R/read_spreadsheet.R


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

#' Read datasets and relevant meta information from a spreadsheet file
#'
#' This function imports one dataset from each sheet of a spreadsheet file.
#' These sheets are selected based on the contents of a sheet 'Datasets', with
#' a column called 'Dataset Number', containing numbers identifying the dataset
#' sheets to be read in. In the second column there must be a grouping
#' variable, which will often be named 'Soil'. Optionally, time normalization
#' factors can be given in columns named 'Temperature' and 'Moisture'.
#'
#' There must be a sheet 'Compounds', with columns 'Name' and 'Acronym'.
#' The first row read after the header read in from this sheet is assumed
#' to contain name and acronym of the parent compound.
#'
#' The dataset sheets should be named using the dataset numbers read in from
#' the 'Datasets' sheet, i.e. '1', '2', ... . In each dataset sheet, name
#' of the observed variable (e.g. the acronym of the parent compound or
#' one of its transformation products) should be in the first column,
#' the time values should be in the second colum, and the observed value
#' in the third column.
#'
#' In case relevant covariate data are available, they should be given
#' in a sheet 'Covariates', containing one line for each value of the grouping
#' variable specified in 'Datasets'. These values should be in the first
#' column and the column must have the same name as the second column in
#' 'Datasets'. Covariates will be read in from columns four and higher.
#' Their names should preferably not contain special characters like spaces,
#' so they can be easily used for specifying covariate models.
#'
#' An similar data structure is defined as the R6 class [mkindsg], but
#' is probably more complicated to use.
#'
#' @param path Absolute or relative path to the spreadsheet file
#' @param valid_datasets Optional numeric index of the valid datasets, default is
#' to use all datasets
#' @param parent_only Should only the parent data be used?
#' @param normalize Should the time scale be normalized using temperature
#' and moisture normalisation factors in the sheet 'Datasets'?
#' @export
read_spreadsheet <- function(path, valid_datasets = "all",
  parent_only = FALSE, normalize = TRUE)
{
  if (!requireNamespace("readxl", quietly = TRUE))
    stop("Please install the readxl package to use this function")

  # Read the compound table
  compounds <- readxl::read_excel(path, sheet = "Compounds")
  parent <- compounds[1, ]$Acronym

  # Read in meta information
  ds_meta <- readxl::read_excel(path, sheet = "Datasets")
  ds_meta["Dataset Number"] <- as.character(ds_meta[["Dataset Number"]])

  # Select valid datasets
  if (valid_datasets[1] == "all") valid_datasets <- 1:nrow(ds_meta)
  ds_numbers_valid <- ds_meta[valid_datasets, ]$`Dataset Number`
  grouping_factor <- names(ds_meta[2]) # Often "Soil"

  # Read in valid datasets
  ds_raw <- lapply(ds_numbers_valid,
    function(dsn) readxl::read_excel(path, sheet = as.character(dsn)))

  # Make data frames compatible with mmkin
  ds_tmp <- lapply(ds_raw, function(x) {
    ds_ret <- x[1:3] |>
      rlang::set_names(c("name", "time", "value")) |>
      transform(value = as.numeric(value))
  })
  names(ds_tmp) <- ds_numbers_valid

  # Normalize with temperature and moisture correction factors
  if (normalize) {
    ds_norm <- lapply(ds_numbers_valid, function(ds_number) {
      f_corr <- as.numeric(ds_meta[ds_number, c("Temperature", "Moisture")])
      ds_corr <- ds_tmp[[ds_number]] |>
        transform(time = time * f_corr[1] * f_corr[2])
      return(ds_corr)
    })
  } else {
    ds_norm <- ds_tmp
  }
  names(ds_norm) <- ds_numbers_valid

  # Select parent data only if requested
  if (parent_only) {
    ds_norm <- lapply(ds_norm, function(x) subset(x, name == parent))
    compounds <- compounds[1, ]
  }

  # Create a single long table to combine datasets with the same group name
  ds_all <- vctrs::vec_rbind(!!!ds_norm, .names_to = "Dataset Number")
  ds_all_group <- merge(ds_all, ds_meta[c("Dataset Number", grouping_factor)])
  groups <- unique(ds_meta[valid_datasets, ][[grouping_factor]])

  ds <- lapply(groups, function(x) {
      ret <- ds_all_group[ds_all_group[[grouping_factor]] == x, ]
      ret[c("name", "time", "value")]
    }
  )
  names(ds) <- groups

  # Get covariates
  covariates_raw <- readxl::read_excel(path, sheet = "Covariates")
  covariates <- as.data.frame(covariates_raw[4:ncol(covariates_raw)])
  nocov <- setdiff(groups, covariates_raw[[1]])
  if (length(nocov) > 0) {
    message("Did not find covariate data for ", paste(nocov, collapse = ", "))
    message("Not returning covariate data")
    attr(ds, "covariates") <- NULL
  } else {
    rownames(covariates) <- covariates_raw[[1]]
    covariates <- covariates[which(colnames(covariates) != "Remarks")]
    # Attach covariate data if available
    attr(ds, "covariates") <- covariates[groups, , drop = FALSE]
  }

  # Attach the compound list to support automatic model building
  attr(ds, "compounds") <- as.data.frame(compounds)

  return(ds)
}