Add read_spreadsheet

author: Johannes Ranke <jranke@uni-bremen.de> 2022-11-01 17:26:05 +0100
committer: Johannes Ranke <jranke@uni-bremen.de> 2022-11-01 17:26:05 +0100
commit: b53ca24f26c759a8ecf5e00f791a4134dd1a926c (patch)
tree: f3de2c40da72939e8a68af19520e2b24555fcca4
parent: 630e657f1794ea441afc9ff10663309fec5e847e (diff)
5 files changed, 171 insertions, 2 deletions
diff --git a/DESCRIPTION b/DESCRIPTION
index aad2ebdc..aa5cfee2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -26,7 +26,7 @@ Depends: R (>= 2.15.1),
 Imports: stats, graphics, methods, parallel, deSolve, R6, inline (>= 0.3.19),
   numDeriv, lmtest, pkgbuild, nlme (>= 3.1-151), saemix (>= 3.1), rlang, vctrs
 Suggests: knitr, rbenchmark, tikzDevice, testthat, rmarkdown, covr, vdiffr,
-  benchmarkme, tibble, stats4
+  benchmarkme, tibble, stats4, readxl
 License: GPL
 LazyLoad: yes
 LazyData: yes
diff --git a/NAMESPACE b/NAMESPACE
index 9f87fce6..8dff6a26 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -136,6 +136,7 @@ export(parplot)
 export(plot_err)
 export(plot_res)
 export(plot_sep)
+export(read_spreadsheet)
 export(saem)
 export(saemix_data)
 export(saemix_model)
diff --git a/R/read_spreadsheet.R b/R/read_spreadsheet.R
new file mode 100644
index 00000000..a20af6db
--- /dev/null
+++ b/R/read_spreadsheet.R
@@ -0,0 +1,113 @@
+#' Read datasets and relevant meta information from a spreadsheet file
+#'
+#' This function imports one dataset from each sheet of a spreadsheet file.
+#' These sheets are selected based on the contents of a sheet 'Datasets', with
+#' a column called 'Dataset Number', containing numbers identifying the dataset
+#' sheets to be read in. In the second column there must be a grouping
+#' variable, which will often be named 'Soil'. Optionally, time normalization
+#' factors can be given in columns named 'Temperature' and 'Moisture'.
+#'
+#' There must be a sheet 'Compounds', with columns 'Name' and 'Acronym'.
+#' The first row read after the header read in from this sheet is assumed
+#' to contain name and acronym of the parent compound.
+#'
+#' The dataset sheets should be named using the dataset numbers read in from
+#' the 'Datasets' sheet, i.e. '1', '2', ... . In each dataset sheet, name
+#' of the observed variable (e.g. the acronym of the parent compound or
+#' one of its transformation products) should be in the first column,
+#' the time values should be in the second colum, and the observed value
+#' in the third column.
+#'
+#' In case relevant covariate data are available, they should be given
+#' in a sheet 'Covariates', containing one line for each value of the grouping
+#' variable specified in 'Datasets'. These values should be in the first
+#' column and the column must have the same name as the second column in
+#' 'Datasets'. Covariates will be read in from columns four and higher.
+#' Their names should preferably not contain special characters like spaces,
+#' so they can be easily used for specifying covariate models.
+#'
+#' An similar data structure is defined as the R6 class [mkindsg], but
+#' is probably more complicated to use.
+#'
+#' @param path Absolute or relative path to the spreadsheet file
+#' @param valid_datasets Optional numeric index of the valid datasets, default is
+#' to use all datasets
+#' @param parent_only Should only the parent data be used?
+#' @param normalize Should the time scale be normalized using temperature
+#' and moisture normalisation factors in the sheet 'Datasets'?
+#' @export
+read_spreadsheet <- function(path, valid_datasets = "all",
+  parent_only = TRUE, normalize = TRUE)
+{
+  if (!requireNamespace("readxl", quietly = TRUE))
+    stop("Please install the readxl package to use this function")
+
+  # Read the compound table
+  compounds <- readxl::read_excel(path, sheet = "Compounds")
+  parent <- compounds[1, ]$Acronym
+
+  # Read in meta information
+  ds_meta <- readxl::read_excel(path, sheet = "Datasets")
+  ds_meta["Dataset Number"] <- as.character(ds_meta[["Dataset Number"]])
+
+  # Select valid datasets
+  if (valid_datasets[1] == "all") valid_datasets <- 1:nrow(ds_meta)
+  ds_numbers_valid <- ds_meta[valid_datasets, ]$`Dataset Number`
+  grouping_factor <- names(ds_meta[2]) # Often "Soil"
+
+  # Read in valid datasets
+  ds_raw <- lapply(ds_numbers_valid,
+    function(dsn) readxl::read_excel(path, sheet = as.character(dsn)))
+
+  # Make data frames compatible with mmkin
+  ds_tmp <- lapply(ds_raw, function(x) {
+    ds_ret <- x[1:3] |>
+      rlang::set_names(c("name", "time", "value")) |>
+      transform(value = as.numeric(value))
+  })
+  names(ds_tmp) <- ds_numbers_valid
+
+  # Normalize with temperature and moisture correction factors
+  if (normalize) {
+    ds_norm <- lapply(ds_numbers_valid, function(ds_number) {
+      f_corr <- as.numeric(ds_meta[ds_number, c("Temperature", "Moisture")])
+      ds_corr <- ds_tmp[[ds_number]] |>
+        transform(time = time * f_corr[1] * f_corr[2])
+      return(ds_corr)
+    })
+  } else {
+    ds_norm <- ds_tmp
+  }
+  names(ds_norm) <- ds_numbers_valid
+
+  # Select parent data only if requested
+  if (parent_only) {
+    ds_norm <- lapply(ds_norm, function(x) subset(x, name == parent))
+    compounds <- compounds[1, ]
+  }
+
+  # Create a single long table to combine datasets with the same group name
+  ds_all <- vctrs::vec_rbind(!!!ds_norm, .names_to = "Dataset Number")
+  ds_all_group <- merge(ds_all, ds_meta[c("Dataset Number", grouping_factor)])
+  groups <- unique(ds_meta[valid_datasets, ][[grouping_factor]])
+
+  ds <- lapply(groups, function(x) {
+      ret <- ds_all_group[ds_all_group[[grouping_factor]] == x, ]
+      ret[c("name", "time", "value")]
+    }
+  )
+  names(ds) <- groups
+
+  # Get covariates
+  covariates_raw <- readxl::read_excel(path, sheet = "Covariates")
+  covariates <- as.data.frame(covariates_raw[4:ncol(covariates_raw)])
+  rownames(covariates) <- covariates_raw[[1]]
+  covariates <- covariates[which(colnames(covariates) != "Remarks")]
+
+  # Attach the compound list to support automatic model building
+  attr(ds, "compounds") <- as.data.frame(compounds)
+
+  # Attach covariate data
+  attr(ds, "covariates") <- covariates[groups, , drop = FALSE]
+  return(ds)
+}
diff --git a/log/check.log b/log/check.log
index 44830443..4092108e 100644
--- a/log/check.log
+++ b/log/check.log
@@ -59,7 +59,7 @@ The Date field is over a month old.
 * checking data for ASCII and uncompressed saves ... OK
 * checking installed files from ‘inst/doc’ ... OK
 * checking files in ‘vignettes’ ... OK
-* checking examples ... [16s/16s] OK
+* checking examples ... [15s/15s] OK
 * checking for unstated dependencies in ‘tests’ ... OK
 * checking tests ... SKIPPED
 * checking for unstated dependencies in vignettes ... OK
diff --git a/man/read_spreadsheet.Rd b/man/read_spreadsheet.Rd
new file mode 100644
index 00000000..147d09bf
--- /dev/null
+++ b/man/read_spreadsheet.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read_spreadsheet.R
+\name{read_spreadsheet}
+\alias{read_spreadsheet}
+\title{Read datasets and relevant meta information from a spreadsheet file}
+\usage{
+read_spreadsheet(
+  path,
+  valid_datasets = "all",
+  parent_only = TRUE,
+  normalize = TRUE
+)
+}
+\arguments{
+\item{path}{Absolute or relative path to the spreadsheet file}
+
+\item{valid_datasets}{Optional numeric index of the valid datasets, default is
+to use all datasets}
+
+\item{parent_only}{Should only the parent data be used?}
+
+\item{normalize}{Should the time scale be normalized using temperature
+and moisture normalisation factors in the sheet 'Datasets'?}
+}
+\description{
+This function imports one dataset from each sheet of a spreadsheet file.
+These sheets are selected based on the contents of a sheet 'Datasets', with
+a column called 'Dataset Number', containing numbers identifying the dataset
+sheets to be read in. In the second column there must be a grouping
+variable, which will often be named 'Soil'. Optionally, time normalization
+factors can be given in columns named 'Temperature' and 'Moisture'.
+}
+\details{
+There must be a sheet 'Compounds', with columns 'Name' and 'Acronym'.
+The first row read after the header read in from this sheet is assumed
+to contain name and acronym of the parent compound.
+
+The dataset sheets should be named using the dataset numbers read in from
+the 'Datasets' sheet, i.e. '1', '2', ... . In each dataset sheet, name
+of the observed variable (e.g. the acronym of the parent compound or
+one of its transformation products) should be in the first column,
+the time values should be in the second colum, and the observed value
+in the third column.
+
+In case relevant covariate data are available, they should be given
+in a sheet 'Covariates', containing one line for each value of the grouping
+variable specified in 'Datasets'. These values should be in the first
+column and the column must have the same name as the second column in
+'Datasets'. Covariates will be read in from columns four and higher.
+Their names should preferably not contain special characters like spaces,
+so they can be easily used for specifying covariate models.
+
+An similar data structure is defined as the R6 class \link{mkindsg}, but
+is probably more complicated to use.
+}
author	Johannes Ranke <jranke@uni-bremen.de>	2022-11-01 17:26:05 +0100
committer	Johannes Ranke <jranke@uni-bremen.de>	2022-11-01 17:26:05 +0100
commit	b53ca24f26c759a8ecf5e00f791a4134dd1a926c (patch)
tree	f3de2c40da72939e8a68af19520e2b24555fcca4
parent	630e657f1794ea441afc9ff10663309fec5e847e (diff)