From b53ca24f26c759a8ecf5e00f791a4134dd1a926c Mon Sep 17 00:00:00 2001 From: Johannes Ranke Date: Tue, 1 Nov 2022 17:26:05 +0100 Subject: Add read_spreadsheet --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/read_spreadsheet.R | 113 ++++++++++++++++++++++++++++++++++++++++++++++++ log/check.log | 2 +- man/read_spreadsheet.Rd | 55 +++++++++++++++++++++++ 5 files changed, 171 insertions(+), 2 deletions(-) create mode 100644 R/read_spreadsheet.R create mode 100644 man/read_spreadsheet.Rd diff --git a/DESCRIPTION b/DESCRIPTION index aad2ebdc..aa5cfee2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,7 @@ Depends: R (>= 2.15.1), Imports: stats, graphics, methods, parallel, deSolve, R6, inline (>= 0.3.19), numDeriv, lmtest, pkgbuild, nlme (>= 3.1-151), saemix (>= 3.1), rlang, vctrs Suggests: knitr, rbenchmark, tikzDevice, testthat, rmarkdown, covr, vdiffr, - benchmarkme, tibble, stats4 + benchmarkme, tibble, stats4, readxl License: GPL LazyLoad: yes LazyData: yes diff --git a/NAMESPACE b/NAMESPACE index 9f87fce6..8dff6a26 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -136,6 +136,7 @@ export(parplot) export(plot_err) export(plot_res) export(plot_sep) +export(read_spreadsheet) export(saem) export(saemix_data) export(saemix_model) diff --git a/R/read_spreadsheet.R b/R/read_spreadsheet.R new file mode 100644 index 00000000..a20af6db --- /dev/null +++ b/R/read_spreadsheet.R @@ -0,0 +1,113 @@ +#' Read datasets and relevant meta information from a spreadsheet file +#' +#' This function imports one dataset from each sheet of a spreadsheet file. +#' These sheets are selected based on the contents of a sheet 'Datasets', with +#' a column called 'Dataset Number', containing numbers identifying the dataset +#' sheets to be read in. In the second column there must be a grouping +#' variable, which will often be named 'Soil'. Optionally, time normalization +#' factors can be given in columns named 'Temperature' and 'Moisture'. +#' +#' There must be a sheet 'Compounds', with columns 'Name' and 'Acronym'. +#' The first row read after the header read in from this sheet is assumed +#' to contain name and acronym of the parent compound. +#' +#' The dataset sheets should be named using the dataset numbers read in from +#' the 'Datasets' sheet, i.e. '1', '2', ... . In each dataset sheet, name +#' of the observed variable (e.g. the acronym of the parent compound or +#' one of its transformation products) should be in the first column, +#' the time values should be in the second colum, and the observed value +#' in the third column. +#' +#' In case relevant covariate data are available, they should be given +#' in a sheet 'Covariates', containing one line for each value of the grouping +#' variable specified in 'Datasets'. These values should be in the first +#' column and the column must have the same name as the second column in +#' 'Datasets'. Covariates will be read in from columns four and higher. +#' Their names should preferably not contain special characters like spaces, +#' so they can be easily used for specifying covariate models. +#' +#' An similar data structure is defined as the R6 class [mkindsg], but +#' is probably more complicated to use. +#' +#' @param path Absolute or relative path to the spreadsheet file +#' @param valid_datasets Optional numeric index of the valid datasets, default is +#' to use all datasets +#' @param parent_only Should only the parent data be used? +#' @param normalize Should the time scale be normalized using temperature +#' and moisture normalisation factors in the sheet 'Datasets'? +#' @export +read_spreadsheet <- function(path, valid_datasets = "all", + parent_only = TRUE, normalize = TRUE) +{ + if (!requireNamespace("readxl", quietly = TRUE)) + stop("Please install the readxl package to use this function") + + # Read the compound table + compounds <- readxl::read_excel(path, sheet = "Compounds") + parent <- compounds[1, ]$Acronym + + # Read in meta information + ds_meta <- readxl::read_excel(path, sheet = "Datasets") + ds_meta["Dataset Number"] <- as.character(ds_meta[["Dataset Number"]]) + + # Select valid datasets + if (valid_datasets[1] == "all") valid_datasets <- 1:nrow(ds_meta) + ds_numbers_valid <- ds_meta[valid_datasets, ]$`Dataset Number` + grouping_factor <- names(ds_meta[2]) # Often "Soil" + + # Read in valid datasets + ds_raw <- lapply(ds_numbers_valid, + function(dsn) readxl::read_excel(path, sheet = as.character(dsn))) + + # Make data frames compatible with mmkin + ds_tmp <- lapply(ds_raw, function(x) { + ds_ret <- x[1:3] |> + rlang::set_names(c("name", "time", "value")) |> + transform(value = as.numeric(value)) + }) + names(ds_tmp) <- ds_numbers_valid + + # Normalize with temperature and moisture correction factors + if (normalize) { + ds_norm <- lapply(ds_numbers_valid, function(ds_number) { + f_corr <- as.numeric(ds_meta[ds_number, c("Temperature", "Moisture")]) + ds_corr <- ds_tmp[[ds_number]] |> + transform(time = time * f_corr[1] * f_corr[2]) + return(ds_corr) + }) + } else { + ds_norm <- ds_tmp + } + names(ds_norm) <- ds_numbers_valid + + # Select parent data only if requested + if (parent_only) { + ds_norm <- lapply(ds_norm, function(x) subset(x, name == parent)) + compounds <- compounds[1, ] + } + + # Create a single long table to combine datasets with the same group name + ds_all <- vctrs::vec_rbind(!!!ds_norm, .names_to = "Dataset Number") + ds_all_group <- merge(ds_all, ds_meta[c("Dataset Number", grouping_factor)]) + groups <- unique(ds_meta[valid_datasets, ][[grouping_factor]]) + + ds <- lapply(groups, function(x) { + ret <- ds_all_group[ds_all_group[[grouping_factor]] == x, ] + ret[c("name", "time", "value")] + } + ) + names(ds) <- groups + + # Get covariates + covariates_raw <- readxl::read_excel(path, sheet = "Covariates") + covariates <- as.data.frame(covariates_raw[4:ncol(covariates_raw)]) + rownames(covariates) <- covariates_raw[[1]] + covariates <- covariates[which(colnames(covariates) != "Remarks")] + + # Attach the compound list to support automatic model building + attr(ds, "compounds") <- as.data.frame(compounds) + + # Attach covariate data + attr(ds, "covariates") <- covariates[groups, , drop = FALSE] + return(ds) +} diff --git a/log/check.log b/log/check.log index 44830443..4092108e 100644 --- a/log/check.log +++ b/log/check.log @@ -59,7 +59,7 @@ The Date field is over a month old. * checking data for ASCII and uncompressed saves ... OK * checking installed files from ‘inst/doc’ ... OK * checking files in ‘vignettes’ ... OK -* checking examples ... [16s/16s] OK +* checking examples ... [15s/15s] OK * checking for unstated dependencies in ‘tests’ ... OK * checking tests ... SKIPPED * checking for unstated dependencies in vignettes ... OK diff --git a/man/read_spreadsheet.Rd b/man/read_spreadsheet.Rd new file mode 100644 index 00000000..147d09bf --- /dev/null +++ b/man/read_spreadsheet.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_spreadsheet.R +\name{read_spreadsheet} +\alias{read_spreadsheet} +\title{Read datasets and relevant meta information from a spreadsheet file} +\usage{ +read_spreadsheet( + path, + valid_datasets = "all", + parent_only = TRUE, + normalize = TRUE +) +} +\arguments{ +\item{path}{Absolute or relative path to the spreadsheet file} + +\item{valid_datasets}{Optional numeric index of the valid datasets, default is +to use all datasets} + +\item{parent_only}{Should only the parent data be used?} + +\item{normalize}{Should the time scale be normalized using temperature +and moisture normalisation factors in the sheet 'Datasets'?} +} +\description{ +This function imports one dataset from each sheet of a spreadsheet file. +These sheets are selected based on the contents of a sheet 'Datasets', with +a column called 'Dataset Number', containing numbers identifying the dataset +sheets to be read in. In the second column there must be a grouping +variable, which will often be named 'Soil'. Optionally, time normalization +factors can be given in columns named 'Temperature' and 'Moisture'. +} +\details{ +There must be a sheet 'Compounds', with columns 'Name' and 'Acronym'. +The first row read after the header read in from this sheet is assumed +to contain name and acronym of the parent compound. + +The dataset sheets should be named using the dataset numbers read in from +the 'Datasets' sheet, i.e. '1', '2', ... . In each dataset sheet, name +of the observed variable (e.g. the acronym of the parent compound or +one of its transformation products) should be in the first column, +the time values should be in the second colum, and the observed value +in the third column. + +In case relevant covariate data are available, they should be given +in a sheet 'Covariates', containing one line for each value of the grouping +variable specified in 'Datasets'. These values should be in the first +column and the column must have the same name as the second column in +'Datasets'. Covariates will be read in from columns four and higher. +Their names should preferably not contain special characters like spaces, +so they can be easily used for specifying covariate models. + +An similar data structure is defined as the R6 class \link{mkindsg}, but +is probably more complicated to use. +} -- cgit v1.2.1