From efab37957381919c21d874906ce870f4941c760a Mon Sep 17 00:00:00 2001 From: Johannes Ranke Date: Sat, 9 May 2020 21:18:42 +0200 Subject: Avoid the call to merge for analytical solutions This increases performance up to a factor of five! --- R/create_deg_func.R | 32 +++++++++++------------ R/mkinerrmin.R | 28 ++++++++++---------- R/mkinfit.R | 75 ++++++++++++++++++++++++++--------------------------- R/mkinmod.R | 2 +- R/mkinpredict.R | 17 ++++++++++-- R/plot.mkinfit.R | 1 + 6 files changed, 83 insertions(+), 72 deletions(-) (limited to 'R') diff --git a/R/create_deg_func.R b/R/create_deg_func.R index fb419a98..886c5e8b 100644 --- a/R/create_deg_func.R +++ b/R/create_deg_func.R @@ -21,19 +21,17 @@ create_deg_func <- function(spec, use_of_ff = c("min", "max")) { use_of_ff <- match.arg(use_of_ff) - min_ff <- use_of_ff == "min" - obs_vars <- names(spec) - n <- character(0) - parent <- obs_vars[1] parent_type <- spec[[1]]$type supported <- TRUE # This may be modified below - n[1] <- paste0(parent, " = ", parent_type, ".solution(outtimes, odeini['", parent, + predicted_text <- character(0) + + predicted_text[parent] <- paste0(parent_type, ".solution(t, odeini['", parent, if (parent_type == "SFORB") "_free", "'], ", switch(parent_type, SFO = paste0("k_", parent, if (min_ff) "_sink" else "", ")"), @@ -59,25 +57,27 @@ create_deg_func <- function(spec, use_of_ff = c("min", "max")) { k2 <- paste0("k_", n2) f12 <- paste0("f_", n1, "_to_", n2) if (parent_type == "SFO") { - n[2] <- paste0(n2, " = (((", k2, "-", k1, ")*", n20, "-", f12, "*", k1, "*", n10, ")*exp(-", k2, "*outtimes)+", - f12, "*", k1, "*", n10, "*exp(-", k1, "*outtimes))/(", k2, "-", k1, ")") + predicted_text[n2] <- paste0( + "(((", k2, "-", k1, ")*", n20, "-", f12, "*", k1, "*", n10, ")*exp(-", k2, "*t)+", + f12, "*", k1, "*", n10, "*exp(-", k1, "*t))/(", k2, "-", k1, ")") } } } if (supported) { - all_n <- paste(n, collapse = ",\n") + deg_func <- function(observed, odeini, odeparms) {} f_body <- paste0("{\n", - "out <- with(as.list(odeparms), {\n", - "data.frame(\n", - "time = outtimes,\n", - all_n, ")\n", + "predicted <- numeric(0)\n", + "with(as.list(odeparms), {\n") + for (obs_var in obs_vars) { + f_body <- paste0(f_body, + "t <- observed[observed$name == '", obs_var, "', 'time']\n", + "predicted <<- c(predicted, ", predicted_text[obs_var], ")\n") + } + f_body <- paste0(f_body, "})\n", - "return(out)\n}\n" - ) - - deg_func <- function(odeini, odeparms, outtimes) {} + "return(predicted)\n}\n") body(deg_func) <- parse(text = f_body) return(deg_func) diff --git a/R/mkinerrmin.R b/R/mkinerrmin.R index 0b647b81..1994aed0 100644 --- a/R/mkinerrmin.R +++ b/R/mkinerrmin.R @@ -1,12 +1,12 @@ if(getRversion() >= '2.15.1') utils::globalVariables(c("name", "value_mean")) #' Calculate the minimum error to assume in order to pass the variance test -#' +#' #' This function finds the smallest relative error still resulting in passing #' the chi-squared test as defined in the FOCUS kinetics report from 2006. -#' +#' #' This function is used internally by \code{\link{summary.mkinfit}}. -#' +#' #' @param fit an object of class \code{\link{mkinfit}}. #' @param alpha The confidence level chosen for the chi-squared test. #' @importFrom stats qchisq aggregate @@ -25,43 +25,41 @@ if(getRversion() >= '2.15.1') utils::globalVariables(c("name", "value_mean")) #' \url{http://esdac.jrc.ec.europa.eu/projects/degradation-kinetics} #' @keywords manip #' @examples -#' +#' #' SFO_SFO = mkinmod(parent = mkinsub("SFO", to = "m1"), #' m1 = mkinsub("SFO"), #' use_of_ff = "max") -#' +#' #' fit_FOCUS_D = mkinfit(SFO_SFO, FOCUS_2006_D, quiet = TRUE) #' round(mkinerrmin(fit_FOCUS_D), 4) #' \dontrun{ #' fit_FOCUS_E = mkinfit(SFO_SFO, FOCUS_2006_E, quiet = TRUE) #' round(mkinerrmin(fit_FOCUS_E), 4) #' } -#' +#' #' @export mkinerrmin <- function(fit, alpha = 0.05) { parms.optim <- fit$par kinerrmin <- function(errdata, n.parms) { - means.mean <- mean(errdata$value_mean, na.rm = TRUE) - df = length(errdata$value_mean) - n.parms + means.mean <- mean(errdata$observed, na.rm = TRUE) + df = nrow(errdata) - n.parms err.min <- sqrt((1 / qchisq(1 - alpha, df)) * - sum((errdata$value_mean - errdata$value_pred)^2)/(means.mean^2)) + sum((errdata$observed - errdata$predicted)^2)/(means.mean^2)) return(list(err.min = err.min, n.optim = n.parms, df = df)) } - means <- aggregate(value ~ time + name, data = fit$observed, mean, na.rm=TRUE) - errdata <- merge(means, fit$predicted, by = c("time", "name"), - suffixes = c("_mean", "_pred")) - errdata <- errdata[order(errdata$time, errdata$name), ] + errdata <- aggregate(cbind(observed, predicted) ~ time + variable, data = fit$data, mean, na.rm=TRUE) + errdata <- errdata[order(errdata$time, errdata$variable), ] # Remove values at time zero for variables whose value for state.ini is fixed, # as these will not have any effect in the optimization and should therefore not # be counted as degrees of freedom. fixed_initials = gsub("_0$", "", rownames(subset(fit$fixed, type = "state"))) - errdata <- subset(errdata, !(time == 0 & name %in% fixed_initials)) + errdata <- subset(errdata, !(time == 0 & variable %in% fixed_initials)) n.optim.overall <- length(parms.optim) - length(fit$errparms) @@ -73,7 +71,7 @@ mkinerrmin <- function(fit, alpha = 0.05) # The degrees of freedom are counted according to FOCUS kinetics (2011, p. 164) for (obs_var in fit$obs_vars) { - errdata.var <- subset(errdata, name == obs_var) + errdata.var <- subset(errdata, variable == obs_var) # Check if initial value is optimised n.initials.optim <- length(grep(paste(obs_var, ".*", "_0", sep=""), names(parms.optim))) diff --git a/R/mkinfit.R b/R/mkinfit.R index 6cfb88c0..c33afbf0 100644 --- a/R/mkinfit.R +++ b/R/mkinfit.R @@ -288,6 +288,10 @@ mkinfit <- function(mkinmod, observed, observed <- subset(observed, value != 0) } + # Sort observed values for efficient analytical predictions + observed$name <- ordered(observed$name, levels = obs_vars) + observed <- observed[order(observed$name, observed$time), ] + # Obtain data for decline from maximum mean value if requested if (from_max_mean) { # This is only used for simple decline models @@ -520,7 +524,7 @@ mkinfit <- function(mkinmod, observed, } # Unique outtimes for model solution. - outtimes = sort(unique(observed$time)) + outtimes <- sort(unique(observed$time)) # Define the objective function for optimisation, including (back)transformations cost_function <- function(P, trans = TRUE, OLS = FALSE, fixed_degparms = FALSE, fixed_errparms = FALSE, update_data = TRUE, ...) @@ -578,47 +582,51 @@ mkinfit <- function(mkinmod, observed, } # Solve the system with current parameter values - out <- mkinpredict(mkinmod, parms, - odeini, outtimes, - solution_type = solution_type, - use_compiled = use_compiled, - method.ode = method.ode, - atol = atol, rtol = rtol, ...) - - out_long <- mkin_wide_to_long(out, time = "time") - - # Surprisingly, the next line is the one taking the most time for one, two - # or three observed variables if we use compiled ODE models - # as evidenced by use of github:hadley/lineprof - cost_data <- merge(observed[c("name", "time", "value")], out_long, - by = c("name", "time"), suffixes = c(".observed", ".predicted")) + if (solution_type == "analytical") { + observed$predicted <- mkinmod$deg_func(observed, odeini, parms) + } else { + out <- mkinpredict(mkinmod, parms, + odeini, outtimes, + solution_type = solution_type, + use_compiled = use_compiled, + method.ode = method.ode, + atol = atol, rtol = rtol, ...) + + out_long <- mkin_wide_to_long(out, time = "time") + + out_merged <- merge(observed[c("name", "time")], out_long) + out_merged$name <- ordered(out_merged$name, levels = obs_vars) + out_merged <- out_merged[order(out_merged$name, out_merged$time), ] + observed$predicted <- out_merged$value + } + # Define standard deviation for each observation if (err_mod == "const") { - cost_data$std <- if (OLS) NA else cost_errparms["sigma"] + observed$std <- if (OLS) NA else cost_errparms["sigma"] } if (err_mod == "obs") { - std_names <- paste0("sigma_", cost_data$name) - cost_data$std <- cost_errparms[std_names] + std_names <- paste0("sigma_", observed$name) + observed$std <- cost_errparms[std_names] } if (err_mod == "tc") { - cost_data$std <- sqrt(cost_errparms["sigma_low"]^2 + cost_data$value.predicted^2 * cost_errparms["rsd_high"]^2) + observed$std <- sqrt(cost_errparms["sigma_low"]^2 + observed$predicted^2 * cost_errparms["rsd_high"]^2) } + # Calculate model cost if (OLS) { # Cost is the sum of squared residuals - cost <- with(cost_data, sum((value.observed - value.predicted)^2)) + cost <- with(observed, sum((value - predicted)^2)) } else { # Cost is the negative log-likelihood - cost <- - with(cost_data, - sum(dnorm(x = value.observed, mean = value.predicted, sd = std, log = TRUE))) + cost <- - with(observed, + sum(dnorm(x = value, mean = predicted, sd = std, log = TRUE))) } # We update the current cost and data during the optimisation, not # during hessian calculations if (update_data) { - assign("out_predicted", out_long, inherits = TRUE) - assign("current_data", cost_data, inherits = TRUE) + assign("current_data", observed, inherits = TRUE) if (cost < cost.current) { assign("cost.current", cost, inherits = TRUE) @@ -718,7 +726,7 @@ mkinfit <- function(mkinmod, observed, degparms <- fit$par # Get the maximum likelihood estimate for sigma at the optimum parameter values - current_data$residual <- current_data$value.observed - current_data$value.predicted + current_data$residual <- current_data$value - current_data$predicted sigma_mle <- sqrt(sum(current_data$residual^2)/nrow(current_data)) # Use that estimate for the constant variance, or as first guess if err_mod = "obs" @@ -855,11 +863,7 @@ mkinfit <- function(mkinmod, observed, # We also need the model and a model name for summary and plotting fit$mkinmod <- mkinmod fit$mkinmod$name <- mkinmod_name - - # We need data and predictions for summary and plotting - fit$observed <- observed fit$obs_vars <- obs_vars - fit$predicted <- out_predicted # Residual sum of squares as a function of the fitted parameters fit$rss <- function(P) cost_function(P, OLS = TRUE, update_data = FALSE) @@ -886,15 +890,10 @@ mkinfit <- function(mkinmod, observed, fit$fixed$type = c(rep("state", length(state.ini.fixed)), rep("deparm", length(parms.fixed))) - # Sort observed, predicted and residuals - current_data$name <- ordered(current_data$name, levels = obs_vars) - - ordered_data <- current_data[order(current_data$name, current_data$time), ] - - fit$data <- data.frame(time = ordered_data$time, - variable = ordered_data$name, - observed = ordered_data$value.observed, - predicted = ordered_data$value.predicted) + fit$data <- data.frame(time = current_data$time, + variable = current_data$name, + observed = current_data$value, + predicted = current_data$predicted) fit$data$residual <- fit$data$observed - fit$data$predicted diff --git a/R/mkinmod.R b/R/mkinmod.R index 21551861..7e9e9248 100644 --- a/R/mkinmod.R +++ b/R/mkinmod.R @@ -70,7 +70,7 @@ #' @examples #' #' # Specify the SFO model (this is not needed any more, as we can now mkinfit("SFO", ...) -#' SFO <- mkinmod(parent = list(type = "SFO")) +#' SFO <- mkinmod(parent = mkinsub("SFO")) #' #' # One parent compound, one metabolite, both single first order #' SFO_SFO <- mkinmod( diff --git a/R/mkinpredict.R b/R/mkinpredict.R index 8c76ca05..75582fac 100644 --- a/R/mkinpredict.R +++ b/R/mkinpredict.R @@ -92,10 +92,15 @@ #' c(parent = 100, m1 = 0), seq(0, 20, by = 0.1), #' solution_type = "analytical", use_compiled = FALSE)[201,]) #' } +#' analytical = mkinpredict(SFO_SFO, +#' c(k_parent = 0.15, f_parent_to_m1 = 0.5, k_m1 = 0.01), +#' c(parent = 100, m1 = 0), seq(0, 20, by = 0.1), +#' solution_type = "analytical", use_compiled = FALSE)[201,] #' #' \dontrun{ #' # Predict from a fitted model #' f <- mkinfit(SFO_SFO, FOCUS_2006_C, quiet = TRUE) +#' f <- mkinfit(SFO_SFO, FOCUS_2006_C, quiet = TRUE, solution_type = "analytical") #' head(mkinpredict(f)) #' } #' @@ -131,8 +136,16 @@ mkinpredict.mkinmod <- function(x, } if (solution_type == "analytical") { - out <- x$deg_func(odeini = odeini, - odeparms = odeparms, outtimes = outtimes) + # This is clumsy, as we wanted fast analytical predictions for mkinfit + out <- data.frame(time = outtimes) + obs_vars <- names(x$spec) + pseudo_observed <- + data.frame(name = rep(obs_vars, each = length(outtimes)), + time = rep(outtimes, length(obs_vars))) + pseudo_observed$predicted <- x$deg_func(pseudo_observed, odeini, odeparms) + for (obs_var in obs_vars) { + out[obs_var] <- pseudo_observed[pseudo_observed$name == obs_var, "predicted"] + } } if (solution_type == "eigen") { diff --git a/R/plot.mkinfit.R b/R/plot.mkinfit.R index 4b5bfa3a..b2cb4890 100644 --- a/R/plot.mkinfit.R +++ b/R/plot.mkinfit.R @@ -69,6 +69,7 @@ if(getRversion() >= '2.15.1') utils::globalVariables(c("type", "variable", "obse #' \dontrun{ #' SFO_SFO <- mkinmod(parent = mkinsub("SFO", "m1", full = "Parent"), #' m1 = mkinsub("SFO", full = "Metabolite M1" )) +#' fit <- mkinfit(SFO_SFO, FOCUS_2006_D, quiet = TRUE) #' fit <- mkinfit(SFO_SFO, FOCUS_2006_D, quiet = TRUE, error_model = "tc") #' plot(fit) #' plot_res(fit) -- cgit v1.2.1