From efab37957381919c21d874906ce870f4941c760a Mon Sep 17 00:00:00 2001
From: Johannes Ranke <jranke@uni-bremen.de>
Date: Sat, 9 May 2020 21:18:42 +0200
Subject: Avoid the call to merge for analytical solutions

This increases performance up to a factor of five!
---
 R/create_deg_func.R | 32 +++++++++++------------
 R/mkinerrmin.R      | 28 ++++++++++----------
 R/mkinfit.R         | 75 ++++++++++++++++++++++++++---------------------------
 R/mkinmod.R         |  2 +-
 R/mkinpredict.R     | 17 ++++++++++--
 R/plot.mkinfit.R    |  1 +
 6 files changed, 83 insertions(+), 72 deletions(-)

(limited to 'R')

diff --git a/R/create_deg_func.R b/R/create_deg_func.R
index fb419a98..886c5e8b 100644
--- a/R/create_deg_func.R
+++ b/R/create_deg_func.R
@@ -21,19 +21,17 @@
 create_deg_func <- function(spec, use_of_ff = c("min", "max")) {
 
   use_of_ff <- match.arg(use_of_ff)
-
   min_ff <- use_of_ff == "min"
-
   obs_vars <- names(spec)
 
-  n <- character(0)
-
   parent <- obs_vars[1]
   parent_type <- spec[[1]]$type
 
   supported <- TRUE # This may be modified below
 
-  n[1] <- paste0(parent, " = ", parent_type, ".solution(outtimes, odeini['", parent,
+  predicted_text <- character(0)
+
+  predicted_text[parent] <- paste0(parent_type, ".solution(t, odeini['", parent,
     if (parent_type == "SFORB") "_free", "'], ",
     switch(parent_type,
       SFO = paste0("k_", parent, if (min_ff) "_sink" else "", ")"),
@@ -59,25 +57,27 @@ create_deg_func <- function(spec, use_of_ff = c("min", "max")) {
       k2 <- paste0("k_", n2)
       f12 <- paste0("f_", n1, "_to_", n2)
       if (parent_type == "SFO") {
-        n[2] <- paste0(n2, " = (((", k2, "-", k1, ")*", n20, "-", f12, "*", k1, "*", n10, ")*exp(-", k2, "*outtimes)+",
-          f12, "*", k1, "*", n10, "*exp(-", k1, "*outtimes))/(", k2, "-", k1, ")")
+        predicted_text[n2] <- paste0(
+          "(((", k2, "-", k1, ")*", n20, "-", f12, "*", k1, "*", n10, ")*exp(-", k2, "*t)+",
+          f12, "*", k1, "*", n10, "*exp(-", k1, "*t))/(", k2, "-", k1, ")")
       }
     }
   }
 
   if (supported) {
-    all_n <- paste(n, collapse = ",\n")
+    deg_func <- function(observed, odeini, odeparms) {}
 
     f_body <- paste0("{\n",
-      "out <- with(as.list(odeparms), {\n",
-      "data.frame(\n",
-        "time = outtimes,\n",
-        all_n, ")\n",
+      "predicted <- numeric(0)\n",
+      "with(as.list(odeparms), {\n")
+    for (obs_var in obs_vars) {
+      f_body <- paste0(f_body,
+      "t <- observed[observed$name == '", obs_var, "', 'time']\n",
+      "predicted <<- c(predicted, ", predicted_text[obs_var], ")\n")
+    }
+    f_body <- paste0(f_body,
       "})\n",
-      "return(out)\n}\n"
-    )
-
-    deg_func <- function(odeini, odeparms, outtimes) {}
+      "return(predicted)\n}\n")
 
     body(deg_func) <- parse(text = f_body)
     return(deg_func)
diff --git a/R/mkinerrmin.R b/R/mkinerrmin.R
index 0b647b81..1994aed0 100644
--- a/R/mkinerrmin.R
+++ b/R/mkinerrmin.R
@@ -1,12 +1,12 @@
 if(getRversion() >= '2.15.1') utils::globalVariables(c("name", "value_mean"))
 
 #' Calculate the minimum error to assume in order to pass the variance test
-#' 
+#'
 #' This function finds the smallest relative error still resulting in passing
 #' the chi-squared test as defined in the FOCUS kinetics report from 2006.
-#' 
+#'
 #' This function is used internally by \code{\link{summary.mkinfit}}.
-#' 
+#'
 #' @param fit an object of class \code{\link{mkinfit}}.
 #' @param alpha The confidence level chosen for the chi-squared test.
 #' @importFrom stats qchisq aggregate
@@ -25,43 +25,41 @@ if(getRversion() >= '2.15.1') utils::globalVariables(c("name", "value_mean"))
 #' \url{http://esdac.jrc.ec.europa.eu/projects/degradation-kinetics}
 #' @keywords manip
 #' @examples
-#' 
+#'
 #' SFO_SFO = mkinmod(parent = mkinsub("SFO", to = "m1"),
 #'                   m1 = mkinsub("SFO"),
 #'                   use_of_ff = "max")
-#' 
+#'
 #' fit_FOCUS_D = mkinfit(SFO_SFO, FOCUS_2006_D, quiet = TRUE)
 #' round(mkinerrmin(fit_FOCUS_D), 4)
 #' \dontrun{
 #'   fit_FOCUS_E = mkinfit(SFO_SFO, FOCUS_2006_E, quiet = TRUE)
 #'   round(mkinerrmin(fit_FOCUS_E), 4)
 #' }
-#' 
+#'
 #' @export
 mkinerrmin <- function(fit, alpha = 0.05)
 {
   parms.optim <- fit$par
 
   kinerrmin <- function(errdata, n.parms) {
-    means.mean <- mean(errdata$value_mean, na.rm = TRUE)
-    df = length(errdata$value_mean) - n.parms
+    means.mean <- mean(errdata$observed, na.rm = TRUE)
+    df = nrow(errdata) - n.parms
 
     err.min <- sqrt((1 / qchisq(1 - alpha, df)) *
-               sum((errdata$value_mean - errdata$value_pred)^2)/(means.mean^2))
+               sum((errdata$observed - errdata$predicted)^2)/(means.mean^2))
 
     return(list(err.min = err.min, n.optim = n.parms, df = df))
   }
 
-  means <- aggregate(value ~ time + name, data = fit$observed, mean, na.rm=TRUE)
-  errdata <- merge(means, fit$predicted, by = c("time", "name"),
-    suffixes = c("_mean", "_pred"))
-  errdata <- errdata[order(errdata$time, errdata$name), ]
+  errdata <- aggregate(cbind(observed, predicted) ~ time + variable, data = fit$data, mean, na.rm=TRUE)
+  errdata <- errdata[order(errdata$time, errdata$variable), ]
 
   # Remove values at time zero for variables whose value for state.ini is fixed,
   # as these will not have any effect in the optimization and should therefore not
   # be counted as degrees of freedom.
   fixed_initials = gsub("_0$", "", rownames(subset(fit$fixed, type = "state")))
-  errdata <- subset(errdata, !(time == 0 & name %in% fixed_initials))
+  errdata <- subset(errdata, !(time == 0 & variable %in% fixed_initials))
 
   n.optim.overall <- length(parms.optim) - length(fit$errparms)
 
@@ -73,7 +71,7 @@ mkinerrmin <- function(fit, alpha = 0.05)
   # The degrees of freedom are counted according to FOCUS kinetics (2011, p. 164)
   for (obs_var in fit$obs_vars)
   {
-    errdata.var <- subset(errdata, name == obs_var)
+    errdata.var <- subset(errdata, variable == obs_var)
 
     # Check if initial value is optimised
     n.initials.optim <- length(grep(paste(obs_var, ".*", "_0", sep=""), names(parms.optim)))
diff --git a/R/mkinfit.R b/R/mkinfit.R
index 6cfb88c0..c33afbf0 100644
--- a/R/mkinfit.R
+++ b/R/mkinfit.R
@@ -288,6 +288,10 @@ mkinfit <- function(mkinmod, observed,
     observed <- subset(observed, value != 0)
   }
 
+  # Sort observed values for efficient analytical predictions
+  observed$name <- ordered(observed$name, levels = obs_vars)
+  observed <- observed[order(observed$name, observed$time), ]
+
   # Obtain data for decline from maximum mean value if requested
   if (from_max_mean) {
     # This is only used for simple decline models
@@ -520,7 +524,7 @@ mkinfit <- function(mkinmod, observed,
   }
 
   # Unique outtimes for model solution.
-  outtimes = sort(unique(observed$time))
+  outtimes <- sort(unique(observed$time))
 
   # Define the objective function for optimisation, including (back)transformations
   cost_function <- function(P, trans = TRUE, OLS = FALSE, fixed_degparms = FALSE, fixed_errparms = FALSE, update_data = TRUE, ...)
@@ -578,47 +582,51 @@ mkinfit <- function(mkinmod, observed,
     }
 
     # Solve the system with current parameter values
-    out <- mkinpredict(mkinmod, parms,
-                       odeini, outtimes,
-                       solution_type = solution_type,
-                       use_compiled = use_compiled,
-                       method.ode = method.ode,
-                       atol = atol, rtol = rtol, ...)
-
-    out_long <- mkin_wide_to_long(out, time = "time")
-
-    # Surprisingly, the next line is the one taking the most time for one, two
-    # or three observed variables if we use compiled ODE models
-    # as evidenced by use of github:hadley/lineprof
-    cost_data <- merge(observed[c("name", "time", "value")], out_long,
-                         by = c("name", "time"), suffixes = c(".observed", ".predicted"))
+    if (solution_type == "analytical") {
+      observed$predicted <- mkinmod$deg_func(observed, odeini, parms)
+    } else {
+      out <- mkinpredict(mkinmod, parms,
+                         odeini, outtimes,
+                         solution_type = solution_type,
+                         use_compiled = use_compiled,
+                         method.ode = method.ode,
+                         atol = atol, rtol = rtol, ...)
+
+      out_long <- mkin_wide_to_long(out, time = "time")
+
+      out_merged <- merge(observed[c("name", "time")], out_long)
+      out_merged$name <- ordered(out_merged$name, levels = obs_vars)
+      out_merged <- out_merged[order(out_merged$name, out_merged$time), ]
+      observed$predicted <- out_merged$value
+    }
 
+    # Define standard deviation for each observation
     if (err_mod == "const") {
-      cost_data$std <- if (OLS) NA else cost_errparms["sigma"]
+      observed$std <- if (OLS) NA else cost_errparms["sigma"]
     }
     if (err_mod == "obs") {
-      std_names <- paste0("sigma_", cost_data$name)
-      cost_data$std <- cost_errparms[std_names]
+      std_names <- paste0("sigma_", observed$name)
+      observed$std <- cost_errparms[std_names]
     }
     if (err_mod == "tc") {
-      cost_data$std <- sqrt(cost_errparms["sigma_low"]^2 + cost_data$value.predicted^2 * cost_errparms["rsd_high"]^2)
+      observed$std <- sqrt(cost_errparms["sigma_low"]^2 + observed$predicted^2 * cost_errparms["rsd_high"]^2)
     }
 
+    # Calculate model cost
     if (OLS) {
       # Cost is the sum of squared residuals
-      cost <- with(cost_data, sum((value.observed - value.predicted)^2))
+      cost <- with(observed, sum((value - predicted)^2))
     } else {
       # Cost is the negative log-likelihood
-      cost <- - with(cost_data,
-        sum(dnorm(x = value.observed, mean = value.predicted, sd = std, log = TRUE)))
+      cost <- - with(observed,
+        sum(dnorm(x = value, mean = predicted, sd = std, log = TRUE)))
     }
 
     # We update the current cost and data during the optimisation, not
     # during hessian calculations
     if (update_data) {
 
-      assign("out_predicted", out_long, inherits = TRUE)
-      assign("current_data", cost_data, inherits = TRUE)
+      assign("current_data", observed, inherits = TRUE)
 
       if (cost < cost.current) {
         assign("cost.current", cost, inherits = TRUE)
@@ -718,7 +726,7 @@ mkinfit <- function(mkinmod, observed,
       degparms <- fit$par
 
       # Get the maximum likelihood estimate for sigma at the optimum parameter values
-      current_data$residual <- current_data$value.observed - current_data$value.predicted
+      current_data$residual <- current_data$value - current_data$predicted
       sigma_mle <- sqrt(sum(current_data$residual^2)/nrow(current_data))
 
       # Use that estimate for the constant variance, or as first guess if err_mod = "obs"
@@ -855,11 +863,7 @@ mkinfit <- function(mkinmod, observed,
   # We also need the model and a model name for summary and plotting
   fit$mkinmod <- mkinmod
   fit$mkinmod$name <- mkinmod_name
-
-  # We need data and predictions for summary and plotting
-  fit$observed <- observed
   fit$obs_vars <- obs_vars
-  fit$predicted <- out_predicted
 
   # Residual sum of squares as a function of the fitted parameters
   fit$rss <- function(P) cost_function(P, OLS = TRUE, update_data = FALSE)
@@ -886,15 +890,10 @@ mkinfit <- function(mkinmod, observed,
   fit$fixed$type = c(rep("state", length(state.ini.fixed)),
                      rep("deparm", length(parms.fixed)))
 
-  # Sort observed, predicted and residuals
-  current_data$name <- ordered(current_data$name, levels = obs_vars)
-
-  ordered_data <- current_data[order(current_data$name, current_data$time), ]
-
-  fit$data <- data.frame(time = ordered_data$time,
-                         variable = ordered_data$name,
-                         observed = ordered_data$value.observed,
-                         predicted = ordered_data$value.predicted)
+  fit$data <- data.frame(time = current_data$time,
+                         variable = current_data$name,
+                         observed = current_data$value,
+                         predicted = current_data$predicted)
 
   fit$data$residual <- fit$data$observed - fit$data$predicted
 
diff --git a/R/mkinmod.R b/R/mkinmod.R
index 21551861..7e9e9248 100644
--- a/R/mkinmod.R
+++ b/R/mkinmod.R
@@ -70,7 +70,7 @@
 #' @examples
 #'
 #' # Specify the SFO model (this is not needed any more, as we can now mkinfit("SFO", ...)
-#' SFO <- mkinmod(parent = list(type = "SFO"))
+#' SFO <- mkinmod(parent = mkinsub("SFO"))
 #'
 #' # One parent compound, one metabolite, both single first order
 #' SFO_SFO <- mkinmod(
diff --git a/R/mkinpredict.R b/R/mkinpredict.R
index 8c76ca05..75582fac 100644
--- a/R/mkinpredict.R
+++ b/R/mkinpredict.R
@@ -92,10 +92,15 @@
 #'       c(parent = 100, m1 = 0), seq(0, 20, by = 0.1),
 #'       solution_type = "analytical", use_compiled = FALSE)[201,])
 #' }
+#'     analytical = mkinpredict(SFO_SFO,
+#'       c(k_parent = 0.15, f_parent_to_m1 = 0.5, k_m1 = 0.01),
+#'       c(parent = 100, m1 = 0), seq(0, 20, by = 0.1),
+#'       solution_type = "analytical", use_compiled = FALSE)[201,]
 #'
 #' \dontrun{
 #'   # Predict from a fitted model
 #'   f <- mkinfit(SFO_SFO, FOCUS_2006_C, quiet = TRUE)
+#'   f <- mkinfit(SFO_SFO, FOCUS_2006_C, quiet = TRUE, solution_type = "analytical")
 #'   head(mkinpredict(f))
 #' }
 #'
@@ -131,8 +136,16 @@ mkinpredict.mkinmod <- function(x,
   }
 
   if (solution_type == "analytical") {
-    out <- x$deg_func(odeini = odeini,
-      odeparms = odeparms, outtimes = outtimes)
+    # This is clumsy, as we wanted fast analytical predictions for mkinfit
+    out <- data.frame(time = outtimes)
+    obs_vars <- names(x$spec)
+    pseudo_observed <-
+      data.frame(name = rep(obs_vars, each = length(outtimes)),
+      time = rep(outtimes, length(obs_vars)))
+    pseudo_observed$predicted <- x$deg_func(pseudo_observed, odeini, odeparms)
+    for (obs_var in obs_vars) {
+      out[obs_var] <- pseudo_observed[pseudo_observed$name == obs_var, "predicted"]
+    }
   }
 
   if (solution_type == "eigen") {
diff --git a/R/plot.mkinfit.R b/R/plot.mkinfit.R
index 4b5bfa3a..b2cb4890 100644
--- a/R/plot.mkinfit.R
+++ b/R/plot.mkinfit.R
@@ -69,6 +69,7 @@ if(getRversion() >= '2.15.1') utils::globalVariables(c("type", "variable", "obse
 #' \dontrun{
 #' SFO_SFO <- mkinmod(parent = mkinsub("SFO", "m1", full = "Parent"),
 #'                    m1 = mkinsub("SFO", full = "Metabolite M1" ))
+#' fit <- mkinfit(SFO_SFO, FOCUS_2006_D, quiet = TRUE)
 #' fit <- mkinfit(SFO_SFO, FOCUS_2006_D, quiet = TRUE, error_model = "tc")
 #' plot(fit)
 #' plot_res(fit)
-- 
cgit v1.2.3