## ----------------------------------------------------------------------------- ## The model cost and residuals ## ----------------------------------------------------------------------------- # Some of the CAKE R modules are based on mkin. # Call to approx is only performed if there are multiple non NA values # which should prevent most of the crashes - Rob Nelson (Tessella) # # Modifications developed by Hybrid Intelligence (formerly Tessella), part of # Capgemini Engineering, for Syngenta, Copyright (C) 2011-2022 Syngenta # Tessella Project Reference: 6245, 7247, 8361, 7414, 10091 # # The CAKE R modules are free software: you can # redistribute them and/or modify them under the # terms of the GNU General Public License as published by the Free Software # Foundation, either version 3 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along with # this program. If not, see CakeCost <- function (model, obs, x = "time", y = NULL, err = NULL, weight = "none", scaleVar = FALSE, cost = NULL, ...) { ## Sometimes a fit is encountered for which the model is unable to calculate ## values on the full range of observed values. In this case, we will return ## an infinite cost to ensure this value is not selected. modelCalculatedFully <- all(unlist(obs[x]) %in% unlist(model[x])) ## convert vector to matrix if (is.vector(obs)) { cn <- names(obs) obs <- matrix(data = obs, nrow = 1) colnames(obs) <- cn } if (is.vector(model)) { cn <- names(model) model <- matrix(data=model, nrow = 1) colnames(model) <- cn } ##============================================================================= ## Observations ##============================================================================= ## The position of independent variable(s) ix <- 0 if (! is.null(x)) { # mapping required... ## For now multiple independent variables are not supported... if (length(x) > 1) stop ("multiple independent variables in 'obs' are not yet supported") if (! is.character(x)) stop ("'x' should be the *name* of the column with the independent variable in 'obs' or NULL") ix <- which(colnames(obs) %in% x) if (length(ix) != length(x)) stop(paste("Independent variable column not found in observations", x)) } else ix <- NULL ## The position of weighing values ierr <- 0 if (! is.null(err)) { if (! is.character(err)) stop ("'err' should be the *name* of the column with the error estimates in obs or NULL") ierr <- which(colnames(obs) == err) # only one if (length(ierr) == 0) stop(paste("Column with error estimates not found in observations", err)) } ## The dependent variables type <- 1 # data input type: type 2 is table format, type 1 is long format... if (!is.null(y)) { # it is in table format; first column are names of observed data... Names <- as.character(unique(obs[, 1])) # Names of data sets, all data should be model variables... Ndat <- length(Names) # Number of data sets ilist <- 1:Ndat if (! is.character(y)) stop ("'y' should be the *name* of the column with the values of the dependent variable in obs") iy <- which(colnames(obs) == y) if (length(iy) == 0) stop(paste("Column with value of dependent variable not found in observations", y)) type <- 2 } else { # it is a matrix, variable names are column names Ndat <- NCOL(obs)-1 Names <- colnames(obs) ilist <- (1:NCOL(obs)) # column positions of the (dependent) observed variables exclude <- ix # exclude columns that are not if (ierr > 0) exclude <- c(ix, ierr) # exclude columns that are not if (length(exclude) > 0) ilist <- ilist[-exclude] } #================================ # The model results #================================ ModNames <- colnames(model) # Names of model variables if (length(ix) > 1) { ixMod <- NULL for ( i in 1:length(ix)) { ix2 <- which(colnames(model) == x[i]) if (length(ix2) == 0) stop(paste("Cannot calculate cost: independent variable not found in model output", x[i])) ixMod <- c(ixMod, ix2) } xMod <- model[,ixMod] # Independent variable, model } else if (length(ix) == 1) { ixMod <- which(colnames(model) == x) if (length(ixMod) == 0) stop(paste("Cannot calculate cost: independent variable not found in model output", x)) xMod <- model[,ixMod] # Independent variable, model } Residual <- NULL CostVar <- NULL #================================ # Compare model and data... #================================ xDat <- 0 iDat <- 1:nrow(obs) for (i in ilist) { # for each observed variable ... ii <- which(ModNames == Names[i]) if (length(ii) == 0) stop(paste("observed variable not found in model output", Names[i])) yMod <- model[, ii] if (type == 2) { # table format iDat <- which (obs[,1] == Names[i]) if (length(ix) > 0) xDat <- obs[iDat, ix] obsdat <- obs[iDat, iy] } else { if (length(ix) > 0) xDat <- obs[, 1] obsdat <- obs[,i] } ii <- which(is.na(obsdat)) if (length(ii) > 0) { xDat <- xDat[-ii] obsdat <- obsdat[-ii] } # CAKE version - Added tests for multiple non-NA values if (length(ix) > 0 && length(unique(xMod[!is.na(xMod)]))>1 && length(yMod[!is.na(yMod)])>1) { ModVar <- approx(xMod, yMod, xout = xDat)$y } else { cat("CakeCost Warning: Only one valid point - using mean (yMod was", yMod, ")\n") ModVar <- mean(yMod[!is.na(yMod)]) obsdat <- mean(obsdat) } iex <- which(!is.na(ModVar)) ModVar <- ModVar[iex] obsdat <- obsdat[iex] xDat <- xDat[iex] if (ierr > 0) { Err <- obs[iDat, ierr] Err <- Err[iex] } else { if (weight == "std") Err <- sd(obsdat) else if (weight == "mean") Err <- mean(abs(obsdat)) else if (weight == "none") Err <- 1 else stop("error: do not recognize 'weight'; should be one of 'none', 'std', 'mean'") } if (any(is.na(Err))) stop(paste("error: cannot estimate weighing for observed variable: ", Names[i])) if (min(Err) <= 0) stop(paste("error: weighing for observed variable is 0 or negative:", Names[i])) if (scaleVar) Scale <- 1/length(obsdat) else Scale <- 1 if(!modelCalculatedFully){ # In this case, the model is unable to predict on the full range, set cost to Inf xDat <- 0 obsdat <- 0 ModVar <- Inf Res <- Inf res <- Inf weight_for_residual <- Inf } else{ Res <- (ModVar- obsdat) res <- Res / Err weight_for_residual <- 1 / Err } resScaled <- res * Scale Residual <- rbind(Residual, data.frame( name = Names[i], x = xDat, obs = obsdat, mod = ModVar, weight = weight_for_residual, res.unweighted = Res, res = res)) CostVar <- rbind(CostVar, data.frame( name = Names[i], scale = Scale, N = length(Res), SSR.unweighted = sum(Res^2), SSR.unscaled = sum(res^2), SSR = sum(resScaled^2))) } # end loop over all observed variables ## SSR Cost <- sum(CostVar$SSR * CostVar$scale) Lprob <- -sum(log(pmax(0, dnorm(Residual$mod, Residual$obs, Err)))) # avoid log of negative values #Lprob <- -sum(log(pmax(.Machine$double.xmin, dnorm(Residual$mod, Residual$obs, Err)))) #avoid log(0) if (! is.null(cost)) { Cost <- Cost + cost$model CostVar <- rbind(CostVar, cost$var) Residual <- rbind(Residual, cost$residuals) Lprob <- Lprob + cost$minlogp } out <- list(model = Cost, cost = Cost, minlogp = Lprob, var = CostVar, residuals = Residual) class(out) <- "modCost" return(out) } ## ----------------------------------------------------------------------------- ## S3 methods of modCost ## ----------------------------------------------------------------------------- plot.modCost<- function(x, legpos="topleft", ...) { nvar <- nrow(x$var) dots <- list(...) dots$xlab <- if(is.null(dots$xlab)) "x" else dots$xlab dots$ylab <- if(is.null(dots$ylab)) "weighted residuals" else dots$ylab DotsPch <- if(is.null(dots$pch)) (16:24) else dots$pch dots$pch <- if(is.null(dots$pch)) (16:24)[x$residuals$name] else dots$pch[x$residuals$name] DotsCol <- if(is.null(dots$col)) (1:nvar) else dots$col dots$col <- if(is.null(dots$col)) (1:nvar)[x$residuals$name] else dots$col[x$residuals$name] do.call("plot", c(alist(x$residuals$x, x$residuals$res), dots)) # plot(x$residuals$x, x$residuals$res, xlab="x", ylab="weighted residuals", # pch=c(16:24)[x$residuals$name],col=c(1:nvar)[x$residuals$name],...) if (! is.na(legpos)) legend(legpos, legend = x$var$name, col = DotsCol, pch = DotsPch) } ## ----------------------------------------------------------------------------- ## Internal cost function for optimisers ## ----------------------------------------------------------------------------- # Cost function. The returned structure must have $model # We need to preserve state between calls so make a closure CakeInternalCostFunctions <- function(mkinmod, state.ini.optim, state.ini.optim.boxnames, state.ini.fixed, parms.fixed, observed, mkindiff, quiet, atol=1e-6, solution="deSolve", err="err"){ cost.old <- 1e+100 calls <- 0 out_predicted <- NA get.predicted <- function(){ out_predicted } get.best.cost <- function(){ cost.old } reset.best.cost <- function() { cost.old<<-1e+100 } get.calls <- function(){ calls } set.calls <- function(newcalls){ calls <<- newcalls } set.error<-function(err) { observed$err <<- err } # The called cost function cost <- function(P) { assign("calls", calls + 1, inherits = TRUE) print(P) if (length(state.ini.optim) > 0) { odeini <- c(P[1:length(state.ini.optim)], state.ini.fixed) names(odeini) <- c(state.ini.optim.boxnames, names(state.ini.fixed)) } else { odeini <- state.ini.fixed } odeparms <- c(P[(length(state.ini.optim) + 1):length(P)], parms.fixed) # Ensure initial state is at time 0 outtimes = unique(c(0,observed$time)) odeini <- AdjustOdeInitialValues(odeini, mkinmod, odeparms) if (solution == "analytical") { evalparse <- function(string) { eval(parse(text=string), as.list(c(odeparms, odeini))) } parent.type = names(mkinmod$map[[1]])[1] parent.name = names(mkinmod$diffs)[[1]] o <- switch(parent.type, SFO = SFO.solution(outtimes, evalparse(parent.name), evalparse(paste("k", parent.name, sep="_"))), FOMC = FOMC.solution(outtimes, evalparse(parent.name), evalparse("alpha"), evalparse("beta")), DFOP = DFOP.solution(outtimes, evalparse(parent.name), evalparse(paste("k1", parent.name, sep="_")), evalparse(paste("k2", parent.name, sep="_")), evalparse(paste("g", parent.name, sep="_"))), HS = HS.solution(outtimes, evalparse(parent.name), evalparse("k1"), evalparse("k2"), evalparse("tb")), IORE = IORE.solution(outtimes, evalparse(parent.name), evalparse(paste("k", parent.name, sep="_")), evalparse("N"))) out <- cbind(outtimes, o) dimnames(out) <- list(outtimes, c("time", sub("_free", "", parent.name))) } if (solution == "deSolve") { out <- ode(y = odeini, times = outtimes, func = mkindiff, parms = odeparms, atol = atol) } out_transformed <- PostProcessOdeOutput(out, mkinmod, atol) assign("out_predicted", out_transformed, inherits = TRUE) mC <- CakeCost(out_transformed, observed, y = "value", err = err, scaleVar = FALSE) mC$penalties <- CakePenalties(odeparms, out_transformed, observed) mC$model <- mC$cost + mC$penalties if (mC$model < cost.old) { if (!quiet) { cat("Model cost at call ", calls, ": m", mC$cost, 'p:', mC$penalties, 'o:', mC$model, "\n") } assign("cost.old", mC$model, inherits = TRUE) } # HACK to make nls.lm respect the penalty, as it just uses residuals and ignores the cost if(mC$penalties > 0){ mC$residuals$res <- mC$residuals$res + (sign(mC$residuals$res) * mC$penalties / length(mC$residuals$res)) } return(mC) } list(cost=cost, get.predicted=get.predicted, get.calls=get.calls, set.calls=set.calls, get.best.cost=get.best.cost, reset.best.cost=reset.best.cost, set.error=set.error ) }