path: root/CakeCost.R



## -----------------------------------------------------------------------------
## The model cost and residuals
## -----------------------------------------------------------------------------
# The CAKE R modules are based on mkin, 
# Call to approx is only performed if there are multiple non NA values
# which should prevent most of the crashes - Rob Nelson (Tessella)
#
# Modifications developed by Tessella Plc for Syngenta, Copyright (C) 2011 Syngenta
# Authors: Rob Nelson, Richard Smith
# Tessella Project Reference: 6245
#
# This program is free software: you can
# redistribute them and/or modify them under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/># 
#
# $Id$

CakeCost <- function (model, obs, x = "time", y = NULL, err = NULL,
                     weight = "none", scaleVar = FALSE, cost = NULL, ...) {

  ## convert vector to matrix
  if (is.vector(obs)) {
    cn <- names(obs)
    obs   <- matrix(data = obs, nrow = 1)
    colnames(obs) <- cn
  }
  if (is.vector(model)) {
    cn <- names(model)
    model <- matrix(data=model, nrow = 1)
    colnames(model) <- cn
  }

  ##=============================================================================
  ## Observations
  ##=============================================================================

  ## The position of independent variable(s)
  ix <- 0
  if (! is.null(x))  {   # mapping required...
    ## For now multiple independent variables are not supported...
    if (length(x) > 1)
      stop ("multiple independent variables in 'obs' are not yet supported")

    if (! is.character(x))
      stop ("'x' should be the *name* of the column with the independent variable in 'obs' or NULL")
    ix  <- which(colnames(obs) %in% x)
    if (length(ix) != length(x))
      stop(paste("Independent variable column not found in observations", x))
  } else ix <- NULL

  ## The position of weighing values
  ierr <- 0
  if (! is.null(err)) {
    if (! is.character(err))
      stop ("'err' should be the *name* of the column with the error estimates in obs or NULL")
    ierr <- which(colnames(obs) == err)    # only one
    if (length(ierr) == 0)
      stop(paste("Column with error estimates not found in observations", err))
  }

  ## The dependent variables
  type <- 1           # data input type: type 2 is table format, type 1 is long format...

  if (!is.null(y)) {   # it is in table format; first column are names of observed data...

    Names    <- as.character(unique(obs[, 1]))  # Names of data sets, all data should be model variables...
    Ndat     <- length(Names)                   # Number of data sets
    ilist    <- 1:Ndat
    if (! is.character(y))
      stop ("'y' should be the *name* of the column with the values of the dependent variable in obs")
    iy  <- which(colnames(obs) == y)
    if (length(iy) == 0)
      stop(paste("Column with value of dependent variable not found in observations", y))
    type <- 2

  } else  {             # it is a matrix, variable names are column names
    Ndat     <- NCOL(obs)-1
    Names    <- colnames(obs)
    ilist    <- (1:NCOL(obs))        # column positions of the (dependent) observed variables
    exclude  <- ix                   # exclude columns that are not
    if (ierr > 0)
      exclude <- c(ix, ierr)          # exclude columns that are not
    if (length(exclude) > 0)
      ilist <- ilist[-exclude]
  }

  #================================
  # The model results
  #================================

  ModNames <- colnames(model)  # Names of model variables
  if (length(ix) > 1) {
    ixMod <- NULL

    for ( i in 1:length(ix)) {
      ix2 <- which(colnames(model) == x[i])
      if (length(ix2) == 0)
        stop(paste("Cannot calculate cost: independent variable not found in model output", x[i]))
      ixMod <- c(ixMod, ix2)
    }

  xMod     <- model[,ixMod]    # Independent variable, model
  } else if (length(ix) == 1) {
   ixMod    <- which(colnames(model) == x)
   if (length(ixMod) == 0)
     stop(paste("Cannot calculate cost: independent variable not found in model output", x))
   xMod     <- model[,ixMod]    # Independent variable, model
  }
  Residual <- NULL
  CostVar  <- NULL

  #================================
  # Compare model and data...
  #================================
  xDat <- 0
  iDat <- 1:nrow(obs)

  for (i in ilist) {   # for each observed variable ...
    ii <- which(ModNames == Names[i])
    if (length(ii) == 0) stop(paste("observed variable not found in model output", Names[i]))
    yMod <- model[,ii]
    if (type == 2)  {  # table format
      iDat <- which (obs[,1] == Names[i])
      if (length(ix) > 0) xDat <- obs[iDat, ix]
      obsdat <- obs[iDat, iy]
    } else {
      if (length(ix) > 0) xDat <- obs[, 1]
      obsdat <- obs[,i]
    }
    ii <- which(is.na(obsdat))
    if (length(ii) > 0) {
      xDat   <- xDat[-ii]
      obsdat <- obsdat[-ii]
    }

    # CAKE version - Added tests for multiple non-NA values 
    if (length(ix) > 0 && length(unique(xMod[!is.na(xMod)]))>1 && length(yMod[!is.na(yMod)])>1)
    {
      ModVar <- approx(xMod, yMod, xout = xDat)$y
    }
    else {
      cat("CakeCost Warning: Only one valid point - using mean (yMod was", yMod, ")\n")
      ModVar <- mean(yMod[!is.na(yMod)])
      obsdat <- mean(obsdat)
    }
    iex <- which(!is.na(ModVar))
    ModVar <- ModVar[iex]
    obsdat <- obsdat[iex]
    xDat   <- xDat[iex]
    if (ierr > 0) {
      Err <- obs[iDat, ierr]
      Err <- Err[iex]
    } else {
      if (weight == "std")
        Err <- sd(obsdat)
      else if (weight == "mean")
        Err <- mean(abs(obsdat))
      else if (weight == "none")
        Err <- 1
      else
       stop("error: do not recognize 'weight'; should be one of 'none', 'std', 'mean'")
    }
    if (any(is.na(Err)))
      stop(paste("error: cannot estimate weighing for observed variable: ", Names[i]))
    if (min(Err) <= 0)
      stop(paste("error: weighing for observed variable is 0 or negative:", Names[i]))
    if (scaleVar)
      Scale <- 1/length(obsdat)
    else Scale <- 1
    Res <- (ModVar- obsdat)
    res <- Res/Err
    resScaled <- res*Scale

    # print("===========================");
    # print("#Names[i]");
    # print(Names[i]);
    # print("xDat");
    # print(xDat);
    # print("obsdat");
    # print(obsdat);
    # print("ModVar");
    # print(ModVar);
    # print("1/Err");
    # print(1/Err);
    # print("Res");
    # print(Res);
    # print("res");
    # print(res);
    # print("===========================");
    
    Residual <- rbind(Residual,
                      data.frame(
                        name   = Names[i],
                        x      = xDat,
                        obs    = obsdat,
                        mod    = ModVar,
                        weight = 1/Err,
                        res.unweighted = Res,
                        res    = res))

    CostVar <- rbind(CostVar,
                  data.frame(
                    name           = Names[i],
                    scale          = Scale,
                    N              = length(Res),
                    SSR.unweighted = sum(Res^2),
                    SSR.unscaled   = sum(res^2),
                    SSR            = sum(resScaled^2)))
                    
    # print("************************")
    # print(CostVar)
    # print("************************")
  }  # end loop over all observed variables

  ## SSR
  Cost  <- sum(CostVar$SSR * CostVar$scale)
  
  Lprob <- -sum(log(pmax(0, dnorm(Residual$mod, Residual$obs, Err)))) # avoid log of negative values
  #Lprob <- -sum(log(pmax(.Machine$double.xmin, dnorm(Residual$mod, Residual$obs, Err)))) #avoid log(0)

  if (! is.null(cost)) {
    Cost     <- Cost + cost$model
    CostVar  <- rbind(CostVar, cost$var)
    Residual <- rbind(Residual, cost$residuals)
    Lprob    <- Lprob + cost$minlogp
  }
  out <- list(model = Cost, cost = Cost, minlogp = Lprob, var = CostVar, residuals = Residual)
  class(out) <- "modCost"
  return(out)
}

## -----------------------------------------------------------------------------
## S3 methods of modCost
## -----------------------------------------------------------------------------

plot.modCost<- function(x, legpos="topleft", ...) {
  nvar <- nrow(x$var)

  dots <- list(...)

  dots$xlab <- if(is.null(dots$xlab)) "x" else dots$xlab
  dots$ylab <- if(is.null(dots$ylab)) "weighted residuals" else dots$ylab
  DotsPch   <- if(is.null(dots$pch)) (16:24) else dots$pch
  dots$pch  <- if(is.null(dots$pch)) (16:24)[x$residuals$name] else dots$pch[x$residuals$name]
  DotsCol   <- if(is.null(dots$col)) (1:nvar) else dots$col
  dots$col  <- if(is.null(dots$col)) (1:nvar)[x$residuals$name] else dots$col[x$residuals$name]

  do.call("plot", c(alist(x$residuals$x, x$residuals$res), dots))

#  plot(x$residuals$x, x$residuals$res, xlab="x", ylab="weighted residuals",
#     pch=c(16:24)[x$residuals$name],col=c(1:nvar)[x$residuals$name],...)

  if (! is.na(legpos))
    legend(legpos, legend = x$var$name, col = DotsCol, pch = DotsPch)
}

## -----------------------------------------------------------------------------
## Internal cost function for optimisers
## -----------------------------------------------------------------------------
# Cost function. The returned structure must have $model
# Called from the middle of IrlsFit and OlsFit
# We need to preserve state between calls so make a closure
CakeInternalCostFunctions <- function(mkinmod, state.ini.optim, state.ini.optim.boxnames, 
                                    state.ini.fixed, parms.fixed, observed, mkindiff, scaleVar, 
                                    quiet, plot=FALSE, atol=1e-6){
    cost.old <- 1e+100
    calls <- 0
    out_predicted <- NA
	
	get.predicted <- function(){ out_predicted }
	
	get.best.cost <- function(){ cost.old }
	reset.best.cost <- function() { cost.old<<-1e+100 }
	
	get.calls <- function(){ calls }
	set.calls <- function(newcalls){ calls <<- newcalls }
	
	set.error<-function(err) { observed$err <<- err }
	
	cost <- function(P) {
		assign("calls", calls + 1, inherits = TRUE)
		print(P)
		if (length(state.ini.optim) > 0) {
			odeini <- c(P[1:length(state.ini.optim)], state.ini.fixed)
			names(odeini) <- c(state.ini.optim.boxnames, names(state.ini.fixed))
		}
		else odeini <- state.ini.fixed
		odeparms <- c(P[(length(state.ini.optim) + 1):length(P)], 
			parms.fixed)
		# Ensure initial state is at time 0
		outtimes = unique(c(0,observed$time))
		out <- ode(y = odeini, times = outtimes, func = mkindiff, 
			parms = odeparms, atol=atol)
		out_transformed <- data.frame(time = out[, "time"])
		for (var in names(mkinmod$map)) {
			if (length(mkinmod$map[[var]]) == 1) {
				out_transformed[var] <- out[, var]
			}
			else {
				out_transformed[var] <- rowSums(out[, mkinmod$map[[var]]])
			}
		}
		assign("out_predicted", out_transformed, inherits = TRUE)
		mC <- CakeCost(out_transformed, observed, y = "value", 
			err = 'err', scaleVar = scaleVar)
		mC$penalties <- CakePenalties(odeparms, out_transformed, observed)
		mC$model <- mC$cost + mC$penalties;
		if (mC$model < cost.old) {
			if (!quiet) 
				cat("Model cost at call ", calls, ": m", mC$cost, 'p:', mC$penalties, 'o:', mC$model,
				  "\n")
			if (plot) {
				outtimes_plot = seq(min(observed$time), max(observed$time), 
				  length.out = 100)
				out_plot <- ode(y = odeini, times = outtimes_plot, 
				  func = mkindiff, parms = odeparms, atol=atol)
				out_transformed_plot <- data.frame(time = out_plot[, 
				  "time"])
				for (var in names(mkinmod$map)) {
				  if (length(mkinmod$map[[var]]) == 1) {
					out_transformed_plot[var] <- out_plot[, var]
				  }
				  else {
					out_transformed_plot[var] <- rowSums(out_plot[, 
					  mkinmod$map[[var]]])
				  }
				}
				plot(0, type = "n", xlim = range(observed$time), 
				  ylim = range(observed$value, na.rm = TRUE), 
				  xlab = "Time", ylab = "Observed")
				col_obs <- pch_obs <- 1:length(obs_vars)
				names(col_obs) <- names(pch_obs) <- obs_vars
				for (obs_var in obs_vars) {
				  points(subset(observed, name == obs_var, c(time, 
					value)), pch = pch_obs[obs_var], col = col_obs[obs_var])
				}
				matlines(out_transformed_plot$time, out_transformed_plot[-1])
				legend("topright", inset = c(0.05, 0.05), legend = obs_vars, 
				  col = col_obs, pch = pch_obs, lty = 1:length(pch_obs))
			}
			assign("cost.old", mC$model, inherits = TRUE)
		}
		# HACK to make nls.lm respect the penalty, as it just uses residuals and ignores the cost
		if(mC$penalties > 0){
			#cat("Penalty adjustment: ",mC$penalties)
			mC$residuals$res <- mC$residuals$res + mC$penalties / length(mC$residuals$res)
		}
		
		return(mC)
	}
		

	list(cost=cost, 
		get.predicted=get.predicted,
		get.calls=get.calls, set.calls=set.calls,
		get.best.cost=get.best.cost, reset.best.cost=reset.best.cost,
		set.error=set.error
	)
}