From 280d36230052de4f94e384648c1283031fbc9840 Mon Sep 17 00:00:00 2001 From: Johannes Ranke Date: Tue, 17 Jul 2018 17:29:14 +0200 Subject: Fix inverse predictions for replicate measurements For details, see NEWS.md --- vignettes/chemCal.R | 39 +++++-------- vignettes/chemCal.Rmd | 97 +++++++++++++++++++++------------ vignettes/chemCal.html | 57 ++++++++++++------- vignettes/figure/unnamed-chunk-1-1.pdf | Bin 12114 -> 0 bytes vignettes/figure/unnamed-chunk-2-1.pdf | Bin 5027 -> 0 bytes vignettes/references.bib | 35 ++++++++++++ vignettes/refs.bib | 7 --- 7 files changed, 148 insertions(+), 87 deletions(-) delete mode 100644 vignettes/figure/unnamed-chunk-1-1.pdf delete mode 100644 vignettes/figure/unnamed-chunk-2-1.pdf create mode 100644 vignettes/references.bib delete mode 100644 vignettes/refs.bib (limited to 'vignettes') diff --git a/vignettes/chemCal.R b/vignettes/chemCal.R index 701db7b..d9015e9 100644 --- a/vignettes/chemCal.R +++ b/vignettes/chemCal.R @@ -1,36 +1,23 @@ -### R code from vignette source '/home/jranke/git/chemCal/vignettes/chemCal.Rnw' - -################################################### -### code chunk number 1: chemCal.Rnw:38-42 -################################################### +## ------------------------------------------------------------------------ library(chemCal) -data(massart97ex3) m0 <- lm(y ~ x, data = massart97ex3) calplot(m0) +## ------------------------------------------------------------------------ +plot(m0, which=3) -################################################### -### code chunk number 2: chemCal.Rnw:49-50 -################################################### -plot(m0,which=3) - +## ---- message = FALSE, echo = TRUE--------------------------------------- +weights <- with(massart97ex3, { + yx <- split(y, x) + ybar <- sapply(yx, mean) + s <- round(sapply(yx, sd), digits = 2) + w <- round(1 / (s^2), digits = 3) +}) +massart97ex3.means <- aggregate(y ~ x, massart97ex3, mean) -################################################### -### code chunk number 3: chemCal.Rnw:56-63 -################################################### -attach(massart97ex3) -yx <- split(y, x) -ybar <- sapply(yx, mean) -s <- round(sapply(yx, sd), digits = 2) -w <- round(1 / (s^2), digits = 3) -weights <- w[factor(x)] -m <- lm(y ~ x, w = weights) +m <- lm(y ~ x, w = weights, data = massart97ex3.means) - -################################################### -### code chunk number 4: chemCal.Rnw:69-71 -################################################### +## ------------------------------------------------------------------------ inverse.predict(m, 15, ws=1.67) inverse.predict(m, 90, ws = 0.145) - diff --git a/vignettes/chemCal.Rmd b/vignettes/chemCal.Rmd index 2515abb..ccbbdc7 100644 --- a/vignettes/chemCal.Rmd +++ b/vignettes/chemCal.Rmd @@ -8,19 +8,14 @@ output: toc_float: true code_folding: show fig_retina: null -bibliography: refs.bib +bibliography: references.bib vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Introduction to chemCal} + %\VignetteEncoding{UTF-8} --- -[Wissenschaftlicher Berater, Kronacher Str. 12, 79639 Grenzach-Wyhlen, Germany](http://www.jrwb.de)
- -```{r, include = FALSE} -require(knitr) -opts_chunk$set(engine='R', tidy=FALSE) -``` -# Basic calibration functions for analytical chemistry +# Basic calibration functions The `chemCal` package was first designed in the course of a lecture and lab course on "Analytics of Organic Trace Contaminants" at the University of Bremen @@ -28,13 +23,17 @@ from October to December 2004. In the fall 2005, an email exchange with Ron Wehrens led to the belief that it would be desirable to implement the inverse prediction method given in @massart97 since it also covers the case of weighted regression. Studies of the IUPAC orange book and of DIN 32645 -as well as publications by Currie and the Analytical Method Committee of the -Royal Society of Chemistry and a nice paper by Castillo and Castells provided -further understanding of the matter. - -At the moment, the package consists of four functions, working on univariate -linear models of class `lm` or `rlm`, plus two datasets for -validation. +(equivalent to ISO 11843), publications by @currie97 and the Analytical +Method Committee of the Royal Society of Chemistry [@amc89] and a nice paper by +Castells and Castillo [@castells00] provided some further understanding of the matter. + +At the moment, the package consists of four functions +([calplot](https://pkgdown.jrwb.de/chemCal/reference/calplot.lm.html), +[lod](https://pkgdown.jrwb.de/chemCal/reference/lod.html), +[loq](https://pkgdown.jrwb.de/chemCal/reference/loq.html) and +[inverse.predict](https://pkgdown.jrwb.de/chemCal/reference/inverse.predict.html)), +working on univariate linear models of class `lm` or `rlm`, plus several +datasets for validation. A [bug report](http://bugs.r-project.org/bugzilla3/show_bug.cgi?id=8877) and the following e-mail exchange on the r-devel mailing list about @@ -42,8 +41,21 @@ prediction intervals from weighted regression entailed some further studies on this subject. However, I did not encounter any proof or explanation of the formula cited below yet, so I can't really confirm that Massart's method is correct. +In fact, in June 2018 I was made aware of the fact that the inverse prediction +method implemented in chemCal version 0.1.37 and before did not take the +variance of replicate calibration standards about their means into account, nor +the number of replicates when calculating the degrees of freedom. Thanks to +PhD student Anna Burniol Figols for reporting this issue! + +As a consequence, I rewrote `inverse.predict` not to automatically work with +the mean responses for each calibration standard any more. The example +calculations from @massart97 can still be reproduced when the regression model +is calculated using the means of the calibration data as shown below. + +# Usage + When calibrating an analytical method, the first task is to generate a suitable -model. If we want to use the `chemCal` functions, we will have to restrict +model. If we want to use the `chemCal` functions, we have to restrict ourselves to univariate, possibly weighted, linear regression so far. Once such a model has been created, the calibration can be graphically @@ -64,16 +76,21 @@ plot(m0, which=3) ``` Therefore, in Example 8 in @massart97, weighted regression -is proposed which can be reproduced by +is proposed which can be reproduced by the following code. +Note that we are building the model on the mean values for +each standard in order to be able to reproduce the results +given in the book with the current version of chemCal. ```{r, message = FALSE, echo = TRUE} -attach(massart97ex3) -yx <- split(y, x) -ybar <- sapply(yx, mean) -s <- round(sapply(yx, sd), digits = 2) -w <- round(1 / (s^2), digits = 3) -weights <- w[factor(x)] -m <- lm(y ~ x, w = weights) +weights <- with(massart97ex3, { + yx <- split(y, x) + ybar <- sapply(yx, mean) + s <- round(sapply(yx, sd), digits = 2) + w <- round(1 / (s^2), digits = 3) +}) +massart97ex3.means <- aggregate(y ~ x, massart97ex3, mean) + +m <- lm(y ~ x, w = weights, data = massart97ex3.means) ``` If we now want to predict a new x value from measured y values, @@ -88,7 +105,7 @@ The weight `ws` assigned to the measured y value has to be given by the user in the case of weighted regression, or alternatively, the approximate variance `var.s` at this location. -# Some theory for `inverse.predict` +# Background for `inverse.predict` Equation 8.28 in @massart97 gives a general equation for predicting the standard error $s_{\hat{x_s}}$ for an $x$ value predicted from measurements of @@ -107,20 +124,32 @@ with s_e = \sqrt{ \frac{\sum w_i (y_i - \hat{y_i})^2}{n - 2}} \end{equation} -where $w_i$ is the weight for calibration standard $i$, $y_i$ is the $y$ -value observed for standard $i$, $\hat{y_i}$ is the estimated value for -standard $i$, $n$ is the number of calibration samples, $w_s$ is the weight -attributed to the sample $s$, $m$ is the number of replicate measurements of -sample $s$, $\bar{y_s}$ is the mean response for the sample, -$\bar{y_w} = \frac{\sum{w_i y_i}}{\sum{w_i}}$ is the weighted mean of responses -$y_i$, and $x_i$ is the given $x$ value for standard $i$. +In chemCal version before 0.2, I interpreted $w_i$ to be the weight for +calibration standard $i$, $y_i$ to be the mean value observed for standard $i$, +and $n$ to be the number of calibration standards. With this implementation +I was able to reproduce the examples given in the book. However, as noted above, +I was made aware of the fact that this way of calculation does not take the +variation of the y values about the means into account. Furthermore, I noticed +that for the case of unweighted linear calibration with replicate standards, +`inverse.predict` produced different results than `calibrate` from the +`investr` package when using the Wald method. + +Both issues are now addressed in chemCal starting from version 0.2.1. Here, +$y_i$ is calibration measurement $i$, $\hat{y_i}$ is the estimated value for +calibration measurement $i$ and $n$ is the total number of calibration +measurements. + +$w_s$ is the weight attributed to the sample $s$, $m$ is the number of +replicate measurements of sample $s$, $\bar{y_s}$ is the mean response for the +sample, $\bar{y_w} = \frac{\sum{w_i y_i}}{\sum{w_i}}$ is the weighted mean of +responses $y_i$, and $x_i$ is the given $x$ value for standard $i$. The weight $w_s$ for the sample should be estimated or calculated in accordance to the weights used in the linear regression. -I adjusted the above equation in order to be able to take a different +I had also adjusted the above equation in order to be able to take a different precisions in standards and samples into account. In analogy to Equation 8.26 -from \cite{massart97} we get +from \cite{massart97} I am using \begin{equation} s_{\hat{x_s}} = \frac{1}{b_1} \sqrt{\frac{{s_s}^2}{w_s m} + diff --git a/vignettes/chemCal.html b/vignettes/chemCal.html index 222ca8f..5779746 100644 --- a/vignettes/chemCal.html +++ b/vignettes/chemCal.html @@ -11,7 +11,7 @@ - + Introduction to chemCal @@ -236,18 +236,22 @@ div.tocify {

Introduction to chemCal

Johannes Ranke

-

2018-07-05

+

2018-07-17

-

Wissenschaftlicher Berater, Kronacher Str. 12, 79639 Grenzach-Wyhlen, Germany

-
-

Basic calibration functions for analytical chemistry

-

The chemCal package was first designed in the course of a lecture and lab course on “Analytics of Organic Trace Contaminants” at the University of Bremen from October to December 2004. In the fall 2005, an email exchange with Ron Wehrens led to the belief that it would be desirable to implement the inverse prediction method given in Massart et al. (1997) since it also covers the case of weighted regression. Studies of the IUPAC orange book and of DIN 32645 as well as publications by Currie and the Analytical Method Committee of the Royal Society of Chemistry and a nice paper by Castillo and Castells provided further understanding of the matter.

-

At the moment, the package consists of four functions, working on univariate linear models of class lm or rlm, plus two datasets for validation.

+
+

Basic calibration functions

+

The chemCal package was first designed in the course of a lecture and lab course on “Analytics of Organic Trace Contaminants” at the University of Bremen from October to December 2004. In the fall 2005, an email exchange with Ron Wehrens led to the belief that it would be desirable to implement the inverse prediction method given in Massart et al. (1997) since it also covers the case of weighted regression. Studies of the IUPAC orange book and of DIN 32645 (equivalent to ISO 11843), publications by Currie (1997) and the Analytical Method Committee of the Royal Society of Chemistry (Analytical Methods Committee 1989) and a nice paper by Castells and Castillo (Castells and Castillo 2000) provided some further understanding of the matter.

+

At the moment, the package consists of four functions (calplot, lod, loq and inverse.predict), working on univariate linear models of class lm or rlm, plus several datasets for validation.

A bug report and the following e-mail exchange on the r-devel mailing list about prediction intervals from weighted regression entailed some further studies on this subject. However, I did not encounter any proof or explanation of the formula cited below yet, so I can’t really confirm that Massart’s method is correct.

-

When calibrating an analytical method, the first task is to generate a suitable model. If we want to use the chemCal functions, we will have to restrict ourselves to univariate, possibly weighted, linear regression so far.

+

In fact, in June 2018 I was made aware of the fact that the inverse prediction method implemented in chemCal version 0.1.37 and before did not take the variance of replicate calibration standards about their means into account, nor the number of replicates when calculating the degrees of freedom. Thanks to PhD student Anna Burniol Figols for reporting this issue!

+

As a consequence, I rewrote inverse.predict not to automatically work with the mean responses for each calibration standard any more. The example calculations from Massart et al. (1997) can still be reproduced when the regression model is calculated using the means of the calibration data as shown below.

+
+
+

Usage

+

When calibrating an analytical method, the first task is to generate a suitable model. If we want to use the chemCal functions, we have to restrict ourselves to univariate, possibly weighted, linear regression so far.

Once such a model has been created, the calibration can be graphically shown by using the calplot function:

library(chemCal)
 m0 <- lm(y ~ x, data = massart97ex3)
@@ -256,14 +260,16 @@ calplot(m0)

As we can see, the scatter increases with increasing x. This is also illustrated by one of the diagnostic plots for linear models provided by R:

plot(m0, which=3)

-

Therefore, in Example 8 in Massart et al. (1997), weighted regression is proposed which can be reproduced by

-
attach(massart97ex3)
-yx <- split(y, x)
-ybar <- sapply(yx, mean)
-s <- round(sapply(yx, sd), digits = 2)
-w <- round(1 / (s^2), digits = 3)
-weights <- w[factor(x)]
-m <- lm(y ~ x, w = weights)
+

Therefore, in Example 8 in Massart et al. (1997), weighted regression is proposed which can be reproduced by the following code. Note that we are building the model on the mean values for each standard in order to be able to reproduce the results given in the book with the current version of chemCal.

+
weights <- with(massart97ex3, {
+  yx <- split(y, x)
+  ybar <- sapply(yx, mean)
+  s <- round(sapply(yx, sd), digits = 2)
+  w <- round(1 / (s^2), digits = 3)
+})
+massart97ex3.means <- aggregate(y ~ x, massart97ex3, mean)
+
+m <- lm(y ~ x, w = weights, data = massart97ex3.means)

If we now want to predict a new x value from measured y values, we use the inverse.predict function:

inverse.predict(m, 15, ws=1.67)
## $Prediction
@@ -291,8 +297,8 @@ m <- lm(y ~ x, w = weights)
## [1] 36.20523 51.91526

The weight ws assigned to the measured y value has to be given by the user in the case of weighted regression, or alternatively, the approximate variance var.s at this location.

-
-

Some theory for inverse.predict

+
+

Background for inverse.predict

Equation 8.28 in Massart et al. (1997) gives a general equation for predicting the standard error \(s_{\hat{x_s}}\) for an \(x\) value predicted from measurements of \(y\) according to the linear calibration function \(y = b_0 + b_1 \cdot x\):

\[\begin{equation} s_{\hat{x_s}} = \frac{s_e}{b_1} \sqrt{\frac{1}{w_s m} + \frac{1}{\sum{w_i}} + @@ -304,9 +310,11 @@ s_{\hat{x_s}} = \frac{s_e}{b_1} \sqrt{\frac{1}{w_s m} + \frac{1}{\sum{w_i}} + \[\begin{equation} s_e = \sqrt{ \frac{\sum w_i (y_i - \hat{y_i})^2}{n - 2}} \end{equation}\] -

where \(w_i\) is the weight for calibration standard \(i\), \(y_i\) is the mean \(y\) value (!) observed for standard \(i\), \(\hat{y_i}\) is the estimated value for standard \(i\), \(n\) is the number calibration standards, \(w_s\) is the weight attributed to the sample \(s\), \(m\) is the number of replicate measurements of sample \(s\), \(\bar{y_s}\) is the mean response for the sample, \(\bar{y_w} = \frac{\sum{w_i y_i}}{\sum{w_i}}\) is the weighted mean of responses \(y_i\), and \(x_i\) is the given \(x\) value for standard \(i\).

+

In chemCal version before 0.2, I interpreted \(w_i\) to be the weight for calibration standard \(i\), \(y_i\) to be the mean value observed for standard \(i\), and \(n\) to be the number of calibration standards. With this implementation I was able to reproduce the examples given in the book. However, as noted above, I was made aware of the fact that this way of calculation does not take the variation of the y values about the means into account. Furthermore, I noticed that for the case of unweighted linear calibration with replicate standards, inverse.predict produced different results than calibrate from the investr package when using the Wald method.

+

Both issues are now addressed in chemCal starting from version 0.2.1. Here, \(y_i\) is calibration measurement \(i\), \(\hat{y_i}\) is the estimated value for calibration measurement \(i\) and \(n\) is the total number of calibration measurements.

+

\(w_s\) is the weight attributed to the sample \(s\), \(m\) is the number of replicate measurements of sample \(s\), \(\bar{y_s}\) is the mean response for the sample, \(\bar{y_w} = \frac{\sum{w_i y_i}}{\sum{w_i}}\) is the weighted mean of responses \(y_i\), and \(x_i\) is the given \(x\) value for standard \(i\).

The weight \(w_s\) for the sample should be estimated or calculated in accordance to the weights used in the linear regression.

-

I adjusted the above equation in order to be able to take a different precisions in standards and samples into account. In analogy to Equation 8.26 from we get

+

I had also adjusted the above equation in order to be able to take a different precisions in standards and samples into account. In analogy to Equation 8.26 from I am using

\[\begin{equation} s_{\hat{x_s}} = \frac{1}{b_1} \sqrt{\frac{{s_s}^2}{w_s m} + {s_e}^2 \left( \frac{1}{\sum{w_i}} + @@ -315,6 +323,15 @@ s_{\hat{x_s}} = \frac{1}{b_1} \sqrt{\frac{{s_s}^2}{w_s m} + \end{equation}\]

where I interpret \(\frac{{s_s}^2}{w_s}\) as an estimator of the variance at location \(\hat{x_s}\), which can be replaced by a user-specified value using the argument var.s of the function inverse.predict.

+
+

Analytical Methods Committee. 1989. “Robust Statistics — How Not to Reject Outliers. Part 1. Basic Concepts.” The Analyst 114: 1693–7.

+
+
+

Castells, Reynaldo César, and Marcela Alejandra Castillo. 2000. “Systematic Errors: Detection and Correction by Means of Standard Calibration, Youden Calibration and Standard Additions Method in Conjunction with a Method Response Model.” Analytica Chimica Acta 423: 179–85.

+
+
+

Currie, L. A. 1997. “Nomenclature in Evaluation of Analytical Methods Including Detection and Quantification Capabilities (IUPAC Recommendations 1995).” Analytica Chimica Acta 391: 105–26.

+

Massart, D. L, B. G. M. Vandeginste, L. M. C. Buydens, S. De Jong, P. J. Lewi, and J Smeyers-Verbeke. 1997. Handbook of Chemometrics and Qualimetrics: Part A. Amsterdam: Elsevier.

diff --git a/vignettes/figure/unnamed-chunk-1-1.pdf b/vignettes/figure/unnamed-chunk-1-1.pdf deleted file mode 100644 index c70b645..0000000 Binary files a/vignettes/figure/unnamed-chunk-1-1.pdf and /dev/null differ diff --git a/vignettes/figure/unnamed-chunk-2-1.pdf b/vignettes/figure/unnamed-chunk-2-1.pdf deleted file mode 100644 index c1b934d..0000000 Binary files a/vignettes/figure/unnamed-chunk-2-1.pdf and /dev/null differ diff --git a/vignettes/references.bib b/vignettes/references.bib new file mode 100644 index 0000000..a710662 --- /dev/null +++ b/vignettes/references.bib @@ -0,0 +1,35 @@ +% Encoding: UTF-8 +@book{massart97, + author = "Massart, D. L and Vandeginste, B. G. M. and Buydens, L. M. C. and De Jong, S. and Lewi, P. J. and Smeyers-Verbeke, J", + title = "Handbook of {C}hemometrics and {Q}ualimetrics: Part {A}", + publisher = "Elsevier", + address = "Amsterdam", + year = "1997" +} +@article{currie97, + author = "Currie, L. A.", + year = "1997", + title = "Nomenclature in evaluation of analytical methods including + detection and quantification capabilities ({IUPAC Recommendations 1995})", + journal = "Analytica Chimica Acta", + volume = "391", + pages = "105 - 126" +} +@article{amc89, + Title = {Robust statistics --- how not to reject outliers. {Part} 1. {Basic} concepts}, + Author = {{Analytical Methods Committee}}, + Journal = {The Analyst}, + Year = {1989}, + Pages = {1693--1697}, + Volume = {114} +} +@article{castells00, + author = {Reynaldo César Castells and Marcela Alejandra Castillo}, + title = {Systematic errors: detection and correction by means of standard calibration, Youden calibration and standard additions method in conjunction with a method response model}, + journal = "Analytica Chimica Acta", + volume = "423", + year = "2000", + pages = "179-185" +} + +@Comment{jabref-meta: databaseType:bibtex;} diff --git a/vignettes/refs.bib b/vignettes/refs.bib deleted file mode 100644 index 514d76b..0000000 --- a/vignettes/refs.bib +++ /dev/null @@ -1,7 +0,0 @@ -@book{massart97, - author = "Massart, D. L and Vandeginste, B. G. M. and Buydens, L. M. C. and De Jong, S. and Lewi, P. J. and Smeyers-Verbeke, J", - title = "Handbook of {C}hemometrics and {Q}ualimetrics: Part {A}", - publisher = "Elsevier", - address = "Amsterdam", - year = "1997" -} -- cgit v1.2.1