\documentclass[notitlepage]{article}
\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{bbold}

\usepackage{amsfonts,amsmath,amssymb}

\setlength{\parindent}{0pt}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%6i", x),
      sprintf("%.6f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@


\begin{center}
\textsclarge{MATH 598: TOPICS IN STATISTICS}

\vspace{0.1 in} \textsc{Bayesian Inference for Nonlinear Regression}
\end{center}

A non-linear regression model is a model for the conditional expectation of a response variable $Y$ in terms of a function of predictors $\bx$ that is non-linear in the parameters.  For example, in pharmacokinetics, the concentration $Y$ of drug in the bloodstream at time $x$ after an initial dose $D$ can be modelled using ordinary differential equations.  One simple 'compartment' model represents the concentration $y(x)$ at time $x$ as
\[
y(x) = D \beta_0 (\exp\{-\beta_1 x\} - \exp\{-(\beta_1+\beta_2) x\})  = \mu(x,D,\beta_0,\beta_1,\beta_2) \qquad x > 0
\]
say, for parameters $\beta_0,\beta_1,\beta_2 >0$.


<<PK00,comment='+', fig.width=7, fig.height=4.5, fig.align='center'>>=
D<-10
pk.model<-function(xv,b0,b1,b2,Dose){
    yv<-Dose*be0*(exp(-b1*xv)-exp(-(b1+b2)*xv))
    return(yv)
}
x<-seq(0,10,by=0.01)
be0<-1;be1<-0.4;be2<-0.6
y<-pk.model(x,be0,be1,be2,D)
par(mar=c(4,4,1,0))
plot(x,y,type='l',ylim=range(0,12))
be0<-2;be1<-0.4;be2<-1
y<-pk.model(x,be0,be1,be2,D)
lines(x,y,col='red')
be0<-1.5;be1<-0.3;be2<-0.2
y<-pk.model(x,be0,be1,be2,D)
lines(x,y,col='blue')
legend(5,10,c(expression(paste(beta[0]==1.0,', ',beta[1]==0.4,', ',beta[2]==0.6)),
              expression(paste(beta[0]==2.0,', ',beta[1]==0.4,', ',beta[2]==1.0)),
              expression(paste(beta[0]==1.5,', ',beta[1]==0.3,', ',beta[2]==0.2))),
              lty=1,col=c('black','red','blue'))
@
In practice, data are collected from individual patients across time, and these data are measured with observation error.  One common model involves representing the data using multiplicative errors, that is
\[
Y(x_i) = \mu(x_i,D,\beta_0,\beta_1,\beta_2) \epsilon_i
\]
for $i=1,\ldots,n$, where $\log \epsilon_i \sim Normal(0,\sigma^2)$.  Below, $n=20$ data points gathered uniformly on $(0,10)$ are simulated according to this model, with $(\beta_0,\beta_1,\beta_2) = (2,0.4,1)$ and $\sigma = 0.1$, for $D=10$.
<<PK01,comment='+', fig.width=7, fig.height=4.5, fig.align='center'>>=
set.seed(37)
D<-10
n<-20
sig<-0.1
x0<-seq(0,10,by=0.01)
x<-sort(runif(n,0,10))
epsilon<-exp(rnorm(n,0,sig))
be0<-2;be1<-0.4;be2<-1
y0<-pk.model(x0,be0,be1,be2,D)
y<-pk.model(x,be0,be1,be2,D)*epsilon
par(mar=c(4,4,1,0))
plot(x,y,xlim=range(0,10),ylim=range(0,12),pch=19,cex=0.6)
lines(x0,y0,col='red')
@
This suggests that the statistical model to be used for inference is
\[
\log Y(x_i) = \log \mu(x,D,\beta_0,\beta_1,\beta_2) + \log \epsilon_i
\]
for $i=1,\ldots,n$.  Frequentist estimation can be carried out using non-linear least squares.  We can achieve the minimization of
\[
\sum_{i=1}^n (\log y_i - \log \mu(x_i,D,\beta_0,\beta_1,\beta_2))^2
\]
using \texttt{optim}.

<<PK02,comment='+', fig.width=7, fig.height=4.5, fig.align='center'>>=
pk.like<-function(be,Dv,xv,yv){
    res<-log(yv) - log(Dv*be[1]*(exp(-be[2]*xv)-exp(-(be[2]+be[3])*xv)))
    return(sum(res^2))
}
be.start<-c(1,0.1,0.2)
pk.fit<-optim(be.start,fn=pk.like,Dv=D,xv=x,yv=y)
pk.fit  #par gives parameter estimates
par(mar=c(4,4,1,0))
plot(x,y,xlim=range(0,10),ylim=range(0,12),pch=19,cex=0.6)
lines(x0,y0,col='red')
be0.hat<-pk.fit$par[1];be1.hat<-pk.fit$par[2];be2.hat<-pk.fit$par[3]
y.hat<-pk.model(x0,be0.hat,be1.hat,be2.hat,D)
lines(x0,y.hat,col='red',lty=2)
legend(5,12,c('True','Fitted'),col='red',lty=c(1,2))
@
We can carry out the same optimization, but first reparamaterize onto the log scale, say
\[
\theta_0 = \log \beta_0 \qquad \theta_1 = \log \beta_1 \qquad \theta_2 = \log \beta_2
\]
as this should make no difference in the minimization.

<<PK03,comment='+', fig.width=7, fig.height=4.5, fig.align='center'>>=
pk.like.th<-function(th,Dv,xv,yv){
    be<-exp(th)
    res<-log(yv) - log(Dv*be[1]*(exp(-be[2]*xv)-exp(-(be[2]+be[3])*xv)))
    return(sum(res^2))
}
th.start<-log(be.start)
pk.fit.th<-optim(th.start,fn=pk.like.th,Dv=D,xv=x,yv=y)
pk.fit.th$par      #par gives parameter estimates on theta scale
exp(pk.fit.th$par) #on the beta scale - identical to previous analysis
par(mar=c(4,4,1,0))
plot(x,y,xlim=range(0,10),ylim=range(0,12),pch=19,cex=0.6)
lines(x0,y0,col='red')
be0.hat2<-exp(pk.fit.th$par[1]);be1.hat2<-exp(pk.fit.th$par[2]);be2.hat2<-exp(pk.fit.th$par[3])
y.hat2<-pk.model(x0,be0.hat2,be1.hat2,be2.hat2,D)
lines(x0,y.hat,col='red',lty=2)
lines(x0,y.hat2,col='blue',lty=2)
legend(5,10,c('True','Fitted',expression(paste('Fitted (',theta,' parameterization.)'))),
            col=c('red','red','blue'),lty=c(1,2,2))
@

For a Bayesian analysis, we use the same model to generate the likelihood: in the $\theta$ parameterization, we have
\begin{align*}
\mathcal{L}_n(\theta,\sigma^2) & = \prod_{i=1}^n \left( \frac{1}{2 \pi \sigma^2 } \right)^{1/2} \exp \left\{
-\frac{1}{2 \sigma^2} (\log y_i - \log \mu(x_i,D,e^{\theta_0},e^{\theta_1},e^{\theta_2}))^2 \right\} \\[6pt]
& = \left( \frac{1}{2 \pi} \right)^{n/2} \left( \frac{1}{\sigma^2 } \right)^{n/2}  \exp \left\{
-\frac{1}{2 \sigma^2} \sum_{i=1}^n (\log y_i - \log \mu(x_i,D,e^{\theta_0},e^{\theta_1},e^{\theta_2}))^2 \right\} \\[6pt]
& = \left( \frac{1}{2 \pi} \right)^{n/2} \left( \frac{1}{\sigma^2 } \right)^{n/2}  \exp \left\{
-\frac{1}{2 \sigma^2} S(\by,\bx,D,{\theta_0},{\theta_1},{\theta_2}) \right\}
\end{align*}
say.  For the prior, we may choose independent priors on $\theta_0,\theta_1,\theta_2$ conditional on $\sigma^2$.   For $\sigma^2$, as in the linear regression case, we may choose an Inverse Gamma prior
\[
\pi_0(\sigma^2) \equiv InvGamma(a/2,b/2)
\]
for suitable constants $a$ and $b$; we may use the subjective prior knowledge that suggests that $\sigma^2$ is no greater than 5, and suggest $a=b=8$.  Given $\sigma^2$, for $(\theta_0,\theta_1,\theta_2)$ we may select independent $Normal(0,\sigma^2/\lambda)$ priors; more generally, a multivariate Normal prior
\[
\pi_0(\theta_0,\theta_1,\theta_2|\sigma^2) \equiv Normal_3(\mathbf{0},\sigma^2 \bL^{-1})
\]
for some positive definite matrix $\bL^{-1}$ could be considered.  For the posterior distribution, then, we have up to proportionality that
\begin{align*}
\pi_n(\theta_0,\theta_1,\theta_2,\sigma^2) & \propto \; \mathcal{L}_n(\theta_0,\theta_1,\theta_2,\sigma^2)  \pi_0(\theta_0,\theta_1,\theta_2|\sigma^2) \pi_0(\sigma^2) \\[6pt]
& \propto \; \left( \frac{1}{\sigma^2 } \right)^{n/2}  \exp \left\{
-\frac{1}{2 \sigma^2} S(\by,\bx,D,{\theta_0},{\theta_1},{\theta_2}) \right\} \left(\frac{1}{\sigma^2}\right)^{3/2} \exp \left\{-\frac{1}{2 \sigma^2} \theta^\top \bL \theta \right\}  \left( \frac{1}{\sigma^2 }  \right)^{a/2+1} \exp \left\{-\frac{b}{2 \sigma^2} \right\} \\[6pt]
& \propto \; \left( \frac{1}{\sigma^2 } \right)^{(n+a+3)/2+1}  \exp \left\{
-\frac{1}{2 \sigma^2} \left[S(\by,\bx,D,{\theta_0},{\theta_1},{\theta_2})+ \theta^\top \bL \theta + b \right] \right\}.
\end{align*}
To compute with this posterior distribution, we may first integrate out $\sigma^2$ to leave the marginal posterior for $(\theta_0,\theta_1,\theta_2)$.  To integrate out $\sigma^2$ from $\pi_n(\theta_0,\theta_1,\theta_2,\sigma^2)$, we note that the integrand is proportional to an Inverse Gamma pdf, so therefore
\[
\pi_n(\theta_0,\theta_1,\theta_2) \propto \left\{S(\by,\bx,D,{\theta_0},{\theta_1},{\theta_2})+ \theta^\top \bL \theta + b \right\}^{-(n+a+3)/2}.
\]
This posterior is intractable.  We may, however, perform maximization of the posterior again using \texttt{optim}.
<<PK04,comment='+', fig.width=7, fig.height=4.5, fig.align='center'>>=
pk.loglike.th<-function(th,Dv,xv,yv,Lm,av,bv){
    be<-exp(th)
    res<-log(yv) - log(Dv*be[1]*(exp(-be[2]*xv)-exp(-(be[2]+be[3])*xv)))
    Sval<-sum(res^2)
    pth<-t(th) %*% (Lm %*% th)
    llike<--(length(xv)+av+3)*log(Sval+pth[1,1]+b)
    return(-llike)   #optim performs minimization - need to multiply log posterior by -1
}
a<-b<-8
L<-diag(rep(1/10,3))
th.start<-pk.fit.th$par
pk.post.th<-optim(th.start,fn=pk.loglike.th,Dv=D,xv=x,yv=y,Lm=L,av=a,bv=b,hessian=T)
pk.post.th$par      #par gives parameter estimates on theta scale
exp(pk.post.th$par) #on the beta scale - identical to previous analysis
par(mar=c(4,4,1,0))
plot(x,y,xlim=range(0,10),ylim=range(0,12),pch=19,cex=0.6)
lines(x0,y0,col='red')
be0.post<-exp(pk.post.th$par[1]);be1.post<-exp(pk.post.th$par[2]);be2.post<-exp(pk.post.th$par[3])
y.post<-pk.model(x0,be0.post,be1.post,be2.post,D)
lines(x0,y.hat,col='red',lty=2)
lines(x0,y.hat2,col='blue',lty=2)
lines(x0,y.post,col='green',lty=1)
legend(5,12,c('True',expression(paste('LS Fitted (',theta,' parameterization.)')),
              'Posterior Mode fit'),col=c('red','blue','green'),lty=c(1,2,2))
@
As for the earlier GLM examples, we may attempt to approximate the posterior using a Normal approximation
\[
\pi_n(\theta) \approx Normal_3(\widehat \theta_n, \widehat M_n^{-1})
\]
where $M_n$ is computed as the Hessian matrix (matrix of second partial derivatives of the negative log-likelihood evaluated at $\widehat \theta_n$).
<<PK05,comment='+', fig.width=7, fig.height=4.5, fig.align='center'>>=
Mn<-pk.post.th$hessian
(pk.post.var<-solve(Mn))
@
For an exact analysis using Monte Carlo methods, we aim to use rejection sampling to produce a sample from the exact posterior.  To do this we need an easy to sample distribution, $\widetilde \pi_n(\theta)$ say, that resembles the true posterior:  from the above analysis, that $\widetilde \pi_n(\theta) \equiv Normal_3(\widehat \theta_n, \widehat M_n^{-1})$ distribution seems a sensible place to start.  However, recall that rejection sampling requires us first to bound the ratio of the target to the proposal density
\[
\frac{\pi_n(\theta)}{\widetilde \pi_n(\theta)} < M.
\]
In this case, due to the prior assumptions, the posterior itself is approximately Normal in its tails.  Therefore, choosing a Normal proposal density may not be feasible as the ratio $\pi_n(\theta)/\widetilde \pi_n(\theta)$ may not be bounded in the tails.  Instead, we choose a multivariate $Student(5)$ distribution with the same location and scale -- this forces the tail behaviour of the proposal density to be \texttt{heavier} than the target.  The multivariate Student-t distribution density and random number generation may be implemented via the \texttt{mvnfast} library, and using the \texttt{dmvt} and \texttt{rmvt} functions.

<<PK06,comment='+', fig.width=7, fig.height=4.5, fig.align='center'>>=
library(mvnfast)
pk.rejection.th<-function(th,Dv,xv,yv,Lm,av,bv,muv,Sigv){
    be<-exp(th)
    res<-log(yv) - log(Dv*be[1]*(exp(-be[2]*xv)-exp(-(be[2]+be[3])*xv)))
    Sval<-sum(res^2)
    pth<-t(th) %*% (Lm %*% th)
    llike<--(length(xv)+av+3)*log(Sval+pth[1,1]+bv)
    lprop<-dmvt(th,muv,Sigv,5,log=T)
    return(-llike+lprop)   #optim performs minimization - need to multiply log posterior by -1
}
a<-b<-8
L<-diag(rep(1/10,3))
th.start<-pk.post.th$par
post.mu<-pk.post.th$par
post.Sig<-pk.post.var
pk.rej.th<-optim(th.start,fn=pk.rejection.th,
                 Dv=D,xv=x,yv=y,Lm=L,av=a,bv=b,muv=post.mu,Sigv=post.Sig)
pk.rej.th
@
This analysis gives that
\[
\log \left( \frac{\pi_n(\theta)}{c \widetilde \pi_n(\theta)} \right) < -\Sexpr{pk.rej.th$val}
\]
--  recall that we only have $\pi_n(\theta)$ up to proportionality, so there is an indeterminate constant $c$;  however, we do not need the constant to implement rejection sampling.    We aim to produce a sample of size $N=10000$ from the posterior.
<<PK07,comment='+', fig.width=7, fig.height=4.5, fig.align='center'>>=
set.seed(94)
N<-10000
nsamp<-0
ico<-0
Mval<--pk.rej.th$val
theta.rej<-matrix(0,nrow=N,ncol=3)
while(nsamp < N){
    ico<-ico+1
    th<-(rmvt(1,post.mu,post.Sig,5))
    be<-exp(th)
    res<-log(y) - log(D*be[1]*(exp(-be[2]*x)-exp(-(be[2]+be[3])*x)))
    Sval<-sum(res^2)
    pth<-(th) %*% (L %*% t(th))
    llike<--(length(x)+a+3)*log(Sval+pth[1,1]+b)
    lprop<-dmvt(th,post.mu,post.Sig,5,log=T)
    U<-runif(1)
    if(log(U) < llike-lprop-Mval){
        nsamp<-nsamp+1
        theta.rej[nsamp,]<-th
    }
}
print(ico)
@
This analysis tells us that on this run, \Sexpr{ico} proposals were needed to generate the $N = \Sexpr{N}$ samples from the posterior. Thus the acceptance rate is $\Sexpr{N/ico}$, and therefore
\[
\max_\theta \frac{\pi_n(\theta)}{\widetilde \pi_n(\theta)} \bumpeq \frac{1}{\Sexpr{N/ico}} = \Sexpr{ico/N}
\]
which informs us that the normalizing constant $c$ is given approximately by $c \bumpeq \Sexpr{ico/N} \exp\{-\Sexpr{pk.rej.th$val} \}$.
<<PK08,comment='+', fig.width=5, fig.height=5, fig.align='center'>>=
par(mar=c(4,4,2,0));pairs(theta.rej,pch=19,cex=0.5,
    labels=c(expression(theta[0]),expression(theta[1]),expression(theta[2])))
@
We can obtain samples of the posterior for $\beta_0,\beta_1,\beta_2$ by transformation.
<<PK09,comment='+', fig.width=5, fig.height=5, fig.align='center'>>=
beta.rej<-exp(theta.rej)
par(mar=c(4,4,2,0));pairs(beta.rej,pch=19,cex=0.5,labels=
                          c(expression(beta[0]),expression(beta[1]),expression(beta[2])))
@
Having obtained the posterior samples for $\beta_0,\beta_1,\beta_2$, we can compute sampled fitted curves.
<<PK10,comment='+', fig.width=7, fig.height=5, fig.align='center'>>=
par(mar=c(4,4,1,0))
plot(x,y,xlim=range(0,10),ylim=range(0,12),pch=19,cex=0.6)
for(i in 1:20){
    be0<-beta.rej[i,1];be1.post<-beta.rej[i,2];be2.post<-beta.rej[i,3]
    y.post<-pk.model(x0,be0,be1,be2,D)
    lines(x0,y.post,col='gray',lty=1)
}
legend(5,12,c('True','Posterior sample fits'),col=c('red','gray'),lty=c(1,1))
lines(x0,y0,col='red')
@
Finally, having sampled the $\theta$ values, we can produce a sample from the posterior for $\sigma^2$ by sampling the conditional posterior
\begin{align*}
\pi_n(\sigma^2|\theta_0,\theta_1,\theta_2) & \propto \; \pi_n(\theta_0,\theta_1,\theta_2,\sigma^2) \\[6pt]
&  \propto \left( \frac{1}{\sigma^2 } \right)^{(n+a+3)/2+1}  \exp \left\{
-\frac{1}{2 \sigma^2} \left[S(\by,\bx,D,{\theta_0},{\theta_1},{\theta_2})+ \theta^\top \bL \theta + b \right] \right\}\\[6pt]
& \equiv InvGamma \left(\frac{(n+a+3)}{2}, \frac{\left(S(\by,\bx,D,{\theta_0},{\theta_1},{\theta_2})+ \theta^\top \bL \theta + b \right)}{2} \right)
\end{align*}
<<PK10a,comment='+', fig.width=7, fig.height=5, fig.align='center'>>=
be.rej<-exp(theta.rej)
sigsq.rej<-rep(0,N)
for(i in 1:N){
    res<-log(y) - log(D*be.rej[i,1]*(exp(-be.rej[i,2]*x)-exp(-(be.rej[i,2]+be.rej[i,3])*x)))
    Sval<-sum(res^2)
    pth<-t(theta.rej[i,]) %*% (L %*% (theta.rej[i,]))
    sigsq.rej[i]<-1/rgamma(1,(length(x)+a+3)/2,(Sval+pth[1,1]+b)/2)
}
par(mar=c(4,4,2,0))
hist(sigsq.rej,breaks=seq(0,1,by=0.02),xlab=expression(sigma^2),
     main=expression(paste('Posterior for ',sigma^2)),col='gray');box()
xv<-seq(0,1,by=0.001)
dinvgamma<-function(x,al,be){return(exp(al*log(be)-lgamma(al)-(al+1)*log(x)-be/x))}
yv<-dinvgamma(xv,8,8)
lines(xv,yv*N*0.02,col='red',lty=2)
legend(0.6,1000,c('Prior'),lty=2,col='red')
@
\textbf{Sampling Importance Resampling:}  In sampling importance resampling, we generate a large sample $\theta^{(1)},\ldots,\theta^{(N_0)}$ from a proposal distribution $\widetilde \pi_n(\theta)$ and the produce a sample of size $N$ from the target $\pi_n(\theta)$ by resampling from the generated sample with replacement with weights
\[
w_i = \frac{\pi_n(\theta^{(i)})/\widetilde \pi_n(\theta^{(i)})}{\sum\limits_{j=1}^{N_0} \{\pi_n(\theta^{(j)})/\widetilde  \pi_n(\theta^{(j)})\}} \qquad i=1,\ldots,N
\]
In the following simulation, we choose the $Student(5)$ proposal above, and set $N_0=50000$ and $N=10000$
<<PK11,comment='+', fig.width=4.5, fig.height=4.5, fig.align='center'>>=
set.seed(94)
N0<-50000
N<-10000
th0<-rmvt(N0,post.mu,post.Sig,5)
lprop<-dmvt(th0,post.mu,post.Sig,5,log=T)
llike<-rep(0,N0)
be0<-exp(th0)
for(i in 1:N0){
    res<-log(y) - log(D*be0[i,1]*(exp(-be0[i,2]*x)-exp(-(be0[i,2]+be0[i,3])*x)))
    Sval<-sum(res^2)
    pth<-t(th0[i,]) %*% (L %*% (th0[i,]))
    llike[i]<--(length(x)+a+3)*log(Sval+pth[1,1]+b)
}
w<-exp(llike-lprop)
w<-w/sum(w)
th.sir<-th0[sample(1:N0,prob=w,rep=T,size=N),]
par(mar=c(4,4,2,0));pairs(th.sir,pch=19,cex=0.5,
    labels=c(expression(theta[0]),expression(theta[1]),expression(theta[2])))
@
The rejection and SIR samples have very similar statistical summaries:

<<PK12,comment='+', fig.width=4.5, fig.height=4.5, fig.align='center'>>=
apply(theta.rej,2,mean); apply(th.sir,2,mean) #Means
cov(theta.rej);cov(th.sir)                    #Covariances
@
We can measure the efficiency of the Importance Sampling proposal by computing the \textit{effective sample size} (ESS)
\[
\text{ESS} = \left(\sum_{i=1}^{N_0} w_i^2 \right)^{-1}
\]
which is to be compared against the largest possible value -- $N_0$ -- which is obtained when $w_i = 1/N_0$ for all $i$.  Here
<<PK13,comment='+', fig.width=4.5, fig.height=4.5, fig.align='center'>>=
ESS<-1/sum(w^2)
ESS
@


\end{document}