\documentclass[notitlepage]{article}
\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{bbold}
\usepackage{bm}



\usepackage{amsfonts,amsmath,amssymb}

\setlength{\parindent}{0pt}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%6i", x),
      sprintf("%.6f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@


\begin{center}
\textsclarge{MATH 598: TOPICS IN STATISTICS}

\vspace{0.1 in} \textsc{Model selection via Reversible Jump MCMC}
\end{center}

Consider two competing pharmacokinetic (PK) models for response data $y(t)$ measured at different time points $t_1,\ldots,t_n$:
\begin{itemize}

\item[1.] \textbf{Model $\bm{M_1}$:} One compartment, elimination only;
\[
\E[Y(t)] = A_1 \exp\{ - \lambda_{1} t\} \qquad t \geq 0
\]

\item[2.] \textbf{Model $\bm{M_2}$:} One compartment, absorption and elimination;
\[
\E[Y(t)] = A_2 \left(\exp\{ - \lambda_{21} t\} - \exp\{ - (\lambda_{21}+\lambda_{22}) t\} \right)\qquad t \geq 0
\]

\end{itemize}
where $(A_1,\lambda_1)$ and $(A_2,\lambda_{21},\lambda_{22})$ are positive parameters.  Under an assumption of additive, heteroscedastic Normal errors, we have two competing explanations for the observed data; both models can be fitted using ordinary least-squares.

\medskip

In the following example, we simulate data from Model $\bm{M_2}$, with parameters
\[
\theta_{21} = \log A_2 = 2 \qquad \theta_{22} =\log \lambda_{21} =-0.5 \qquad \theta_{23} = \log \lambda_{22} = -0.5
\]
with noise variance $\sigma^2 = 0.3^2$.

<<T0,comment='+', fig.width=7, fig.height=4.75, fig.align='center', echo=FALSE>>=
#Function definitions
funct.m1<-function(xv,thv){
	return(thv[1]*exp(-thv[2]*xv))
}
funct.m2<-function(xv,thv){
	return(thv[1]*(exp(-thv[2]*xv)-exp(-(thv[2]+thv[3])*xv)))
}
loglike<-function(yv,xv,thv,sv,fv){
	return(sum(dnorm(yv,fv(xv,thv),sv,log=T)))
}
loglike.optim<-function(x,yv,xv,fv,np){
	thv<-exp(x[-np])
	sv<-exp(x[np])
	return(-loglike(yv,xv,thv,sv,fv))
}

run.RJMCMC<-function(nburn,nsamp,nthin){
    nits<-nburn+nsamp*nthin
    if(!ifix){
	   imod<-sample(c(1,2),size=1)
    }
    if(imod == 1){
        np<-2
        old.th<-rmvn(1,ests.1,var.1)
        old.sig<-sig.1
        old.like<-loglike(y,x,exp(old.th),old.sig,funct.m1)
        old.p<-sum(dnorm(old.th,0,tau,log=T))
        old.q<-dmvn(old.th,ests.1,var.1,log=T)
    }else{
        np<-3
        old.th<-rmvn(1,ests.2,var.2)
        old.sig<-sig.2
        old.like<-loglike(y,x,exp(old.th),old.sig,funct.m2)
        old.p<-sum(dnorm(old.th,0,tau,log=T))
        old.q<-dmvn(old.th,ests.2,var.2,log=T)
    }

    theta.samp<-matrix(0,nrow=nsamp,ncol=3)
    sigma.samp<-like.samp<-imod.samp<-rep(0,nsamp)

    ico<-0
    i1<-0;i2<-0
    hvec1<-hvec2<-rep(0,nits)

    for(iter in 1:nits){

        if(imod == 1){
            imove<-sample(c(1,3),size=1,prob=c(0.5,0.5))
        }else{
            imove<-sample(c(2,4),size=1,prob=c(0.5,0.5))
        }

        if(imod == 1 & imove== 1){
            #move in Model 1

            jmove<-sample(0:2,size=1)
            if(jmove == 0){
                new.th<-rmvn(1,ests.1,var.1)
                new.like<-loglike(y,x,exp(new.th),old.sig,funct.m1)
                new.p<-sum(dnorm(new.th,0,tau,log=T))
                new.q<-dmvn(old.th,ests.1,var.1,log=T)
                iacc<-log(runif(1)) < (new.like+new.p-new.q)-(old.like+old.p-old.q)
            }else if(jmove == 1){
                new.th<-old.th
                new.th[1]<-old.th[1]+rnorm(1)*0.1
                new.like<-loglike(y,x,exp(new.th),old.sig,funct.m1)
                new.p<-sum(dnorm(new.th,0,tau,log=T))
                new.q<-dmvn(old.th,ests.1,var.1,log=T)
                iacc<-log(runif(1)) < (new.like+new.p)-(old.like+old.p)
            }else if(jmove == 2){
                new.th<-old.th
                new.th[2]<-old.th[2]+rnorm(1)*0.1
                new.like<-loglike(y,x,exp(new.th),old.sig,funct.m1)
                new.p<-sum(dnorm(new.th,0,tau,log=T))
                new.q<-dmvn(old.th,ests.1,var.1,log=T)
                iacc<-log(runif(1)) < (new.like+new.p)-(old.like+old.p)
            }	

            if(iacc){
                old.th<-new.th
                old.like<-new.like
                old.p<-new.p
                old.q<-new.q
            }

            ssq<-sum((y-funct.m1(x,exp(old.th)))^2)
            old.sig<-sqrt(1/rgamma(1,n/2+prior.al,ssq/2+prior.be))
            old.like<-loglike(y,x,exp(old.th),old.sig,funct.m1)

        }else if(imod == 2 & imove == 2){
            #Move in Model 2
            jmove<-sample(0:3,size=1)
            if(jmove == 0){
                new.th<-rmvn(1,ests.2,var.2)
                new.like<-loglike(y,x,exp(new.th),old.sig,funct.m2)
                new.p<-sum(dnorm(new.th,0,tau,log=T))
                new.q<-dmvn(old.th,ests.2,var.2,log=T)
                iacc<-log(runif(1)) < (new.like+new.p-new.q)-(old.like+old.p-old.q)
            }else if(jmove == 1){
                new.th<-old.th
                new.th[1]<-old.th[1]+rnorm(1)*0.1
                new.like<-loglike(y,x,exp(new.th),old.sig,funct.m2)
                new.p<-sum(dnorm(new.th,0,tau,log=T))
                new.q<-dmvn(old.th,ests.2,var.2,log=T)
                iacc<-log(runif(1)) < (new.like+new.p)-(old.like+old.p)
            }else if(jmove == 2){
                new.th<-old.th
                new.th[2]<-old.th[2]+rnorm(1)*0.1
                new.like<-loglike(y,x,exp(new.th),old.sig,funct.m2)
                new.p<-sum(dnorm(new.th,0,tau,log=T))
                new.q<-dmvn(old.th,ests.2,var.2,log=T)
                iacc<-log(runif(1)) < (new.like+new.p)-(old.like+old.p)
            }else if(jmove == 3){
                new.th<-old.th
                new.th[3]<-old.th[3]+rnorm(1)*0.1
                new.like<-loglike(y,x,exp(new.th),old.sig,funct.m2)
                new.p<-sum(dnorm(new.th,0,tau,log=T))
                new.q<-dmvn(old.th,ests.2,var.2,log=T)
                iacc<-log(runif(1)) < (new.like+new.p)-(old.like+old.p)
            }

            if(iacc){
                old.th<-new.th
                old.like<-new.like
                old.p<-new.p
                old.q<-new.q
            }

            ssq<-sum((y-funct.m2(x,exp(old.th)))^2)
            old.sig<-sqrt(1/rgamma(1,n/2+prior.al,ssq/2+prior.be))
            old.like<-loglike(y,x,exp(old.th),old.sig,funct.m2)
        }

        if(!ifix){
            if(imod == 1 & imove == 3){  #Move from Model 1 to Model 2
                uvec<-rmvn(1,ests.2,var.2)
                new.th<-uvec
                new.like<-loglike(y,x,exp(new.th),old.sig,funct.m2)
                new.p<-sum(dnorm(new.th,0,tau,log=T))
                new.q<-dmvn(new.th,ests.2,var.2,log=T)
                hval<-(new.like+new.p-new.q)-(old.like+old.p-old.q)
                i1<-i1+1
                hvec1[i1]<-hval
                if(log(runif(1)) < hval){
				    imod<-2
				    np<-3
				    old.th<-new.th
				    old.like<-new.like
				    old.p<-new.p
				    old.q<-new.q
                }
            }else if(imod == 2 & imove== 4){  #Move from Model 2 to Model 1
                vvec<-rmvn(1,ests.1,var.1)
                new.th<-vvec
                new.like<-loglike(y,x,exp(new.th),old.sig,funct.m1)
                new.p<-sum(dnorm(new.th,0,tau,log=T))
                new.q<-dmvn(new.th,ests.1,var.1,log=T)
                hval<-(new.like+new.p-new.q)-(old.like+old.p-old.q)
                i2<-i2+1
                hvec2[i2]<-hval
                if(log(runif(1)) < hval){
				    imod<-1
				    np<-2
				    old.th<-new.th
				    old.like<-new.like
				    old.p<-new.p
				    old.q<-new.q
                }
            }
        }

        if(imod == 1){
            ssq<-sum((y-funct.m1(x,exp(old.th)))^2)
            old.sig<-sqrt(1/rgamma(1,n/2+prior.al,ssq/2+prior.be))3
            old.like<-loglike(y,x,exp(old.th),old.sig,funct.m1)
        }else{
            ssq<-sum((y-funct.m2(x,exp(old.th)))^2)
            old.sig<-sqrt(1/rgamma(1,n/2+prior.al,ssq/2+prior.be))
            old.like<-loglike(y,x,exp(old.th),old.sig,funct.m2)
        }
        if(iter > nburn && iter %% nthin==0){
            ico<-ico+1
            theta.samp[ico,1:np]<-old.th
            sigma.samp[ico]<-old.sig
            like.samp[ico]<-old.like
            imod.samp[ico]<-imod
        }
    }
    return(list(th=theta.samp,sig=sigma.samp,like=like.samp,model=imod.samp))
}
@

<<T1,comment='+', fig.width=7, fig.height=5.0, fig.align='center',echo=TRUE>>=
library(MASS)
library(mvnfast)
#Simulate data
set.seed(4263374)
n<-20
x<-sort(runif(n,0,5))
th<-exp(c(2,-0.5,-0.5))
y<-funct.m2(x,th)+rnorm(n,0,.3)
par(mar=c(4,4,2,0))
plot(x,y,xlim=range(0,5),ylim=range(0,2.5),pch=19,xlab="t",ylab="y(t)")
xval<-seq(0,5,by=0.01)
yval<-funct.m2(xval,th)
lines(xval,yval,col="red")
fit.1<-optim(c(0,0,0),fn=loglike.optim,yv=y,xv=x,
            fv=funct.m1,np=3,control=list(maxit=1000),hessian=T)
yval1<-funct.m1(xval,exp(fit.1$par[1:2]))
lines(xval,yval1,col="green")
fit.2<-optim(c(0,0,0,0),fn=loglike.optim,yv=y,xv=x,
            fv=funct.m2,np=4,control=list(maxit=10000),hessian=T)
yval2<-funct.m2(xval,exp(fit.2$par[1:3]))
lines(xval,yval2,col="blue")
@

We can compare the two models by using the BIC:  for $k=1,2$, we have
\[
\text{BIC}_k = -2 \log \mathcal{L}_n^{M_k}(\widehat \theta_k) + d_k \log n
\]
where $d_k$ is the total number of parameters for model $M_k$.
<<T2,comment='+', fig.width=7, fig.height=4.25, fig.align='center',echo=TRUE>>=
#Compute the BIC
BIC1<-2*fit.1$value+3*log(n)
BIC2<-2*fit.2$value+4*log(n)
c(BIC1,BIC2)

#Check using nls
PK.data<-data.frame(x,y)
AIC(nls(y~A*exp(-B*x),data=PK.data,start=list(A=1,B=0.5)),k=log(20))
AIC(nls(y~A*exp(-B*x)*(1-exp(-C*x)),data=PK.data,start=list(A=1,B=1,C=1)),k=log(20))
@
Under BIC, the evidence favours model $M_2$, for which $\textrm{BIC}_2 = \Sexpr{BIC2}$, over model $M_1$, where $\textrm{BIC}_1 = \Sexpr{BIC1}$.

\bigskip

We now consider a fully Bayesian solution using reversible jump MCMC.  In the log-scale parameterization
\[
\theta_1 = (\log A_1, \log \lambda_1) \qquad \qquad \theta_2 = (\log A_2, \log \lambda_{21}, \log \lambda_{22}).
\]
We place equal prior probabilities on $M_1$ and $M_2$, and then place independent $N(0,\tau^2)$ priors on the components of $\theta_1$ and $\theta_2$.  The prior on the residual error variance $\sigma^2$ is Inverse Gamma with parameters 20 and 8.

\medskip

The ML estimates $\widehat{\theta}_1$ and $\widehat{\theta}_2$ can be computed easily, as can the Hessian matrices $\widehat{\mathbf{I}}_1$ and $\widehat{\mathbf{I}}_2$; these likelihood-based results yield reasonable approximations to the posterior distributions to produce independence MH algorithms. Specifically, at the ML estimates for $\sigma$ under the two models, we may approximate the conditional posterior for $\theta$ by the Normal density
\begin{equation}
\pi_n^{M_k}(\theta_k|\widehat \sigma,\by) \bumpeq Normal(\widehat \theta_k, n^{-1} \widehat \sigma_k^2 \widehat{\mathbf{I}_k}^{-1}) \label{eq:AsympNormal}
\end{equation}
We may also introduce the prior information and use the conjugate analysis to return an approximate Normal conditional posterior distribution.  Finally, on fitting using ML, the estimates of $\sigma$ under the two models are found to be quite similar ($M_1: \widehat \sigma_1 = 0.252, M_2: \widehat \sigma_2 = 0.329$).

\bigskip

\textbf{Reversible Jump MCMC:}  A reversible jump MCMC algorithm can be constructed as follows: we again consider four move types:
\begin{enumerate}
\item $m=1$: move \textbf{within} $M_1$; update $\theta_1$ from $ \pi_n^{M_1}(\theta_1|M_1, \sigma)$

\item $m=2$: move \textbf{within} $M_1$; update $\theta_2$ from $ \pi_n^{M_2}(\theta_2|M_2, \sigma)$

\item $m=3$: move \textbf{from} $M_1$ \textbf{to} $M_2$; propose a new $\theta_2$, and carry out an accept/reject step.

\item $m=4$: move \textbf{from} Model $M_2$ \textbf{to} Model $M_1$; propose a new $\theta_1$, and carry out an accept/reject step.

\end{enumerate}
with the remaining parameter $\sigma^2$ being updated in a Gibbs sampler algorithm at each iteration.

\medskip

\begin{itemizeWide}

\item For moves $m=1,2$, we use standard Metropolis-Hastings steps, either jointly on the whole parameter vector, or for each of the parameters individually. The asymptotic approximations in \eqref{eq:AsympNormal} can be used.

\item Moves $m=3,4$ are a forward/reverse move pair.  For move 3, several options are available; for example, we could adopt the earlier strategy, and generate a new variate $u$ from the prior for the additional parameter, and then merely use the mapping
\[
(\theta_{11},\theta_{12},u) \longmapsto (\theta_{21}=\theta_{11},\theta_{22}=\theta_{12},\theta_{23}=u)
\]
with reverse move setting $\theta_{23} = 0$.

\medskip
This approach may be adequate, but more probably would not facilitate good mixing across the models.  A better strategy is to consider a different augmentation, where we generate $\bu = (u_1,u_2,u_3)$ from the model in \eqref{eq:AsympNormal} for $k=2$, and map
$(\theta_{11},\theta_{12},u_1,u_2,u_3)$ to
\[
(\theta_{21}=u_1,\theta_{22}=u_2,\theta_{23}=u_3,v_1=\theta_{11},v_2=\theta_{12})
\]
with the paired reverse move being to generate $\bv = (v_1,v_2)$ from the model in \eqref{eq:AsympNormal} for $k=1$.

\medskip

This guarantees that the proposed value $\theta_2$ lies in a region with reasonably high posterior support under model $M_2$, although it does not guarantee that the move will be accepted with high probability.

\medskip


In the Hastings ratio, the Jacobian of the transformation is 1, and under equal probabilities of forward/reverse moves, we have that
\[
\frac{\pi_n(M_2,\theta_2)p_V(v_1,v_2)}{\pi_n(M_1,\theta_1)p_U(u_1,u_2,u_3)}
\]
can be written
\[
 \dfrac{\mathcal{L}_n^{M_2}(\theta_2) \left\{\prod\limits_{j=1}^3  \phi(\theta_{2j}/\tau)/\tau\right\}  \phi_2(\theta_{11},\theta_{12}; \widehat\theta_1, \widehat {\mathbf{I}}_1)} {\mathcal{L}_n^{M_1}(\theta_1) \left\{\prod\limits_{j=1}^2  \phi(\theta_{1j}/\tau)/\tau\right\}  \phi_3(\theta_{21},\theta_{22},\theta_{23}; \widehat\theta_2, \widehat {\mathbf{I}}_2)}
\]
where $\tau$ is the prior standard deviation for the $\theta$ parameters. The logic of this construction is that numerically
\[
\mathcal{L}_n^{M_1}(\theta_1) \bumpeq Normal_2(\widehat\theta_1, \widehat {\mathbf{I}}_1^{-1}) \qquad \mathcal{L}_n^{M_2}(\theta_2) \bumpeq Normal_3( \widehat\theta_2, \widehat {\mathbf{I}}_2^{-1}).
\]
Note that if the prior information is also incorporated,  the asymptotic approximation to the posterior may be improved further.
\end{itemizeWide}

The algorithm was run for 100000 iterations to collect 10000 posterior samples.

<<T3,comment='+', fig.width=7, fig.height=4.25, fig.align='center',echo=TRUE>>=
#Prior
tau<-4
prior.al<-20
prior.be<-8

#Normal approximations to the conditional posterior distributions
var.1 <-solve(fit.1$hessian[1:2,1:2]+diag(1/tau^2,2))
ests.1<-var.1 %*% (fit.1$hessian[1:2,1:2] %*% fit.1$par[1:2])
sig.1<-exp(fit.1$par[3])
prec.1<-solve(var.1)

var.2<-solve(fit.2$hessian[1:3,1:3]+diag(1/tau^2,3))
ests.2<-var.2 %*% (fit.2$hessian[1:3,1:3] %*% fit.2$par[1:3])
sig.2<-exp(fit.2$par[4])
prec.2<-solve(var.2)

#Run MCMC
nsamp<-10000
nburn<-1000
nthin<-1
imod.tot<-c(0,0)
nr<-20
for(irep in 1:nr){
	ifix<-F
	imod<-1
	Res<-run.RJMCMC(nburn,nsamp,nthin)
    imod.tot<-imod.tot+table(Res$model)
	print(c(irep,as.numeric(table(Res$model)/nsamp)))
}
#Overall estimate of model probabilities
imod.tot/(nr*nsamp)
@
In this run with $\tau=4$, the chain spent about 66 \% of the time in model $M_1$, indicating the posterior probabilities are
\[
\pi_n(M_1) \bumpeq 0.66 \qquad \pi_n(M_2) \bumpeq 0.34.
\]
The model posterior probabilities vary with the choice of $\tau$; this is as expected, as the model probabilities are closely related to the marginal likelihood, or prior predictive distribution, which is the expected value of the likelihood for the observed data with respect to the prior distribution.  It is evident from the discussion that the prior specification acts as a penalty for complexity.  For illustration, if $\tau = 1$, the model probabilities change to $(0.43, 0.57)$; if $\tau= 10$, the model probabilities are approximately $(0.80,0.20)$.
<<T4,comment='+', fig.width=5.75, fig.height=5.75, fig.align='center',echo=TRUE>>=
th1<-Res$th[Res$model==1,]
th2<-Res$th[Res$model==2,]

par(mar=c(4,4,0,1))
plot(th1,pch=19,cex=0.5,xlab=expression(theta[11]),ylab=expression(theta[12]))

pairs(th2,pch=19,cex=0.5,labels=c(expression(theta[21]),expression(theta[22]),expression(theta[22])))

@

Conditional on $M_1$ or $M_2$ being true, we can perform inference about the parameters of the two models, and also reconstruct estimates and posterior credible intervals for $\E[Y]$.

<<T5,comment='+', fig.width=7, fig.height=4.5, fig.align='center',echo=FALSE>>=
xv<-seq(0,5,by=0.01)
ymat1<-matrix(0,nrow=nrow(th1),ncol=length(xv))
for(irep in 1:nrow(th1)){
	ymat1[irep,]<-funct.m1(xv,exp(th1[irep,]))
}
ymat2<-matrix(0,nrow=nrow(th2),ncol=length(xv))
for(irep in 1:nrow(th2)){
	ymat2[irep,]<-funct.m2(xv,exp(th2[irep,]))
}
yq1<-apply(ymat1,2,quantile,prob=c(0.025,0.50,0.975))
yq2<-apply(ymat2,2,quantile,prob=c(0.025,0.50,0.975))


par(mar=c(4,4,2,0))
plot(x,y,type="n",xlim=range(0,5),ylim=range(0,2.5),pch=19,xlab="t",ylab="y(t)")
polygon(c(xv,rev(xv)),c(yq1[1,],rev(yq1[3,])),col="lightgray")
lines(xval,yq1[2,],lwd=2)
points(x,y,pch=19)
abline(v=5,col="white")
title("Model 1")

par(mar=c(4,4,2,0))
plot(x,y,type="n",xlim=range(0,5),ylim=range(0,2.5),pch=19,xlab="t",ylab="y(t)")
polygon(c(xv,rev(xv)),c(yq2[1,],rev(yq2[3,])),col="lightgray")
lines(xval,yq2[2,],lwd=2)
points(x,y,pch=19)
abline(v=5,col="white")
title("Model 2")

boxplot(Res$sig^2~Res$model,names=c(expression(sigma[1]^2),expression(sigma[2]^2)))
@

\end{document}