\documentclass[notitlepage]{article} \usepackage{../../Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{multirow} \usepackage{bbm} \usepackage{bbold} \usepackage{bm} \usepackage{amsfonts,amsmath,amssymb} \setlength{\parindent}{0pt} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%6i", x), sprintf("%.6f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \begin{center} \textsclarge{MATH 598: TOPICS IN STATISTICS} \vspace{0.1 in} \textsc{Model selection via Reversible Jump MCMC} \end{center} Consider two competing pharmacokinetic (PK) models for response data $y(t)$ measured at different time points $t_1,\ldots,t_n$: \begin{itemize} \item[1.] \textbf{Model $\bm{M_1}$:} One compartment, elimination only; \[ \E[Y(t)] = A_1 \exp\{ - \lambda_{1} t\} \qquad t \geq 0 \] \item[2.] \textbf{Model $\bm{M_2}$:} One compartment, absorption and elimination; \[ \E[Y(t)] = A_2 \left(\exp\{ - \lambda_{21} t\} - \exp\{ - (\lambda_{21}+\lambda_{22}) t\} \right)\qquad t \geq 0 \] \end{itemize} where $(A_1,\lambda_1)$ and $(A_2,\lambda_{21},\lambda_{22})$ are positive parameters. Under an assumption of additive, heteroscedastic Normal errors, we have two competing explanations for the observed data; both models can be fitted using ordinary least-squares. \medskip In the following example, we simulate data from Model $\bm{M_2}$, with parameters \[ \theta_{21} = \log A_2 = 2 \qquad \theta_{22} =\log \lambda_{21} =-0.5 \qquad \theta_{23} = \log \lambda_{22} = -0.5 \] with noise variance $\sigma^2 = 0.3^2$. <>= #Function definitions funct.m1<-function(xv,thv){ return(thv[1]*exp(-thv[2]*xv)) } funct.m2<-function(xv,thv){ return(thv[1]*(exp(-thv[2]*xv)-exp(-(thv[2]+thv[3])*xv))) } loglike<-function(yv,xv,thv,sv,fv){ return(sum(dnorm(yv,fv(xv,thv),sv,log=T))) } loglike.optim<-function(x,yv,xv,fv,np){ thv<-exp(x[-np]) sv<-exp(x[np]) return(-loglike(yv,xv,thv,sv,fv)) } run.RJMCMC<-function(nburn,nsamp,nthin){ nits<-nburn+nsamp*nthin if(!ifix){ imod<-sample(c(1,2),size=1) } if(imod == 1){ np<-2 old.th<-rmvn(1,ests.1,var.1) old.sig<-sig.1 old.like<-loglike(y,x,exp(old.th),old.sig,funct.m1) old.p<-sum(dnorm(old.th,0,tau,log=T)) old.q<-dmvn(old.th,ests.1,var.1,log=T) }else{ np<-3 old.th<-rmvn(1,ests.2,var.2) old.sig<-sig.2 old.like<-loglike(y,x,exp(old.th),old.sig,funct.m2) old.p<-sum(dnorm(old.th,0,tau,log=T)) old.q<-dmvn(old.th,ests.2,var.2,log=T) } theta.samp<-matrix(0,nrow=nsamp,ncol=3) sigma.samp<-like.samp<-imod.samp<-rep(0,nsamp) ico<-0 i1<-0;i2<-0 hvec1<-hvec2<-rep(0,nits) for(iter in 1:nits){ if(imod == 1){ imove<-sample(c(1,3),size=1,prob=c(0.5,0.5)) }else{ imove<-sample(c(2,4),size=1,prob=c(0.5,0.5)) } if(imod == 1 & imove== 1){ #move in Model 1 jmove<-sample(0:2,size=1) if(jmove == 0){ new.th<-rmvn(1,ests.1,var.1) new.like<-loglike(y,x,exp(new.th),old.sig,funct.m1) new.p<-sum(dnorm(new.th,0,tau,log=T)) new.q<-dmvn(old.th,ests.1,var.1,log=T) iacc<-log(runif(1)) < (new.like+new.p-new.q)-(old.like+old.p-old.q) }else if(jmove == 1){ new.th<-old.th new.th[1]<-old.th[1]+rnorm(1)*0.1 new.like<-loglike(y,x,exp(new.th),old.sig,funct.m1) new.p<-sum(dnorm(new.th,0,tau,log=T)) new.q<-dmvn(old.th,ests.1,var.1,log=T) iacc<-log(runif(1)) < (new.like+new.p)-(old.like+old.p) }else if(jmove == 2){ new.th<-old.th new.th[2]<-old.th[2]+rnorm(1)*0.1 new.like<-loglike(y,x,exp(new.th),old.sig,funct.m1) new.p<-sum(dnorm(new.th,0,tau,log=T)) new.q<-dmvn(old.th,ests.1,var.1,log=T) iacc<-log(runif(1)) < (new.like+new.p)-(old.like+old.p) } if(iacc){ old.th<-new.th old.like<-new.like old.p<-new.p old.q<-new.q } ssq<-sum((y-funct.m1(x,exp(old.th)))^2) old.sig<-sqrt(1/rgamma(1,n/2+prior.al,ssq/2+prior.be)) old.like<-loglike(y,x,exp(old.th),old.sig,funct.m1) }else if(imod == 2 & imove == 2){ #Move in Model 2 jmove<-sample(0:3,size=1) if(jmove == 0){ new.th<-rmvn(1,ests.2,var.2) new.like<-loglike(y,x,exp(new.th),old.sig,funct.m2) new.p<-sum(dnorm(new.th,0,tau,log=T)) new.q<-dmvn(old.th,ests.2,var.2,log=T) iacc<-log(runif(1)) < (new.like+new.p-new.q)-(old.like+old.p-old.q) }else if(jmove == 1){ new.th<-old.th new.th[1]<-old.th[1]+rnorm(1)*0.1 new.like<-loglike(y,x,exp(new.th),old.sig,funct.m2) new.p<-sum(dnorm(new.th,0,tau,log=T)) new.q<-dmvn(old.th,ests.2,var.2,log=T) iacc<-log(runif(1)) < (new.like+new.p)-(old.like+old.p) }else if(jmove == 2){ new.th<-old.th new.th[2]<-old.th[2]+rnorm(1)*0.1 new.like<-loglike(y,x,exp(new.th),old.sig,funct.m2) new.p<-sum(dnorm(new.th,0,tau,log=T)) new.q<-dmvn(old.th,ests.2,var.2,log=T) iacc<-log(runif(1)) < (new.like+new.p)-(old.like+old.p) }else if(jmove == 3){ new.th<-old.th new.th[3]<-old.th[3]+rnorm(1)*0.1 new.like<-loglike(y,x,exp(new.th),old.sig,funct.m2) new.p<-sum(dnorm(new.th,0,tau,log=T)) new.q<-dmvn(old.th,ests.2,var.2,log=T) iacc<-log(runif(1)) < (new.like+new.p)-(old.like+old.p) } if(iacc){ old.th<-new.th old.like<-new.like old.p<-new.p old.q<-new.q } ssq<-sum((y-funct.m2(x,exp(old.th)))^2) old.sig<-sqrt(1/rgamma(1,n/2+prior.al,ssq/2+prior.be)) old.like<-loglike(y,x,exp(old.th),old.sig,funct.m2) } if(!ifix){ if(imod == 1 & imove == 3){ #Move from Model 1 to Model 2 uvec<-rmvn(1,ests.2,var.2) new.th<-uvec new.like<-loglike(y,x,exp(new.th),old.sig,funct.m2) new.p<-sum(dnorm(new.th,0,tau,log=T)) new.q<-dmvn(new.th,ests.2,var.2,log=T) hval<-(new.like+new.p-new.q)-(old.like+old.p-old.q) i1<-i1+1 hvec1[i1]<-hval if(log(runif(1)) < hval){ imod<-2 np<-3 old.th<-new.th old.like<-new.like old.p<-new.p old.q<-new.q } }else if(imod == 2 & imove== 4){ #Move from Model 2 to Model 1 vvec<-rmvn(1,ests.1,var.1) new.th<-vvec new.like<-loglike(y,x,exp(new.th),old.sig,funct.m1) new.p<-sum(dnorm(new.th,0,tau,log=T)) new.q<-dmvn(new.th,ests.1,var.1,log=T) hval<-(new.like+new.p-new.q)-(old.like+old.p-old.q) i2<-i2+1 hvec2[i2]<-hval if(log(runif(1)) < hval){ imod<-1 np<-2 old.th<-new.th old.like<-new.like old.p<-new.p old.q<-new.q } } } if(imod == 1){ ssq<-sum((y-funct.m1(x,exp(old.th)))^2) old.sig<-sqrt(1/rgamma(1,n/2+prior.al,ssq/2+prior.be))3 old.like<-loglike(y,x,exp(old.th),old.sig,funct.m1) }else{ ssq<-sum((y-funct.m2(x,exp(old.th)))^2) old.sig<-sqrt(1/rgamma(1,n/2+prior.al,ssq/2+prior.be)) old.like<-loglike(y,x,exp(old.th),old.sig,funct.m2) } if(iter > nburn && iter %% nthin==0){ ico<-ico+1 theta.samp[ico,1:np]<-old.th sigma.samp[ico]<-old.sig like.samp[ico]<-old.like imod.samp[ico]<-imod } } return(list(th=theta.samp,sig=sigma.samp,like=like.samp,model=imod.samp)) } @ <>= library(MASS) library(mvnfast) #Simulate data set.seed(4263374) n<-20 x<-sort(runif(n,0,5)) th<-exp(c(2,-0.5,-0.5)) y<-funct.m2(x,th)+rnorm(n,0,.3) par(mar=c(4,4,2,0)) plot(x,y,xlim=range(0,5),ylim=range(0,2.5),pch=19,xlab="t",ylab="y(t)") xval<-seq(0,5,by=0.01) yval<-funct.m2(xval,th) lines(xval,yval,col="red") fit.1<-optim(c(0,0,0),fn=loglike.optim,yv=y,xv=x, fv=funct.m1,np=3,control=list(maxit=1000),hessian=T) yval1<-funct.m1(xval,exp(fit.1$par[1:2])) lines(xval,yval1,col="green") fit.2<-optim(c(0,0,0,0),fn=loglike.optim,yv=y,xv=x, fv=funct.m2,np=4,control=list(maxit=10000),hessian=T) yval2<-funct.m2(xval,exp(fit.2$par[1:3])) lines(xval,yval2,col="blue") @ We can compare the two models by using the BIC: for $k=1,2$, we have \[ \text{BIC}_k = -2 \log \mathcal{L}_n^{M_k}(\widehat \theta_k) + d_k \log n \] where $d_k$ is the total number of parameters for model $M_k$. <>= #Compute the BIC BIC1<-2*fit.1$value+3*log(n) BIC2<-2*fit.2$value+4*log(n) c(BIC1,BIC2) #Check using nls PK.data<-data.frame(x,y) AIC(nls(y~A*exp(-B*x),data=PK.data,start=list(A=1,B=0.5)),k=log(20)) AIC(nls(y~A*exp(-B*x)*(1-exp(-C*x)),data=PK.data,start=list(A=1,B=1,C=1)),k=log(20)) @ Under BIC, the evidence favours model $M_2$, for which $\textrm{BIC}_2 = \Sexpr{BIC2}$, over model $M_1$, where $\textrm{BIC}_1 = \Sexpr{BIC1}$. \bigskip We now consider a fully Bayesian solution using reversible jump MCMC. In the log-scale parameterization \[ \theta_1 = (\log A_1, \log \lambda_1) \qquad \qquad \theta_2 = (\log A_2, \log \lambda_{21}, \log \lambda_{22}). \] We place equal prior probabilities on $M_1$ and $M_2$, and then place independent $N(0,\tau^2)$ priors on the components of $\theta_1$ and $\theta_2$. The prior on the residual error variance $\sigma^2$ is Inverse Gamma with parameters 20 and 8. \medskip The ML estimates $\widehat{\theta}_1$ and $\widehat{\theta}_2$ can be computed easily, as can the Hessian matrices $\widehat{\mathbf{I}}_1$ and $\widehat{\mathbf{I}}_2$; these likelihood-based results yield reasonable approximations to the posterior distributions to produce independence MH algorithms. Specifically, at the ML estimates for $\sigma$ under the two models, we may approximate the conditional posterior for $\theta$ by the Normal density \begin{equation} \pi_n^{M_k}(\theta_k|\widehat \sigma,\by) \bumpeq Normal(\widehat \theta_k, n^{-1} \widehat \sigma_k^2 \widehat{\mathbf{I}_k}^{-1}) \label{eq:AsympNormal} \end{equation} We may also introduce the prior information and use the conjugate analysis to return an approximate Normal conditional posterior distribution. Finally, on fitting using ML, the estimates of $\sigma$ under the two models are found to be quite similar ($M_1: \widehat \sigma_1 = 0.252, M_2: \widehat \sigma_2 = 0.329$). \bigskip \textbf{Reversible Jump MCMC:} A reversible jump MCMC algorithm can be constructed as follows: we again consider four move types: \begin{enumerate} \item $m=1$: move \textbf{within} $M_1$; update $\theta_1$ from $ \pi_n^{M_1}(\theta_1|M_1, \sigma)$ \item $m=2$: move \textbf{within} $M_1$; update $\theta_2$ from $ \pi_n^{M_2}(\theta_2|M_2, \sigma)$ \item $m=3$: move \textbf{from} $M_1$ \textbf{to} $M_2$; propose a new $\theta_2$, and carry out an accept/reject step. \item $m=4$: move \textbf{from} Model $M_2$ \textbf{to} Model $M_1$; propose a new $\theta_1$, and carry out an accept/reject step. \end{enumerate} with the remaining parameter $\sigma^2$ being updated in a Gibbs sampler algorithm at each iteration. \medskip \begin{itemizeWide} \item For moves $m=1,2$, we use standard Metropolis-Hastings steps, either jointly on the whole parameter vector, or for each of the parameters individually. The asymptotic approximations in \eqref{eq:AsympNormal} can be used. \item Moves $m=3,4$ are a forward/reverse move pair. For move 3, several options are available; for example, we could adopt the earlier strategy, and generate a new variate $u$ from the prior for the additional parameter, and then merely use the mapping \[ (\theta_{11},\theta_{12},u) \longmapsto (\theta_{21}=\theta_{11},\theta_{22}=\theta_{12},\theta_{23}=u) \] with reverse move setting $\theta_{23} = 0$. \medskip This approach may be adequate, but more probably would not facilitate good mixing across the models. A better strategy is to consider a different augmentation, where we generate $\bu = (u_1,u_2,u_3)$ from the model in \eqref{eq:AsympNormal} for $k=2$, and map $(\theta_{11},\theta_{12},u_1,u_2,u_3)$ to \[ (\theta_{21}=u_1,\theta_{22}=u_2,\theta_{23}=u_3,v_1=\theta_{11},v_2=\theta_{12}) \] with the paired reverse move being to generate $\bv = (v_1,v_2)$ from the model in \eqref{eq:AsympNormal} for $k=1$. \medskip This guarantees that the proposed value $\theta_2$ lies in a region with reasonably high posterior support under model $M_2$, although it does not guarantee that the move will be accepted with high probability. \medskip In the Hastings ratio, the Jacobian of the transformation is 1, and under equal probabilities of forward/reverse moves, we have that \[ \frac{\pi_n(M_2,\theta_2)p_V(v_1,v_2)}{\pi_n(M_1,\theta_1)p_U(u_1,u_2,u_3)} \] can be written \[ \dfrac{\mathcal{L}_n^{M_2}(\theta_2) \left\{\prod\limits_{j=1}^3 \phi(\theta_{2j}/\tau)/\tau\right\} \phi_2(\theta_{11},\theta_{12}; \widehat\theta_1, \widehat {\mathbf{I}}_1)} {\mathcal{L}_n^{M_1}(\theta_1) \left\{\prod\limits_{j=1}^2 \phi(\theta_{1j}/\tau)/\tau\right\} \phi_3(\theta_{21},\theta_{22},\theta_{23}; \widehat\theta_2, \widehat {\mathbf{I}}_2)} \] where $\tau$ is the prior standard deviation for the $\theta$ parameters. The logic of this construction is that numerically \[ \mathcal{L}_n^{M_1}(\theta_1) \bumpeq Normal_2(\widehat\theta_1, \widehat {\mathbf{I}}_1^{-1}) \qquad \mathcal{L}_n^{M_2}(\theta_2) \bumpeq Normal_3( \widehat\theta_2, \widehat {\mathbf{I}}_2^{-1}). \] Note that if the prior information is also incorporated, the asymptotic approximation to the posterior may be improved further. \end{itemizeWide} The algorithm was run for 100000 iterations to collect 10000 posterior samples. <>= #Prior tau<-4 prior.al<-20 prior.be<-8 #Normal approximations to the conditional posterior distributions var.1 <-solve(fit.1$hessian[1:2,1:2]+diag(1/tau^2,2)) ests.1<-var.1 %*% (fit.1$hessian[1:2,1:2] %*% fit.1$par[1:2]) sig.1<-exp(fit.1$par[3]) prec.1<-solve(var.1) var.2<-solve(fit.2$hessian[1:3,1:3]+diag(1/tau^2,3)) ests.2<-var.2 %*% (fit.2$hessian[1:3,1:3] %*% fit.2$par[1:3]) sig.2<-exp(fit.2$par[4]) prec.2<-solve(var.2) #Run MCMC nsamp<-10000 nburn<-1000 nthin<-1 imod.tot<-c(0,0) nr<-20 for(irep in 1:nr){ ifix<-F imod<-1 Res<-run.RJMCMC(nburn,nsamp,nthin) imod.tot<-imod.tot+table(Res$model) print(c(irep,as.numeric(table(Res$model)/nsamp))) } #Overall estimate of model probabilities imod.tot/(nr*nsamp) @ In this run with $\tau=4$, the chain spent about 66 \% of the time in model $M_1$, indicating the posterior probabilities are \[ \pi_n(M_1) \bumpeq 0.66 \qquad \pi_n(M_2) \bumpeq 0.34. \] The model posterior probabilities vary with the choice of $\tau$; this is as expected, as the model probabilities are closely related to the marginal likelihood, or prior predictive distribution, which is the expected value of the likelihood for the observed data with respect to the prior distribution. It is evident from the discussion that the prior specification acts as a penalty for complexity. For illustration, if $\tau = 1$, the model probabilities change to $(0.43, 0.57)$; if $\tau= 10$, the model probabilities are approximately $(0.80,0.20)$. <>= th1<-Res$th[Res$model==1,] th2<-Res$th[Res$model==2,] par(mar=c(4,4,0,1)) plot(th1,pch=19,cex=0.5,xlab=expression(theta[11]),ylab=expression(theta[12])) pairs(th2,pch=19,cex=0.5,labels=c(expression(theta[21]),expression(theta[22]),expression(theta[22]))) @ Conditional on $M_1$ or $M_2$ being true, we can perform inference about the parameters of the two models, and also reconstruct estimates and posterior credible intervals for $\E[Y]$. <>= xv<-seq(0,5,by=0.01) ymat1<-matrix(0,nrow=nrow(th1),ncol=length(xv)) for(irep in 1:nrow(th1)){ ymat1[irep,]<-funct.m1(xv,exp(th1[irep,])) } ymat2<-matrix(0,nrow=nrow(th2),ncol=length(xv)) for(irep in 1:nrow(th2)){ ymat2[irep,]<-funct.m2(xv,exp(th2[irep,])) } yq1<-apply(ymat1,2,quantile,prob=c(0.025,0.50,0.975)) yq2<-apply(ymat2,2,quantile,prob=c(0.025,0.50,0.975)) par(mar=c(4,4,2,0)) plot(x,y,type="n",xlim=range(0,5),ylim=range(0,2.5),pch=19,xlab="t",ylab="y(t)") polygon(c(xv,rev(xv)),c(yq1[1,],rev(yq1[3,])),col="lightgray") lines(xval,yq1[2,],lwd=2) points(x,y,pch=19) abline(v=5,col="white") title("Model 1") par(mar=c(4,4,2,0)) plot(x,y,type="n",xlim=range(0,5),ylim=range(0,2.5),pch=19,xlab="t",ylab="y(t)") polygon(c(xv,rev(xv)),c(yq2[1,],rev(yq2[3,])),col="lightgray") lines(xval,yq2[2,],lwd=2) points(x,y,pch=19) abline(v=5,col="white") title("Model 2") boxplot(Res$sig^2~Res$model,names=c(expression(sigma[1]^2),expression(sigma[2]^2))) @ \end{document}