\documentclass[notitlepage]{article}
\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{bbold}



\usepackage{amsfonts,amsmath,amssymb}

\setlength{\parindent}{0pt}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%6i", x),
      sprintf("%.6f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@


\begin{center}
\textsclarge{MATH 598: TOPICS IN STATISTICS}

\vspace{0.1 in} \textsc{MCMC for hierarchical non-linear regression models}
\end{center}


\textbf{Non-linear regression:}  We consider non-linear regression data from $m$ individuals.  Suppose we have the mean model for individual $i$ as
\[
\mu(x;\beta_{i0},\beta_{i1},\beta_{i2}) = e^{\beta_{i2}} \text{expit}\{\beta_{i0}+\beta_{i1} x\}
\]
where $\text{expit}(x) = e^x/(1+e^x)$, and where it is considered that the parameters $(\beta_{i0},\beta_{i1},\beta_{i2})$ for each individual are different, thereby necessitating a hierarchical model: if $i=1,\ldots,m$ indexes individuals in the study, and $j=1,\ldots,n$ indexes the observations made on each individual, then

\begin{itemizeWide}
\item \textrm{STAGE 1:} \textbf{Observed data:}
\[
Y_{ij}  \sim  {Normal}(\mu(x_{ij};\beta_{i0},\beta_{i1},\beta_{i2}),\sigma_i^2) \qquad i=1,\ldots,m, j=1,\ldots,n ~\text{independently}
\]

\item \textrm{STAGE 2:} \textbf{Population model:} for $i=1,\ldots,m$ independently, assume that
\[
(\beta_{i0},\beta_{i1},\beta_{i2})  \sim  {Normal}_3(\mu, \Sigma) \qquad \qquad
\sigma_i^2  \sim InverseGamma(a/2,b/2).
\]
where $\mu \in \R^3$ and $\Sigma$ is a $3 \times 3$ positive definite matrix, and $a,b > 0$ are fixed scalars.

\item \textrm{STAGE 3:} \textbf{Prior model:} the prior model is the \textbf{Normal-Inverse Wishart} model
\[
\Sigma \sim InverseWishart(\nu,\Psi) \qquad \qquad \mu|\Sigma \sim Normal_3(\eta,\Sigma/\lambda)
\]
where $\eta \in \R^3$, $\nu > 1$ is a scalar and $\Psi$ is a $3 \times 3$ positive definite matrix.
\end{itemizeWide}

\medskip

\textbf{MCMC:} We now formulate a Gibbs sampler strategy:

\begin{enumerate}[1.]

\item For the \textbf{first stage} parameters conditional on the first stage parameters, we have a standard linear model, and the full conditional posteriors exhibit a conditional independent structure for $i=1,\ldots,m$.  However, the analysis is not quite the same as in the non-hierarchical case, and so we again use a Gibbs sampler structure.  If $\bX_i$ is the $n \times 2$ design matrix for the $i$th simple linear regression, $\by_i$ is the vector of response data, we have

\begin{itemizeWide}

\item $\beta_i | \sigma_i^2,\mu, \Sigma$ :  we have that the full conditional posterior is proportional to
\[
\exp\left\{-\frac{1}{2 \sigma_i^2} (\by_i - \mu(x_{i};\beta_{i0},\beta_{i1},\beta_{i2}))^\top (\by_i-\mu(x_{i};\beta_{i0},\beta_{i1},\beta_{i2})) \right\} \exp\left\{-\frac{1}{2} (\beta_i - \mu)^\top \Sigma^{-1} (\beta_i - \mu) \right\}.
\]
where $\mu(x_{i};\beta_{i0},\beta_{i1},\beta_{i2})$ is the $n \times 1$ vector of means for individual $i$.  In this case, the full conditional posterior is not available in closed form, so we must use the Metropolis-Hastings algorithm: this can be achieved using a block-update of all the parameters for an individual together, or using a Metropolis-within-Gibbs approach where each parameters is updated separately conditional on the others.


\item $\sigma_i^2 | \beta_i,\mu, \Sigma$ :  we have that the full conditional posterior is proportional to
\[
\left(\frac{1}{\sigma_i^2} \right)^{n/2} \exp\left\{-\frac{1}{2 \sigma_i^2} (\by_i - \mu(x_{i};\beta_{i0},\beta_{i1},\beta_{i2}))^\top (\by_i-\mu(x_{i};\beta_{i0},\beta_{i1},\beta_{i2})) \right\}
\left(\frac{1}{\sigma_i^2} \right)^{a/2+1} \exp\left\{-\frac{b}{2} \right\}
\]
and then by inspection, we see that
\[
\pi_n(\sigma_i^2|\beta_i, \mu, \Sigma) \equiv InverseGamma(a_{n}/2,b_{ni}/2)
\]
and
\[
a_n = n + a \qquad b_{ni} = (\by_i - \mu(x_{ij};\beta_{i0},\beta_{i1},\beta_{i2}))^\top (\by_i-\mu(x_{ij};\beta_{i0},\beta_{i1},\beta_{i2})) + b
\]
\end{itemizeWide}

These two full conditional distributions can be sampled directly using \texttt{rmvn} from the \texttt{mvnfast} library, and the \texttt{rgamma} function (after reciprocation) respectively.

\item For the \textbf{second stage} parameters conditional on the first stage parameters, if $\beta_i = (\beta_{i0},\beta_{i1})$ are regarded as known quantities, then independently for $i=1,\ldots,n$
\[
\beta_i | \mu, \Sigma \sim Normal_d(\mu, \Sigma).
\]
We treat the $\beta_i$ vectors as pseudo-data, and attempt to perform inference for the second stage ``population'' parameters.   Here, the $NIW(\eta,\lambda,\nu,\Psi)$ prior is conjugate to the corresponding second-stage ``likelihood'', and it follows that the posterior distribution for $\mu$ and $\Sigma$ is $NIW(\mu_n,\lambda_n,\nu_n,\Psi_n)$
where
\[
\eta_n = \frac{m \overline{\beta} + \lambda \eta}{m+\lambda} \qquad \lambda_n = m + \lambda \qquad \nu_n = m + \nu
\]
and
\[
\Psi_n = \sum_{i=1}^m (\beta_i - \overline{\beta}) (\beta_i - \overline{\beta})^\top + \frac{m\lambda}{m+\lambda}(\overline{\beta}- \eta) (\overline{\beta} - \eta)^\top + \Psi
\qquad \qquad
\overline{\beta} = \frac{1}{m} \sum_{i=1}^m \beta_i.
\]
and $\eta,\nu$ and $\Psi$ are hyperparameters.  Then we have the results
\[
\pi_n(\mu|\Sigma,-) \equiv Normal(\eta_n,\Sigma/\lambda_n) \qquad \qquad \pi_n(\Sigma|-) \equiv InverseWishart(\nu_n, \Psi_n).
\]
These two full conditional distributions can be sampled directly using \texttt{rmvn} from the \texttt{mvnfast} library, and the \texttt{riwish} function from the \texttt{MCMCpack} library.
\end{enumerate}

We examine the performance of the algorithm for the following simulated data.

<<HLR1,comment='+', fig.width=7, fig.height=4.25, fig.align='center'>>=
set.seed(2300)
#Simulation
library(mvnfast)
m<-20
mu<-c(1.5,2.0,4)
Cor.mat<-matrix(c(1,0.75,0.25,0.75,1,0.25,0.25,0.75,1),3,3)
Sigma<-2*diag(c(1,0.05,0.2))%*% (Cor.mat%*%diag(c(1,0.05,0.2)))
be<-rmvn(m,mu,Sigma)
ysd<-sqrt(1/rgamma(m,4,4))

expit<-function(x){return(exp(x)/(1+exp(x)))}
mu.func<-function(x,b0,b1,b2){
    return(exp(b2)*expit(b0+b1*x))
}

xv<-seq(-5,5,by=0.01)
x<-seq(-5,5.0,by=0.5)
n<-length(x)
par(mar=c(4,4,1,0))
plot(xv,xv,type='n',ylim=range(-0.25,100),xlab='x',ylab='y')
y<-matrix(0,nrow=m,ncol=n)
for(i in 1:m){
	muv<-mu.func(x,be[i,1],be[i,2],be[i,3])
	lines(xv,mu.func(xv,be[i,1],be[i,2],be[i,3]))
	y[i,]<-muv+rnorm(n)*ysd[i]
}
@

The Gibbs sampler proceeds as follows.
<<HLR2,comment='+', fig.width=7, fig.height=4.25, fig.align='center'>>=
library(MCMCpack)
a<-b<-1
eta<-c(0,0,0)
lambda<-0.1
nu<-4
Psi<-nu*diag(rep(1,3))

#Starting values via Non-linear least squares
old.beta<-matrix(0,nrow=m,ncol=3)
old.sigsq<-old.ssq<-old.prior<-rep(0,m)
for(i in 1:m){
    yi<-y[i,]
    fit0<-nls(yi~exp(b2+b0+b1*x)/(1+exp(b0+b1*x)),start=list(b0=0.5,b1=1,b2=4))
    old.beta[i,]<-coef(fit0)
    old.sigsq[i]<-summary(fit0)$sigma^2
}
old.mu<-apply(old.beta,2,mean)
old.Sigma<-var(old.beta)

for(i in 1:m){
    old.ssq[i]<-sum((y[i,]-mu.func(x,old.beta[i,1],old.beta[i,2],old.beta[i,3]))^2)
    old.prior[i]<-dmvn(old.beta[i,],old.mu,old.Sigma,log=T)
}

nsamp<-2000; nburn<-50; nthin<-1
nits<-nburn+nsamp*nthin
ico<-0

beta.samp<-array(0,c(nsamp,m,3))
sigsq.samp<-matrix(0,nrow=nsamp,ncol=m)
mu.samp<-matrix(0,nrow=nsamp,ncol=3)
Sigma.samp<-array(0,c(nsamp,3,3))

mh.sig<-c(0.05,0.05,0.005)
for(iter in 1:nits){

    #Update the betas using Gibbs sampler
    old.Siginv<-solve(old.Sigma)
    new.ssq<-new.prior<-rep(0,m)
    for(i in 1:m){
        for(k in 1:3){
            new.beta<-old.beta[i,]
            new.beta[k]<-old.beta[i,k]+rnorm(1)*mh.sig[k]
            new.ssq[i]<-sum((y[i,]-mu.func(x,new.beta[1],new.beta[2],new.beta[3]))^2)
            new.prior[i]<-dmvn(new.beta,old.mu,old.Sigma,log=T)
            if(log(runif(1)) < -0.5*new.ssq[i]/old.sigsq[i]+new.prior[i]+
                                0.5*old.ssq[i]/old.sigsq[i]-old.prior[i]){
                old.beta[i,k]<-new.beta[k]
                old.ssq[i]<-new.ssq[i]
                old.prior[i]<-new.prior[i]
            }
        }
    }

    #Update the sigmas
    a.n<-a+m
    b.n<-old.ssq+b
    old.sigsq<-1/rgamma(m,a.n/2,b.n/2)

    #Update Sigma
    nu.n<-nu+m
    beta.bar<-apply(old.beta,2,mean)
    Psi.n<-Psi+((m*lambda)/(m+lambda))*((beta.bar-eta) %*% t(beta.bar-eta))+var(old.beta)*(m-1)
    old.Sigma<-riwish(nu.n,Psi.n)

    #Update mu given Sigma
    lambda.n<-lambda+m
    eta.n<-(m*beta.bar+lambda*eta)/(m+lambda)
    old.mu<-matrix(rmvn(1,eta.n,old.Sigma/lambda.n),ncol=1)

    if(iter > nburn & iter %% nthin == 0){
        ico<-ico+1
        beta.samp[ico,,]<-old.beta
        sigsq.samp[ico,]<-old.sigsq
        mu.samp[ico,]<-old.mu
        Sigma.samp[ico,,]<-old.Sigma
    }

    for(i in 1:m){
        old.ssq[i]<-sum((y[i,]-mu.func(x,old.beta[i,1],old.beta[i,2],old.beta[i,3]))^2)
        old.prior[i]<-dmvn(old.beta[i,],old.mu,old.Sigma,log=T)
    }

}
@

<<HLR3,comment='+', fig.width=7, fig.height=4, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,1),mfrow=c(1,1))
plot(mu.samp[,1],type='l',ylim=range(-3,5),ylab='',col='green')
lines(mu.samp[,2],col='blue')
lines(mu.samp[,3],col='cyan')
title(expression(paste('Trace plots of ',mu,' parameters')),line=1)
legend(1500,0,c(expression(mu[1]),expression(mu[2]),expression(mu[3])),
    col=c('green','blue','cyan'),lty=1)
@

<<HLR3a,comment='+', fig.width=7, fig.height=5, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,1),mfrow=c(2,2))
hist(mu.samp[,1],col='green',breaks=seq(0,3,by=0.1),ylim=range(0,800),
        xlab=' ',main=expression(mu[1]));box()
abline(v=mu[1],col='red')  #True value of mu1
hist(mu.samp[,2],col='blue',breaks=seq(0,3,by=0.1),ylim=range(0,800),
        xlab=' ',main=expression(mu[2]));box()
abline(v=mu[2],col='red')  #True value of mu2
hist(mu.samp[,3],col='cyan',breaks=seq(2,5,by=0.1),ylim=range(0,800),
        xlab=' ',main=expression(mu[3]));box()
abline(v=mu[3],col='red')  #True value of mu3
@

<<HLR4,comment='+', fig.width=7, fig.height=4.5, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,1),mfrow=c(3,3))
plot(Sigma.samp[,1,1],type='l',ylim=range(-1,5),ylab='')
title(expression(Sigma[11]))
plot(Sigma.samp[,1,2],type='l',ylim=range(-1,5),ylab='')
title(expression(Sigma[12]))
plot(Sigma.samp[,1,3],type='l',ylim=range(-1,5),ylab='')
title(expression(Sigma[13]))
plot.new()
plot(Sigma.samp[,2,2],type='l',ylim=range(-1,5),ylab='')
title(expression(Sigma[22]))
plot(Sigma.samp[,2,3],type='l',ylim=range(-1,5),ylab='')
title(expression(Sigma[23]))
plot.new()
plot.new()
plot(Sigma.samp[,3,3],type='l',ylim=range(-1,4),ylab='')
title(expression(Sigma[33]))
@

<<HLR5,comment='+', fig.width=7, fig.height=4.5, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0))
boxplot(beta.samp[,,1],pch=19,cex=0.25,cex.axis=0.8)
points(1:m,be[,1],pch=19,col='red')
title(expression(paste('Boxplots of ',beta[i0],' parameters')))
boxplot(beta.samp[,,2],pch=19,cex=0.25,cex.axis=0.8)
points(1:m,be[,2],pch=19,col='red')
title(expression(paste('Boxplots of ',beta[i1],' parameters')))
boxplot(beta.samp[,,3],pch=19,cex=0.25,cex.axis=0.8)
points(1:m,be[,3],pch=19,col='red')
title(expression(paste('Boxplots of ',beta[i2],' parameters')))
@

<<HLR6,comment='+', fig.width=7, fig.height=6, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0),mfrow=c(2,2))
plot(beta.samp[,1,1],type='l',ylab='')
title(expression(beta[10]),line=1)
plot(beta.samp[,1,2],type='l',ylab='')
title(expression(beta[11]),line=1)
plot(beta.samp[,1,3],type='l',ylab='')
title(expression(beta[12]),line=1)
@

<<HLR7,comment='+', fig.width=7, fig.height=6, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0))
pairs(beta.samp[,1,],pch=19,cex=0.75,
        labels=c(expression(beta[10]),expression(beta[11]),expression(beta[12])))
@
\end{document}