\documentclass[notitlepage]{article}
\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{bbold}



\usepackage{amsfonts,amsmath,amssymb}

\setlength{\parindent}{0pt}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%6i", x),
      sprintf("%.6f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@


\begin{center}
\textsclarge{MATH 598: TOPICS IN STATISTICS}

\vspace{0.1 in} \textsc{MCMC for hierarchical regression models}
\end{center}


\textbf{Simple linear regression:}  We consider linear regression data from $m$ individuals, where it is considered that the simple linear regression parameters $(\beta_0,\beta_1)$ for each individual are different, thereby necessitating a hierarchical model: if $i=1,\ldots,m$ indexes individuals in the study, and $j=1,\ldots,n$ indexes the observations made on each individual, then

\begin{itemizeWide}
\item \textrm{STAGE 1:} \textbf{Observed data:}
\[
Y_{ij}  \sim  {Normal}(\beta_{i0}+\beta_{i1} x_{ij},\sigma_i^2) \qquad i=1,\ldots,m, j=1,\ldots,n ~\text{independently}
\]

\item \textrm{STAGE 2:} \textbf{Population model:} for $i=1,\ldots,m$ independently, assume that
\[
(\beta_{i0},\beta_{i1})  \sim  {Normal}_2(\mu, \Sigma) \qquad \qquad
\sigma_i^2  \sim InverseGamma(a/2,b/2).
\]
where $\mu \in \R^2$ and $\Sigma$ is a $2 \times 2$ positive definite matrix, and $a,b > 0$ are fixed scalars.  Notice that this is not an identical specification to that used in the non-hierarchical version of the model.

\item \textrm{STAGE 3:} \textbf{Prior model:} the prior model in this case is the \textbf{Normal-Inverse Wishart} model
\[
\Sigma \sim InverseWishart(\nu,\Psi) \qquad \qquad \mu|\Sigma \sim Normal_2(\eta,\Sigma/\lambda)
\]
where $\eta \in \R^2$, $\nu > 1$ is a scalar and $\Psi$ is a $2 \times 2$ positive definite matrix.
\end{itemizeWide}

This prior is the standard conjugate prior for multivariate Normal problems.  The Inverse Wishart prior is a conjugate prior for $d \times d$ covariance matrices.  It has two hyperparameters:

\begin{itemize}

\item $\nu$ (a degrees of freedom parameter) which is a real-valued scalar such that $\nu > d-1$; as $\nu$ increases;

\item $\Psi$ is a positive definite $d \times d$ ``precision'' matrix.

\end{itemize}
The expected value of this prior is
\[
\frac{1}{\nu - d - 1} \Psi
\]
provided $\nu > d + 1$, and the variance is finite if $\nu > d + 3$ -- the variance decreases with $\nu^2$.

\medskip

\textbf{MCMC:} We now formulate a Gibbs sampler strategy:

\begin{enumerate}[1.]

\item For the \textbf{first stage} parameters conditional on the first stage parameters, we have a standard linear model, and the full conditional posteriors exhibit a conditional independent structure for $i=1,\ldots,m$.  However, the analysis is not quite the same as in the non-hierarchical case, and so we again use a Gibbs sampler structure.  If $\bX_i$ is the $n \times 2$ design matrix for the $i$th simple linear regression, $\by_i$ is the vector of response data, we have

\begin{itemize}

\item $\beta_i | \sigma_i^2,\mu, \Sigma$ :  we have that the full conditional posterior is proportional to
\[
\exp\left\{-\frac{1}{2 \sigma_i^2} (\by_i - \bX_i \beta_i)^\top (\by_i-\bX_i \beta_i) \right\}
\exp\left\{-\frac{1}{2} (\beta_i - \mu)^\top \Sigma^{-1} (\beta_i - \mu) \right\}
\]
and therefore
\[
\pi_n(\beta_{i}|\sigma_i^2,\mu, \Sigma) \equiv Normal_2(\mathbf{m}_{ni}, \bM_{ni})
\]
where
\[
\mathbf{m}_{ni} = (\sigma_i^{-2} \bX_i^\top \bX_i + \Sigma^{-1})^{-1} (\sigma_i^{-2} \bX_i^\top \by_i + \Sigma^{-1} \mu) \qquad \qquad \bM_{ni} = (\sigma_i^{-2} \bX_i^\top \bX_i + \Sigma^{-1})^{-1}
\]


\item $\sigma_i^2 | \beta_i,\mu, \Sigma$ :  we have that the full conditional posterior is proportional to
\[
\left(\frac{1}{\sigma_i^2} \right)^{n/2} \exp\left\{-\frac{1}{2 \sigma_i^2} (\by_i - \bX_i \beta_i)^\top (\by_i-\bX_i \beta_i) \right\}
\left(\frac{1}{\sigma_i^2} \right)^{a/2+1} \exp\left\{-\frac{b}{2} \right\}
\]
and then by inspection, we see that
\[
\pi_n(\sigma_i^2|\beta_i, \mu, \Sigma) \equiv InverseGamma(a_{n}/2,b_{ni}/2)
\]
and
\[
a_n = n + a \qquad b_{ni} = (\by_i - \bX_i \beta_i)^\top (\by_i-\bX_i \beta_i) + b
\]
\end{itemize}

These two full conditional distributions can be sampled directly using \texttt{rmvn} from the \texttt{mvnfast} library, and the \texttt{rgamma} function (after reciprocation) respectively.

\item For the \textbf{second stage} parameters conditional on the first stage parameters, if $\beta_i = (\beta_{i0},\beta_{i1})$ are regarded as known quantities, then independently for $i=1,\ldots,n$
\[
\beta_i | \mu, \Sigma \sim Normal_d(\mu, \Sigma).
\]
We treat the $\beta_i$ vectors as pseudo-data, and attempt to perform inference for the second stage ``population'' parameters.   Here, the $NIW(\eta,\lambda,\nu,\Psi)$ prior is conjugate to the corresponding second-stage ``likelihood'', and it follows that the posterior distribution for $\mu$ and $\Sigma$ is $NIW(\mu_n,\lambda_n,\nu_n,\Psi_n)$
where
\[
\eta_n = \frac{m \overline{\beta} + \lambda \eta}{m+\lambda} \qquad \lambda_n = m + \lambda \qquad \nu_n = m + \nu
\]
and
\[
\Psi_n = \sum_{i=1}^m (\beta_i - \overline{\beta}) (\beta_i - \overline{\beta})^\top + \frac{m\lambda}{m+\lambda}(\overline{\beta}- \eta) (\overline{\beta} - \eta)^\top + \Psi
\qquad \qquad
\overline{\beta} = \frac{1}{m} \sum_{i=1}^m \beta_i.
\]
and $\eta,\nu$ and $\Psi$ are hyperparameters.  Then we have the results
\[
\pi_n(\mu|\Sigma,-) \equiv Normal(\eta_n,\Sigma/\lambda_n) \qquad \qquad \pi_n(\Sigma|-) \equiv InverseWishart(\nu_n, \Psi_n).
\]
These two full conditional distributions can be sampled directly using \texttt{rmvn} from the \texttt{mvnfast} library, and the \texttt{riwish} function from the \texttt{MCMCpack} library.
\end{enumerate}

We examine the performance of the algorithm for the following simulated data.

<<HLR1,comment='+', fig.width=7, fig.height=4.25, fig.align='center'>>=
set.seed(2300)
#Simulation
library(mvnfast)
m<-10
mu<-c(1.5,2.0)
Sigma<-2*diag(c(1,1.5))%*% matrix(c(1,0.75,0.75,1),2,2)%*%diag(c(1,1.5))
be<-rmvn(m,mu,Sigma)
ysd<-sqrt(1/rgamma(m,2,2))

xv<-seq(0,10,by=0.01)
x<-seq(1,10.0,by=0.5)
n<-length(x)
par(mar=c(4,4,1,0))
plot(xv,xv,type='n',ylim=range(-30,60),xlab='x',ylab='y')
y<-matrix(0,nrow=m,ncol=n)
for(i in 1:m){
	muv<-be[i,1]+be[i,2]*x
	lines(xv,be[i,1]+be[i,2]*xv)
	y[i,]<-muv+rnorm(n)*ysd[i]
	points(x,y[i,],cex=0.4,pch=i)
}
@

The Gibbs sampler proceeds as follows.
<<HLR2,comment='+', fig.width=7, fig.height=4.25, fig.align='center'>>=
library(MCMCpack)
X<-cbind(1,x)
XTX<-crossprod(X)
tXy<-(t(X) %*% t(y))

a<-b<-1
eta<-c(0,0)
lambda<-0.1
nu<-4
Psi<-nu*matrix(c(1,0,0,1),2,2)

#Starting values
old.beta<-matrix(0,nrow=m,ncol=2)
old.sigsq<-rep(0,m)
for(i in 1:m){
    fit0<-lm(y[i,]~x)
    old.beta[i,]<-coef(fit0)
    old.sigsq[i]<-summary(fit0)$sigma^2
}
old.mu<-apply(old.beta,2,mean)
old.Sigma<-var(old.beta)

nsamp<-2000; nburn<-50; nthin<-1
nits<-nburn+nsamp*nthin
ico<-0

beta.samp<-array(0,c(nsamp,m,2))
sigsq.samp<-matrix(0,nrow=nsamp,ncol=m)
mu.samp<-matrix(0,nrow=nsamp,ncol=2)
Sigma.samp<-array(0,c(nsamp,2,2))
for(iter in 1:nits){

    #Update the betas
    old.Siginv<-solve(old.Sigma)
    ssq<-rep(0,m)
    for(i in 1:m){
        M.ninv<-((XTX/old.sigsq[i])+old.Siginv)
        M.n<-solve(M.ninv)
        m.n<-M.n %*% (old.Siginv %*% old.mu + tXy[,i]/old.sigsq[i])
        old.beta[i,]<-rmvn(1,m.n,M.n)
        ssq[i]<-sum((y[i,]-X%*%old.beta[i,])^2)
    }

    #Update the sigmas
    a.n<-a+n
    b.n<-ssq+b
    old.sigsq<-1/rgamma(m,a.n/2,b.n/2)

    #Update Sigma
    nu.n<-nu+m
    beta.bar<-apply(old.beta,2,mean)
    Psi.n<-Psi+((m*lambda)/(m+lambda))*((beta.bar-eta) %*% t(beta.bar-eta))+var(old.beta)*(m-1)
    old.Sigma<-riwish(nu.n,Psi.n)

    #Update mu given Sigma
    lambda.n<-lambda+m
    eta.n<-(m*beta.bar+lambda*eta)/(m+lambda)
    old.mu<-matrix(rmvn(1,eta.n,old.Sigma/lambda.n),ncol=1)

    if(iter > nburn & iter %% nthin == 0){
        ico<-ico+1
        beta.samp[ico,,]<-old.beta
        sigsq.samp[ico,]<-old.sigsq
        mu.samp[ico,]<-old.mu
        Sigma.samp[ico,,]<-old.Sigma
    }

}
@

<<HLR3,comment='+', fig.width=7, fig.height=4, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,1),mfrow=c(1,1))
plot(mu.samp[,1],type='l',ylim=range(-3,5),ylab='',col='green')
lines(mu.samp[,2],col='blue')
title(expression(paste('Trace plots of ',mu,' parameters')),line=1)
legend(1200,-1,c(expression(mu[1]),expression(mu[2])),col=c('green','blue'),lty=1)
par(mar=c(4,4,2,1),mfrow=c(1,2))
hist(mu.samp[,1],col='green',breaks=seq(-3,5,by=0.25),ylim=range(0,800),
        xlab=' ',main=expression(mu[1]));box()
abline(v=mu[1],col='red')  #True value of mu1
hist(mu.samp[,2],col='blue',breaks=seq(-3,5,by=0.25),ylim=range(0,800),
        xlab=' ',main=expression(mu[2]));box()
abline(v=mu[2],col='red')  #True value of mu1
@

<<HLR4,comment='+', fig.width=7, fig.height=4.5, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,1),mfrow=c(2,2))
plot(Sigma.samp[,1,1],type='l',ylim=range(-3,15),ylab='',col=rgb(0,255/256,0))
title(expression(Sigma[11]))
plot(Sigma.samp[,1,2],type='l',ylim=range(-3,15),ylab='',col=rgb(0,128/256,0))
title(expression(Sigma[12]))
plot.new()
plot(Sigma.samp[,2,2],type='l',ylim=range(-3,15),ylab='',col=rgb(0,64/256,0))
title(expression(Sigma[22]))
par(mar=c(4,4,2,1),mfrow=c(2,2))
ivec<-Sigma.samp[,1,1] > -5 & Sigma.samp[,1,1] < 15
hist(Sigma.samp[ivec,1,1],col=rgb(0,255/256,0),breaks=seq(-5,15,by=0.5),ylim=range(0,800),
        xlab=' ',main=expression(Sigma[11]));box()
abline(v=Sigma[1,1],col='red')
ivec<-Sigma.samp[,1,2] > -5 & Sigma.samp[,1,2] < 15
hist(Sigma.samp[ivec,1,2],col=rgb(0,128/256,0),breaks=seq(-5,15,by=0.5),ylim=range(0,800),
        xlab=' ',main=expression(Sigma[12]));box()
abline(v=Sigma[1,2],col='red')
plot.new()
ivec<-Sigma.samp[,2,2] > -5 & Sigma.samp[,2,2] < 15
hist(Sigma.samp[ivec,2,2],col=rgb(0,64/256,0),breaks=seq(-5,25,by=0.5),ylim=range(0,800),
        xlab=' ',main=expression(Sigma[22]));box()
abline(v=Sigma[2,2],col='red')
@

<<HLR5,comment='+', fig.width=7, fig.height=4.5, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0))
boxplot(beta.samp[,,1],pch=19,cex=0.25)
points(1:m,be[,1],pch=19,col='red')
title(expression(paste('Boxplots of ',beta[i0],' parameters')))
boxplot(beta.samp[,,2],pch=19,cex=0.25)
points(1:m,be[,2],pch=19,col='red')
title(expression(paste('Boxplots of ',beta[i1],' parameters')))
@
\end{document}