\documentclass[notitlepage]{article} \usepackage{../../Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{multirow} \usepackage{bbm} \usepackage{bbold} \usepackage{amsfonts,amsmath,amssymb} \setlength{\parindent}{0pt} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%6i", x), sprintf("%.6f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \begin{center} \textsclarge{MATH 598: TOPICS IN STATISTICS} \vspace{0.1 in} \textsc{MCMC for hierarchical regression models} \end{center} \textbf{Simple linear regression:} We consider linear regression data from $m$ individuals, where it is considered that the simple linear regression parameters $(\beta_0,\beta_1)$ for each individual are different, thereby necessitating a hierarchical model: if $i=1,\ldots,m$ indexes individuals in the study, and $j=1,\ldots,n$ indexes the observations made on each individual, then \begin{itemizeWide} \item \textrm{STAGE 1:} \textbf{Observed data:} \[ Y_{ij} \sim {Normal}(\beta_{i0}+\beta_{i1} x_{ij},\sigma_i^2) \qquad i=1,\ldots,m, j=1,\ldots,n ~\text{independently} \] \item \textrm{STAGE 2:} \textbf{Population model:} for $i=1,\ldots,m$ independently, assume that \[ (\beta_{i0},\beta_{i1}) \sim {Normal}_2(\mu, \Sigma) \qquad \qquad \sigma_i^2 \sim InverseGamma(a/2,b/2). \] where $\mu \in \R^2$ and $\Sigma$ is a $2 \times 2$ positive definite matrix, and $a,b > 0$ are fixed scalars. Notice that this is not an identical specification to that used in the non-hierarchical version of the model. \item \textrm{STAGE 3:} \textbf{Prior model:} the prior model in this case is the \textbf{Normal-Inverse Wishart} model \[ \Sigma \sim InverseWishart(\nu,\Psi) \qquad \qquad \mu|\Sigma \sim Normal_2(\eta,\Sigma/\lambda) \] where $\eta \in \R^2$, $\nu > 1$ is a scalar and $\Psi$ is a $2 \times 2$ positive definite matrix. \end{itemizeWide} This prior is the standard conjugate prior for multivariate Normal problems. The Inverse Wishart prior is a conjugate prior for $d \times d$ covariance matrices. It has two hyperparameters: \begin{itemize} \item $\nu$ (a degrees of freedom parameter) which is a real-valued scalar such that $\nu > d-1$; as $\nu$ increases; \item $\Psi$ is a positive definite $d \times d$ ``precision'' matrix. \end{itemize} The expected value of this prior is \[ \frac{1}{\nu - d - 1} \Psi \] provided $\nu > d + 1$, and the variance is finite if $\nu > d + 3$ -- the variance decreases with $\nu^2$. \medskip \textbf{MCMC:} We now formulate a Gibbs sampler strategy: \begin{enumerate}[1.] \item For the \textbf{first stage} parameters conditional on the first stage parameters, we have a standard linear model, and the full conditional posteriors exhibit a conditional independent structure for $i=1,\ldots,m$. However, the analysis is not quite the same as in the non-hierarchical case, and so we again use a Gibbs sampler structure. If $\bX_i$ is the $n \times 2$ design matrix for the $i$th simple linear regression, $\by_i$ is the vector of response data, we have \begin{itemize} \item $\beta_i | \sigma_i^2,\mu, \Sigma$ : we have that the full conditional posterior is proportional to \[ \exp\left\{-\frac{1}{2 \sigma_i^2} (\by_i - \bX_i \beta_i)^\top (\by_i-\bX_i \beta_i) \right\} \exp\left\{-\frac{1}{2} (\beta_i - \mu)^\top \Sigma^{-1} (\beta_i - \mu) \right\} \] and therefore \[ \pi_n(\beta_{i}|\sigma_i^2,\mu, \Sigma) \equiv Normal_2(\mathbf{m}_{ni}, \bM_{ni}) \] where \[ \mathbf{m}_{ni} = (\sigma_i^{-2} \bX_i^\top \bX_i + \Sigma^{-1})^{-1} (\sigma_i^{-2} \bX_i^\top \by_i + \Sigma^{-1} \mu) \qquad \qquad \bM_{ni} = (\sigma_i^{-2} \bX_i^\top \bX_i + \Sigma^{-1})^{-1} \] \item $\sigma_i^2 | \beta_i,\mu, \Sigma$ : we have that the full conditional posterior is proportional to \[ \left(\frac{1}{\sigma_i^2} \right)^{n/2} \exp\left\{-\frac{1}{2 \sigma_i^2} (\by_i - \bX_i \beta_i)^\top (\by_i-\bX_i \beta_i) \right\} \left(\frac{1}{\sigma_i^2} \right)^{a/2+1} \exp\left\{-\frac{b}{2} \right\} \] and then by inspection, we see that \[ \pi_n(\sigma_i^2|\beta_i, \mu, \Sigma) \equiv InverseGamma(a_{n}/2,b_{ni}/2) \] and \[ a_n = n + a \qquad b_{ni} = (\by_i - \bX_i \beta_i)^\top (\by_i-\bX_i \beta_i) + b \] \end{itemize} These two full conditional distributions can be sampled directly using \texttt{rmvn} from the \texttt{mvnfast} library, and the \texttt{rgamma} function (after reciprocation) respectively. \item For the \textbf{second stage} parameters conditional on the first stage parameters, if $\beta_i = (\beta_{i0},\beta_{i1})$ are regarded as known quantities, then independently for $i=1,\ldots,n$ \[ \beta_i | \mu, \Sigma \sim Normal_d(\mu, \Sigma). \] We treat the $\beta_i$ vectors as pseudo-data, and attempt to perform inference for the second stage ``population'' parameters. Here, the $NIW(\eta,\lambda,\nu,\Psi)$ prior is conjugate to the corresponding second-stage ``likelihood'', and it follows that the posterior distribution for $\mu$ and $\Sigma$ is $NIW(\mu_n,\lambda_n,\nu_n,\Psi_n)$ where \[ \eta_n = \frac{m \overline{\beta} + \lambda \eta}{m+\lambda} \qquad \lambda_n = m + \lambda \qquad \nu_n = m + \nu \] and \[ \Psi_n = \sum_{i=1}^m (\beta_i - \overline{\beta}) (\beta_i - \overline{\beta})^\top + \frac{m\lambda}{m+\lambda}(\overline{\beta}- \eta) (\overline{\beta} - \eta)^\top + \Psi \qquad \qquad \overline{\beta} = \frac{1}{m} \sum_{i=1}^m \beta_i. \] and $\eta,\nu$ and $\Psi$ are hyperparameters. Then we have the results \[ \pi_n(\mu|\Sigma,-) \equiv Normal(\eta_n,\Sigma/\lambda_n) \qquad \qquad \pi_n(\Sigma|-) \equiv InverseWishart(\nu_n, \Psi_n). \] These two full conditional distributions can be sampled directly using \texttt{rmvn} from the \texttt{mvnfast} library, and the \texttt{riwish} function from the \texttt{MCMCpack} library. \end{enumerate} We examine the performance of the algorithm for the following simulated data. <>= set.seed(2300) #Simulation library(mvnfast) m<-10 mu<-c(1.5,2.0) Sigma<-2*diag(c(1,1.5))%*% matrix(c(1,0.75,0.75,1),2,2)%*%diag(c(1,1.5)) be<-rmvn(m,mu,Sigma) ysd<-sqrt(1/rgamma(m,2,2)) xv<-seq(0,10,by=0.01) x<-seq(1,10.0,by=0.5) n<-length(x) par(mar=c(4,4,1,0)) plot(xv,xv,type='n',ylim=range(-30,60),xlab='x',ylab='y') y<-matrix(0,nrow=m,ncol=n) for(i in 1:m){ muv<-be[i,1]+be[i,2]*x lines(xv,be[i,1]+be[i,2]*xv) y[i,]<-muv+rnorm(n)*ysd[i] points(x,y[i,],cex=0.4,pch=i) } @ The Gibbs sampler proceeds as follows. <>= library(MCMCpack) X<-cbind(1,x) XTX<-crossprod(X) tXy<-(t(X) %*% t(y)) a<-b<-1 eta<-c(0,0) lambda<-0.1 nu<-4 Psi<-nu*matrix(c(1,0,0,1),2,2) #Starting values old.beta<-matrix(0,nrow=m,ncol=2) old.sigsq<-rep(0,m) for(i in 1:m){ fit0<-lm(y[i,]~x) old.beta[i,]<-coef(fit0) old.sigsq[i]<-summary(fit0)$sigma^2 } old.mu<-apply(old.beta,2,mean) old.Sigma<-var(old.beta) nsamp<-2000; nburn<-50; nthin<-1 nits<-nburn+nsamp*nthin ico<-0 beta.samp<-array(0,c(nsamp,m,2)) sigsq.samp<-matrix(0,nrow=nsamp,ncol=m) mu.samp<-matrix(0,nrow=nsamp,ncol=2) Sigma.samp<-array(0,c(nsamp,2,2)) for(iter in 1:nits){ #Update the betas old.Siginv<-solve(old.Sigma) ssq<-rep(0,m) for(i in 1:m){ M.ninv<-((XTX/old.sigsq[i])+old.Siginv) M.n<-solve(M.ninv) m.n<-M.n %*% (old.Siginv %*% old.mu + tXy[,i]/old.sigsq[i]) old.beta[i,]<-rmvn(1,m.n,M.n) ssq[i]<-sum((y[i,]-X%*%old.beta[i,])^2) } #Update the sigmas a.n<-a+n b.n<-ssq+b old.sigsq<-1/rgamma(m,a.n/2,b.n/2) #Update Sigma nu.n<-nu+m beta.bar<-apply(old.beta,2,mean) Psi.n<-Psi+((m*lambda)/(m+lambda))*((beta.bar-eta) %*% t(beta.bar-eta))+var(old.beta)*(m-1) old.Sigma<-riwish(nu.n,Psi.n) #Update mu given Sigma lambda.n<-lambda+m eta.n<-(m*beta.bar+lambda*eta)/(m+lambda) old.mu<-matrix(rmvn(1,eta.n,old.Sigma/lambda.n),ncol=1) if(iter > nburn & iter %% nthin == 0){ ico<-ico+1 beta.samp[ico,,]<-old.beta sigsq.samp[ico,]<-old.sigsq mu.samp[ico,]<-old.mu Sigma.samp[ico,,]<-old.Sigma } } @ <>= par(mar=c(4,4,2,1),mfrow=c(1,1)) plot(mu.samp[,1],type='l',ylim=range(-3,5),ylab='',col='green') lines(mu.samp[,2],col='blue') title(expression(paste('Trace plots of ',mu,' parameters')),line=1) legend(1200,-1,c(expression(mu[1]),expression(mu[2])),col=c('green','blue'),lty=1) par(mar=c(4,4,2,1),mfrow=c(1,2)) hist(mu.samp[,1],col='green',breaks=seq(-3,5,by=0.25),ylim=range(0,800), xlab=' ',main=expression(mu[1]));box() abline(v=mu[1],col='red') #True value of mu1 hist(mu.samp[,2],col='blue',breaks=seq(-3,5,by=0.25),ylim=range(0,800), xlab=' ',main=expression(mu[2]));box() abline(v=mu[2],col='red') #True value of mu1 @ <>= par(mar=c(4,4,2,1),mfrow=c(2,2)) plot(Sigma.samp[,1,1],type='l',ylim=range(-3,15),ylab='',col=rgb(0,255/256,0)) title(expression(Sigma[11])) plot(Sigma.samp[,1,2],type='l',ylim=range(-3,15),ylab='',col=rgb(0,128/256,0)) title(expression(Sigma[12])) plot.new() plot(Sigma.samp[,2,2],type='l',ylim=range(-3,15),ylab='',col=rgb(0,64/256,0)) title(expression(Sigma[22])) par(mar=c(4,4,2,1),mfrow=c(2,2)) ivec<-Sigma.samp[,1,1] > -5 & Sigma.samp[,1,1] < 15 hist(Sigma.samp[ivec,1,1],col=rgb(0,255/256,0),breaks=seq(-5,15,by=0.5),ylim=range(0,800), xlab=' ',main=expression(Sigma[11]));box() abline(v=Sigma[1,1],col='red') ivec<-Sigma.samp[,1,2] > -5 & Sigma.samp[,1,2] < 15 hist(Sigma.samp[ivec,1,2],col=rgb(0,128/256,0),breaks=seq(-5,15,by=0.5),ylim=range(0,800), xlab=' ',main=expression(Sigma[12]));box() abline(v=Sigma[1,2],col='red') plot.new() ivec<-Sigma.samp[,2,2] > -5 & Sigma.samp[,2,2] < 15 hist(Sigma.samp[ivec,2,2],col=rgb(0,64/256,0),breaks=seq(-5,25,by=0.5),ylim=range(0,800), xlab=' ',main=expression(Sigma[22]));box() abline(v=Sigma[2,2],col='red') @ <>= par(mar=c(4,4,2,0)) boxplot(beta.samp[,,1],pch=19,cex=0.25) points(1:m,be[,1],pch=19,col='red') title(expression(paste('Boxplots of ',beta[i0],' parameters'))) boxplot(beta.samp[,,2],pch=19,cex=0.25) points(1:m,be[,2],pch=19,col='red') title(expression(paste('Boxplots of ',beta[i1],' parameters'))) @ \end{document}