\documentclass[notitlepage,11pt]{article} \usepackage{../../Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumitem} \usepackage{multirow} \usepackage{bbm} \usepackage{amsfonts,amsmath,amssymb} \setlength{\parindent}{0pt} \def\E{\Expect} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%.3f", x), sprintf("%.3f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \begin{center} \textsclarge{MATH 559: Bayesian Theory and Methods} \vspace{0.1 in} \textsc{Bayesian modelling with the Student-t model} \end{center} A model is to be constructed under an assumption of exchangeability with the following components: \begin{itemize} \item Finite realization $y_1,\ldots,y_n$ recorded; \smallskip \item $\mathcal{Y} \equiv \R$; \smallskip \item $\theta = (\mu,\sigma)$; \smallskip \item $f_{Y}(y;\theta) \equiv Student(\nu,\mu,\sigma)$ \[ f_{Y}(y;\theta) = \dfrac{\Gamma \left( {\dfrac{\nu +1}{2}}\right) }{\Gamma \left( {\dfrac{\nu }{2}}\right) \sigma \sqrt{\pi \nu }% \left\{ 1+\dfrac{1}{\nu}\left(\dfrac{y-\mu}{\sigma }\right)^2 \right\} ^{(\nu +1)/2}} \] where $\nu > 0$ is a \textbf{fixed constant}. \smallskip \item $\pi_0(\mu,\sigma)$ a prior density on $\R \times \R^+$. \end{itemize} Consider a data-generating scenario where the true values of the parameters are $\mu_0 = 2, \sigma_0 = 1$ and consider a sample of size $n=20$ with $\nu = 3$. Note that if $Y \sim Student(\nu,\mu,\sigma)$, then we have that \[ Z = \frac{Y - \mu}{\sigma} \sim Student(\nu,0,1) \equiv Student(\nu). \] Thus $Z$ has the `standard' Student-t distribution with $\nu$ degrees of freedom, and in \texttt{R} the functions \texttt{rt} and \texttt{dt} compute for this standard version. <>= #Data generation set.seed(234) n<-20 mu0<-2;sigma0<-1 nu<-3 y<-rt(n,nu)*sigma0+mu0 round(y,4) @ A conjugate analysis is not possible in this model, so we are forced to use numerical methods. We will retain the Normal-Inverse Gamma prior that would be conjugate for a Normal likelihood, with \[ \pi_0(\mu,\sigma^2) = \pi_0(\mu|\sigma^2) \pi_0(\sigma^2) \] with factors \[ \pi_0(\mu|\sigma^2) \equiv Normal(\eta,\sigma^2/\lambda) \] and \[ \pi_0(\sigma^2) = InvGamma(\alpha_0/2,\beta_0/2) \] where we take $\eta = 2, \lambda = 0.1, \alpha_0=2$ and $\beta_0=4$ in the illustrative analysis. \bigskip \textbf{Approach 1:} \textbf{Direct approach using Gibbs sampler} Here the two parameters are updated recursively from their full conditionals \[ \pi_n(\mu|\sigma^2) \qquad \pi_n(\sigma^2|\mu) \] using Metropolis-Hastings updates for each, as there is no standard form for either density. Note that if \[ \mathcal{L}_n(\mu,\sigma^2) = \prod_{i=1}^n f_Y(y_i;\mu,\sigma^2) \] represents the likelihood, we have that \begin{align*} \pi_n(\mu|\sigma^2) & \propto \ \mathcal{L}_n(\mu,\sigma^2) \pi_0(\mu|\sigma^2)\\[6pt] \pi_n(\sigma^2|\mu) & \propto \ \mathcal{L}_n(\mu,\sigma^2) \pi_0(\mu|\sigma^2) \pi_0(\sigma^2) \end{align*} <>= #Function definitions dinvgamma<-function(x,a,b,log=FALSE){ dx<-(b^a/gamma(a))*(1/x)^(a+1)*exp(-b/x) if(log){ dx<-log(dx) } return(dx) } log.like<-function(yv,mv,sv,nuv){ #Use dt on the standardized data, but remember to include #the Jacobian for the scale transform xv<-(yv-mv)/sv return(sum(dt(xv,nuv,log=T)-log(sv))) } @ <>= #MCMC collection settings nsamp<-10000 nburn<-50 nthin<-10 nits<-nburn+nthin*nsamp #Prior distributions eta<-2; lambda<-0.1 al<-2; be<-4 #Starting set up old.mu<-0 old.sigsq<-1 old.like<-log.like(y,old.mu,sqrt(old.sigsq),nu) old.prior.mu<-dnorm(old.mu,eta,sqrt(old.sigsq/lambda),log=TRUE) old.prior.sig<-dinvgamma(old.sigsq,al/2,be/2,log=TRUE) old.post<-old.like+old.prior.mu+old.prior.sig @ We will run the Gibbs sampler with Metropolis updates, using Normal and reflected Normal proposals for $\mu$ and $\sigma$ respectively, that is, using proposal densities \begin{align*} q(\mu,z) & \equiv Normal(\mu,\sigma_m^2)\\[6pt] q(\sigma,z) & : Z = |\sigma+\delta|, \delta \sim Normal(0,\sigma_s^2) \end{align*} <>= #Gibbs sampler - Metropolis-within-Gibbs #Proposal parameters psig.m<-0.2 psig.s<-0.2 #For the accept/reject steps logu1<-log(runif(nits)) logu2<-log(runif(nits)) #For collecting the samples ico<-0 par.samp<-matrix(0,nrow=nsamp,ncol=2) for(iter in 1:nits){ #Update mu from its full conditional new.mu<-rnorm(1,old.mu,psig.m) new.like<-log.like(y,new.mu,sqrt(old.sigsq),nu) new.prior.mu<-dnorm(new.mu,eta,sqrt(old.sigsq/lambda),log=TRUE) new.post<-new.like+new.prior.mu+old.prior.sig if(logu1[iter] < new.post-old.post){ #Accept update old.mu<-new.mu old.like<-new.like old.prior.mu<-new.prior.mu old.post<-new.post } #Update sigma sq from its full conditional new.sigsq<-abs(rnorm(1,old.sigsq,psig.s)) new.like<-log.like(y,old.mu,sqrt(new.sigsq),nu) new.prior.mu<-dnorm(old.mu,eta,sqrt(new.sigsq/lambda),log=TRUE) new.prior.sig<-dinvgamma(new.sigsq,al/2,be/2,log=TRUE) new.post<-new.like+new.prior.mu+new.prior.sig if(logu2[iter] < new.post-old.post){ #Accept update old.sigsq<-new.sigsq old.like<-new.like old.prior.mu<-new.prior.mu old.prior.sig<-new.prior.sig old.post<-new.post } #Collect output if(iter > nburn & iter %% nthin ==0){ ico<-ico+1 par.samp[ico,]<-c(old.mu,old.sigsq) } } @ <>= par(mar=c(4,4,2,0)) mulab<-expression(mu);siglab<-expression(sigma^2) plot(par.samp,pch=19,cex=0.5,xlab=mulab,ylab=siglab,xlim=range(0,3.5),ylim=range(0,3.5)) #Summaries mean(par.samp[,1]) mean(par.samp[,2]) var(par.samp) @ <>= #Posterior histograms par(mar=c(4,4,4,0),mfrow=c(1,2)) hist(par.samp[,1],breaks=seq(0,3.5,by=0.05),ylim=range(0,2), freq=FALSE,main=mulab,xlab='');box() acc<-par.samp[,2]<3.5 hist(par.samp[acc,2],breaks=seq(0,3.5,by=0.05),,ylim=range(0,2), freq=FALSE,main=siglab,xlab='');box() @ <>= #Trace plots par(mar=c(4,4,1,0)) plot(par.samp[,1],type='l',col='red',xlab='Sample',ylab='',ylim=range(0,4)) lines(par.samp[,2],type='l',col='blue') legend(0,4,c(mulab,siglab),lty=1,col=c('red','blue')) @ <>= #ACF plots par(mar=c(4,4,3,0),mfrow=c(1,2)) acf(par.samp[,1],main=mulab) acf(par.samp[,2],main=siglab) #Effective sample size library(coda) effectiveSize(par.samp) #Efficiency effectiveSize(par.samp)/nsamp @ \pagebreak \textbf{Approach 2:} \textbf{Gibbs sampler with auxiliary variables} For the Student-t model, we can use an auxiliary variable representation to remove the need for a Metropolis-Hastings update and rely on a Gibbs sampler where all full conditionals are standard distributions. The representation arises as if \[ V \sim InvGamma(\nu/2,\nu/2) \qquad \qquad X | V=v \sim Normal(0,v) \] then a standard calculation verifies that $X \sim Student(\nu)$. <>= set.seed(38) V<-1/rgamma(10000,nu/2,nu/2) #Generate the inverse gamma V X<-rnorm(10000,0,sqrt(V)) #Generate the conditional normal inc<-(abs(X) < 5) par(mar=c(4,4,2,0)) hist(X[inc],breaks=seq(-5,5,0.2),freq=FALSE,main='',xlab='X');box() xv<-seq(-5,5,by=0.001) yv<-dt(xv,nu) #Student(nu) density lines(xv,yv,col='red') legend(-5,0.3,c(expression(Student(nu))),col='red',lty=1) @ For our inference problem we may therefore using a likelihood that includes the auxiliary variables based on using the \textbf{joint} density \[ f_{Y,V}(y,v;\mu,\sigma^2) = f_{Y|V}(y|v;\mu,\sigma^2) f_V(v). \] for which the marginal density \[ f_Y(y;\mu,\sigma^2) = \int_0^\infty f_{Y|V}(y|v;\mu,\sigma^2) f_V(v) \ dv \] is the correct target $Student(\nu,\mu,\sigma)$. Specifically, we now have the auxiliary variable likelihood that includes the auxiliary variables $\bv = (v_1,\ldots,v_n)$ as unknown quantities \[ \mathcal{L}_n(\mu,\sigma,\bv) = \prod_{i=1}^n f_{Y|V}(y_i|v_i;\mu,\sigma^2) f_V(v_i) \] which takes the form \begin{align*} \mathcal{L}_n(\mu,\sigma,\bv) & = \prod_{i=1}^n \left(\dfrac{1}{2 \pi v_i \sigma^2} \right)^{1/2} \exp\left\{-\frac{1}{2 v_i \sigma^2} (y_i - \mu)^2 \right\} \dfrac{(\nu/2)^{\nu/2}}{\Gamma(\nu/2)} \left(\frac{1}{v_i} \right)^{\nu/2+1} \exp\left\{ - \frac{v_i \nu }{2} \right\} \\[6pt] & \propto \left(\dfrac{1}{\sigma^2} \right)^{n/2} \exp\left\{-\frac{1}{2\sigma^2} \sum_{i=1}^n \frac{1}{v_i} (y_i - \mu)^2 \right\} \left\{ \prod_{i=1}^n \left( \frac{1}{v_i} \right)^{(\nu + 1)/2+1} \right\} \exp\left\{ - \frac{\nu}{2} \sum_{i=1}^n v_i \right\} \end{align*} which combines with the prior \begin{align*} \pi_0(\mu|\sigma^2) & \propto \left(\frac{1}{\sigma^2}\right)^{1/2} \exp \left\{-\frac{\lambda}{2 \sigma^2} (\mu - \eta)^2 \right\} \\[6pt] \pi_0(\sigma^2) & \propto \left(\frac{1}{\sigma^2} \right)^{\alpha_0/2 + 1} \exp\left\{-\frac{\beta_0}{2 \sigma^2} \right\} \end{align*} to yield the joint posterior $\pi_n(\mu,\theta,\bv)$ up to proportionality. For the Gibbs sampler, we need the full conditional posteriors: \begin{itemize}[itemsep=0.2in] \item Full conditional for $\mu$: \begin{align*} \pi_n(\mu| \sigma^2, \bv) & \propto \exp\left\{-\frac{1}{2\sigma^2} \sum_{i=1}^n \frac{1}{v_i} (y_i - \mu)^2 \right\} \exp \left\{-\frac{\lambda}{2 \sigma^2} (\mu - \eta)^2 \right\} \\[6pt] & \propto \exp\left\{-\frac{1}{2\sigma^2} \left[ \sum_{i=1}^n \frac{1}{v_i} (y_i - \mu)^2 + \lambda (\mu - \eta)^2 \right] \right\}\\[6pt] & \propto \exp\left\{-\frac{\lambda_n(\bv) }{2\sigma^2} (\mu - \eta_n(\bv))^2 \right\} \end{align*} where \[ \eta_n(\bv) = \frac{\sum\limits_{i=1}^n (y_i/v_i) + \lambda \eta}{\sum\limits_{i=1}^n (1/v_i) + \lambda} \qquad \qquad \lambda_n(\bv) = \lambda + \sum\limits_{i=1}^n (1/v_i). \] Therefore \[ \pi_n(\mu| \sigma^2, \bv) \equiv Normal(\eta_n(\bv),\sigma^2/\lambda_n(\bv)). \] \item Full conditional for $\sigma^2$: \[ \pi_n(\sigma^2|\mu,\bv) \propto \left(\dfrac{1}{\sigma^2} \right)^{(n+\alpha_0+1)/2+1} \exp\left\{-\frac{1}{2\sigma^2} \left[ \sum_{i=1}^n \frac{1}{v_i} (y_i - \mu)^2 + \lambda (\mu - \eta)^2 + \beta_0 \right] \right\} \] so therefore \[ \pi_n(\sigma^2|\mu,\bv) \equiv InvGamma(\alpha_n/2,\beta_n(\mu,\bv)/2) \] where \[ \alpha_n = \alpha_0 + n + 1 \qquad \beta_n(\mu,\bv) = \sum_{i=1}^n \frac{1}{v_i} (y_i - \mu)^2 + \lambda (\mu - \eta)^2 + \beta_0. \] \item Full conditional for $v_i, i=1,\ldots,n$: we can see from the form of $\pi_n(\mu,\sigma^2,\bv)$ that \[ \pi_n(\bv|\mu,\sigma^2) = \prod_{i=1}^n \pi_n(v_i|\mu,\sigma^2) \] where \[ \pi_n(v_i|\mu,\sigma^2) \propto \left( \frac{1}{v_i} \right)^{(\nu + 1)/2+1} \exp\left\{ - \frac{1}{2 v_i} \left[ \nu + \frac{(y_i - \mu)^2}{\sigma^2} \right] \right\} \] that is, \[ \pi_n(v_i|\mu,\sigma^2) \equiv InvGamma((\nu+1)/2,\varphi_i(\mu,\sigma^2)/2) \] where \[ \varphi_i(\mu,\sigma^2) = \nu + \frac{(y_i - \mu)^2}{\sigma^2} \] \end{itemize} Therefore all the full conditionals are standard distributions, and the elements of $\bv$ can be updated independently in parallel. <>= old.mu<-0 old.sigsq<-1 old.v<-rep(1,n) ico<-0 par.samp.A<-matrix(0,nrow=nsamp,ncol=2) v.samp.A<-matrix(0,nrow=nsamp,ncol=n) for(iter in 1:nits){ #Update mu lam.n<-sum(1/old.v)+lambda eta.n<-(sum(y/old.v)+eta*lambda)/lam.n old.mu<-rnorm(1,eta.n,sqrt(old.sigsq/lam.n)) #Update sigma^2 al.n<-al+n+1 be.n<-sum((y-old.mu)^2/old.v)+lambda*(old.mu-eta)^2 + be old.sigsq<-1/rgamma(1,al.n/2,be.n/2) #Update v old.v<-1/rgamma(n,(nu+1)/2,(nu+(y-old.mu)^2/old.sigsq)/2) #Collect output if(iter > nburn & iter %% nthin ==0){ ico<-ico+1 par.samp.A[ico,]<-c(old.mu,old.sigsq) v.samp.A[ico,]<-old.v } } @ <>= par(mar=c(4,4,2,0)) plot(par.samp.A,pch=19,cex=0.5,xlab=mulab,ylab=siglab,xlim=range(0,3.5),ylim=range(0,3.5)) #Summaries mean(par.samp.A[,1]) mean(par.samp.A[,2]) var(par.samp.A) @ <>= #Posterior histograms par(mar=c(4,4,4,0),mfrow=c(1,2)) inc<-par.samp.A[,2]<3.5 hist(par.samp.A[,1],breaks=seq(0,3.5,by=0.05),ylim=range(0,2), freq=FALSE,main=mulab,xlab='');box() hist(par.samp.A[inc,2],breaks=seq(0,3.5,by=0.05),,ylim=range(0,2), freq=FALSE,main=siglab,xlab='');box() @ <>= #Trace plots par(mar=c(4,4,1,0)) plot(par.samp.A[,1],type='l',col='red',xlab='Sample',ylab='',ylim=range(0,4)) lines(par.samp.A[,2],type='l',col='blue') legend(0,4,c(mulab,siglab),lty=1,col=c('red','blue')) @ <>= #ACF plots par(mar=c(4,4,3,0),mfrow=c(1,2)) acf(par.samp.A[,1],main=mulab) acf(par.samp.A[,2],main=siglab) #Effective sample size library(coda) effectiveSize(par.samp.A) #Efficiency effectiveSize(par.samp.A)/nsamp @ To compare the two MCMC approaches, we may overlay the two posterior samples: <>= par(mar=c(4,4,2,0)) plot(par.samp.A,pch=19,cex=0.5,col='red', xlab=mulab,ylab=siglab,xlim=range(0,3.5),ylim=range(0,3.5)) points(par.samp,col='blue',pch=19,cex=0.5) legend(0,3.5,c('Auxiliary','Gibbs'),pch=19,cex=0.5,col=c('red','blue')) @ It seems that the auxiliary variable method explores more of the parameter space. \pagebreak \textbf{Approach 3:} \textbf{Rejection sampling} For rejection sampling, we may use a bivariate proposal distribution with parameters estimated by finding a Normal approximation near the posterior mode that we find using \texttt{optim}. <>= log.post<-function(parv,yv,etv,lamv,a0v,b0v,nuv){ t1<-log.like(yv,parv[1],sqrt(parv[2]),nuv) t2<-dnorm(parv[1],etv,sqrt(parv[2])/sqrt(lamv),log=TRUE) t3<-dinvgamma(parv[2],a0v/2,b0v/2,log=TRUE) return(t1+t2+t3) } pstart<-c(0,1) olist<-list(fnscale=-1) post.max<-optim(pstart,fn=log.post,yv=y,etv=eta,lamv=lambda,a0v=al,b0v=be,nuv=nu, control=olist,hessian=T) rej.mu<-post.max$par rej.mu #Proposal mean rej.Sig<-solve(-post.max$hessian) rej.Sig #Proposal variance matrix @ <>= gpts<-seq(0,1,by=0.005) mu1<-sig1<-gpts*3.5 #Grids #Compute the posterior on the log scale log.post.func1<-function(muv,sigv,yv,etv,lamv,a0v,b0v,nuv){ parv<-c(muv,sigv) t1<-log.post(parv,yv,etv,lamv,a0v,b0v,nuv) return(t1) } f <- Vectorize(log.post.func1,vectorize.args=c("muv","sigv")) log.p1<-outer(mu1,sig1,f,yv=y,etv=eta,lamv=lambda,a0v=al,b0v=be,nuv=nu) log.p1[log.p1 < -70]<--70 par(pty='s',mar=c(2,3,2,2)) colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red")) image.plot(mu1,sig1,log.p1,col=colfunc(100),zlim=range(-70,-34), xlab=mulab,ylab=siglab,cex.axis=0.8) contour(mu1,sig1,log.p1,add=T,levels=seq(-70,-34,by=2)) @ We propose from a bivariate Student-t distribution with degrees of freedom 2 using the mvnfast library functions \texttt{dmvt} and \texttt{rmvt}. <>= library(mvnfast) log.rat<-function(parv,yv,etv,lamv,a0v,b0v,nuv,Muv,Sigv){ if(parv[2]<0){return(-Inf)} t1<-log.like(yv,parv[1],sqrt(parv[2]),nuv) t2<-dnorm(parv[1],etv,sqrt(parv[2])/sqrt(lamv),log=TRUE) t3<-dinvgamma(parv[2],a0v/2,b0v/2,log=TRUE) t4<-dmvt(parv,Muv,Sigv,2,log=T) return(t1+t2+t3-t4) } pstart<-c(0,1) olist<-list(fnscale=-1) rat.max<-optim(pstart,fn=log.rat,yv=y,etv=eta,lamv=lambda,a0v=al,b0v=be,nuv=nu, Muv=rej.mu,Sigv=rej.Sig,control=olist,hessian=T) logM<-rat.max$value nacc<-0 ico<-0 while(nacc < nsamp){ th<-rmvt(nsamp,rej.mu,rej.Sig,2) u<-log(runif(nsamp)) r<-apply(th,1,log.rat,yv=y,etv=eta,lamv=lambda,a0v=al,b0v=be,nuv=nu, Muv=rej.mu,Sigv=rej.Sig) acc<-u < r - logM if(nacc==0){ rej.samp<-th[acc,] }else{ rej.samp<-rbind(rej.samp,th[acc,]) } nacc<-nacc+sum(acc) ico<-ico+nsamp } c(ico,nacc,nacc/ico) #Acceptance rate rej.samp<-rej.samp[1:nsamp,] @ <>= par(mar=c(4,4,2,0)) plot(par.samp.A,pch=19,cex=0.5,col='red', xlab=mulab,ylab=siglab,xlim=range(0,3.5),ylim=range(0,3.5)) points(rej.samp,col='green',pch=19,cex=0.5) legend(0,3.5,c('Auxiliary','Rejection'),pch=19,cex=0.5,col=c('red','green')) @ A comparison shows that the auxiliary variable MCMC sampler matches the exact rejection sampler. <>= var(par.samp) #Gibbs sampler var(par.samp.A) #Auxiliary variable sampler var(rej.samp) #Rejection sampling @ \end{document}