\documentclass[notitlepage]{article} \usepackage{../../Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{multirow} \usepackage{bbm} \usepackage{bbold} \usepackage{bm} \usepackage{amsfonts,amsmath,amssymb} \setlength{\parindent}{0pt} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%6i", x), sprintf("%.3f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \begin{center} \textsclarge{MATH 598: TOPICS IN STATISTICS} \vspace{0.1 in} \textsc{Hamiltonian Markov chain Monte Carlo} \end{center} \emph{Hamiltonian Monte Carlo} appeals to the dynamics of physical systems to perform sampling from a target distribution. Consider two $d$-dimensional vectors that describe the motion of an object in $\R^d$. \begin{itemize} \item $x = (x_1,\ldots,x_d)$ denotes the \emph{position} of the object \item $v = (v_1,\ldots,v_d)$ denotes the \emph{momentum} (proportional to the \emph{velocity}) of the object. \end{itemize} Let $z = (x,v)$. Hamiltonian dynamics is formulated via the \emph{Hamiltonian}, denoted $H(z) \equiv H(x,v)$, which is a function of $z=(x,v)$ but \emph{not} $t$, and which describes how $z$ changes in time via the differential form \[ \frac{dz}{dt} = \begin{bmatrix} \mathbf{0}_d & \Ident_d \\ - \Ident_d & \mathbf{0}_d \end{bmatrix} \frac{\partial H(z)}{\partial z} = \bD \frac{\partial H(z)}{\partial z} \] say, that is \[ \frac{d x}{d t} = \frac{\partial H(x,v)}{ \partial v} \qquad \qquad \frac{d v}{d t} = -\frac{\partial H(x,v)}{ \partial x}. \] Attention focusses on Hamiltonians that satisfy $H(x,v) = H(x,-v)$ and are assumed to be separable \[ H(x,v) = U(x) + K(v) \] so that $K(v) = K(-v)$. \begin{itemize} \item $U(x)$ is termed the \emph{potential energy} \item $K(v)$ is termed the \emph{kinetic energy}, where typically \[ K(v) = \frac{1}{2} v^\top \bM^{-1} v \] where $\bM$ is a positive definite, symmetric matrix. \end{itemize} The Hamiltonian, and Hamilton's equations, define the evolution of the system in continuous time: the objective is to find the solution $z(t) = (x(t),v(t))$ to the equations for all $t$. \bigskip In a statistical problem, we consider joint pdf $\pi_{X,V}(x,v)$ \[ \pi_{X,V}(x,v) \varpropto \exp \left\{ -H(x,v) \right\} = \exp \left\{ - U(x) - K(v) \right\} \] corresponding to independence of corresponding random variables $X$ and $V$ \[ \pi_X(x) \varpropto \exp \left\{ - U(x) \right\} \qquad \text{and} \qquad \pi_V(v) \varpropto \exp \left\{- K(v) \right\}. \] If the marginal $\pi_X(x)$ is our true target, then $V$ is merely an \emph{auxiliary variable}. We are free to choose the distribution $\pi_V(v)$, and a typical choice is the multivariate Normal \[ \pi_V(v) \varpropto \exp \left\{- \frac{1}{2} v^\top \bM^{-1} v \right\} = \exp \left\{- K(v) \right\}. \] If $\bM = \text{diag}(m_1,\ldots,m_d)$, then \[ K(v) = \frac{1}{2} \sum_{j=1}^d \frac{v_j^2}{m_j} \] which corresponds to an assumption that $V_j \sim Normal(0,m_j)$ for $j=1,\ldots,d$ are independent. Here \[ \dot{K}(v) = \bM^{-1} v = \left[\begin{array}{c} \dfrac{v_1}{m_2} \\[6pt] \vdots \\[6pt] \dfrac{v_d}{m_d} \end{array} \right] \] Hamiltonian dynamics can be approximated using a discrete time approach. We consider $\{z_k \equiv z(k \delta), k=1,2,\ldots\}$ for time-step $\delta > 0$. \begin{itemize} \item \textbf{Euler method:} the dynamics equation becomes in an Euler approximation \[ z_{k+1} = z_k + \delta \bD \frac{\partial H(z)}{\partial z} \bigg |_{z=z_k}. \] That is, if $H(x,v) = U(x) + K(v)$ \begin{align*} x_{k+1} & = x_k + \delta \frac{\partial H(x,v)}{\partial v} \bigg |_{z=(x_k,v_k)} = x_k + \delta \dot{K}(v_k) \\[6pt] v_{k+1} & = v_k - \delta \frac{\partial H(x,v)}{\partial x} \bigg |_{z=(x_k,v_k)} = v_k - \delta \dot{U}(x_k) \end{align*} \item \textbf{Improved Euler:} Euler's method can be improved by considering \emph{sequential} updating: \begin{align*} x_{k+1} & = x_k + \delta \dot{K}(v_k) \\[6pt] v_{k+1} & = v_k - \delta \dot{U}(x_{k+1}) \end{align*} \item \textbf{Leapfrog method:} The leapfrog method uses half-steps gives further improvement with the updates \begin{align*} v_{k}^\ast & = v_k - \frac{\delta}{2} \dot{U}(x_k) \\[6pt] x_{k+1} & = x_k + \delta \dot{K}(v_{k}^\ast) \\[6pt] v_{k+1} & = v_{k}^\ast - \frac{\delta}{2} \dot{U}(x_{k+1}) \end{align*} where $v_{k}^\ast \equiv v((k+1/2)\delta )$. \end{itemize} The basic Hamiltonian MCMC algorithm proceeds using the following Metropolis accept/reject approach: we construct an MCMC move $(x_k,v_k) \lra (x_{k+1},v_{k+1})$ as follows: \begin{enumerate}[(I)] \item Generate $v_{1}^\prime \sim Normal_d(\mathbf{0}_d,\bM)$. \item Perform $L$ dynamics updates with time-step $\delta$: for example, for the leapfrog updates, \begin{enumerate}[(i)] \item set $v_1^\prime$, $x_1^\prime = x_{k}$; \item for $l=1,\ldots,L-1$ \begin{align*} v_{l}^\ast & = v_l^\prime - \frac{\delta}{2} \dot{U}(x_l^\prime) \\[6pt] x_{l+1}^\prime & = x_l^\prime + \delta \dot{K}(v_{l}^\ast) \\[6pt] v_{l+1}^\prime & = v_{l}^\ast - \frac{\delta}{2} \dot{U}(x_{l+1}^\prime); \end{align*} \item set $x_{k+1}^\ast = x_{L}^\prime$ and $v_{k+1}^\ast = - v_{L}^\prime$. \end{enumerate} \item Accept $(x_{k+1}^\ast,v_{k+1}^\ast)$ with probability \[ \min \left \{1,\frac{\pi_{X,V}(x_{k+1}^\ast,v_{k+1}^\ast)}{\pi_{X,V}(x,v)} \right\} \] where \[ \frac{\pi_{X,V}(x_{k+1}^\ast,v_{k+1}^\ast)}{\pi_{X,V}(x,v)} = \exp\{-H(x_{k+1}^\ast,v_{k+1}^\ast) + H(x_{k},v_{k}) \}. \] \end{enumerate} The proposal in step (II) is reversible by construction, so the proposal mechanism does not appear in the acceptance probability as it cancels in numerator and denominator of the ratio. \bigskip \textbf{Example: Bivariate normal} Suppose that $\pi_X(x) \equiv Normal_2(\mathbf{0}_2,\Sigma)$ where \[ \Sigma = \begin{bmatrix} 1 & \rho \\[3pt] \rho & 1 \end{bmatrix} \] so that in the Hamiltonian notation \[ U(x) = \frac{1}{2} x^\top \Sigma^{-1} x \qquad \qquad \dot{U}(x) = \Sigma^{-1} x. \] Suppose also that $\bM = \Ident_2$, so that $\dot{K}(v) = v$. The basic version of Hamiltonian MCMC constructs a move $(x_k,v_k) \lra (x_{k+1},v_{k+1})$ as follows: \begin{enumerate}[(I)] \item Generate $v_{1}^\prime \sim Normal_2(\mathbf{0}_2,\Ident_2)$. \item For the leapfrog updates, for fixed $\delta$ and $L$ \begin{enumerate}[(i)] \item set $v_1^\prime$, $x_1^\prime = x_{k}$; \item for $l=1,\ldots,L-1$ \begin{align*} v_{l}^\ast & = v_l^\prime - \frac{\delta}{2} \Sigma^{-1} x_l^\prime \\[6pt] x_{l+1}^\prime & = x_l^\prime + \delta v_{l}^\ast \\[6pt] v_{l+1}^\prime & = v_{l}^\ast - \frac{\delta}{2} \Sigma^{-1} x_{l+1}^\prime; \end{align*} \item set $x_{k+1}^\ast = x_{L}^\prime$ and $v_{k+1}^\ast = - v_{L}^\prime$. \end{enumerate} \item Accept $(x_{k+1}^\ast,v_{k+1}^\ast)$ with probability \[ \min \left \{1,\frac{\pi_{X,V}(x_{k+1}^\ast,v_{k+1}^\ast)}{\pi_{X,V}(x,v)} \right\} \] where \[ \frac{\pi_{X,V}(x_{k+1}^\ast,v_{k+1}^\ast)}{\pi_{X,V}(x,v)} = \exp\{-H(x_{k+1}^\ast,v_{k+1}^\ast) + H(x_{k},v_{k}) \}. \] \end{enumerate} We illustrate this with $\rho = 0.9$, $\delta = 0.1$ and $L=10$. <>= rho<-0.9 Sigma<-matrix(c(1,rho,rho,1),2,2) SigInv<-solve(Sigma) L<-25 del<-0.25 x<-c(0,0) nits<-1000 xmat<-matrix(0,nrow=nits,ncol=2) for(iter in 1:nits){ #Initialize v<-rnorm(2) #Leapfrog for(l in 1:L){ vstar<-v-0.5*del*SigInv %*% x xnew<-x+del*vstar vnew<-vstar-0.5*del*SigInv %*% xnew } #Negate vnew<--vnew #Metropolis lpi.old<--0.5*t(x) %*% (SigInv %*% x) - 0.5*sum(v^2) lpi.new<--0.5*t(xnew) %*% (SigInv %*% xnew) - 0.5*sum(vnew^2) if(log(runif(1)) < lpi.new-lpi.old){ x<-xnew v<-vnew } xmat[iter,]<-x } @ <>= par(mar=c(4,3,1,0)) xl<-yl<-range(-3,3) plot(xmat[,1],type='l',ylim=xl,xlab='Iteration') lines(xmat[,2],col='red') legend(600,3,c(expression(x[1]),expression(x[2])),lty=1,col=c('black','red')) par(mar=c(4,3,1,0),pty='s') plot(xmat,xlab=expression(x[1]),ylab=expression(x[2]),xlim=xl,ylim=yl,pch=20,cex=0.8) @ The first 50 steps of the MCMC run are plotted below <>= par(mar=c(4,3,1,0),pty='s') plot(xmat,xlab=expression(x[1]),ylab=expression(x[2]),xlim=xl,type='n',ylim=yl) for(i in 2:50){lines(xmat[c(i-1,i),1],xmat[c(i-1,i),2])} @ It is also instructive to look at the path of the leapfrog proposal mechanism. For $L=50$ steps the generated path is as below: the green dot is the starting value of $x$, and the white dot is the ending position. <>= library(mvnfast) xvec<-yvec<-seq(-2.5,2.5,by=0.01) rho<-0.8 Sigma<-matrix(c(1,rho,rho,1),2,2) SigInv<-solve(Sigma) dfunc<-function(xv,yv,muv,sigv,lv=TRUE){ dval<-dmvn(c(xv,yv),muv,sigv,log=lv) } f <- Vectorize(dfunc,vectorize.args=c("xv","yv")) zmat<-outer(xvec,yvec,f,muv=c(0,0),sigv=Sigma) library(fields,quietly=TRUE) par(pty='s',mar=c(4,3,2,2)) colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red")) image.plot(xvec,yvec,zmat,col=colfunc(200), xlab=expression(x[1]),ylab=expression(x[2]),cex.axis=0.8) contour(xvec,yvec,zmat,add=T,levels=seq(-30,0,by=2)) title(expression(paste('Log ',pi[x](x)))) #Leapfrog run from fixed starting position L<-50 del<-0.25 x<-runif(2,-2,2) v<-rnorm(2) xpath<-matrix(0,nrow=L,ncol=2) Ham<-U<-K<-rep(0,L) xpath[1,]<-x U[1]<-0.5*t(x) %*% (SigInv %*% x) K[1]<-0.5*sum(v^2) Ham[1]<-U[1]+K[1] for(l in 1:(L-1)){ vstar<-v-0.5*del*SigInv %*% x xnew<-x+del*vstar vnew<-vstar-0.5*del*SigInv %*% xnew x<-xnew v<-vnew xpath[l+1,]<-x U[l+1]<-0.5*t(x) %*% (SigInv %*% x) K[l+1]<-0.5*sum(v^2) Ham[l+1]<-U[l+1]+K[l+1] } for(i in 2:L){lines(xpath[c(i-1,i),1],xpath[c(i-1,i),2],lty=3)} points(xpath[1,1],xpath[1,2],col='green',pch=20) points(xpath[L,1],xpath[L,2],col='white',pch=20); @ If we examine how the $H(x,v)$, $U(x)$ and $K(v)$ functions vary across the path, we see that $H(x,v)$ is almost constant, whereas the other two functions vary in opposition to each other. <>= par(mar=c(4,3,1,0)) plot(Ham,type='l',ylim=range(0,3),xlab='Leapfrog step') lines(U,lty=2,col='red') lines(K,lty=2,col='blue') legend(30,3,c(expression(H(x,v)),expression(U(x)),expression(K(v))), lty=c(1,2,2),col=c('black','red','blue')) @ \pagebreak For another realization <>= par(pty='s',mar=c(4,3,2,2)) image.plot(xvec,yvec,zmat,col=colfunc(200), xlab=expression(x[1]),ylab=expression(x[2]),cex.axis=0.8) contour(xvec,yvec,zmat,add=T,levels=seq(-30,0,by=2)) title(expression(paste('Log ',pi[x](x)))) #Leapfrog run from fixed starting position L<-50 del<-0.25 x<-runif(2,-2,2) v<-rnorm(2) xpath<-matrix(0,nrow=L,ncol=2) Ham<-U<-K<-rep(0,L) xpath[1,]<-x U[1]<-0.5*t(x) %*% (SigInv %*% x) K[1]<-0.5*sum(v^2) Ham[1]<-U[1]+K[1] for(l in 1:(L-1)){ vstar<-v-0.5*del*SigInv %*% x xnew<-x+del*vstar vnew<-vstar-0.5*del*SigInv %*% xnew x<-xnew v<-vnew xpath[l+1,]<-x U[l+1]<-0.5*t(x) %*% (SigInv %*% x) K[l+1]<-0.5*sum(v^2) Ham[l+1]<-U[l+1]+K[l+1] } for(i in 2:L){lines(xpath[c(i-1,i),1],xpath[c(i-1,i),2],lty=3)} points(xpath[1,1],xpath[1,2],col='green',pch=20) points(xpath[L,1],xpath[L,2],col='white',pch=20); @ <>= par(mar=c(4,3,1,0)) plot(Ham,type='l',ylim=range(0,3),xlab='Leapfrog step') lines(U,lty=2,col='red') lines(K,lty=2,col='blue') legend(30,3,c(expression(H(x,v)),expression(U(x)),expression(K(v))), lty=c(1,2,2),col=c('black','red','blue')) @ \pagebreak For a third realization <>= par(pty='s',mar=c(4,3,2,2)) image.plot(xvec,yvec,zmat,col=colfunc(200), xlab=expression(x[1]),ylab=expression(x[2]),cex.axis=0.8) contour(xvec,yvec,zmat,add=T,levels=seq(-30,0,by=2)) title(expression(paste('Log ',pi[x](x)))) #Leapfrog run from fixed starting position L<-50 del<-0.25 x<-runif(2,-2,2) v<-rnorm(2) xpath<-matrix(0,nrow=L,ncol=2) Ham<-U<-K<-rep(0,L) xpath[1,]<-x U[1]<-0.5*t(x) %*% (SigInv %*% x) K[1]<-0.5*sum(v^2) Ham[1]<-U[1]+K[1] for(l in 1:(L-1)){ vstar<-v-0.5*del*SigInv %*% x xnew<-x+del*vstar vnew<-vstar-0.5*del*SigInv %*% xnew x<-xnew v<-vnew xpath[l+1,]<-x U[l+1]<-0.5*t(x) %*% (SigInv %*% x) K[l+1]<-0.5*sum(v^2) Ham[l+1]<-U[l+1]+K[l+1] } for(i in 2:L){lines(xpath[c(i-1,i),1],xpath[c(i-1,i),2],lty=3)} points(xpath[1,1],xpath[1,2],col='green',pch=20) points(xpath[L,1],xpath[L,2],col='white',pch=20); @ <>= par(mar=c(4,3,1,0)) plot(Ham,type='l',ylim=range(0,3),xlab='Leapfrog step') lines(U,lty=2,col='red') lines(K,lty=2,col='blue') legend(30,3,c(expression(H(x,v)),expression(U(x)),expression(K(v))), lty=c(1,2,2),col=c('black','red','blue')) @ \end{document}