\documentclass[notitlepage]{article}
\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{bbold}



\usepackage{amsfonts,amsmath,amssymb}

\setlength{\parindent}{0pt}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%6i", x),
      sprintf("%.6f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@


\begin{center}
\textsclarge{MATH 598: TOPICS IN STATISTICS}

\vspace{0.1 in} \textsc{Tempering and Annealing in Monte Carlo}
\end{center}

Suppose our target density is
\[
\pi(\bx) \equiv \omega Normal_2(\mu_1,\Sigma_1) + (1-\omega) Normal_2(\mu_2,\Sigma_2)
\]
where $\mu_1$ and $\mu_2$ are relatively well separated.  Ordinary Metropolis-Hastings methods can struggle so sample the pdf properly as they get stuck in local modes around one of the $\mu$ values.  In the following example we have $\omega=1/2$, and
\[
\mu_1 = (20,30) \qquad \mu_2 = (60,70) \qquad \Sigma_1 = \begin{bmatrix}25 & 6 \\ 6 & 4 \end{bmatrix} \qquad \Sigma_2 = \begin{bmatrix}64 & -72 \\ -72 & 100 \end{bmatrix}
\]
<<T1,comment='+', fig.width=6, fig.height=4.5, fig.align='center'>>=
set.seed(34)
xv<-yv<-seq(0,100,0.5)
require(mvnfast)
mu1<-c(20,30)
mu2<-c(60,70)

Sig1<-diag(c(5,2)) %*% matrix(c(1,0.6,0.6,1),2,2) %*% diag(c(5,2))
Sig1
Tau1<-solve(Sig1)
Sig2<-diag(c(8,10)) %*% matrix(c(1,-0.9,-0.9,1),2,2) %*% diag(c(8,10))
Sig2
Tau2<-solve(Sig2)

joint.pdf<-function(xv,yv,m1,m2,S1,S2,Tmp=1){
	return(log(dmvn(c(xv,yv),m1,S1)+dmvn(c(xv,yv),m2,S2))/Tmp)
}
f <- Vectorize(joint.pdf,vectorize.args=c("xv","yv"))
fmat<-outer(xv,yv,f,m1=mu1,m2=mu2,S1=Sig1,S2=Sig2)


library(fields,quietly=TRUE)
par(pty='s',mar=c(4,4,2,2))
colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red"))
image.plot(xv,yv,fmat,col=colfunc(100),zlim=range(-200,-0),
           xlab=expression(x[1]),ylab=expression(x[2]),cex.axis=0.8)
contour(xv,yv,fmat,add=T,levels=seq(-150,0,by=10))
@

A Metropolis algorithm implementation illustrates the difficulty.  The chain gets stuck in one mode, and cannot traverse the deep valley between the two modes.


<<T2,comment='+', fig.width=6, fig.height=4.5, fig.align='center'>>=
old.x<-runif(2,0,100)
old.like<-log(dmvn(old.x,mu1,Sig1)+dmvn(old.x,mu2,Sig2))
nburn<-1000;nsamp<-10000;nthin<-10
nits<-nburn+nsamp*nthin
x.samp0<-array(0,c(nsamp,2))
like.val0<-matrix(0,nsamp)
ico<-0
for(iter in 1:nits){
	#Mutation
	new.x<-old.x+rmvn(1,c(0,0),diag(c(10,10)))
	new.like<-log(dmvn(new.x,mu1,Sig1)+dmvn(new.x,mu2,Sig2))
	if(log(runif(1)) < new.like-old.like){
		old.like<-new.like
		old.x<-new.x
	}
	if(iter > nburn & iter %% nthin == 0){
		ico<-ico+1
		x.samp0[ico,]<-old.x
		like.val0[ico,]<-old.like
	}
}
par(pty='s',mar=c(4,4,0,2))
image.plot(xv,yv,fmat,col=colfunc(100),zlim=range(-200,-0),
           xlab=expression(x[1]),ylab=expression(x[2]),cex.axis=0.8)
contour(xv,yv,fmat,add=T,levels=seq(-150,0,by=5));points(x.samp0,pch=19,cex=0.5)
@

<<T3,comment='+', fig.width=8, fig.height=4, fig.align='center'>>=
par(mfrow=c(1,2),mar=c(4,4,1,2))
plot(x.samp0[,1],ylim=range(0,100),xlab='Iter',ylab=expression(x[1]),type='l',cex.axis=0.8)
plot(x.samp0[,2],ylim=range(0,100),xlab='Iter',ylab=expression(x[2]),type='l',cex.axis=0.8)
@

\medskip

To overcome this we can use a population approach with tempered distributions.  Let
\[
\pi_m(x) \propto \{\pi(x)\}^{1/T_m}  \qquad m=1,2,\ldots,M
\]
where
\[
1 = T_1 < T_2 < \cdots < T_M
\]
yield tempered forms of $\pi$.  We run $M$ parallel chains each with its own target, and then allow for exchange between the tempered chains and the target chain $m=1$.  In the following implementation, we use
\begin{itemize}
  \item \textbf{Mutation moves:} a standard Metropolis algorithm is used for each chain.
  \item \textbf{Exchange moves:} an exchange move between chain $m=1$ and a randomly selected chain $l$ say is proposed -- this involves merely a swap of the current values within each chain, and an acceptance of the move with probability
      \[
      \min \left\{1, \frac{\pi_l(x_m) \pi_m(x_l)}{\pi_l(x_l) \pi_m(x_m)}  \right\}
      \]
\end{itemize}


<<T4,comment='+', fig.width=6, fig.height=4.5, fig.align='center'>>=
M<-5
set.seed(3764)
Temps<-1+10*c(0:(M-1))/M

old.x<-matrix(runif(2*M,0,100),M,2)
old.like<-log(dmvn(old.x,mu1,Sig1)+dmvn(old.x,mu2,Sig2))/Temps

x.samp<-array(0,c(nsamp,M,2))
like.val<-matrix(0,nsamp,M)
ico<-0
for(iter in 1:nits){
	#Mutation
	new.x<-old.x+rmvn(M,c(0,0),diag(c(10,10)))
	new.like<-log(dmvn(new.x,mu1,Sig1)+dmvn(new.x,mu2,Sig2))/Temps
	ivec<-log(runif(M)) < new.like-old.like
	old.x[ivec]<-new.x[ivec]
	old.like[ivec]<-new.like[ivec]

	#Exchange
	new.x<-old.x
	#Simple swap from target chain to tempered chain
	i<-1;j<-sample(2:M,size=1)
	new.x[i]<-old.x[j]
	new.x[j]<-old.x[i]
	new.like<-log(dmvn(new.x,mu1,Sig1)+dmvn(new.x,mu2,Sig2))/Temps
	if(log(runif(1)) < new.like[i]+new.like[j]-old.like[i]-old.like[j]){
		old.like[i]<-new.like[i]
		old.like[j]<-new.like[j]
		old.x[i]<-new.x[i]
		old.x[j]<-new.x[j]
	}
	if(iter > nburn & iter %% nthin == 0){
		ico<-ico+1
		x.samp[ico,,]<-old.x
		like.val[ico,]<-old.like
	}
}
par(pty='s',mar=c(4,4,2,2))
image.plot(xv,yv,fmat,col=colfunc(100),zlim=range(-200,-0),
           xlab=expression(x[1]),ylab=expression(x[2]),cex.axis=0.8)
contour(xv,yv,fmat,add=T,levels=seq(-150,0,by=5)); points(x.samp[,1,],pch=19,cex=0.5)
@

<<T5,comment='+', fig.width=8, fig.height=4, fig.align='center'>>=
par(mfrow=c(1,2),mar=c(4,4,0,2))
plot(x.samp[,1,1],ylim=range(0,100),xlab='Iter',ylab=expression(x[1]),type='l',cex.axis=0.8)
plot(x.samp[,1,2],ylim=range(0,100),xlab='Iter',ylab=expression(x[2]),type='l',cex.axis=0.8)
@

\pagebreak

\textbf{Annealed Importance Sampling:}  \textit{Annealed Importance Sampling} (AIS) uses a SIS approach, but with
auxiliary densities \textbf{of the same dimension} defined by
\[
p_{0j}(x) = c_j g_{0j}(x) = c_j \{g_0(x)\}^{1-\xi_j} \{g_n(x) \}^{\xi_j}
\]
where $p_0$ is a ``diffuse" distribution, and
\[
0 = \xi_0 < \xi_1 < \cdots < \xi_n = 1.
\]



We have a ``path" from $p_{00}(x) \equiv p_0 (x)$ to the target distribution
\[
p_{0n}(x) \equiv \pi_n(x) = c_n g_n(x).
\]
For $j=1,\ldots,n-1$, let $P_j$ be a Markov kernel with invariant distribution $p_{0j}$. The AIS algorithm proceeds as follows:

\begin{itemize}

\item[1.] Sample $x_1$ from $p_{00}$, set $w_0 = 1/g_0(x_1)$.

\medskip

\item[2.] For $j=1,2,\ldots,n-1$,

\begin{itemize}

\item[(i)] Sample $x_{j+1}$ from $P_j(x_{j},\cdot)$;

\smallskip

\item[(ii)] Set
\[
w_j = w_{j-1} \frac{p_{0j}(x_{j})}{p_{0j}(x_{j+1})} = w_{j-1} \frac{g_{0j}(x_{j})}{g_{0j}(x_{j+1})} .
\]


\end{itemize}

\medskip

\item[3.]  Return to 1. and repeat to produce $N$ samples and weights
\[
x_n^{(1)},\ldots,x_n^{(N)}
\qquad \qquad
w_n^{(1)},\ldots,w_n^{(N)}
\]
where
\[
w_n = \frac{1}{g_0(x_1)} \times \frac{g_{01}(x_{1})}{g_{01}(x_{2})} \times \frac{g_{02}(x_{2})}{g_{02}(x_{3})} \times \cdots \times \frac{g_{0\:n-1}(x_{n-1})}{g_{0\:n-1}(x_{n})} \times g_n(x_n).
\]

\end{itemize}
Consider the \textbf{reverse kernel} $P_j^{R}$ defined using Bayes Theorem as
\[
P_{j}^R(x_{j+1},x_j) = \frac{P_j(x_j,x_{j+1}) p_{0j}(x_j)}{p_{0j}(x_{j+1})} = \frac{P_j(x_j,x_{j+1}) g_{0j}(x_j)}{g_{0j}(x_{j+1})},
\]
and let
\[
g^\star(x_{1:n}) = g_{n}(x_n) \prod_{j=1}^{n-1} P_{j}^R(x_{j+1},x_{j} \qquad \qquad
g_0^\star(x_{1:n}) = g_{0}(x_1) \prod_{j=1}^{n-1} P_j(x_{j},x_{j+1})
\]
where $g_0^\star(x)$ is proportional to the AIS proposal joint density $p_0^\star(x)$.  Define
\[
w^\star(x_{1:n}) = \frac{g^\star(x_{1:n})}{g_0^\star(x_{1:n})}
\]
as the importance sampling weight for the augmented sample
\[
x_{1:n} = (x_1,\ldots,x_n)
\]
generated from $p_0^\star$.  We then have that
\[
w^\star(x_{1:n}) = \dfrac{g_{n}(x_n) \prod\limits_{j=1}^{n-1} P_{j}^R(x_{j+1},x_{j})}{g_{0}(x_1) \prod\limits_{j=1}^{n-1} P_j(x_{j},x_{j+1})} = \dfrac{g_{n}(x_n) \prod\limits_{j=1}^{n-1} \dfrac{P_{j}(x_{j},x_{j+1}) g_{0j}(x_{j})}{g_{0j}(x_{j+1})}}{g_{0}(x_1) \prod\limits_{j=1}^{n-1} P_j(x_{j},x_{j+1})}
\]
That is, all the terms in $P_j(\cdot,\cdot)$ cancel, and
\[
w^\star(x) = \frac{1}{g_0(x)} \times \frac{g_{01}(x_{1})}{g_{01}(x_{2})} \times \frac{g_{02}(x_{2})}{g_{02}(x_{3})} \times \cdots \times \frac{g_{0\:n-1}(x_{n-1})}{g_{0\:n-1}(x_{n})} \times g_n(x_n)
\]
This is the identical weight to the one computed recursively above. Note also that for each $j=1,\ldots,n-1$,
\[
\int P_j^R(x_{j+1},x_j) \: d x_{j} = 1
\]
so the marginal distribution of $x_{n}$ from
\[
\pi^\star (x_{1:n}) =  c^\star g^\star(x_{1:n})
\]
obtained by integrating $g^\star(x_{1:n})$ with respect to
\[
x_1, x_2, \ldots, x_{n-1}
\]
is precisely the true target $\pi_n(x_n)$.

<<T6,comment='+', fig.width=6, fig.height=4.5, fig.align='center'>>=
p0j<-function(xv,m0,S0,m1,m2,S1,S2,Tmp){
    Tv<-(1-Tmp)*dmvn(xv,m0,S0,log=T)+Tmp*log(0.5*dmvn(xv,mu1,Sig1)+0.5*dmvn(xv,mu2,Sig2))
	return(Tv)
}

mu0<-c(50,50)
Sig0<-diag(c(200,200))

n<-20
set.seed(3764)
Temps<-c(1:n)/n
N<-10000
xmat<-array(0,c(n,N,2))
wmat<-matrix(0,nrow=n,ncol=N)
xmat[1,,]<-rmvn(N,mu0,Sig0)
old.like<-p0j(xmat[1,,],mu0,Sig0,mu1,mu2,Sig1,Sig2,0)
w0<--dmvn(xmat[1,,],mu0,Sig0,log=T)
for(j in 1:(n-1)){
    old.x<-xmat[j,,]
    new.x<-old.x+rmvn(N,c(0,0),diag(c(10,10)))
	old.like<-p0j(old.x,mu0,Sig0,mu1,mu2,Sig1,Sig2,Temps[j])
	new.like<-p0j(new.x,mu0,Sig0,mu1,mu2,Sig1,Sig2,Temps[j])
	ivec<-log(runif(N)) < (new.like-old.like)
	old.x[ivec]<-new.x[ivec]
	old.like[ivec]<-new.like[ivec]
    xmat[j+1,,]<-old.x
    wmat[j,]<-p0j(xmat[j,,],mu0,Sig0,mu1,mu2,Sig1,Sig2,Temps[j])-
              p0j(xmat[j+1,,],mu0,Sig0,mu1,mu2,Sig1,Sig2,Temps[j])
}
wmat[n,]<-p0j(xmat[n,,],mu0,Sig0,mu1,mu2,Sig1,Sig2,Temps[n])

wvec<-apply(wmat,2,sum)+w0
wvec<-exp(wvec-max(wvec))
wvec<-wvec/sum(wvec)

xsamp.sir<-xmat[n,sample(1:N,prob=wvec,size=N,rep=T),]
1/sum(wvec^2)

par(pty='s',mar=c(4,4,2,2))
image.plot(xv,yv,fmat,col=colfunc(100),zlim=range(-200,-0),
           xlab=expression(x[1]),ylab=expression(x[2]),cex.axis=0.8)
contour(xv,yv,fmat,add=T,levels=seq(-150,0,by=5));
points(xmat[n,,],pch=19,cex=0.5)
title(expression(paste(x[n],': Sampled points')))
@

<<T7,comment='+', fig.width=6, fig.height=4.5, fig.align='center',echo=FALSE>>=
par(pty='s',mar=c(4,4,2,2))
image.plot(xv,yv,fmat,col=colfunc(100),zlim=range(-200,-0),
           xlab=expression(x[1]),ylab=expression(x[2]),cex.axis=0.8)
contour(xv,yv,fmat,add=T,levels=seq(-150,0,by=5));
points(xsamp.sir,pch=19,cex=0.5)
title(expression(paste(x[n],': Resampled points')))
@


\end{document}