\documentclass[notitlepage]{article}
\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{bbold}
\usepackage{bm}



\usepackage{amsfonts,amsmath,amssymb}

\setlength{\parindent}{0pt}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%6i", x),
      sprintf("%.3f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@


\begin{center}
\textsclarge{MATH 598: TOPICS IN STATISTICS}

\vspace{0.1 in} \textsc{The Langevin algorithm}
\end{center}

Denote the target pdf $\pi(x)$.  Consider the Langevin SDE, and the diffusion process $\{X_t, t > 0\}$
\[
d X_t = \frac{1}{2} S(X_t) dt + d W_t
\]
where $\{W_t\}$ is Brownian motion, and
\[
S(x) = \frac{d}{dx} \log \pi(x) = \triangledown \log \pi(x).
\]
It can be shown that the invariant measure of this continuous time process is $\pi(x)$.  A standard scheme for simulating this diffusion is given by an Euler approximation based on the time-discretization
\[
\{X_{k\delta}, k=1,2,\ldots\}
\]
for some time-step $\delta >0$ where $X_0 = x_0$, and then for $k=0,1,2,\ldots$
\[
X_{k+1} = X_k + \frac{\delta}{2} S(X_k) + \sqrt{\delta} Z_k
\]
where $\{Z_k, k=1,2,\ldots\}$ are an iid $Normal_d(0,\Ident_d)$ sequence. The approximation induced by time-discretization does not quite preserve $\pi(x)$ as the invariant distribution.  This can be easily corrected using a Metropolis-Hastings step to yield the Metropolis-adjusted Langevin algorithm where the proposal
\[
X_{k+1}^\star \sim Normal_d(X_k +  \delta S(X_k)/2, \delta \Ident_d)
\]
is \emph{accepted} with probability
\[
\min \left \{1, \frac{\pi(X_{k+1}^\star)}{\pi(X_{k})} \frac{q(X_{k+1}^\star,X_k)}{q(X_k,X_{k+1}^\star)} \right\}
\]
where
\[
q(\bx,\by) \varpropto \exp\left\{ -\frac{1}{2\delta} (\by - \bx - \delta S(\bx)/2)^\top (\by - \bx - \delta S(\bx)/2) \right\}
\]

\medskip
\textbf{Example:}  Suppose $Y \equiv Gamma(\alpha,1)$, and consider the pdf of $X = \log Y$.  Then for $x \in \R$ 
\[
f_X(x) = e^x f_Y(e^x) = \frac{1}{\Gamma(\alpha)} \exp\{ \alpha x - e^x\}.
\]
We set $\pi(x) \equiv f_X(x)$ so that
\begin{align*}
\log \pi(x) & = \alpha x - e^x - \log \Gamma(\alpha)\\[6pt]
S(x) =  \frac{d}{dx} \log \pi(x) & = \alpha - e^x
\end{align*}

It is straightforward to simulate this pdf directly

<<M1,comment='+', fig.width=7, fig.height=4.0, fig.align='center', echo=TRUE>>=
set.seed(23984)
al<-10
N<-10001
Y<-rgamma(N,al,1)
X<-log(Y)
par(mar=c(3,3,3,0))
hist(X,freq=FALSE,breaks=seq(0,5,by=0.1),main='Direct sampling');box()
xvec<-seq(0,5,by=0.001)
yvec<-exp(al*xvec-exp(xvec))/gamma(al)
lines(xvec,yvec,col='red')
@

We may also simulate the Langevin dynamics directly using the Euler approximation.  We use $N=10000$ steps, with a time step of $\delta = 0.01$, taking $x_0 = 2$.   We can compute the effective sample size for the sampling approach.

<<M2,comment='+', fig.width=7, fig.height=4.0, fig.align='center', echo=TRUE>>=
del<-0.01
x<-rep(0,N)
x[1]<-2
Z<-rnorm(N)
for(k in 1:N){
	x[k+1] <- x[k] + del*(al - exp(x[k]))/2 + sqrt(del)*Z[k]
}
library(coda)
ESS<-effectiveSize(x)
x.1<-x
@
The low effective sample size of \Sexpr{round(as.numeric(ESS),3)} indicates a high degree of autocorrelation.  The effective sample size can be increased by increasing $\delta$ to 0.1.
<<M3,comment='+', fig.width=7, fig.height=4.0, fig.align='center', echo=TRUE>>=
del<-0.1
x<-rep(0,N)
x[1]<-2
Z<-rnorm(N)
for(k in 1:N){
	x[k+1] <- x[k] + del*(al - exp(x[k]))/2 + sqrt(del)*Z[k]
}
(ESS<-effectiveSize(x))
x.2<-x
@
The effective sample size is now \Sexpr{round(as.numeric(ESS),3)} so performance seems to be better.  We now see how the two schemes with different $\delta$ values compare.


<<M4,comment='+', fig.width=7, fig.height=3.5, fig.align='center', echo=TRUE>>=
par(mar=c(3,3,2,0),mfrow=c(1,2))
plot(x.1,type='l',main=expression(delta==0.01),ylim=range(1,4))
plot(x.2,type='l',main=expression(delta==0.1),ylim=range(1,4))
hist(x.1,freq=FALSE,breaks=seq(0,5,by=0.1),main=expression(delta==0.01),ylim=range(0,1.5));box()
lines(xvec,yvec,col='red')
hist(x.2,freq=FALSE,breaks=seq(0,5,by=0.1),main=expression(delta==0.1),ylim=range(0,1.5));box()
lines(xvec,yvec,col='red')
@
The approximation is poorer when $\delta = 0.1$.

\medskip

We now implement the MALA enhancement:

<<M5,comment='+', fig.width=7, fig.height=4.0, fig.align='center', echo=TRUE>>=
log.pi<-function(xv,av){
    return(av*xv - exp(xv))
}
dlog.pi<-function(xv,av){
    return(av- exp(xv))
}
q.func<-function(xv,yv,av,dv){
    m<-xv+0.5*dv*dlog.pi(xv,av)
    return(dnorm(yv,m,sqrt(dv),log=T))
}

del<-0.01
x<-rep(0,N)
x[1]<-2
Z<-rnorm(N)
irej<-0
for(k in 1:N){
	x[k+1] <- x[k] + del*(al - exp(x[k]))/2 + sqrt(del)*Z[k]
    acc<-log.pi(x[k+1],al)-log.pi(x[k],al) + q.func(x[k+1],x[k],al,del) - q.func(x[k],x[k+1],al,del)
    if(log(runif(1)) > acc){
        x[k+1]<-x[k]  #Reject !
        irej<-irej+1
    }
}
(ESS<-effectiveSize(x))
x.3<-x
acc.rate.3<-(N-irej)/N
@
The low effective sample size of \Sexpr{round(as.numeric(ESS),3)} again indicates a high degree of autocorrelation.  The effective sample size can again be increased by increasing $\delta$ to 0.1.
<<M6,comment='+', fig.width=7, fig.height=4.0, fig.align='center', echo=TRUE>>=
del<-0.1
x<-rep(0,N)
x[1]<-2
Z<-rnorm(N)
irej<-0
for(k in 1:N){
	x[k+1] <- x[k] + del*(al - exp(x[k]))/2 + sqrt(del)*Z[k]
    acc<-log.pi(x[k+1],al)-log.pi(x[k],al) + q.func(x[k+1],x[k],al,del) - q.func(x[k],x[k+1],al,del)
    if(log(runif(1)) > acc){
        x[k+1]<-x[k]  #Reject !
        irej<-irej+1
    }
}
(ESS<-effectiveSize(x))
x.4<-x
acc.rate.4<-(N-irej)/N
print(c(acc.rate.3,acc.rate.4))
@
The effective sample size is now \Sexpr{round(as.numeric(ESS),3)}.  The acceptance rates for the Metropolis step are very high in both algorithms.

<<M7,comment='+', fig.width=7, fig.height=3.5, fig.align='center', echo=TRUE>>=
par(mar=c(3,3,2,0),mfrow=c(1,2))
plot(x.3,type='l',main=expression(delta==0.01),ylim=range(1,4))
plot(x.4,type='l',main=expression(delta==0.1),ylim=range(1,4))
hist(x.3,freq=FALSE,breaks=seq(0,5,by=0.1),main=expression(delta==0.01),ylim=range(0,1.5));box()
lines(xvec,yvec,col='red')
hist(x.4,freq=FALSE,breaks=seq(0,5,by=0.1),main=expression(delta==0.1),ylim=range(0,1.5));box()
lines(xvec,yvec,col='red')
@
The sampling from the target is good in both cases, the inaccuracy for $\delta=0.1$ has been corrected.

\medskip
We now assess results when $\delta$ is increased to $\delta=0.5$ and $\delta = 1.0$.
<<M8,comment='+', fig.width=7, fig.height=4.0, fig.align='center', echo=TRUE>>=
del<-0.5
x<-rep(0,N)
x[1]<-2
Z<-rnorm(N)
irej<-0
for(k in 1:N){
	x[k+1] <- x[k] + del*(al - exp(x[k]))/2 + sqrt(del)*Z[k]
    acc<-log.pi(x[k+1],al)-log.pi(x[k],al) + q.func(x[k+1],x[k],al,del) - q.func(x[k],x[k+1],al,del)
    if(log(runif(1)) > acc){
        x[k+1]<-x[k]  #Reject !
        irej<-irej+1
    }
}
(ESS<-effectiveSize(x))
x.5<-x
acc.rate.5<-(N-irej)/N

del<-1
x<-rep(0,N)
x[1]<-2
Z<-rnorm(N)
irej<-0
for(k in 1:N){
	x[k+1] <- x[k] + del*(al - exp(x[k]))/2 + sqrt(del)*Z[k]
    acc<-log.pi(x[k+1],al)-log.pi(x[k],al) + q.func(x[k+1],x[k],al,del) - q.func(x[k],x[k+1],al,del)
    if(log(runif(1)) > acc){
        x[k+1]<-x[k]  #Reject !
        irej<-irej+1
    }
}
(ESS<-effectiveSize(x))
x.6<-x
acc.rate.6<-(N-irej)/N
print(c(acc.rate.5,acc.rate.6))
par(mar=c(3,3,2,0),mfrow=c(1,2))
plot(x.5,type='l',main=expression(delta==0.5),ylim=range(1,4))
plot(x.6,type='l',main=expression(delta==1.0),ylim=range(1,4))
hist(x.5,freq=FALSE,breaks=seq(0,5,by=0.1),main=expression(delta==0.5),ylim=range(0,1.5));box()
lines(xvec,yvec,col='red')
hist(x.6,freq=FALSE,breaks=seq(0,5,by=0.1),main=expression(delta==1.0),ylim=range(0,1.5));box()
lines(xvec,yvec,col='red')
var(X)
apply(cbind(x.1,x.2,x.3,x.4,x.5,x.6),2,var)
@

There is some theory to suggest that the optimal acceptance rate for the MALA algorithm is around 0.57.  We examine the variation in the ESS and acceptance rate as $\delta$ varies from 0.1 to 0.5.

<<M9,comment='+', fig.width=7, fig.height=4.0, fig.align='center', echo=TRUE>>=
N<-100001
del.vec<-acc.vec<-seq(0.1,0.5,by=0.01)
xd<-matrix(0,nrow=N+1,ncol=length(del.vec))
x<-rep(0,N)
for(j in 1:length(del.vec)){
    del<-del.vec[j]
    x[1]<-2
    Z<-rnorm(N)
    irej<-0
    for(k in 1:N){
	   x[k+1] <- x[k] + del*(al - exp(x[k]))/2 + sqrt(del)*Z[k]
        acc<-log.pi(x[k+1],al)-log.pi(x[k],al) + q.func(x[k+1],x[k],al,del) - q.func(x[k],x[k+1],al,del)
        if(log(runif(1)) > acc){
            x[k+1]<-x[k]  #Reject !
            irej<-irej+1
        }
    }
    xd[,j]<-x
    acc.vec[j]<-(N-irej)/N
}
ESSd<-effectiveSize(xd)
@

In the plots below, we compare

\begin{itemize}
  \item Effective sample size vs $\delta$;
  \item Effective sample size vs Metropolis acceptance probability.
\end{itemize}
The effective sample sizes are plotted as proportions of the run-length $N=100001$.

<<M10,comment='+', fig.width=7, fig.height=4.0, fig.align='center', echo=TRUE>>=
#cbind(del.vec,ESSd,acc.vec)
par(mar=c(4,4,1,0))
plot(del.vec,ESSd/N,ylim=range(0,1),type='l',xlab=expression(delta),ylab='ESS')
points(del.vec,ESSd/N,pch=20)
plot(acc.vec,ESSd/N,pch=20,ylim=range(0,1),type='l',xlab='Acc. Rate.',ylab='ESS')
points(acc.vec,ESSd/N,pch=20)
@
From the plots we deduce that the optimal $\delta$ in this problem is approximately \Sexpr{del.vec[which.max(ESSd)]}, which returns an acceptance rate of \Sexpr{acc.vec[which.max(ESSd)]} and an effective sample size of \Sexpr{ESSd[which.max(ESSd)]}.

<<M11,comment='+', fig.width=7, fig.height=4.0, fig.align='center', echo=TRUE>>=
par(mar=c(4,4,1,0))
tval<-substitute(paste('Optimal ',delta)==d,list(d = del.vec[which.max(ESSd)]))
plot(xd[1:10000,which.max(ESSd)],type='l',xlab='k',ylab=expression(X),
    main=tval,ylim=range(1,4))
hist(xd[1:10000,which.max(ESSd)],freq=FALSE,xlab=expression(X),breaks=seq(0,5,by=0.1),
    main=tval,ylim=range(0,1.5));box();lines(xvec,yvec,col='red')
@

\end{document}