\documentclass[notitlepage]{article}
\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{bbold}

\usepackage{amsfonts,amsmath,amssymb}

\setlength{\parindent}{0pt}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%.6f", x),
      sprintf("%.6f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@

\begin{center}
\textsclarge{MATH 598: TOPICS IN STATISTICS}

\vspace{0.1 in} \textsc{Basic Monte Carlo method}
\end{center}

To compute the integral
\[
I(g) = \int g(y) f(y) dy =  \E_f[g(Y)]
\]
where $f(y)$ is a probability density, we use the Monte Carlo estimator
\[
\widehat I_N(g) = \frac{1}{N} \sum_{i=1}^N g(Y_i)
\]
where $Y_1,\ldots,Y_N$ are sampled (independently) from $f(.)$.  Under integrability (finite expectation) conditions, we have that
\[
\widehat I_N(g) \Cas I(g)
\]
by the law of large numbers.

\bigskip

\textbf{EXAMPLE}:  Suppose $Y \sim Gamma(2,3)$.  First we compute $\E[Y^{4}]$.  For $r > 0$, with $Y \sim Gamma(\alpha,\beta)$
\[
\E[Y^r] = \int_0^\infty y^r \frac{\beta^\alpha}{\Gamma(\alpha)} y^{\alpha -1} e^{-\beta y} \: dy = \frac{\beta^\alpha}{\Gamma(\alpha)} \frac{\Gamma(\alpha)}{\beta^{r+\alpha}} = \frac{1}{\beta^r} \frac{\Gamma(\alpha+r)}{\Gamma(\alpha)}
\]
so therefore $\E[Y^{4}] = (5 \times 4 \times 3 \times 2)/3^4 = 1.481481$.





<<MC00,comment='+', fig.width=7, fig.height=4, fig.align='center'>>=
set.seed(64)
al<-2;be<-3;r<-4
true.val<-(gamma(al+r)/gamma(al))/be^r
N<-10000
Y<-rgamma(N,al,be)
est<-cumsum(Y^4)/c(1:N)
par(mar=c(4,4,1,0))
plot(1:N,est,type='l',xlab='N',ylab='Estimate',ylim=range(0,3))
abline(h=true.val,col='red',lty=2)
@

Some replicate runs show the variability
<<MC01,comment='+', fig.width=7, fig.height=4, fig.align='center'>>=
set.seed(64)
par(mar=c(4,4,1,0))
plot(1:N,est,type='n',xlab='N',ylab='Estimate',ylim=range(0,3))
abline(h=true.val,col='red',lty=2)
for(j in 1:10){
    Y<-rgamma(N,al,be)
    tot<-cumsum(Y^4)
    est<-tot/c(1:N)
    lines(1:N,est)
    print(tot[N]/N)
}
@
The variance of the estimator is
\[
\frac{1}{N} \Var[Y^4] = \frac{1}{N} \left(\E[Y^8] - \{\E[Y^4]\}^2 \right) = \frac{1}{N} \left(\frac{1}{3^8} \frac{\Gamma(10)}{\Gamma(2)} - \frac{1}{3^8} \left\{\frac{\Gamma(6)}{\Gamma(2)} \right\}^2 \right) = 0.005311385
\]
<<MC02,comment='+', fig.width=7, fig.height=4, fig.align='center'>>=
(true.var<-(gamma(al+2*r)/gamma(al)-(gamma(al+r)/gamma(al))^2)/be^(2*r)/N) #True value
Ireps<-replicate(10000,mean((rgamma(N,al,be))^r));print(var(Ireps))      #Sample-based estimate
@

To compute the definite integral
\[
\int_{a}^{b} f(y) \: dy
\]
we can use the \texttt{pgamma} function to compute the cdf.  First for $a=0.2$, $b=0.5$:
<<MC03,comment='+', fig.width=7, fig.height=4, fig.align='center'>>=
set.seed(64)
a<-0.2;b<-0.5
Y<-rgamma(N,al,be)
par(mar=c(4,4,1,0))
hist(Y[Y<3],breaks=seq(0,3,by=0.1),prob=T,main='Gamma(2,3) pdf',xlab='y');box()
xv<-seq(0,3,by=0.01)
yv<-dgamma(xv,al,be)
lines(xv,yv,col='red')
par(mar=c(4,4,1,0))
(true.val<-pgamma(b,al,be)-pgamma(a,al,be))
par(mar=c(4,4,1,0))
plot(1:N,est,type='n',xlab='N',ylab='f(y)',ylim=range(0,1))
abline(h=true.val,col='red',lty=2)
for(j in 1:10){
    Y<-rgamma(N,al,be)
    tot<-cumsum(Y> a & Y < b)
    est<-tot/c(1:N)
    lines(1:N,est)
    print(tot[N]/N)
}
@
If $a=2.5$ and $b=3$, the estimator is much more variable relative to the estimated quantity:
<<MC04,comment='+', fig.width=7, fig.height=4, fig.align='center'>>=
set.seed(64)
Ind<-function(y,a,b){mean(y>a & y < b)}
a<-0.2;b<-0.5
true.val<-pgamma(b,al,be)-pgamma(a,al,be)
Ireps1<-replicate(10000,Ind(rgamma(N,al,be),a,b))
var(Ireps1)/true.val
a<-2.5;b<-3
true.val<-pgamma(b,al,be)-pgamma(a,al,be)
Ireps2<-replicate(10000,Ind(rgamma(N,al,be),a,b))
var(Ireps2)/true.val
@

\pagebreak

\textbf{EXAMPLE}:  For random variables $(X,Y)$ that are independently uniform on the unit square, the probability
\[
\Pr((X-0.5)^2+(Y-0.5)^2 < 0.5^2) = \frac{\pi}{4} \bumpeq \Sexpr{round(pi/4,6)}
\]
and we can compute this probability as
\[
\iint_\mathcal{D} \: dx dy = \int_{0}^1 \int_0^1 \Ind_{\mathcal{D}}(x,y) \: dx dy
\]
where $\mathcal{D}$ is the disk of radius 1/2 centered at $(1/2,1/2)$.  We may estimate this probability using the Monte Carlo estimator
\[
\frac{1}{N} \sum_{i=1}^N \Ind_{\mathcal{D}}(X_i,Y_i)
\]
where $(X_i,Y_i),i=1,\ldots,N$ are drawn independently as $Uniform(0,1)$.
<<MC05,comment='+', fig.width=4.5, fig.height=4.5, fig.align='center'>>=
set.seed(64)
N<-1000;X<-runif(N,0,1);Y<-runif(N,0,1)
par(mar=c(4,4,0,0),pty='s')
plot(X,Y,ylim=range(0,1),xlim=range(0,1),type='n')
lines(c(0,1,1,0,0),c(0,0,1,1,0))
tv<-seq(0,2*pi,length=1001);xv<-0.5*cos(tv)+0.5;yv<-0.5*sin(tv)+0.5
lines(xv,yv,lty=2)
inside<-(X-0.5)^2 + (Y-0.5)^2 < (0.5)^2
points(X[inside],Y[inside],col='red',pch=19,cex=0.5)
points(X[!inside],Y[!inside],col='black',pch=19,cex=0.5)
sum(inside)/N
@

\textbf{EXAMPLE}:   Consider the integral
\[
\int_0^1 \frac{1}{x} \sin \left(\frac{2 \pi}{x} \right) \: dx
\]
Using the change of variable $t=2 \pi /x$, we see that the integral is equal to
\[
\int_{2 \pi}^\infty \frac{1}{t} \sin (t) \: dt
\]
The integral
\[
Si(a) = \int_0^a \frac{1}{t} \sin (t) \: dt
\]
is known as the Sine Integral, a special function that can be computed numerically:
\[
\int_{2 \pi}^\infty \frac{1}{t} \sin (t) \: dt = Si(\infty) - Si(2 \pi) = \frac{\pi}{2} - 1.418151 = \Sexpr{round(pi/2-1.418151,6)}
\]
We try to compute this integral by Monte Carlo.  We write
\[
\int_0^1 \frac{1}{x} \sin \left(\frac{2 \pi}{x} \right) \: dx = \E_Y \left[\frac{1}{Y} \sin \left(\frac{2 \pi}{Y}  \right) \right]
\]
where $Y \sim Uniform(0,1)$, which implies the Monte Carlo estimator
\[
\frac{1}{N} \sum_{i=1}^N \frac{1}{Y_i} \sin \left(\frac{2 \pi}{Y_i} \right)
\]
with $Y_1,\ldots,Y_N \sim Uniform(0,1)$ are independent.  The simulation below shows ten multiple runs.
<<MC06,comment='+', fig.width=7, fig.height=4, fig.align='center'>>=
set.seed(64)
par(mar=c(4,4,1,2))
N<-10000
plot(1:N,1:N,type='n',xlab='N',ylab='Estimate',ylim=range(-20,20))
true.val<-pi/2-1.418151
abline(h=true.val,col='red',lty=2)
for(j in 1:10){
    Y<-runif(N)
    gY<-sin(2*pi/Y)/Y
    tot<-cumsum(gY)
    est<-tot/c(1:N)
    lines(1:N,est)
    print(tot[N]/N)
}
@

The Monte Carlo estimator does not seem to be performing well.  This problem persists even at a sample size of $N=100000$:

<<MC07,comment='+', fig.width=7, fig.height=4, fig.align='center'>>=
set.seed(64)
par(mar=c(4,4,1,2))
N<-100000
plot(1:N,1:N,type='n',xlab='N',ylab='Estimate',ylim=range(-20,20))
true.val<-pi/2-1.418151
abline(h=true.val,col='red',lty=2)
for(j in 1:10){
    Y<-runif(N)
    gY<-sin(2*pi/Y)/Y
    tot<-cumsum(gY)
    est<-tot/c(1:N)
    lines(1:N,est)
    print(tot[N]/N)
}
@
The problem remains, and is caused by the fact that the original (Riemann) integral is not \textit{absolutely convergent}, that is,
\[
\int_0^1 \biggl | \frac{1}{x} \sin \left(\frac{2 \pi}{x} \right)  \biggr| dx = \int_0^1 \frac{1}{x}\biggl | \sin \left(\frac{2 \pi}{x} \right)  \biggr| dx
\]
is \textbf{not finite}. This is a necessary condition for Monte Carlo estimation (which essentially relies on Lebesgue integration) to yield an unbiased and consistent estimator.  We have that if
\[
\int |g(x)| f(x) \: dx
\]
is finite, we can use Monte Carlo to estimate the integral.  This coincides with the result that the expectation of a random variable $Y$ exists and is finite if and only if the expectation of $|Y|$ is finite.

\pagebreak

\textbf{EXAMPLE:}  The Central Limit Theorem (CLT) states that for the Monte Carlo estimator of $I(g) = \E[g(Y)] < \infty$ is asymptotically Normally distributed 
\[
\sqrt{N} (\widehat I_N(g) - I(g)) \CiD Normal(0,V(g))
\]
provided $V(g) = \Var[g(Y)] < \infty$.  If the variance is not finite, then the CLT approach does not apply.   For example, consider estimating the mean of a standard $Student(\nu)$ distribution; the variance of this distribution is $\nu/(\nu -2)$ provided $\nu > 2$, but is infinite otherwise.  When $\nu = 3$, the asymptotic variance of $\overline{Y}_n$ is 3.

<<MC08,comment='+', fig.width=8.5, fig.height=3.5, fig.align='center'>>=
set.seed(64)
par(mar=c(3,3,2,1),mfrow=c(1,2))
N<-100000
plot(1:N,1:N,type='n',xlab='N',ylab='Estimate',ylim=range(-1,1),main=expression(bar(Y)[n]))
abline(h=0,col='red',lty=2)
for(j in 1:10){
    Y<-rt(N,3)
    tot<-cumsum(Y)
    est<-tot/c(1:N)
    lines(1:N,est)
}
set.seed(64)
plot(1:N,1:N,type='n',xlab='N',ylab='Estimate',ylim=range(-10,10),main=expression(sqrt(N)*bar(Y)[n]))
abline(h=0,col='red',lty=2)
for(j in 1:10){
    Y<-rt(N,3)
    tot<-cumsum(Y)
    est<-tot/sqrt(c(1:N))
    lines(1:N,est)
}
var(replicate(1000,sqrt(N)*mean(rt(N,3))))

@

\pagebreak

However, when $\nu =2$, the variance of $\sqrt{N} \overline{Y}_N$ is not finite, and the standard CLT does not apply.
<<MC09,comment='+', fig.width=8.5, fig.height=3.5, fig.align='center'>>=
set.seed(654)
par(mar=c(3,3,2,1),mfrow=c(1,2))
plot(1:N,1:N,type='n',xlab='N',ylab='Estimate',ylim=range(-1,1),main=expression(bar(Y)[n]))
abline(h=0,col='red',lty=2)
for(j in 1:10){
    Y<-rt(N,2)
    tot<-cumsum(Y)
    est<-tot/c(1:N)
    lines(1:N,est)
}
set.seed(654)
plot(1:N,1:N,type='n',xlab='N',ylab='Estimate',ylim=range(-10,10),main=expression(sqrt(N)*bar(Y)[n]))
abline(h=0,col='red',lty=2)
for(j in 1:10){
    Y<-rt(N,2)
    tot<-cumsum(Y)
    est<-tot/sqrt(c(1:N))
    lines(1:N,est)
}
var(replicate(1000,sqrt(N)*mean(rt(N,2))))
var(replicate(1000,sqrt(N)*mean(rt(N,2))))
@
\end{document}