\documentclass[notitlepage]{article} \usepackage{../../Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{multirow} \usepackage{bbm} \usepackage{bbold} \usepackage{amsfonts,amsmath,amssymb} \setlength{\parindent}{0pt} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%.6f", x), sprintf("%.6f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \begin{center} \textsclarge{MATH 559: Bayesian Theory and Methods} \vspace{0.1 in} \textsc{Importance sampling and variance reduction} \end{center} \textbf{Importance Sampling:} We may write \[ I(g) \equiv \E_f[g(X)] = \int g(x) f(x) dx = \int \frac{g(x) f(x)}{f_0(x)} f_0(x) dx = \E_{f_0}\left[\frac{g(X) f(X)}{f_0(X)} \right] \] provided that the support of $f_0(x)$ includes the support of $f(x)$. This suggests the \textit{Importance Sampling} (IS) strategy where $X_1,\ldots,X_N$ are sampled independently from $f_0$, with the estimator \[ \widehat I_N^{(f_0)}(g) = \frac{1}{N} \sum_{i=1}^N \frac{g(X_i) f(X_i)}{f_0(X_i)}. \] The choice $f_0(x) \equiv f (x)$ for all $x$ returns the original MC estimator, $\widehat I_N(g)$. We might use IS if $f$ is hard to sample from whereas $f_0$ is straightforward, or if the variance of $\widehat I_N^{(f_0)}(g)$ is smaller than that of $\widehat I_N (g)$. We have that \[ \Var_{f_0} \left[\widehat I_N^{(f_0)}(g) \right] = \frac{1}{N} \Var_{f_0} \left[ \frac{g(X) f(X)}{f_0(X)} \right] = \frac{1}{N} \int \left(\frac{g(x) f(x)}{f_0(x)} - I(g) \right)^2 f_0(x) \: dx = \frac{1}{N} \int \left\{\frac{g(x) f(x)}{f_0(x)} \right\}^2 f_0(x) \: dx - \frac{\{I(g) \}^2}{N} \] which shows that the variance is finite if and only if \[ \int \left\{\frac{g(x) f(x)}{f_0(x)} \right\}^2 f_0(x) \: dx < \infty. \] We have that by Jensen's inequality \[ \int \left\{\frac{g(x) f(x)}{f_0(x)} \right\}^2 f_0(x) \: dx \geq \left\{\int \frac{|g(x)| f(x)}{f_0(x)} f_0(x) \: dx \right\}^2 = \left\{ \int |g(x)| f(x) \: dx \right\}^2 \] and so we have that the right-hand side is a lower bound on the left-hand side. We can make the two sides equal, and therefore achieve the lower bound by setting \[ f_0(x) = \frac{|g(x)| f(x)}{\int |g(t)| f(t) \: dt} \] However, this is infeasible as we cannot explicitly compute the pdf in most cases. It does inform us that we should choose $f_0(x)$ to be close (in 'shape') to $|g(x)| f(x)$. \bigskip Note also that the statistic \[ \frac{1}{N} \sum_{i=1}^N \frac{f(X_i)}{f_0(X_i)} \Cas \int \frac{f(x)}{f_0(x)} f_0(x) \: dx = 1 \] so we may instead use the estimator \[ \dfrac{\sum\limits_{i=1}^N \dfrac{g(X_i) f(X_i)}{f_0(X_i)}}{\sum\limits_{j=1}^N \dfrac{f(X_j)}{f_0(X_j)}} = \sum_{i=1}^N w(X_i) g(X_i) \qquad \qquad w(X_i) = \dfrac{\dfrac{f(X_i)}{f_0(X_i)}}{\sum\limits_{j=1}^N \dfrac{f(X_j)}{f_0(X_j)}} \qquad i=1,\ldots,N. \] This approach can be useful if the densities $f(x)$ and $f_0(x)$ are only known up to proportionality. \bigskip \textbf{EXAMPLE}: Suppose $X \sim Gamma(2,3)$, and aim to compute $\E[X^{4}]$. For $r > 0$, with $X \sim Gamma(\alpha,\beta)$ \[ \E[X^r] = \frac{1}{\beta^r} \frac{\Gamma(\alpha+r)}{\Gamma(\alpha)} \] so therefore $\E[X^{4}] = (5 \times 4 \times 3 \times 2)/3^4 = 1.481481$. <>= al<-2;be<-3;r<-4 true.val<-(gamma(al+r)/gamma(al))/be^r xv<-seq(0,5,by=0.01) yv<-dgamma(xv,al,be) yv4<-xv^4 par(mar=c(3,3,1,0)) plot(xv,yv,type='l',xlab='y',ylab=' ') lines(xv,yv*yv4,col='red') legend(3,1,c('f(x)','g(x)f(x)'),lty=1,col=c('black','red')) @ We test the importance sampling estimator with the following choices of $f_0$ \begin{itemize} \item $f_0(x) \equiv Gamma(2,3)$ (that is, ordinary MC estimation); \item $f_0(x) \equiv Gamma(5,2)$; \item $f_0(x) \equiv Gamma(6,3)$; \item $f_0(x) \equiv Normal(2,2)$; \item $f_0(x) \equiv Student(5)$ centered at $y=2$; \end{itemize} Replicate runs show the variability in each case <>= set.seed(64) N<-10000 nreps<-1000 IS.ests<-matrix(0,nrow=nreps,ncol=5) for(irep in 1:nreps){ X1<-rgamma(N,2,3) IS.ests[irep,1]<-mean(X1^4) X2<-rgamma(N,5,2) IS.ests[irep,2]<-mean(X2^4*dgamma(X2,2,3)/dgamma(X2,5,2)) X3<-rgamma(N,6,3) IS.ests[irep,3]<-mean(X3^4*dgamma(X3,2,3)/dgamma(X3,6,3)) X4<-rnorm(N,2,2) IS.ests[irep,4]<-mean(X4^4*dgamma(X4,2,3)/dnorm(X4,2,2)) X5<-rt(N,df=5)+2 IS.ests[irep,5]<-mean(X5^4*dgamma(X5,2,3)/dt(X5-2,df=5)) } par(mar=c(3,3,2,0)) boxplot(IS.ests);title('Boxplot of the five estimators over 1000 replicates') abline(h=true.val,col='red') #True.value true.val apply(IS.ests,2,mean) #Mean of estimator N*apply(IS.ests,2,var) #Variance of estimator @ Here it is evident that the IS estimators perform much better (in terms of estimator variance) than the MC estimator. Note that for the case $f_0(x) \equiv Gamma(6,3)$ we have exactly matched the integrand \[ f_0(x) \propto x^{6-1} \exp\{-3 x\} = x^4 x^{2-1} \exp\{-3 x\} \propto g(x) f(x). \] \medskip \textbf{Rejection Sampling:} Rejection sampling is a form of direct sampling from $f(x)$ that uses an alternate distribution $f_0$ for sampling which can be used under the condition \[ \frac{f(x)}{f_0(x)} < M < \infty \] for all $y$. To apply rejection sampling \begin{enumerate}[(i)] \item generate $X \sim f_0(x)$ and $U \sim Uniform(0,1)$ independently \item \textbf{accept} $X$ as a sample from $f(x)$ if \[ U \leq \frac{f(X)}{M f_0(X)} \] and \textbf{reject} $X$ and return to (i) if \[ U > \frac{f(X)}{M f_0(X)} \] \end{enumerate} We have that \begin{eqnarray*} \P[X\text{ is accepted}] = \P\left[U \leq \frac{f(X)}{M f_0(X)}\right] = \int_{-\infty}^\infty \left\{ \int_0^{f(x)/(M f_0(x))} \: du \right\} f_0(x) \: dx &=& \int_{-\infty}^\infty \frac{f(x)}{M f_0(x)} f_0(x) \: dx\\[6pt] & = & \frac{1}{M} \int_{-\infty}^\infty f(x) \: dx = \frac{1}{M} \end{eqnarray*} and then that for $x \in \R$, \begin{eqnarray*} \P[X \leq x | X\text{ is accepted}] = \frac{\P[X \leq x, X\text{ is accepted}]}{\P[X\text{ is accepted]}} & = & M \int_{-\infty}^x \left\{ \int_0^{f(t)/(M f_0(t))} \: du \right\} f_0(t) \: dt\\[6pt] & = & \int_{-\infty}^x \frac{f(t)}{f_0(t)} f_0(t) \: dt= \int_{-\infty}^x f(t) \: dt \end{eqnarray*} and therefore the accepted points have distribution $f(x)$. \bigskip \textbf{EXAMPLE:} To illustrate the use of rejection sampling, consider the following model: \[ f(x) = \frac{1}{4} \frac{1}{\sigma_1} \phi \left(\frac{y-\mu_1}{\sigma_1}\right) + \frac{3}{4} \frac{1}{\sigma_2} \phi \left(\frac{y-\mu_2}{\sigma_2}\right) \] that is, a mixture of two Normal densities, where $\phi(.)$ is the standard Normal pdf. In the example below \[ \mu_1 = -2 \qquad \sigma_1 = 1 \qquad \mu_2 = 1 \qquad \sigma_2 = 1. \] For $f_0(x)$ we propose to use the $Normal(1,2.5^2)$ distribution. <>= xv<-seq(-7.5,7.5,by=0.001) fx<-0.25*dnorm(xv,-2,1)+0.75*dnorm(xv,1,1) f0x<-dnorm(xv,1,2.5) f.ratio<-function(xs){ fxv<-0.25*dnorm(xs,-2,1)+0.75*dnorm(xs,1,1) f0xv<-dnorm(xs,1,2.5) return(fxv/f0xv) } par(mar=c(3,3,2,0)) plot(xv,fx,type="l",lwd=2) lines(xv,f0x,col="red",lwd=2,lty=2) legend(-7,0.3,c(expression(f(x)),expression(f[0](x))),col=c('black','red'),lty=c(1,2),lwd=c(2,2)) mval<-optim(c(0),fn=f.ratio,control=list(fnscale=-1),method="BFGS") M<-max(fx/f0x) plot(xv,fx/f0x,type="l",xlab="x",ylab=expression(f(x)/f[0](x)),lwd=2) abline(v=mval$par,col="red",lty=2,lwd=2) legend(-7,1.6,c(expression(f(x)/f[0](x)),'M'),col=c('black','red'),lty=c(1,2),lwd=c(2,2)) plot(xv,fx,type="l",lwd=2) lines(xv,mval$val*f0x,col="red",lwd=2,lty=2) legend(-7,0.3,c(expression(f(x)),expression(M*f[0](x))),col=c('black','red'),lty=c(1,2),lwd=c(2,2)) M<-mval$val N<-100000 U<-runif(N) X<-rnorm(N,1,2.5) pX<-(0.25*dnorm(X,-2,1)+0.75*dnorm(X,1,1)) p0X<-dnorm(X,1,2.5) ival<-U<(pX/(M*p0X)) X.acc<-X[ival] acc.rate<-sum(ival)/N hist(X.acc,br=seq(-6,6,by=0.25),main="Accepted Points",col="black",border="white",xlab="x") Xv<-seq(-6,6,by=0.01) Yv<-(0.25*dnorm(Xv,-2,1)+0.75*dnorm(Xv,1,1)) lines(Xv,Yv*length(X.acc)*0.25,col="red",lwd=2);box() plot(xv,fx,type="l",lwd=2,main="Selected simulations",ylab="Density",xlab="x") lines(xv,mval$val*f0x,col="red",lty=2) for(id in c(1000,7200,6000,5000,10000)){ xh<-X[id] yh<-mval$val*dnorm(xh,1,2.5) lines(c(xh,xh),c(0,yh),col="red") u<-runif(1);acc<-4*(u>yh)+18*(u>= set.seed(64) N<-5000 nreps<-1000 AV.ests<-matrix(0,nrow=nreps,ncol=2) for(irep in 1:nreps){ X<-runif(2*N) AV.ests[irep,1]<-mean(X+X^2) X1<-runif(N) X2<-1-X1 AV.ests[irep,2]<-mean(X1+X1^2 + X2+X2^2)/2 } (true.val<-5/6) apply(AV.ests,2,mean) cor(X1+X1^2,X2+X2^2) N*apply(AV.ests,2,var) var(AV.ests[,1])/var(AV.ests[,2]) @ In this example there is a 30-fold decrease in variance of the estimator. \end{document}