\documentclass[notitlepage]{article} \usepackage{../../Math556} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{multirow} \usepackage{bbm} \usepackage{bbold} \usepackage{amsfonts,amsmath,amssymb} \setlength{\parindent}{0pt} \def\E{\Expect} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \def\htitle{\textsclarge{Random Number Generation and Monte Carlo}} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%.6f", x), sprintf("%.6f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \begin{center}\coursetitle\end{center} \begin{center}\htitle\end{center} Many functions exist in \texttt{R} to produce random samples from probability distributions. <>= n<-10 args(rnorm) rnorm(n,5,sqrt(4)) #sample from Normal(5,4) args(rgamma) rgamma(n,2,0.5) #sample from Gamma(2,0.5) args(rpois) rpois(n,3.5) #sample from Poisson(3.5) args(sample) sample(c(1:100),n,replace=T) #sample discrete uniform on {1,2,...,100} with replacement sample(c(1:100),n,replace=F) #sample discrete uniform on {1,2,...,100| without replacement sample(c(1:5),n,replace=T,prob=c(0.2,0.1,0.4,0.1,0.2)) #sample discrete distn on {1,2,...,5} @ The \texttt{set.seed()} function can be used to make the random draws reproducible <>= set.seed(8910) rnorm(5);rnorm(5) set.seed(8910) rnorm(5) @ Random samples can be used for several purposes in basic distribution theory. \begin{itemize} \item \textbf{Visualization:} $X \sim Weibull(\alpha,\beta)$ \[ f_X(x) = \alpha \beta x^{\alpha -1}\exp\left\{-\beta x^{\alpha } \right\} \qquad x > 0 \] and zero otherwise, for $\alpha > 0$ and $\beta > 0$. Parameter $\alpha$ is the \textit{shape} parameter, and $\beta$ controls the dispersion of the distribution. An alternative parameterization utilizes the \textit{scale} parameter $\sigma = 1/\beta^{1/\alpha}$ yielding pdf \[ f_X(x) = \frac{\alpha}{\sigma} \left(\frac{x}{\sigma}\right)^{\alpha -1} \exp \left\{ -\left(\frac{x}{\sigma}\right)^{\alpha} \right\} \qquad x > 0 \] This is the parameterization that \texttt{R} uses. The expectation of this distribution is \[ \Expect_X[X] = \dfrac{\Gamma \left( 1+\dfrac{1}{\alpha}\right) }{\beta ^{1/\alpha }} = \sigma \Gamma \left( 1+\dfrac{1}{\alpha}\right). \] <>= n<-10000 al<-2;be<-4; sig<-1/be^{1/al} X<-rweibull(n,al,sig) par(mar=c(4,3,1,0)) hist(X,breaks=seq(0,2,by=0.05),main='',freq=FALSE);box() x<-seq(0,2,by=0.001) fx<-dweibull(x,al,sig) lines(x,fx,col='red') sig*gamma(1+1/al) #Expectation mean(X) #Sample mean @ \item \textbf{Transformations:} If $U \sim Uniform(0,1)$, then \[ X = \left(-\frac{1}{\beta} \log (1-U) \right)^{1/\alpha} \] ensures that $X \sim Weibull(\alpha,\beta)$ <>= set.seed(8910) n<-10000 U<-runif(n) X<-(-log(1-U)/be)^(1/al) par(mar=c(4,3,1,0)) hist(X,breaks=seq(0,2,by=0.05),main='',freq=FALSE);box() lines(x,fx,col='red') sig*gamma(1+1/al) #Expectation mean(X) #Sample mean @ Multivariate transformations can also be studied. Suppose that $U_1$ and $U_2$ are independent $Uniform(0,1)$ variables, and consider \[ X_1 = \sqrt{-2 \log U_1} \cos (2 \pi U_2) \qquad X_2 = \sqrt{-2 \log U_1} \sin (2 \pi U_2). \] We may use the multivariate transformation theorem to demonstrate that $X_1$ and $X_2$ are independent $Normal(0,1)$ variables. <>= set.seed(8910) n<-10000 U1<-runif(n);U2<-runif(n) X1<-sqrt(-2*log(U1))*cos(2*pi*U2) X2<-sqrt(-2*log(U1))*sin(2*pi*U2) par(mar=c(4,4,1,2),mfrow=c(1,2)) plot(U1,U2,xlim=range(0,1),ylim=range(0,1),pch=19,cex=0.3) plot(X1,X2,xlim=range(-4.5,4.5),ylim=range(-4.5,4.5),pch=19,cex=0.3) x<-seq(-4.5,4.5,by=0.001) fx<-dnorm(x) hist(X1,breaks=seq(-4.5,4.5,by=0.25),main='',freq=FALSE);box() lines(x,fx,col='red') hist(X2,breaks=seq(-4.5,4.5,by=0.25),main='',freq=FALSE);box() lines(x,fx,col='red') @ It is also straightforward to study non 1-1 transformations: suppose $X_1,X_2 \sim Exponential(1)$ are independent. We have seen that \[ Y = X_1 - X_2 \] has a Double Exponential (or Laplace) distribution, with \[ f_Y(y) = \frac{1}{2} \exp\{-|y| \} \qquad y \in \R. \] <>= set.seed(8910) n<-10000 X1<-rexp(n) X2<-rexp(n) Y<-X1-X2 par(mar=c(4,3,1,0)) y<-seq(-10,10,by=0.001) fy<-0.5*exp(-abs(y)) hist(Y,breaks=seq(-10,10,by=0.25),main='',freq=FALSE,ylim=range(0,0.5));box() lines(y,fy,col='red') @ This can be useful when the analytical calculation is less straightforward: suppose $X_1,X_2 \sim Gamma(\alpha,1)$ are independent. We can study the distribution of \[ Y = X_1 - X_2 \] easily using simulation. By direct but more involved calculation, we can demonstrate that, if $\alpha$ is a positive integer, say $\alpha = r+1$ for $r = 0,1,2,\ldots$, then \[ f_Y(y) = \sum_{j = 0}^r \binom{r}{j} \frac{\Gamma(r+j+1)}{\{\Gamma(r+1)\}^2} \frac{1}{2^{r+j+1}} |y|^{r-j} \exp\{-|y|\} \qquad y \in \R \] To see this, consider the case $y > 0$ and note that \[ P_{Y}[Y > y] = P_{X_1,X_2}[X_1 - X_2 > y] = \int_0^\infty \int_{x_2 + y}^\infty f_{X_1}(x_1) f_{X_2}(x_2) \ dx_1 d x_2. \] From this we compute the density by noting that $F_Y(y) = 1 - P_{Y}[Y > y]$, and differentiating with respect to $y$ under the first integral: the differentiation is facilitated using the fundamental law of calculus. This yields, for $y > 0$ \[ f_Y(y) = \frac{1}{\{\Gamma(\alpha)\}^2} e^{-y} \int_0^\infty (x_2 (x_2+y))^{\alpha - 1} \exp\{-2 x_2\} \ dx_2. \] To compute this integral for $\alpha = r+1$ an integer, we use the binomial expansion \[ (x_2+y)^r = \sum_{j = 0}^r \binom{r}{j} x_2^j y^{r-j} \] and then integrate term-by-term. We have that \[ \int_0^\infty x_2^{r+j} \exp\{-2 x_2\} \ d x_2 = \frac{\Gamma(r+j+1)}{2^{r+j+1}} \] as the integrand is proportional to a $Gamma(r+j+1,2)$ pdf. The result follows. Note that the pdf is symmetric about zero as we must have that the distribution of $X_1 - X_2$ is identical to the distribution of $X_2 - X_1$. <>= set.seed(8910) n<-10000 al<-4 r<-al-1 X1<-rgamma(n,al,1) X2<-rgamma(n,al,1) Y<-X1-X2 par(mar=c(4,3,1,0)) y<-seq(-15,15,by=0.001) ay<-abs(y) fy<-y*0 for(j in 0:r){ fy<-fy+choose(r,j)*(gamma(r+j+1)/gamma(r+1)^2)*(2^(-(r+j+1)))*ay^(r-j)*exp(-ay) } hist(Y,breaks=seq(-15,15,by=0.5),main='',freq=FALSE);box() lines(y,fy,col='red') @ \pagebreak \item \textbf{Monte Carlo:} The general principle of Monte Carlo is that the availability of random samples from a distribution $F_X(x)$ is essentially equivalent to knowledge of $F_X$ itself if the number of samples is large enough. Suppose $X_1,\ldots,X_n$ are independent rvs having the same cdf $F_X$. We may approximate $F_X$ itself using the \textit{empirical distribution function}, $\widehat F_n$, defined by \[ \widehat F_n(x) = \frac{1}{n} \sum_{i=1}^n \One_{(-\infty,x]}(X_i) \equiv \frac{1}{n} \sum_{i=1}^n \One_{[X_i,\infty)}(x) \qquad x \in \R \] which, for any fixed $x$, records the fraction of the $X_i$s that are less than or equal to $x$. If $n$ is large enough, we can show that $\widehat F_n(x)$, when computed with simulated sample values $x_1,\ldots,x_n$, closely approximates $F_X(x)$. Thus we may also approximate the expectation \[ \Expect_X[g(X)] = \int_{-\infty}^\infty g(x) \ d F_X(x) \] by the sample average \[ \widehat \Expect_X[g(X)] = \int_{-\infty}^\infty g(x) \ d \widehat F_n(x) = \frac{1}{n} \sum_{i=1}^n g(x_i). \] This is the same principle that we use in statistical inference when we use the sample mean, sample variance etc. from a data sample to estimate the theoretical mean and variance etc. The discrete distribution \[ \widehat f_n(x) = \sum_{i=1}^n \frac{1}{n} \One_{\{x\}}(X_i) = \sum_{i=1}^n \frac{1}{n} \One_{\{X_i\}}(x) \] with masses $1/n$ placed at the $X_i$s is used to approximate the pmf/pdf of $X$. \medskip In the previous example, with $X_1$ and $X_2$ independent $Gamma(\alpha,1)$, we may compute the variance of $Y=X_1 - X_2$ directly as $\Var_Y[Y] = \Var_{X_1}[X_1] + \Var_{X_2}[X_2] = \alpha + \alpha = 2 \alpha$, and approximate it by Monte Carlo as follows: <>= var(Y) #sample variance 2*al #theoretical variance. @ Note that because the values are randomly drawn, we will get slightly different numerical results each time. For 10 replicate runs, we get <>= vvec<-replicate(10,var(rgamma(n,al,1)-rgamma(n,al,1))) vvec @ If the sample size is increased, the variation becomes smaller. <>= n<-n*10 vvec<-replicate(10,var(rgamma(n,al,1)-rgamma(n,al,1))) vvec n<-n*10 vvec<-replicate(10,var(rgamma(n,al,1)-rgamma(n,al,1))) vvec @ Monte Carlo methods are most often used when there are no straightforward analytic results available. Suppose we have that \[ g(x) = \max\left\{2 |x| \log |x|,5\right\}. \] Then \[ \widehat \Expect_X[g(X)] = \frac{1}{n} \sum_{i=1}^n \max\left\{2 |x_i| \log |x_i|,5\right\} \] where $x_1,\ldots,x_n$ are drawn from $f_X$. <>= n<-100000 ecalc<-function(nv,av){ xv<-rgamma(nv,av,1)-rgamma(nv,av,1) gxv<-pmax(2*abs(xv)*log(abs(xv)),5) return(mean(gxv)) } gvec<-replicate(10,ecalc(n,al)) gvec @ An extension to basic Monte Carlo is \textit{\textbf{importance sampling}}: we have that \[ \Expect_X[g(X)] = \int_{-\infty}^\infty g(x) \frac{dF_X(x)}{dF_0(x)} \ d F_0(x) \] where $F_0$ is some other distribution, provided the quantity \[ \frac{dF_X(x)}{dF_0(x)} \equiv \frac{f_X(x)}{f_0(x)} \] is well-defined and finite on the support of $X$. Thus we may write \[ \Expect_X[g(X)] = \E_0 \left[ g(X) \frac{f_X(X)}{f_0(X)} \right] \] that is, as an expectation with respect to $f_0$. \end{itemize} \bigskip \textbf{Cautionary note:} Monte Carlo is a powerful technique, but it is not suitable for solving all problems. Consider computing the (Riemann) integral \[ \int_0^1 \frac{1}{x} \sin \left(\frac{2 \pi }{x} \right) \: dx \] by Monte Carlo. This involves sampling $X_1,\ldots,X_n \sim Uniform(0,1)$ independently, and then computing \[ \frac{1}{n} \sum_{i=1}^n \frac{1}{x_i} \sin \left(\frac{2 \pi }{x_i} \right). \] The Riemann integral can be computed as \begin{eqnarray*} \int_0^1 \frac{1}{x} \sin \left(\frac{2 \pi }{x} \right) \: dx = \int_1^\infty \frac{\sin (2 \pi t)}{t} \: dt & = & \int_0^\infty \frac{\sin (t)}{t} \: dt - \int_0^{2\pi} \frac{\sin (t)}{t} \: dt\\[6pt] & = & \text{Si}(\infty) - \text{Si}(2\pi) \end{eqnarray*} where $\text{Si}(.)$ is a special function (the \textit{sine integral}) with $\text{Si}(\infty) = \pi/2$. The numerical value of the integral is 0.1526. <>= library(pracma) Si(10^6)-Si(2*pi) #Riemann integral result sincalc<-function(nv){ xv<-runif(nv) return(mean(sin(2*pi/xv)/xv)) } svec<-replicate(20,sincalc(n)) #Monte Carlo replicates svec @ The problem is that if $X \sim Uniform(0,1)$ \[ \Expect_X \left[ \frac{1}{X} \sin \left(\frac{2 \pi}{X} \right) \right] \] is not defined. Specifically \[ \Expect_X \left[ \bigg| \frac{1}{X} \sin \left(\frac{2 \pi}{X} \right) \bigg| \right] \] is not finite. \end{document}