\documentclass[notitlepage]{article} \usepackage{../Math323} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{multirow} \usepackage{bbm} \usepackage{amsfonts,amsmath,amssymb} \setlength{\parindent}{0pt} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%.3f", x), sprintf("%.3f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \begin{center} {\textsclarge{Convergence Theorems}} \end{center} Suppose that $Y_1,Y_2,\ldots,Y_n,$ are independent random variables that have the same distribution, and have expectation $\mu$ and variance $\sigma^2$. \begin{itemize} \item \textbf{The Weak Law of Large Numbers (WLLN):} For each $n$, let \[ \overline{Y}_n = \frac{1}{n} \sum_{i=1}^n Y_i \] be the sample mean random variable. Then $\overline{Y}_n \CiP \mu$ as $n \lra \infty$, that is, the probability distribution of $\overline{Y}_n$ becomes concentrated at $\mu$. \medskip \item \textbf{The Central Limit Theorem (CLT):} The random variable \[ U_n = \frac{\sqrt{n}(\overline{Y}_n - \mu)}{\sigma} \] \textit{converges in distribution} to a \textit{standard Normal distribution}. We could also write \[ U_n = \frac{(\overline{Y}_n - \mu)}{\sigma/\sqrt{n}} \qquad \text{or} \qquad U_n = \frac{\left(\sum\limits_{i=1}^n Y_i - n \mu \right)}{\sqrt{n} \sigma}. \] Thus, when $n$ is large, we can approximate the distribution of $\overline{Y}_n$, or of \[ S_n = \sum_{i=1}^n Y_i \] using a Normal distribution, irrespective of the actual distribution of $Y_1,\ldots,Y_n$. \end{itemize} \bigskip \textbf{EXAMPLE: $Bernoulli(p)$}. \smallskip If $Y_i \sim Bernoulli(p)$, then $\E[Y_i] = p$ and $\V[Y_i] = p(1-p)$. The WLLN suggests that \[ \overline{Y}_n \CiP p \] whereas the CLT suggests that \[ U_n = \frac{\sqrt{n}(\overline{Y}_n - p)}{\sqrt{p(1-p)}} \CiD Normal(0,1). \] We can check these results in simulation: we generate \texttt{M} repeated sequences of length \texttt{N}, and study the behaviour of $\overline{Y}_n$ and $U_n$. In the simulation below, we choose $M=10$ and $N=1000$. We first plot the trace plots of $\overline{Y}_n$ for $n=1,\ldots,N$ for 10 different replicated runs. <>= set.seed(1010) M<-10 N<-1000 p<-0.3 ybar<-replicate(M,cumsum(rbinom(N,1,p))/c(1:N)) par(mar=c(4,4,1,0)) plot(1:N,ybar[,1],type='l',xlab='n',ylab=expression(bar(y)[n]),ylim=range(0,1)) for(j in 2:M){lines(1:N,ybar[,j])} abline(h=p,col='red',lwd=2) @ It is clear that the sample means are converging to $p=0.3$ (marked as the red line) for each of the repeated runs. \medskip To check the CLT, we increase $M$ to 2000, and reduce $N$ to 500. For each replicate run, and each $n$, we compute \[ U_n = \frac{\sqrt{n}(\overline{Y}_n - p)}{\sqrt{p(1-p)}} \] which the CLT predicts will converge to a standard Normal random variable as $n$ gets large. <>= M<-2000 N<-500 ybar<-replicate(M,cumsum(rbinom(N,1,p))/c(1:N)) Un<-(ybar-p)/sqrt(p*(1-p)) Un<-apply(Un,2,function(x){return(x*sqrt(1:N))}) @ We can check the claim of Normality by computing mean, variance and \textit{quantiles} of the sampled values; the $\alpha$ quantile of a distribution is the value $y_\alpha$ such that \[ F(y_\alpha) = \alpha. \] For example, the $\alpha=0.5$ quantile is the \textit{median}; the $\alpha=0.25$ quantile is the \textit{lower quartile}; the $\alpha=0.75$ quantile is the \textit{upper quartile} etc. We compare these summary values with the values of the standard Normal distribution for $n=100,200,300,400,500$. <>= Un.sub<-t(Un[100*c(1:5),]) qvec<-c(0.025,0.25,0.5,0.75,0.975) Un.summary<-apply(Un.sub,2,function(x){return(c(mean(x),var(x),quantile(x,qvec)))}) Un.summary<-cbind(Un.summary,c(0,1,qnorm(qvec))) colnames(Un.summary)<-c(paste("n=",seq(100,500,by=100),sep=""),"Z") rownames(Un.summary)[1:2]<-c("mean","var") round(Un.summary,4) @ %<>= %par(mar=c(5,4,3,2),mfrow=c(2,2)) %Un.sub<-t(Un[100*c(1:5),]) %for(j in 1:5){ % uvec<-Un.sub[,j] % uvec<-uvec[uvec >= -3.5 & uvec <= 3.5] % hist(uvec,breaks=seq(-3.5,3.5,by=0.25),col='gray',ylim=range(0,250), % main=substitute( paste("n=",nval),list(nval = 100*j)),xlab=expression(u[n])) % box() %} %@ We can compute the pmf of $U_n$ exactly using transformation methods; we know that \[ S_n = \sum_{i=1}^n Y_i \sim Binomial(n,p) \] -- this can be very simply proved using mgf methods -- so that \[ p_{S_n}(s) = \dbinom{n}{s} p^s (1-p)^{n-s} \qquad s \in \{0,1,2,\ldots,n\} \] with $p_{S_n}(s) = 0$ for other values of $s$. Therefore $U_n$ also has a discrete distribution, with the same probabilities, but restricted to the set of values \[ u = \frac{\sqrt{n}(s/n - p)}{\sqrt{p(1-p)}} \] for $s \in \{0,1,2,\ldots,n\}$. We compare the \textit{cumulative} probabilities associated with the distribution and its approximation; the CLT tells us that \[ F_{U_n}(u) = P(U_n \leq u) \bumpeq \Phi (u) \] where $\Phi(.)$ is the standard Normal cdf. The computation is performed for $n=20,40,60,80,100$. In \texttt{R}, \texttt{pbinom} computes the binomial cdf and \texttt{pnorm} computes the Normal cdf. <>= par(mar=c(4,4,2,2),mfrow=c(3,2)) nvec<-seq(20,100,by=20) xv0<-seq(-2.5,2.5,by=0.5);yv0<-pnorm(xv0) Fmat<-matrix(0,nrow=length(nvec),ncol=length(xv0)) xv<-seq(-5,5,by=0.01);yv<-pnorm(xv) for(j in 1:length(nvec)){ ivec<-0:nvec[j] uvec<-sqrt(nvec[j])*(ivec/nvec[j]-p)/sqrt(p*(1-p)) Pvec<-pbinom(0:nvec[j],nvec[j],p) plot(uvec,Pvec,pch=19,cex=0.5,xlim=range(-5,5), main=substitute( paste("n=",nval),list(nval = nvec[j])), xlab=expression(u),ylab=expression(F[n](u))) lines(xv,yv,col='red') for(k in 1:length(xv0)){Fmat[j,k]<-ifelse(is.na(Pvec[max(which(uvec<=xv0[k]))]),0, Pvec[max(which(uvec<=xv0[k]))])} } Fmat<-rbind(yv0,Fmat);colnames(Fmat)<-xv0; rownames(Fmat)<-c("Z",paste("n=",seq(20,100,by=20),sep="")) round(Fmat,3) @ Here the red line is the plot of the standard normal cdf; clearly the approximation is quite good even for $n=40$. \pagebreak \textbf{EXAMPLE: Continuous $Uniform(0,1)$}. \smallskip If $Y_i \sim Uniform(0,1)$, then $\E[Y_i] = 1/2$ and $\V[Y_i] = 1/12$. The WLLN suggests that \[ \overline{Y}_n \CiP \frac{1}{2} \] whereas the CLT suggests that \[ U_n = \frac{\sqrt{n}(\overline{Y}_n - 1/2)}{\sqrt{11/144}} \CiD Normal(0,1). \] In the simulation below, we choose $M=10$ repeated sequences of maximal length $N=1000$. The trace plots of $\overline{Y}_n$ for $n=1,\ldots,N$ for 10 different replicated runs. It is clear that the sample means are converging to $1/2$ (marked as the red line) for each of the repeated runs. <>= set.seed(1010) M<-10 N<-1000 ybar<-replicate(M,cumsum(runif(N))/c(1:N)) par(mar=c(4,4,1,0)) plot(1:N,ybar[,1],type='l',xlab='n',ylab=expression(bar(y)[n]),ylim=range(0,1)) for(j in 2:M){lines(1:N,ybar[,j])} abline(h=1/2,col='red',lwd=2) @ \medskip To check the CLT, we increase $M$ to 2000, and reduce $N$ to 500. For each replicate run, and each $n$, we compute \[ U_n = \frac{\sqrt{n}(\overline{Y}_n - 1/2)}{\sqrt{11/144}} \] which the CLT predicts will converge to a standard Normal random variable as $n$ gets large. <>= M<-2000 N<-500 ybar<-replicate(M,cumsum(runif(N))/c(1:N)) Un<-(ybar-1/2)/sqrt(11/144) Un<-apply(Un,2,function(x){return(x*sqrt(1:N))}) @ We can check the claim of Normality by computing mean, variance and \textit{quantiles} of the sampled values We compare these summary values with the values of the standard Normal distribution for $n=100,200,300,400,500$. <>= Un.sub<-t(Un[100*c(1:5),]) qvec<-c(0.025,0.25,0.5,0.75,0.975) Un.summary<-apply(Un.sub,2,function(x){return(c(mean(x),var(x),quantile(x,qvec)))}) Un.summary<-cbind(Un.summary,c(0,1,qnorm(qvec))) colnames(Un.summary)<-c(paste("n=",seq(100,500,by=100),sep=""),"Z") rownames(Un.summary)[1:2]<-c("mean","var") round(Un.summary,4) @ In the uniform case, it is not straightforward to compute the distribution of $U_n$ analytically. We instead compare the histograms of $U_n$ across the $M$ replicates with the standard normal pdf. The computation is performed for $n=10,20,30,40,50$. <>= par(mar=c(3,3,1,2),mfrow=c(1,2)) Un.sub<-t(Un[10*c(1:5),]) xv<-seq(-3.5,3.5,by=0.001) yv<-dnorm(xv) for(j in 1:5){ uvec<-Un.sub[,j] uvec<-uvec[uvec >= -3.5 & uvec <= 3.5] hist(uvec,breaks=seq(-3.5,3.5,by=0.25),col='gray',ylim=range(0,250),cex.axis=0.75, main=substitute( paste("n=",nval),list(nval = 10*j)),xlab=expression(u[n])) lines(xv,yv*M*0.25,col='red') box() } @ Here the red line is the plot of the standard normal cdf; the approximation is quite good even for $n=10$. \bigskip \textbf{EXAMPLE: $Exponential (1)$}. \smallskip If $Y_i \sim Exponential(1)$, then $\E[Y_i] = 1$ and $\V[Y_i] = 1$. The WLLN suggests that \[ \overline{Y}_n \CiP 1 \] whereas the CLT suggests that \[ U_n = \frac{\sqrt{n}(\overline{Y}_n - 1)}{\sqrt{1}} \CiD Normal(0,1). \] In the simulation below, we choose $M=10$ repeated sequences of maximal length $N=1000$. The trace plots of $\overline{Y}_n$ for $n=1,\ldots,N$ for 10 different replicated runs. It is clear that the sample means are converging to $1$ (marked as the red line) for each of the repeated runs. <>= set.seed(1010) M<-10 N<-1000 ybar<-replicate(M,cumsum(rexp(N))/c(1:N)) par(mar=c(4,4,1,0)) plot(1:N,ybar[,1],type='l',xlab='n',ylab=expression(bar(y)[n]),ylim=range(0,2)) for(j in 2:M){lines(1:N,ybar[,j])} abline(h=1,col='red',lwd=2) @ \medskip To check the CLT, we increase $M$ to 2000, and reduce $N$ to 500. For each replicate run, and each $n$, we compute \[ U_n = \sqrt{n}(\overline{Y}_n - 1) \] which the CLT predicts will converge to a standard Normal random variable as $n$ gets large. <>= M<-2000;N<-500 ybar<-replicate(M,cumsum(rexp(N))/c(1:N)) Un<-(ybar-1)/sqrt(1) Un<-apply(Un,2,function(x){return(x*sqrt(1:N))}) @ We can check the claim of Normality by computing mean, variance and \textit{quantiles} of the sampled values We compare these summary values with the values of the standard Normal distribution for $n=100,200,300,400,500$. <>= Un.sub<-t(Un[100*c(1:5),]) qvec<-c(0.025,0.25,0.5,0.75,0.975) Un.summary<-apply(Un.sub,2,function(x){return(c(mean(x),var(x),quantile(x,qvec)))}) Un.summary<-cbind(Un.summary,c(0,1,qnorm(qvec))) colnames(Un.summary)<-c(paste("n=",seq(100,500,by=100),sep=""),"Z") rownames(Un.summary)[1:2]<-c("mean","var") round(Un.summary,4) @ In the Exponential case, we can show using mgfs that \[ S_n = \sum_{i=1}^n Y_i \sim Gamma(n,1) \] and therefore we can compute the cdf of $U_n$ analytically from first principles: \[ F_{U_n}(u) = P(U_n \leq u) = P \left(\sqrt{n}(\overline{Y}_n - 1) \leq u \right) = P(S_n \leq \sqrt{n} u + n) = F_{S_n}(\sqrt{n} u + n) \] <>= par(mar=c(4,4,2,2),mfrow=c(1,2)) nvec<-seq(20,100,by=20) xv0<-seq(-2.5,2.5,by=0.5);yv0<-pnorm(xv0) Fmat<-matrix(0,nrow=length(nvec),ncol=length(xv0)) xv<-seq(-5,5,by=0.01);yv<-pnorm(xv) for(j in 1:length(nvec)){ ivec<-0:nvec[j] svec<-sqrt(nvec[j])*xv+nvec[j] Fvec<-pgamma(svec,nvec[j],1) plot(xv,Fvec,pch=19,cex=0.5,xlim=range(-5,5),type='l', main=substitute( paste("n=",nval),list(nval = nvec[j])), xlab=expression(u),ylab=expression(F[n](u))) lines(xv,yv,col='red') Fmat[j,]<-pgamma(sqrt(nvec[j])*xv0+nvec[j],nvec[j],1) } Fmat<-rbind(yv0,Fmat);colnames(Fmat)<-xv0; rownames(Fmat)<-c("Z",paste("n=",seq(20,100,by=20),sep="")) round(Fmat,3) @ In these plots, the black (exact cdf of $U_n$) and red (standard Normal approximation to the distribution) are almost identical. \bigskip We also compare the histograms of $U_n$ across the $M$ replicates with the standard normal pdf. The computation is performed for $n=10,20,30,40,50$. <>= par(mar=c(3,3,1,2),mfrow=c(1,2)) Un.sub<-t(Un[10*c(1:5),]) xv<-seq(-3.5,3.5,by=0.001) yv<-dnorm(xv) for(j in 1:5){ uvec<-Un.sub[,j] uvec<-uvec[uvec >= -3.5 & uvec <= 3.5] hist(uvec,breaks=seq(-3.5,3.5,by=0.25),col='gray',ylim=range(0,250),cex.axis=0.75, main=substitute( paste("n=",nval),list(nval = 10*j)),xlab=expression(u[n])) lines(xv,yv*M*0.25,col='red');box() } @ Here the red line is the plot of the standard normal cdf; the approximation is good for $n=50$, but not so good for smaller $n$. \end{document}