\documentclass[notitlepage]{article} \usepackage{../../Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{multirow} \usepackage{bbm} \usepackage{bbold} \usepackage{bm} \usepackage{amsfonts,amsmath,amssymb} \setlength{\parindent}{0pt} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%6i", x), sprintf("%.6f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \begin{center} \textsclarge{MATH 598: TOPICS IN STATISTICS} \vspace{0.1 in} \textsc{The Dirichlet process and the Bayesian bootstrap} \end{center} The Dirichlet Process (DP) is a probability distribution on the space of discrete distributions on $\R$. In light of a random sample of data $y_1,\ldots,y_n$, we have that under a $DP(\alpha,G_Y)$ prior, the posterior distribution can be shown to be a Dirichlet process, $DP(\alpha^\star,G_Y^\star)$, where \[ \alpha^\star = \alpha + n \qquad \qquad G_Y^\star(y) = \dfrac{\alpha}{\alpha + n} G_Y (y) + \dfrac{1}{\alpha + n} \sum\limits_{j=1}^n \delta_{\{y_j\}}(y) \] In the limit as $\alpha \lra 0$, $G_Y^\star(y)$ becomes the empirical distribution \[ G_Y^\star(y) = \frac{1}{n} \sum\limits_{j=1}^n \delta_{\{y_j\}}(y) \] and therefore a sampled value of the random distribution $\ftilde(y)$ is obtained by sampling weights $W = (W_1,\ldots,W_n)$, with $W \sim Dirichlet(n,1,1,\ldots,1)$, and attaching these weights to $y_1,\ldots,y_n$. Conditional on a realization $w$ of $W$, we have a representation of the predictive distribution $p_n(.)$ derived from a Dirichlet process model as \[ p_n(y) = \sum_{i=1}^n w_i \delta_{\{y_i\}}(y). \] For maximum likelihood estimation in a potentially mis-specified model $f(y;\theta)$, we identify the true value of $\theta$, $\theta_0$ as \[ \theta_0 = \arg \min_\theta KL(f_0,f(.;\theta)) = \arg \min_\theta \int \log \left\{ \frac{f_0(y)}{f(y;\theta)} \right\} f_0(y) \: dy. \] An estimator is obtained when we replace the integral by a `Monte Carlo' version based on an i.i.d. sample \[ \widehat \theta = \arg \max_\theta \sum_{i=1}^n \log f(y_i;\theta) = \arg \max_\theta \ell_n(\theta) \] In alternative form, $\widehat \theta$ is the solution to the estimating equation \[ \dot{\ell}_n(\theta) = \sum_{i=1}^n \frac{\partial \log f(y_i;\theta) }{\partial \theta } = 0. \] A Bayesian version of the calculation replaces the original sample by a sample from the predictive distribution $p_n(y_{(n+1):(n+m)})$. However, we are not restricted to use the `score' function as the basis of an estimation procedure: we may use any loss function $L(y,\theta)$ say, and define the Bayesian estimator as \[ \arg \min_\theta \int L(y,\theta) p_n(y) \: dy = \arg \min_\theta \E_{p_n}[L(Y,\theta)]. \] This is a valid fully Bayesian estimator as it minimizes an expected posterior loss; via this route, we may achieve fully Bayesian inference in a semi-parametric fashion. This leads to the calculation \[ \E_{p_n}[L(Y,\theta)] = \sum_{i=1}^n w_i L(y_i,\theta) = - \sum_{i=1}^n w_i\ell(y_i;\theta) \] in the specific case of the log-density loss. Hence, we must perform the calculation of \[ \theta_{\text{OPT}} = \arg \max_\theta \sum_{i=1}^n w_i\ell(y_i;\theta) \] to minimize the loss. The quantity $\theta_{\text{OPT}}$ is a thus functional of the Dirichlet process posterior, and so we may build up a posterior distribution for it by repeatedly sampling the Dirichlet weights, and recomputing $\theta_{\text{OPT}}$ for each sample. \bigskip \textbf{EXAMPLE:} Suppose that $Y_1,\ldots,Y_n$ are a random sample from the Cauchy location family \[ f_Y(y;\theta) = \frac{1}{\pi} \frac{1}{1+(y-\theta)^2} \quad y \in \R \] The log-likelihood from a sample of size $n=10$ is plotted below, along with the maximizing value. <>= set.seed(7838) n<-10 y<-rcauchy(n) wlike<-function(th,yv,wv){ logl<--wv*log(1+(yv-th)^2) return(sum(logl)) } w<-rep(1,n) opt0<-optimize(f=wlike,interval=c(-100,100),yv=y,wv=w,maximum=TRUE) thvec<-seq(-5,5,by=0.001);llvec<-thvec*0 for(i in 1:length(thvec)){llvec[i]<-wlike(thvec[i],y,w)} par(mar=c(4,4,0,0)) plot(thvec,llvec,type='l',xlab=expression(theta),ylab=expression(l(theta))) abline(v=opt0$maximum,lty=2,col='red') @ Using the Bayesian bootstrap procedure, we can simulate $B=5$ resampled versions of the likelihood. <>= B<-5 par(mar=c(4,4,0,0)) plot(thvec,llvec-opt0$objective,type='l',xlab=expression(theta),ylab=expression(d(theta))) abline(v=opt0$maximum,lty=2,col='red') for(b in 1:B){ w<-rgamma(n,1);w<-w/sum(w) opt1<-optimize(f=wlike,interval=c(-100,100),yv=y,wv=w,maximum=TRUE) for(i in 1:length(thvec)){llvec[i]<-wlike(thvec[i],y,w)} lines(thvec,n*(llvec-opt1$objective),lty=3,col='blue') abline(v=opt1$maximum,lty=2,col='blue') } legend(-4,0,c(expression(l(theta)),'resampled'),col=c('black','blue'),lty=c(1,3)) @ In this plot, \begin{itemize} \item each log-likelihood is plotted relative to its maximum value, that is \[ d(\theta) = \ell_n(\theta) - \ell_n(\widehat \theta) \] is plotted. \medskip \item for the Dirichlet re-weighted log-likelihoods, to match scales correctly, the objective function is multiplied by $n$, that is \[ d_W(\theta) = n \sum_{i=1}^n w_i \ell(y_i;\theta) \] \medskip \item the position of the maximizing value $\theta_{\text{OPT}}$ is plotted as a vertical blue dashed line for each of the re-weighted cases, and as a vertical solid red line for the parametric maximum likelihood estimator. \end{itemize} Using $B=5000$, we may generate from the posterior distribution for $\theta_{\text{OPT}}$. <>= B<-5000 th.samp<-rep(0,B) for(b in 1:B){ w<-rgamma(n,1);w<-w/sum(w) opt1<-optimize(f=wlike,interval=c(-100,100),yv=y,wv=w,maximum=TRUE) th.samp[b]<-opt1$max } par(mar=c(4,4,3,0)) hist(th.samp,freq=FALSE,breaks=seq(-5,5,by=0.1),main=' ', xlab=expression(theta[OPT]),ylab=expression(f(theta[OPT]))) box() @ In this problem, if we specify a uniform prior for $\theta$ on $(-5,5)$, then \[ \pi_n(\theta) \varpropto \prod_{i=1}^n \frac{1}{1+(y_i-\theta)^2} \qquad -5 < \theta < 5 \] and can compute the normalizing constant using numerical approximation, for example, Simpson's rule, and from this compute the posterior under correct specification in which case $\theta_{\text{OPT}} = \theta_0 = 0$. Simpson's rule uses the approximation \[ \int_{x_0}^{x_N} g(x) \ dx \bumpeq \frac{(x_N-x_0)}{3N} \left[g(x_0) + 4 \sum_{j=1}^{N/2} g(x_{2j-1}) + 2 \sum_{j=1}^{N/2-1} g(x_{2j}) + g(x_N) \right] \] and in the calculation below, we use $N=500000$, and \[ h = \frac{(x_N-x_0)}{N} = 0.00001. \] \pagebreak <>= w<-rep(1,n) h<-0.00001 thvec<-seq(-5,5,by=h);llvec<-thvec*0 for(i in 1:length(thvec)){llvec[i]<-wlike(thvec[i],y,w)} like.n<-exp(llvec-max(llvec)) halfn<-(length(thvec)-1)/2 evens<-c(1:(halfn-1))*2 odds<-c(1:halfn)*2-1 const<-(like.n[1]+4*sum(like.n[odds])+2*sum(like.n[evens])+like.n[length(thvec)])*h/3 post.n<-like.n/const par(mar=c(4,4,1,0)) hist(th.samp,freq=FALSE,breaks=seq(-5,5,by=0.1),main=' ',ylim=range(0,1), xlab=expression(theta[OPT]),ylab=expression(pi[n](theta[OPT]))) box() lines(thvec,post.n,col='red') @ When the sample size is small, the two methods can differ appreciably. <>= set.seed(784638) n<-10 y<-rcauchy(n) wlike<-function(th,yv,wv){ logl<--wv*log(1+(yv-th)^2) return(sum(logl)) } w<-rep(1,n) opt0<-optimize(f=wlike,interval=c(-100,100),yv=y,wv=w,maximum=TRUE) thvec<-seq(-5,5,by=0.001);llvec<-thvec*0 for(i in 1:length(thvec)){llvec[i]<-wlike(thvec[i],y,w)} B<-5000 th.samp<-rep(0,B) for(b in 1:B){ w<-rgamma(n,1);w<-w/sum(w) opt1<-optimize(f=wlike,interval=c(-100,100),yv=y,wv=w,maximum=TRUE) th.samp[b]<-opt1$max } w<-rep(1,n) h<-0.00001 thvec<-seq(-5,5,by=h);llvec<-thvec*0 for(i in 1:length(thvec)){llvec[i]<-wlike(thvec[i],y,w)} like.n<-exp(llvec-max(llvec)) halfn<-(length(thvec)-1)/2 evens<-c(1:(halfn-1))*2 odds<-c(1:halfn)*2-1 const<-(like.n[1]+4*sum(like.n[odds])+2*sum(like.n[evens])+like.n[length(thvec)])*h/3 post.n<-like.n/const par(mar=c(4,4,3,0)) thv<-th.samp[th.samp > -5 & th.samp < 5] hist(thv,freq=FALSE,breaks=seq(-5,5,by=0.1),main=' ',ylim=range(0,1), xlab=expression(theta[OPT]),ylab=expression(pi[n](theta[OPT])));box() lines(thvec,post.n,col='red') @ When the sample size is $n=100$, the two methods coincide more closely. <>= set.seed(784638) n<-100 y<-rcauchy(n) wlike<-function(th,yv,wv){ logl<--wv*log(1+(yv-th)^2) return(sum(logl)) } w<-rep(1,n) opt0<-optimize(f=wlike,interval=c(-100,100),yv=y,wv=w,maximum=TRUE) thvec<-seq(-5,5,by=0.001);llvec<-thvec*0 for(i in 1:length(thvec)){llvec[i]<-wlike(thvec[i],y,w)} B<-5000 th.samp<-rep(0,B) for(b in 1:B){ w<-rgamma(n,1);w<-w/sum(w) opt1<-optimize(f=wlike,interval=c(-100,100),yv=y,wv=w,maximum=TRUE) th.samp[b]<-opt1$max } w<-rep(1,n) h<-0.00001 thvec<-seq(-2.5,2.5,by=h);llvec<-thvec*0 for(i in 1:length(thvec)){llvec[i]<-wlike(thvec[i],y,w)} like.n<-exp(llvec-max(llvec)) halfn<-(length(thvec)-1)/2 evens<-c(1:(halfn-1))*2 odds<-c(1:halfn)*2-1 const<-(like.n[1]+4*sum(like.n[odds])+2*sum(like.n[evens])+like.n[length(thvec)])*h/3 post.n<-like.n/const par(mar=c(4,4,3,0)) thv<-th.samp[th.samp > -2.5 & th.samp < 2.5] hist(thv,freq=FALSE,breaks=seq(-2.5,2.5,by=0.05),main=' ',ylim=range(0,3.5), xlab=expression(theta[OPT]),ylab=expression(pi[n](theta[OPT])));box() lines(thvec,post.n,col='red') @ \end{document}