\documentclass[notitlepage,11pt]{article}
\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{bbold}
\usepackage{bm}



\usepackage{amsfonts,amsmath,amssymb}

\setlength{\parindent}{0pt}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%6i", x),
      sprintf("%.6f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@


\begin{center}
\textsclarge{MATH 559: Bayesian Theory and Methods}

\vspace{0.1 in} \textsc{Bayesian nonparametric methods}
\end{center}

The \textit{Dirichlet Process} is a probability distribution on the space of discrete distributions on $\R$ characterized by
\begin{itemize}

\item a \emph{base distribution}, $G_X$, which is some probability measure on $\R$, from which are drawn a countable collection of random variables $X_1,X_2,\ldots$.

\item a countable collection of random variables $\pi_1,\pi_2,\ldots$ with $\pi_i > 0$ such that
\[
\sum\limits_{i=1}^\infty \pi_i = 1
\]
that is constructed in the following way: $\pi_1 = V_1 \sim Beta(1,\alpha)$ and for $i=2,3,\ldots$,
\[
\pi_i = V_i \prod_{j=1}^{i-1} (1-V_j)
\]
that is
\begin{align*}
  \pi_1 & = V_1 \\[3pt]
  \pi_2 & = (1-V_1) V_2\\[3pt]
  \pi_3 & = (1-V_1) (1-V_2) V_3
\end{align*}
etc., where for each $j$, $V_j \sim Beta(1,\alpha)$ for some parameter $\alpha > 0$.   Suppose that $Z$ is a random variable on $\{1,2,\ldots,\}$ with mass function defined by $\{\pi_1,\pi_2,\ldots\}$, that is $\Pr[Z = i] = \pi_i$.  Then
\begin{align*}
  \Pr[Z > 2] & = 1 - \pi_1 - \pi_2 = 1-V_1 - (1-V_1)V_2 = (1-V_1)(1-V_2) \\[6pt]
  \Pr[Z > 3] & = 1- \pi_1 - \pi_2 - \pi_3 = (1-V_1)(1-V_2) - (1-V_2)(1-V_2) V_3 = (1-V_1)(1-V_2)(1-V_3)
\end{align*}
etc. and in general, for $n \geq 1$,
\[
P_n = \Pr[Z > n] = \prod_{j=1}^n (1-V_j).
\]
The sequence of \textbf{random variables} $\{P_n\}$ is \textbf{strictly stochastically decreasing}
\[
P_1 > P_2 > \cdots > P_n > \cdots
\]
and is bounded below by zero.  Let $\epsilon > 0$.  Then
\[
\Pr[P_n < \epsilon] = \Pr \left[\prod_{j=1}^n (1-V_j) < \epsilon \right] \leq \Pr[ T_n^n < \epsilon] \qquad T_n = \max\{1-V_1,\ldots,1-V_n\}.
\]
As the random variables $V_j$ are independent, we have that if $F$ is the cdf of the $1-V_j\sim Beta(\alpha,1)$ random variables, then
\[
\Pr[P_n < \epsilon] \leq \{F(\epsilon^{1/n})\}^n \lra 0 \qquad n \lra \infty.
\]
and the sequence converges (almost surely) to 0 as $n \longrightarrow \infty$.  Thus the generated $\pi$s constitute a random discrete distribution.  Also by construction, in expectation $\pi_1 > \pi_2 > \cdots$, which facilitates truncation at some finite value $M$.   Note that, up to machine accuracy, the value of $\pi_i$ will become zero eventually in any case.
\end{itemize}

\pagebreak

\textbf{Simulation:}  1000 replicated runs of the generated probabilities, with $\alpha=2$.

<<T00,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=TRUE>>=
M<-41
alpha<-2
pi.mat<-matrix(0,nrow=1000,ncol=M)
for(i in 1:1000){
    V<-rbeta(M,1,alpha)
    pi.mat[i,]<-c(V[1],cumprod(1-V[-M])*V[-1])
}
par(mar=c(2,2,3,0))
boxplot(pi.mat,pch=19,cex=0.5,ylim=range(0,1),cex.axis=0.6)
title(expression(paste('Boxplot of stick-breaking draws of ', pi)),line=1)
@
The collections $\{X_1,X_2,\ldots\}$ and $\{\pi_1,\pi_2,\ldots,\}$ define a random discrete distribution with mass function $\widetilde f$ where
\[
\widetilde f(x) = \sum_{i=1}^\infty \pi_i \delta_{X_i} (x)
\]
where $\delta_{x_i} (x) = 1 \Longleftrightarrow x = x_i$.   We write $\widetilde f \sim DP(\alpha,G_X)$ to denote that $\widetilde f$ is drawn from the Dirichlet Process.

\bigskip

\textbf{EXAMPLE:}  Here we choose $\alpha=2$, and set $G_X \equiv Normal(0,10^2)$.

<<T0,comment='+', fig.width=6, fig.height=4.5, fig.align='center', echo=TRUE>>=
M<-1001;alpha<-2
set.seed(1010)
#Stick-breaking
Xi<-rnorm(M,0,10)   #Base measure
V<-rbeta(M,1,alpha)
pi.vec<-c(V[1],cumprod(1-V[-M])*V[-1])
Xi1<-Xi
pi1<-pi.vec

Xi<-rnorm(M,0,10); V<-rbeta(M,1,alpha)
pi.vec<-c(V[1],cumprod(1-V[-M])*V[-1])
Xi2<-Xi
pi2<-pi.vec

Xi<-rnorm(M,0,10);V<-rbeta(M,1,alpha)
pi.vec<-c(V[1],cumprod(1-V[-M])*V[-1])
Xi3<-Xi
pi3<-pi.vec

par(mar=c(4,4,1,0))
plot(Xi1,pi1,type="n",ylim=range(0,1),xlab='X',ylab=expression(pi),xlim=range(-30,30))
points(Xi1,pi1,pch=19,xlim=range(-30,30),cex=0.5)
for(i in 1:M){lines(c(Xi1[i],Xi1[i]),c(0,pi1[i]),lty=2)}
@
<<T1,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=TRUE>>=
par(mar=c(4,4,1,0))
plot(Xi1,pi1,type="n",ylim=range(0,1),xlab='X',ylab=expression(pi),xlim=range(-30,30))
points(Xi1[pi1>1e-6],pi1[pi1>1e-6],pch=19,xlim=range(-30,30),cex=0.5)
for(i in 1:M){
	if(pi1[i] < 1e-6) next
	lines(c(Xi1[i],Xi1[i]),c(0,pi1[i]),lty=2)
}
nval<-sum(pi1>1e-6)
text(-30,1,substitute(paste(nv,' points greater than ',10^{-6}),list(nv=nval)),adj=0)



par(mar=c(4,4,2,0))
plot(Xi1,pi1,type="n",ylim=range(0,1),xlab='X',ylab=expression(pi),xlim=range(-30,30))
title(expression(paste('Three draws from the ',DP(alpha,G[X]))))
points(Xi1[pi1>1e-6],pi1[pi1>1e-6],pch=19,xlim=range(-30,30),cex=0.5)
for(i in 1:M){
	if(pi1[i] < 1e-6) next
	lines(c(Xi1[i],Xi1[i]),c(0,pi1[i]),lty=2)
}
points(Xi2[pi2>1e-6],pi2[pi2>1e-6],pch=19,xlim=range(-30,30),col="red",cex=0.5)
for(i in 1:M){
	if(pi2[i] < 1e-6) next
	lines(c(Xi2[i],Xi2[i]),c(0,pi2[i]),lty=2,col="red")
}
points(Xi3[pi3>1e-6],pi3[pi3>1e-6],pch=19,xlim=range(-30,30),col="green",cex=0.5)
for(i in 1:M){
	if(pi3[i] < 1e-6) next
	lines(c(Xi3[i],Xi3[i]),c(0,pi3[i]),lty=2,col="green")
}
@

<<T2,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=TRUE>>=
Xi<-rnorm(M,0,10)
V<-rbeta(M,1,alpha)
pi.vec<-c(V[1],cumprod(1-V[-M])*V[-1])
XiLarge<-Xi
piLarge<-pi.vec

xi.sort<-XiLarge[order(XiLarge)]
pi.sort<-pi.vec[order(XiLarge)]
Pi<-cumsum(pi.sort)
par(mar=c(4,4,2,0))
plot(xi.sort,Pi,type="s",ylim=range(0,1),xlab="x",ylab="Cumul. Prob")
title(expression(paste('Five replicate draws of the random cdf ',(alpha==2))))
for(i in 1:4){
	Xi<-rnorm(M,0,10)
	V<-rbeta(M,1,alpha)
    pi.vec<-c(V[1],cumprod(1-V[-M])*V[-1])
	XiLarge<-Xi
	piLarge<-pi.vec
	xi.sort<-XiLarge[order(XiLarge)]
	pi.sort<-pi.vec[order(XiLarge)]
	Pi<-cumsum(pi.sort)
	lines(xi.sort,Pi,type="s")
}
xv<-seq(-30,30,length=1001)
yv<-pnorm(xv,0,10)
lines(xv,yv,col="red",lwd=2)
@
The parameter $\alpha$ is a precision parameter; as $\alpha$ increases, the random distribution becomes more concentrated on the base measure.

<<T3,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=FALSE>>=
alpha<-100
Xi<-rnorm(M,0,10)
V<-rbeta(M,1,alpha)
pi.vec<-c(V[1],cumprod(1-V[-M])*V[-1])
XiLarge<-Xi
piLarge<-pi.vec

xi.sort<-XiLarge[order(XiLarge)]
pi.sort<-pi.vec[order(XiLarge)]
Pi<-cumsum(pi.sort)
par(mar=c(4,4,2,0))
plot(xi.sort,Pi,type="s",ylim=range(0,1),xlab="x",ylab="Cumul. Prob")
title(expression(paste('Five replicate draws of the random cdf ',(alpha==100))))
for(i in 1:4){
	Xi<-rnorm(M,0,10)
	V<-rbeta(M,1,alpha)
	pi.vec<-c(V[1],cumprod(1-V[-M])*V[-1])
	XiLarge<-Xi
	piLarge<-pi.vec
	xi.sort<-XiLarge[order(XiLarge)]
	pi.sort<-pi.vec[order(XiLarge)]
	Pi<-cumsum(pi.sort)
	lines(xi.sort,Pi,type="s")
}
xv<-seq(-30,30,length=1001)
yv<-pnorm(xv,0,10)
lines(xv,yv,col="red",lwd=2)
@

The random distribution $\widetilde f$ can be sampled to produce iid data $Y_1,\ldots,Y_n$ in a straightforward fashion.  In the simulation below, we simulate $\widetilde f$ with $\alpha = 10$ and $G_X \equiv Normal(0,1)$

<<T4,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=FALSE>>=
M<-1000
alpha<-10
X<-rnorm(M,0,1)
V<-rbeta(M,1,alpha)
pi.vec<-c(V[1],cumprod(1-V[-M])*V[-1])
n<-1000
Y<-X[sample(1:M,rep=T,size=n,prob=pi.vec)]
par(mar=c(4,4,2,0))
hist(Y,col='gray',breaks=seq(-3,3,by=0.1),
    main=expression(paste('Sample of data from ', widetilde(f))));box()
@
Note that in the sample of size $n=\Sexpr{n}$, there are only \Sexpr{length(table(Y))} distinct $Y$ values
<<T6,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=TRUE>>=
tabY<-table(Y)
names(tabY)<-as.character(round(as.numeric(names(tabY)),5))
tabY
@
and thus we observe \textit{clustering}: this is due to the fact that $\widetilde f$ is discrete.

\newpage
\textbf{Polya Urn:} We can also use a \textit{Polya Urn} scheme to sample these data:
\begin{enumerate}[1.]

\item simulate $Y_1 \sim G_X$;

\item for $i=2,3,\ldots,n$ generate $Y_i$ from the mixture distribution
\[
\frac{\alpha}{\alpha + i-1} G_X(.) + \frac{1}{\alpha+i-1} \sum_{j=1}^{i-1} \delta_{Y_j}(.)
\]
that is

\begin{itemizeWide}

\item with probability $\alpha/(\alpha+i-1)$, simulate $Y_i \sim G_X(.)$;

\item with probability $1/(\alpha+i-1)$, simulate $Y_i$ uniformly on $\{Y_1,\ldots,Y_{i-1}\}$.

\end{itemizeWide}
\end{enumerate}

<<T7,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=FALSE>>=
n<-1000
Y<-numeric(n)
Y[1]<-rnorm(1,0,1)
alpha<-10
for(i in 2:n){
	u<-runif(1)
	if(u < alpha/(alpha+i-1)){
		Y[i]<-rnorm(1,0,1)
	}else{
		Y[i]<-sample(Y[1:(i-1)],size=1)
	}
}
par(mar=c(4,4,2,0))
hist(Y,col='gray',breaks=seq(-3,3,by=0.1),
    main=expression(paste('Sample of data from ', widetilde(f),' via Polya Urn')));box()
@
Here there are \Sexpr{length(table(Y))} distinct $Y$ values.  The Polya Urn method reconstructs a sample of exchangeable observables according to the de Finetti representation
\[
f_{Y_1,\ldots,Y_n} (y_1,\ldots,y_n) = \int \prod_{i=1}^n \widetilde f(y_i) \: \pi_0(d \widetilde f)
\]
by sampling directly from the left hand side using the factorization
\[
f_{Y_1,\ldots,Y_n} (y_1,\ldots,y_n) = f_{Y_1}(y_1) \prod_{i=2}^n f_{Y_i|Y_1,\ldots,Y_{i-1}}(y_i|y_1,\ldots,y_{i-1})
\]
where, denoting $g_X(.)$ is the density corresponding to $G_X(.)$, we have
\[
f_{Y_i|Y_1,\ldots,Y_{i-1}}(y_i|y_1,\ldots,y_{i-1}) = \frac{\alpha}{\alpha + i-1} g_X(y_i) + \frac{1}{\alpha+i-1} \sum_{j=1}^{i-1} \delta_{Y_j}(y_i)
\]

\bigskip

\textbf{Dirichlet Process Mixtures:} In a \textit{Dirichlet Process Mixture} model we add another stage that brings in a continuous distribution.  For example, could treat each $x_i$ as the location of a normal density, and consider generating a $y$ for each
\begin{eqnarray*}
x_1,x_2,\ldots ~&\sim& G_X\\[6pt]
\pi_1,\pi_2,\ldots &&\text{generated by stick-breaking}.\\[6pt]
y_i &\sim & \phi((y_i - x_i)/\sigma) \qquad i=1,2,\ldots
\end{eqnarray*}
Then the random density function generated takes the form
\[
\ftilde(y) = \sum_{i=1}^\infty \pi_i \phi((y - x_i)/\sigma)/\sigma
\]
that is, an \textbf{infinite mixture model}.  Below we have $\alpha=2$, with $G_X(x) \equiv Normal(0,\lambda^2)$ with $\lambda=10$.
<<T8,comment='+', fig.width=7, fig.height=4.20, fig.align='center', echo=TRUE>>=
set.seed(1010)
M<-1001
alpha<-2; sigma<-1;lambda<-10
Xi<-rnorm(M,0,lambda);V<-rbeta(M,1,alpha)
pi.vec<-c(V[1],cumprod(1-V[-M])*V[-1])
xv<-seq(-30,30,by=0.01);yv<-xv*0
for(i in 1:M){
    yv<-yv+pi.vec[i]*dnorm(xv,Xi[i],sigma)
}
par(mar=c(4,5,2,0))
plot(xv,yv,type='l',xlab='y',ylab=expression(widetilde(f)(y)))
points(Xi,Xi*0,cex=2*pi.vec,pch=19)
@

\newpage

\textbf{Gibbs sampler:} For fully Bayesian inference consider the de Finetti construction

\begin{itemizeWide}

\item a prior model for $f$ that is $DPM(\alpha,G_X,g_Y;\theta)$ where
    $\theta$ represents the parameters that appear in $G_X$ and $g_Y$.

\item conditional on $f$, data $y_1,y_2,\ldots,y_n \sim f$ independently

\end{itemizeWide}

We wish to compute the posterior for $f$.  We use the hierarchical formulation
\begin{eqnarray*}
y_j | x_j & \stackrel{\text{ind.}}{\sim} & g_Y(y_j|x_j;\theta) \qquad j=1,\ldots,n\\[6pt]
x_1,\ldots,x_n & \sim & DP(\alpha,G_X;\theta)\\[6pt]
\theta & \sim & \pi_0(\theta).
\end{eqnarray*}
The latent variables $x_1,\ldots,x_n$ are also treated as parameters.  They can
be sampled using an MCMC \textbf{Gibbs sampler} scheme.  For $j=1,\ldots,n$, we sample
\[
x_j \: | \: \bm{x}_{(j)},\bm{y} \sim w_0 p(x_j|y_j)  +  \sum_{l \neq j} w_l \delta_{x_l}
\]
where
\begin{itemize}

\item $\bm{y} = (y_1,\ldots,y_n)^\transpose$

\item $\bm{x}_{(j)} = (x_1,\ldots,x_{j-1},x_{j+1},\ldots,x_n)^\transpose$.

\item $w_0$ is proportional to $\alpha$ times the \textbf{prior
    predictive} of $y_j$

\item $w_l$ is proportional to the \textbf{likelihood} of $y_j|x_l$

\item $p(x_j|y_j)$ is the \textbf{posterior} for $x_j$ given $y_j$.

\end{itemize}

\bigskip

\textbf{Marginalized sampler:} To be more efficient, we can use the \textbf{clustering} property:  suppose that at a given iteration of the MCMC, there are $K$ clusters labelled 1 to $K$, where $K \leq n$.  Label the $K$ distinct $x$ values $z_1,\ldots,z_K$ and for each $j$, define the corresponding cluster label $c_j$ where
\[
c_j = k \qquad \Longleftrightarrow \qquad x_j = z_k
\]
We can update the $c_j$s instead of the $x_j$s which will be more computationally efficient; we are clustering $x$s to the \textbf{cluster centres} at the $z$ values.  For $i=1,\ldots,n$, let

\begin{itemize}

\item $n_1(i),\ldots,n_K(i)$ denote the number of items in clusters
    $1,\ldots,K$

\vspace{0.05 in}

\item $\bm{y}_1(i),\ldots,\bm{y}_K(i)$ denote the vectors of $y$ values currently allocated to the $K$ clusters

\end{itemize}
\textbf{if the $i$th data point is removed}.  For $i=1,\ldots,n$, we sample the cluster labels in a Gibbs sampler with
conditional probabilities
\[
\Pr[c_i = k \: | \: c_{(i)} ] \propto \frac{n_k(i)}{\alpha + n-1} p(y_i|\bm{y}_k(i)) \qquad k=1,\ldots,K
\]
and
\[
\Pr[c_i = K+1 \: | \: c_{(i)} ] \propto \frac{\alpha}{\alpha + n-1} p(y_i)
\]
In this expression

\begin{itemize}

\item $p(y_i|\bm{y}_k(i))$ is the \textbf{posterior predictive} density for $y_i$, assuming that $y_i$ comes from
    cluster $k$.

\vspace{0.1 in}

\item $p(y_i)$ is the \textbf{prior predictive} density for $y_i$, assuming that $y_i$ comes from a \textbf{new cluster} not currently represented in the data.

\end{itemize}

In this formulation, we have \textbf{integrated out} the Dirichlet Process analytically. Thus we can simply sample the cluster labels in turn, and then sample the $z_1,\ldots,z_k$ values; this will allow us to do density estimation. By the usual calculation
\[
p(y_i|\bm{y}_k(i)) = \int g_Y(y_i \:|\: x) p(x \:|\: \bm{y}_k(i)) \: d x
\]
where
\begin{eqnarray*}
p(x \:|\: \bm{y}_k(i)) & \propto & p(\bm{y}_k(i) \:|\: x ) p(x) =\left\{\prod_{l \neq i} g_Y(y_l \:|\: x ) \right\} p(x)
\end{eqnarray*}
gives the posterior distribution for the $k$th cluster centre.  Similarly
\[
p(y_i) = \int g_Y(y_i \:|\: x) p(x) \: d x
\]
In the earlier Normal model, suppose for simplicity that $G_X$ is the $Normal(0,\lambda^2)$ density:
\[
x_i \sim Normal(0,\lambda^2) \qquad \qquad
y_i \: | \:x_i \sim Normal(x_i,\sigma^2)
\]
Then by standard calculations
\[
p(y_i|\bm{y}_k(i)) \equiv Normal\left(\frac{n_k(i) \overline{y}_k(i)/\sigma^2}{n_k(i)/\sigma^2+1/\lambda^2},
\frac{(n_k(i)+1)/\sigma^2+1/\lambda^2}{n_k(i)/\sigma^2+1/\lambda^2} \sigma^2 \right)
\]
and
\[
p(y_i) \equiv Normal\left(0,\sigma^2+\lambda^2 \right).
\]
The marginalized version of the algorithm often mixes more quickly than the Gibbs sampler on the cluster centres.

<<T9a,comment='+', fig.width=7, fig.height=4.25, fig.align='center', echo=TRUE>>=
set.seed(1091)
n<-200
alpha<-2;sigma<-1; lambda<-10
X<-numeric(n);X[1]<-rnorm(1,0,lambda)
for(j in 2:n){
	u<-runif(1)
	if(u < alpha/(alpha+j-1)){
		X[j]<-rnorm(1,0,lambda)
	}else{
		X[j]<-sample(X[1:(j-1)],size=1)
	}
}
True.K<-length(table(X))
Y<-rnorm(n,X,sigma)
@
<<T9b,comment='+', fig.width=7, fig.height=4.25, fig.align='center', echo=TRUE>>=
par(mar=c(4,4,2,0))
inc<-abs(Y) < 20
hist(Y[inc],breaks=seq(-20,20,by=0.50),freq=FALSE,xlab='Y',
    ylim=range(0,0.25),col='gray',main='Simulated data');box()
@
<<T10a,comment='+', fig.width=7, fig.height=4.25, fig.align='center', echo=TRUE>>=
set.seed(1010)
index<-c(1:n)
Kval<-5
Yq<-quantile(Y,prob=c(0:Kval)/Kval)
Cvec<-as.numeric(cut(Y,Yq,include.lowest=T))
Ysum.Vec<-rep(0,Kval)
for(k in 1:Kval){Ysum.Vec[k]<-sum(Y[Cvec==k])}
Nvec<-as.numeric(table(Cvec))

nreps<-10000;Nsamp<-rep(0,nreps)
yv<-seq(-20,20,by=0.01);fyv.samp<-numeric()
for(irep in 1:nreps){
	for(i in 1:n){
		sum.Vec<-Ysum.Vec
		sum.Vec[Cvec[i]]<-sum.Vec[Cvec[i]]-Y[i]
		nvec<-Nvec
		nvec[Cvec[i]]<-nvec[Cvec[i]]-1
		if(nvec[Cvec[i]] > 0){
			mvec<-sum.Vec/nvec	
			Mvec<-(sum.Vec/sigma^2)/(nvec/sigma^2+1/lambda^2)
			Svec<-sigma^2*((nvec+1)/sigma^2+1/lambda^2)/(nvec/sigma^2+1/lambda^2)
			pvec<-nvec*dnorm(rep(Y[i],Kval),Mvec,sqrt(Svec))
			pvec<-c(pvec,alpha*dnorm(Y[i],0,sqrt(sigma^2 + lambda^2)))/(n-1+alpha)
			pvec<-pvec/sum(pvec)
			Ksamp<-sample(c(1:(Kval+1)),size=1,prob=pvec)
			Cvec[i]<-Ksamp
			if(Ksamp == Kval+1){
				Kval<-Kval+1
				Ysum.Vec<-c(sum.Vec,Y[i])
				Nvec<-c(nvec,1)
			}else{
				Ysum.Vec<-sum.Vec
				Ysum.Vec[Ksamp]<-sum.Vec[Ksamp]+Y[i]
				Nvec<-nvec
				Nvec[Ksamp]<-nvec[Ksamp]+1
			}
		}else{
			Ktmp<-Cvec[i]
			sum.Vec<-sum.Vec[-Ktmp]
			nvec<-nvec[-Ktmp]
			Kval<-Kval-1
			Cvec[Cvec > Ktmp]<-Cvec[Cvec > Ktmp]-1
	
			mvec<-sum.Vec/nvec	
			Mvec<-(sum.Vec/sigma^2)/(nvec/sigma^2+1/lambda^2)
			Svec<-sigma^2*((nvec+1)/sigma^2+1/lambda^2)/(nvec/sigma^2+1/lambda^2)
			pvec<-nvec*dnorm(rep(Y[i],Kval),Mvec,sqrt(Svec))
			pvec<-c(pvec,alpha*dnorm(Y[i],0,sqrt(sigma^2 + lambda^2)))/(n-1+alpha)

			pvec<-pvec/sum(pvec)
			Ksamp<-sample(c(1:(Kval+1)),size=1,prob=pvec)
			Cvec[i]<-Ksamp
			if(Ksamp == Kval+1){
				Kval<-Kval+1
				Ysum.Vec<-c(sum.Vec,Y[i])
				Nvec<-c(nvec,1)
			}else{
				Ysum.Vec<-sum.Vec
				Ysum.Vec[Ksamp]<-sum.Vec[Ksamp]+Y[i]
				Nvec<-nvec
				Nvec[Ksamp]<-nvec[Ksamp]+1
			}
		}
	}
	Nsamp[irep]<-Kval
	if(irep %% 500 == 0){		
		MuVec<-(Ysum.Vec/sigma^2)/(Nvec/sigma^2+1/lambda^2)
		SigVec<-1/(Nvec/sigma^2+1/lambda^2)
		Xvec<-rnorm(Kval)*sqrt(SigVec)+MuVec
		fyv<-yv*0
		for(k in 1:Kval){
			fyv<-fyv+Nvec[k]*dnorm(yv,Xvec[k],sigma)/n
		}
		fyv.samp<-cbind(fyv.samp,fyv)
	}
}
@
<<T10b,comment='+', fig.width=7, fig.height=4.35, fig.align='center', echo=TRUE>>=
par(mar=c(4,4,2,0))
inc<-abs(Y) < 20
hist(Y[inc],breaks=seq(-20,20,by=0.5),freq=FALSE,col='gray',xlab='Y',
    ylim=range(0,0.25),main='Posterior density estimates');box()
for(i in 1:ncol(fyv.samp)){lines(yv,fyv.samp[,i],col='red')}
hist(Nsamp,col='gray',breaks=seq(0,30,by=1),freq=FALSE,
    xlab='K',main='Posterior sample for K');box()
@

\textbf{Analysis of the Galaxy data:}  From the \texttt{MASS::galaxies} help: the galaxies data are the velocities in km/sec of 82 galaxies from six conic sections of an unfilled survey of the Corona Borealis region.  Multimodality in such surveys is evidence for voids and superclusters in the far universe.




<<T11,comment='+', fig.width=7, fig.height=4.25, fig.align='center', echo=TRUE>>=
#Galaxy data using Gibbs sampler
Y.galaxy<-MASS::galaxies/10000
n.galaxy<-length(Y.galaxy)

set.seed(1010)
index<-c(1:n.galaxy)
nreps<-10000

alpha<-2.0

Pi<-Y.galaxy
Prior.K.galaxy<-numeric()

for(irep in 1:nreps){
	for(i in 1:n.galaxy){
		pvec<-rep(1/(alpha+n.galaxy-1),n.galaxy)
		pvec[i]<-alpha/(alpha+n.galaxy-1)
		ival<-sample(index,size=1,prob=pvec)
		if(ival == i){
			Pi[i]<-rnorm(1,0,lambda)
		}else{
			Pi[i]<-Pi[ival]
		}
	}

	Kval<-length(table(Pi))
	Prior.K.galaxy<-c(Prior.K.galaxy,Kval)
}

set.seed(1010)
index<-c(1:n.galaxy)
nreps<-10000
sigma<-0.1
lambda<-0.5

Pi<-Y.galaxy
Pvec<-rep(0,n.galaxy)
Posterior.K.galaxy<-numeric()
yv<-seq(0,4,by=0.01)
fyv.samp.GAL<-numeric()
for(irep in 1:nreps){

	for(i in 1:n.galaxy){
		pvec<-dnorm(Y.galaxy[i],Pi,rep(sigma,n.galaxy))
		pvec[i]<-alpha*dnorm(Y.galaxy[i],0,sqrt(sigma^2+lambda^2))
		ival<-sample(index,size=1,prob=pvec)
		if(ival == i){
			muval<-(Y.galaxy[i]/sigma^2)/(1/sigma^2+1/lambda^2)
			sval<-sqrt(1/(1/sigma^2+1/lambda^2))
			Pi[i]<-rnorm(1,muval,sval)
		}else{
			Pi[i]<-Pi[ival]
		}
	}

	Kval<-length(table(Pi))
    Posterior.K.galaxy<-c(Posterior.K.galaxy,Kval)

	if(irep %% 100 == 0){
		fyv<-yv*0
		for(k in 1:n.galaxy){
			fyv<-fyv+dnorm(yv,Pi[k],sigma)/n.galaxy
		}
		fyv.samp.GAL<-cbind(fyv.samp.GAL,fyv)
	}

}
@
<<T12,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=TRUE>>=
table(Posterior.K.galaxy)/length(Posterior.K.galaxy)
table(Prior.K.galaxy)/length(Prior.K.galaxy)
@
<<T13,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=FALSE>>=
par(mar=c(4,4,2,0))
boxplot(c(Prior.K.galaxy,Posterior.K.galaxy)~rep(c(1,2),each=nreps),
    names=c("Prior","Posterior"),xlab='',ylab='K',main="Posterior for K",ylim=range(0,20))
hist(Y.galaxy,breaks=seq(0,4.0,by=0.05),ylim=range(0,15),col='gray',
        main="Galaxy Data",xlab="Velocity");box()
for(i in 1:20){lines(yv,fyv.samp.GAL[,i]*n.galaxy*0.05,lwd=1,col='red')}
@

<<T14,comment='+', fig.width=7, fig.height=4.25, fig.align='center', echo=TRUE>>=
#Galaxy data using marginalized algorithm
Y.galaxy<-MASS::galaxies/10000
n.galaxy<-length(Y.galaxy)

set.seed(1010)
index<-c(1:n.galaxy)
nreps<-10000
alpha<-2.0
sigma<-0.1
lambda<-0.5

index<-c(1:n.galaxy)
Kval<-5
Yq<-quantile(Y.galaxy,prob=c(0:Kval)/Kval)
Cvec<-as.numeric(cut(Y.galaxy,Yq,include.lowest=T))
Ysum.Vec<-rep(0,Kval)
for(k in 1:Kval){Ysum.Vec[k]<-sum(Y.galaxy[Cvec==k])}
Nvec<-as.numeric(table(Cvec))

Posterior.K.marg<-rep(0,nreps)
yv<-seq(0,4,by=0.01)
marg.fyv.samp.GAL<-numeric()

for(irep in 1:nreps){

	for(i in 1:n.galaxy){
		sum.Vec<-Ysum.Vec
		sum.Vec[Cvec[i]]<-sum.Vec[Cvec[i]]-Y.galaxy[i]
		nvec<-Nvec
		nvec[Cvec[i]]<-nvec[Cvec[i]]-1
		if(nvec[Cvec[i]] > 0){
			mvec<-sum.Vec/nvec	
			Mvec<-(sum.Vec/sigma^2)/(nvec/sigma^2+1/lambda^2)
			Svec<-sigma^2*((nvec+1)/sigma^2+1/lambda^2)/(nvec/sigma^2+1/lambda^2)
			pvec<-nvec*dnorm(rep(Y.galaxy[i],Kval),Mvec,sqrt(Svec))
			pvec<-c(pvec,alpha*dnorm(Y.galaxy[i],0,sqrt(sigma^2 + lambda^2)))
			pvec<-pvec/sum(pvec)
			Ksamp<-sample(c(1:(Kval+1)),size=1,prob=pvec)
			Cvec[i]<-Ksamp
			if(Ksamp == Kval+1){
				Kval<-Kval+1
				Ysum.Vec<-c(sum.Vec,Y.galaxy[i])
				Nvec<-c(nvec,1)
			}else{
				Ysum.Vec<-sum.Vec
				Ysum.Vec[Ksamp]<-sum.Vec[Ksamp]+Y.galaxy[i]
				Nvec<-nvec
				Nvec[Ksamp]<-nvec[Ksamp]+1
			}
		}else{
			Ktmp<-Cvec[i]
			sum.Vec<-sum.Vec[-Ktmp]
			nvec<-nvec[-Ktmp]
			Kval<-Kval-1
			Cvec[Cvec > Ktmp]<-Cvec[Cvec > Ktmp]-1
	
			mvec<-sum.Vec/nvec	
			Mvec<-(sum.Vec/sigma^2)/(nvec/sigma^2+1/lambda^2)
			Svec<-sigma^2*((nvec+1)/sigma^2+1/lambda^2)/(nvec/sigma^2+1/lambda^2)
			pvec<-nvec*dnorm(rep(Y.galaxy[i],Kval),Mvec,sqrt(Svec))
			pvec<-c(pvec,alpha*
                dnorm(Y.galaxy[i],0,sqrt(sigma^2 + lambda^2)))/(n.galaxy-1+alpha)

			pvec<-pvec/sum(pvec)
			Ksamp<-sample(c(1:(Kval+1)),size=1,prob=pvec)
			Cvec[i]<-Ksamp
			if(Ksamp == Kval+1){
				Kval<-Kval+1
				Ysum.Vec<-c(sum.Vec,Y.galaxy[i])
				Nvec<-c(nvec,1)
			}else{
				Ysum.Vec<-sum.Vec
				Ysum.Vec[Ksamp]<-sum.Vec[Ksamp]+Y.galaxy[i]
				Nvec<-nvec
				Nvec[Ksamp]<-nvec[Ksamp]+1
			}
		}
	}

		
	Posterior.K.marg[irep]<-Kval

	if(irep %% 100 == 0){
		
		MuVec<-(Ysum.Vec/sigma^2)/(Nvec/sigma^2+1/lambda^2)
		SigVec<-1/(Nvec/sigma^2+1/lambda^2)
		Xvec<-rnorm(Kval)*sqrt(SigVec)+MuVec
		fyv<-yv*0
		for(k in 1:Kval){
			fyv<-fyv+Nvec[k]*dnorm(yv,Xvec[k],sigma)/n.galaxy
		}
		marg.fyv.samp.GAL<-cbind(marg.fyv.samp.GAL,fyv)
	}
}
@
<<T15,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=TRUE>>=
table(Posterior.K.marg)/nreps
table(Prior.K.galaxy)/length(Prior.K.galaxy)
@
<<T16,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=FALSE>>=
par(mar=c(4,4,2,0))
boxplot(c(Prior.K.galaxy,Posterior.K.marg)~rep(c(1,2),each=nreps),
    names=c("Prior","Posterior"),xlab='',ylab='K',main="Posterior for K",ylim=range(0,20))
hist(Y.galaxy,breaks=seq(0,4.0,by=0.05),ylim=range(0,15),col='gray',
        main="Galaxy Data",xlab="Velocity");box()
for(i in 1:20){lines(yv,marg.fyv.samp.GAL[,i]*n.galaxy*0.05,lwd=1,col='red')}
@
\end{document}