\documentclass[notitlepage]{article}
\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{bbold}
\usepackage{bm}



\usepackage{amsfonts,amsmath,amssymb}

\setlength{\parindent}{0pt}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%6i", x),
      sprintf("%.6f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@


\begin{center}
\textsclarge{MATH 598: TOPICS IN STATISTICS}

\vspace{0.1 in} \textsc{Bayesian Mixture Modelling of the Galaxy Data}
\end{center}

The finite mixture model with $K$ components has density
\[
f(y;\bomega,\theta,K) = \sum_{k=1}^K \omega_k f_k(y;\theta_k)
\]
where $\theta_k$ are the parameters for component density $k$, and $0 < \omega_k < 1, k=1,\ldots, K$ with $\sum\limits_{k=1}^K \omega_k = 1$.
Most typically the component densities are from the same parametric location-scale family (say Gaussian) which differ in location and scale.    The model can be represented by introducing latent component indicators $z_1,\ldots,z_n$, so that
\[
f(y_i;\bomega,\theta,K) = \sum_{k=1}^K f(y|Z_i = k;\btheta) \Pr[Z_i=k]
\]
and $\Pr[Z_i=k] = \omega_k$ for $k=1,\ldots,K$.

\medskip

For exchangeable data $(y_1,\ldots,y_n)$ from this model, the likelihood arising from this model does not permit analytical calculation of the posterior distribution, so MCMC is commonly used.

\begin{itemize}

\item  If $K$ is \textbf{known}, MCMC is typically implemented using a completed data model

\begin{itemizeWide}

\item Auxiliary variables  $z_1,\ldots,z_n$,
\[
f(y,z;\bomega,\theta,K) = \omega_z f_z(y;\theta_z) \qquad z \in \{1,2,\ldots,K\}
\]

\item Gibbs sampler strategy updates the $\bz$ and $(\bomega,\theta)$ from their full conditional posterior distributions.

\medskip

\item Conditional on $(\bomega,\theta)$, the posterior distribution for each $z_i$ is a discrete distribution on $\{1,2,\ldots,K\}$, whereas conditional on the $\bz$, the posterior for the mixture weights $\bomega$ is a density on the $K$ dimensional simplex, and the posterior for the components of $\theta_k$ proceeds using only those $y_i$ for which $z_i = k$, for $k=1,\ldots,K$.

\end{itemizeWide}


\item If $K$ is \textbf{unknown}, then transdimensional MCMC is also needed.



\medskip

\begin{itemizeWide}

\item A discrete prior on $K$, typically on some finite set, completes the posterior specification, and yields a posterior distribution $\pi_n(K,\theta^{(K)},\bomega^{(K)})$.

\item In this model, dimension-changing moves correspond to changing the value of $K$, and in order to construct an efficient and tunable MCMC algorithm, moves which perturb $K$ by $\pm 1$ are typically considered.

\item For example, moves of type $K\longrightarrow K + 1$ are termed ``birth" moves, and $K\longrightarrow K - 1$ are termed ``death" moves.

\item The forward/reverse move pairs are then births from $K$ and deaths from $K+1$.

\item Inference about the parameter $K$ may be of primary interest in a given application.

\end{itemizeWide}

\end{itemize}

\bigskip

\textbf{Normal Model:} For a Normal mixture model:


\begin{itemize}

\item If $y$ is one dimensional, each component density has two parameters, so a birth/death pair requires the introduction of three latent variables $\bu$, a new mean and variance, and also a parameter to introduce a new component weight.

\smallskip

\item If $y$ is $d$-dimensional, $d+d(d+1)/2 + 1$ new scalar parameters are needed, that is, a new mean and covariance matrix, and a new component weight parameter.

\smallskip

\item \textbf{New location/scale parameters:}  Either these new parameters are generated independently of the current model parameters from proposal distribution, or a subset of the current components are selected and used to generate a new additional component.

\smallskip
\item \textbf{New weight parameters:}  A random split or rescaling of the currently existing weights is used.

\end{itemize}

\newpage


\textbf{Galaxies Data: }  The galaxies data are commonly used to illustrate mixture modelling.  From the \texttt{R} help:

\begin{lstlisting}

galaxies                 package:MASS                  R Documentation

Velocities for 82 Galaxies

Description:

     A numeric vector of velocities in km/sec of 82 galaxies from 6
     well-separated conic sections of an 'unfilled' survey of the
     Corona Borealis region.  Multimodality in such surveys is evidence
     for voids and superclusters in the far universe.
\end{lstlisting}



<<T1,comment='+', fig.width=7, fig.height=4.75, fig.align='center', echo=TRUE>>=
(Y<-MASS::galaxies/10000);n<-length(Y)
par(mar=c(4,4,2,0))
low<-0.5;high<-4;del<-0.1
hist(Y,breaks=seq(low,high,by=del),col='gray',ylim=range(0,20),main='Galaxy data')
box()
@

We can analyze these data using the EM algorithm
<<T2,comment='+', fig.width=7, fig.height=4.75, fig.align='center', echo=TRUE>>=
mix.like<-function(muv,sigv,omv,yv){
	dvec<-rep(0,length(yv))
	for(i in 1:length(yv)){
		dvec[i]<-sum(omv*dnorm(yv[i],muv,sigv))
	}
	return(sum(log(dvec)))
}

em.analysis<-function(Yv,Kv,tolv=1e-10,iseed=10101){
    nv<-length(Yv)
    #Starting values
    omega.r<-omega.r1<-rep(1/Kv,Kv)
    if(iseed == 0){
        mu.vec.r<-mu.vec.r1<-quantile(Yv,prob=c(1:Kv)/(Kv+1))
    }else{
        set.seed(iseed)
        mu.vec.r<-mu.vec.r1<-runif(Kv,min(Yv),max(Yv))
    }
    sig.vec.r<-sig.vec.r1<-rep(sqrt(var(Yv)),Kv)
    varomega.r<-matrix(0,nrow=nv,ncol=Kv)

    Ym<-matrix(Yv,ncol=1)
    #Run the EM algorithm until convergence.
    del<-1
    while(del > tolv){
        for(i in 1:nv){
            for(k in 1:Kv){
                varomega.r[i,k]<-omega.r[k]*dnorm(Yv[i],mu.vec.r[k],sig.vec.r[k])
            }
            varomega.r[i,]<-varomega.r[i,]/sum(varomega.r[i,])
        }

        omega.r1<-apply(varomega.r,2,sum)
        omega.r1<-omega.r1/sum(omega.r1)
        for(k in 1:Kv){
            mu.vec.r1[k]<-sum(varomega.r[,k]*Yv)/sum(varomega.r[,k])
            sig.vec.r1[k]<-sqrt(sum(varomega.r[,k]*(Yv-mu.vec.r1[k])^2)/sum(varomega.r[,k]))
        }
        del<-sum((omega.r-omega.r1)^2)+sum((mu.vec.r-mu.vec.r1)^2)+
             sum((sig.vec.r-sig.vec.r1)^2)
        omega.r<-omega.r1
        mu.vec.r<-mu.vec.r1
        sig.vec.r<-sig.vec.r1
        like<-mix.like(mu.vec.r,sig.vec.r,omega.r,Yv)
    }
    return(list(mu=mu.vec.r,sig=sig.vec.r,omega=omega.r,Like=like))
}
@

The EM algorithm exploits the complete data likelihood obtained by including the latent component indictor variables $\bZ = (Z_1,\ldots,Z_n)$, and recursively solves
\[
\theta^{(r+1)} = \arg \max_\theta Q(\theta;\theta^{(r)}) = \arg \max_\theta \E_{\bZ|\bY}\left[\log f(\bY,\bZ;\theta)|\by;\theta^{(r)} \right]
\]
for $r=1,2,\ldots,$ where
\[
f(\by,\bz;\theta) = \prod_{i=1}^n \prod_{k=1}^K \left\{ \omega_k f(y_i;\mu_k,\sigma_k^2) \right\}^{\Ind_{\{k\}(z_i)}}
\]
is the complete data likelihood.

\newpage

The EM algorithm can be sensitive to starting values due to the multi-modal nature of the likelihood, so we choose 20 random collections of starting means, and take the maximum value achieved across the different runs.

<<T3,comment='+', fig.width=7, fig.height=4.75, fig.align='center', echo=TRUE>>=
Like.mat<-matrix(0,nrow=8,ncol=20)
Like.vec<-j.vec<-rep(0,8)
for(k in 1:8){
    rmax<--Inf
    for(j in 1:20){
        f.em<-em.analysis(Y,tol=1e-10,Kv=k,iseed=j^3+j^2+j+1)
        Like.mat[k,j]<-f.em$Like
        if(f.em$Like > rmax){
            Like.vec[k]<-f.em$Like
            rmax<-f.em$Like
            j.vec[k]<-j
        }
    }
}
@
We can perform model selection using AIC or BIC
\begin{align*}
\text{AIC :} & -2 \log \mathcal{L}_n^{(K)}(\widehat \theta^{(K)}) + 2 d_K \\[6pt]
\text{BIC :} & -2 \log \mathcal{L}_n^{(K)}(\widehat \theta^{(K)}) + d_K \log n
\end{align*}
where $d_K = 3 K -1$ -- we select the model for which each quantity is smallest.

<<T4,comment='+', fig.width=7, fig.height=4.75, fig.align='center', echo=TRUE>>=
np<-3*c(1:8)-1
Res<-rbind(Like.vec,-2*Like.vec+np*2,-2*Like.vec+np*log(n))
colnames(Res)<-1:8
row.names(Res)<-c("Max logLik.","AIC","BIC")
round(Res,3)
@
By AIC, there is little difference between fits for $k=5,6,7,8$, whereas for BIC, $k=3$ seems optimal.
<<T5,comment='+', fig.width=7, fig.height=4.75, fig.align='center', echo=TRUE>>=

mix.like.par<-function(muv,sigv,omv,yv,Kv){

	dvec<-rep(0,length(yv))
	for(k in 1:Kv){
		dvec<-dvec+omv[k]*dnorm(yv,muv[k],sigv[k])
	}
	return(sum(log(dvec)))
}
f.emA<-em.analysis(Y,tolv=1e-10,Kv=6,iseed=j.vec[6]^3+j.vec[6]^2+j.vec[6]+1)
f.emB<-em.analysis(Y,tolv=1e-10,Kv=3,iseed=j.vec[3]^3+j.vec[3]^2+j.vec[3]+1)
xvec<-seq(low,high,by=0.01)
yvec.A<-yvec.B<-xvec*0
for(k in 1:6){
	yvec.A<-yvec.A+f.emA$omega[k]*dnorm(xvec,f.emA$mu[k],f.emA$sig[k])
}
for(k in 1:3){
	yvec.B<-yvec.B+f.emB$omega[k]*dnorm(xvec,f.emB$mu[k],f.emB$sig[k])
}
par(mar=c(4,4,2,0))
low<-0.5;high<-4;del<-0.1
hist(Y,breaks=seq(low,high,by=del),col='gray',main='Galaxy data',ylim=range(0,20));box()
lines(xvec,yvec.A*n*del)
lines(xvec,yvec.B*n*del,col='red')
legend(0.5,20,c('K=6','K=3'),lty=1,col=c('black','red'))
@


For the Bayesian solution, we specify the following hierarchical prior
\[
\pi_0(K) \pi_0(\sigma_{K1}^2,\ldots,\sigma_{KK}^2|K) \pi_0(\mu_{K1}^2,\ldots,\mu_{KK}^2|\sigma_{K1}^2,\ldots,\sigma_{KK}^2,K) \pi_0(\omega_{K1},\ldots,\omega_{KK}^2|K)
\]

\begin{itemizeWide}

\item Prior for $K$:  we specify a $Poisson(\gamma)$ prior restricted to the positive integers, with pmf
\[
\pi_0(K) = \frac{\gamma^K \exp\{-\gamma\}}{(1-\exp\{-\gamma\})K!} \qquad K = 1,2,\ldots
\]
for parameter $\gamma > 0$.

\item Prior for $\sigma_{Kk}^2$: we specify independent priors
\[
\pi_0(\sigma_{K1}^2,\ldots,\sigma_{KK}^2|K) = \prod_{k=1}^K \pi_0(\sigma_{Kk}^2)
\]
where $\pi_0(\sigma_{Kk}^2) \equiv InverseGamma(a/2,b/2)$ for hyperparameters $a,b>0$.

\item Prior for $\mu_{Kk}$ given $\sigma_k^2$: we specify conditionally independent priors
\[
\pi_0(\mu_{K1}^2,\ldots,\mu_{KK}^2|\sigma_{K1}^2,\ldots,\sigma_{KK}^2,K) = \prod_{k=1}^K \pi_0(\mu_{Kk}|\sigma_{Kk}^2)
\]
where $\pi_0(\mu_{Kk}|\sigma_{Kk}^2) \equiv Normal(\eta,\sigma_{Kk}^2/\lambda)$ for hyperparameters $\eta$ and $\lambda>0$.

\item Prior for $\omega_{(K)} = (\omega_1^{(K)},\ldots,\omega_K^{(K)})$:  we specify a symmetric Dirichlet prior
\[
\pi_0(\omega_{K1},\ldots,\omega_{KK}|K) = \dfrac{\Gamma(K \alpha)}{\{\Gamma(\alpha)\}^K} \prod_{k=1}^K \omega_{Kk}^{\alpha - 1}
\]
on the $K$-dimensional unit simplex.
\end{itemizeWide}

For a reversible jump MCMC solution, we consider Birth and Death move pairs for the transdimensional moves:

\begin{itemize}

\item \textbf{Birth Move:}  We consider an update that changes $K \lra K+1$, requiring the need to generate a new component.  We simulate the new mean and variance parameters from the prior distribution:
\begin{align*}
\sigma_{(K+1)(K+1)}^2 & \sim InverseGamma(a/2,b/2) \\[6pt]
\mu_{(K+1)(K+1)} & \sim Normal(\eta,\sigma_{(K+1)(K+1)}^2/\lambda)
\end{align*}
For the new component weight we generate $\psi \sim Beta(1,K)$, and then set the new component weight vector $\omega_{(K+1)}$ as
\[
(\omega_{(K+1)1},\ldots,\omega_{(K+1)(K+1)}) =
((1-\psi)\omega_{K1},\ldots,(1-\psi)\omega_{KK},\psi).
\]
The Jacobian of the transform is
\[
\frac{1}{(1-\omega_{(K+1)(K+1)})^K}
\]
and therefore the acceptance probability for the Birth move is the minimum of 1 and
\[
\dfrac{\pi_n(K+1,\mu_{(K+1)},\sigma_{(K+1)}^2,\omega_{(K+1)})}{\pi_n(K,\mu_{K},\sigma_{K}^2,\omega_{K}) \ p(\mu_{(K+1)(K+1)},\sigma_{(K+1)(K+1)}^2) \ p(\psi)} \times \frac{1}{(1-\omega_{(K+1)(K+1)})^K}
\]


\item \textbf{Death Move:}  For the (reverse) Death move we consider the dimension change $K+1 \lra K$ by picking a component at random from the $K+1$ existing components to eliminate, and then reversing the transformation from the Birth move to set
\[
(\omega_{K1},\ldots,\omega_{KK},\psi) = \frac{1}{(1-\psi)} (\omega_{(K+1)1},\ldots,\omega_{(K+1)K})
\]
for which the Jacobian is $(1-\psi)^K$, and the acceptance probability is the minimum of 1 and
\[
\dfrac{\pi_n(K,\mu_{K},\sigma_{K}^2,\omega_{K}) \ p(\mu_{(K+1)(K+1)},\sigma_{(K+1)(K+1)}^2) \ p(\psi)}{\pi_n(K+1,\mu_{(K+1)},\sigma_{(K+1)}^2,\omega_{(K+1)})} \times (1-\psi)^K
\]

\end{itemize}

<<T6,comment='+', fig.width=7, fig.height=4.75, fig.align='center', echo=TRUE>>=
mix.like.par<-function(muv,sigv,omv,yv,Kv){
	dvec<-rep(0,length(yv))
	for(k in 1:Kv){
		dvec<-dvec+omv[k]*dnorm(yv,muv[k],sigv[k])
	}
	return(sum(log(dvec)))
}

dinvgamma<-function(xv,a,b,log=F){
	lv<-a*log(b)-lgamma(a)-(a+1)*log(xv)-b/xv
	if(log){
		return(lv)
	}else{
		return(exp(lv))
	}
}

dpois0<-function(xv,lam,log=F){	#Poisson pmf with zero removed.
	fv<-dpois(xv,lam,log=log)-log(1-exp(-lam))
	if(log){return(fv)
	}else{return(exp(fv))
	}
}
@

Choosing the hyperparameters for this conjugate prior can be achieved using simulation methods.

<<T7,comment='+', fig.width=7, fig.height=4.5, fig.align='center', echo=TRUE>>=
#Prior elicitation
prior.al<-1
prior.eta<-2
prior.lam<-0.01
prior.a<-40
prior.b<-4
prior.K.gam<-2

s1<-1
a1<-1

s<-(1/sqrt(rgamma(10000,prior.a/2,prior.b/2)))
m<-rnorm(10000)*s/sqrt(prior.lam)+prior.eta
par(mar=c(4,4,2,0))
hist(m,col='gray',main=expression(paste('Sample from prior for ',mu)));box()
hist(s,col='gray',main=expression(paste('Sample from prior for ',sigma)));box()
@

In order to facilitate ease of posterior sampling for the component parameters, we again introduce the latent component indicators $Z_1,\ldots,Z_n$, and sample their values from their full conditional posterior distributions as part of the Gibbs sampler. Conditional on the component parameters, we have that
\[
\Pr[Z_i = k | \by, \btheta,K] = \dfrac{\omega_{Kk} f_k(y_i;\mu_{Kk},\sigma_{Kk}^2)}{\sum\limits_{j=1}^K \omega_{Kj} f_j(y_i;\mu_{Kj},\sigma_{Kj}^2)} \qquad k=1,\ldots,K
\]
independently for $i=1,\ldots,n$.  Conditional on the sampled $Z$ values we have a conjugate model:  specifically the posterior distribution of $\sigma_{Kk}^2$ is $InverseGamma(a_{nk}/2,b_{nk}/2)$ distribution with parameters
\[
a_{nk} = a + n_{Kk} \qquad \qquad b_{nk} = b + \frac{n_{Kk} \lambda}{n_{Kk} + \lambda} (\overline{y}_{nKk}- \eta)^2 + \sum_{i: z_i = k} (y_i - \overline{y}_{nKk})^2
\]
and the conditional posterior for $\mu_{Kk}$ is $Normal(\eta_{nk},\sigma_{Kk}^2/\lambda_{nk})$ where
\[
\eta_{nk} = \frac{\lambda \eta + \sum\limits_{i: z_i = k} y_i }{\lambda + n_{Kk}} \qquad \qquad  \lambda_{nk} = \lambda + n_{Kk}
\]
where 
\[
n_{Kk} = \sum_{i = 1}^n \Ind_{\{k \}} (z_i) \qquad \qquad \overline{y}_{nKk} = \dfrac{\sum\limits_{i = 1}^n \Ind_{\{k \}} (z_i) y_i}{\sum\limits_{i = 1}^n \Ind_{\{k \}} (z_i)}
\]
are the component specific sample size and sample mean at the current iteration.  Note that these formulae hold if a component is `empty', that is, if no data are allocated to a given cluster, as in such cases the conditional posterior and conditional prior are identical.  For $\omega_K$ we have a simple conjugate analysis given the latent indicators
\[
\omega_{K}| - \sim Dirichlet(K;\alpha + n_{K1},\ldots,\alpha + n_{KK})
\]
Note that the Birth and Death steps can be implemented on the collapsed form, by summing out over the possible values of the latent component indicators.


<<T8,comment='+', fig.width=7, fig.height=4.75, fig.align='center', echo=TRUE>>=
set.seed(234)

#Starting values
old.K<-4
old.omega<-old.omega1<-rep(1/old.K,old.K)
old.mu<-old.mu1<-as.numeric(quantile(Y,prob=c(1:old.K)/(old.K+1)))
old.sig<-old.sig1<-rep(sqrt(var(Y)),old.K)
old.z<-rep(0,n)

#Run the MCMC algorithm
nburn<-5000
nsamp<-10000
nthin<-5
nits<-nburn+nsamp*nthin

#Galaxy plot
low<-0.5
high<-4
del<-0.1

par(mar=c(4,4,2,1))
hist(Y,breaks=seq(low,high,by=del),ylim=range(0,20),col='gray',
        main='Galaxy data: posterior sampled pdfs');box()

mu.samp<-matrix(0,nrow=nsamp,ncol=100)
sig.samp<-matrix(0,nrow=nsamp,ncol=100)
omega.samp<-matrix(0,nrow=nsamp,ncol=100)
K.samp<-rep(0,nsamp)
ico<-0
for(iter in 1:nits){
	pvec<-rep(0,old.K)
	for(i in 1:n){
		for(k in 1:old.K){
			pvec[k]<-old.omega[k]*dnorm(Y[i],old.mu[k],old.sig[k])
		}
		old.z[i]<-sample(1:old.K,prob=pvec,size=1)
	}

	for(k in 1:old.K){
		old.omega1[k]<-sum(old.z==k)
	}
	old.omega1<-rgamma(old.K,old.omega1+prior.al,1)
	old.omega<-old.omega1/sum(old.omega1)

	for(k in 1:old.K){
		yk<-Y[old.z==k]
		nk<-length(yk)
		if(nk == 0){
			old.sig[k]<-1/sqrt(rgamma(1,prior.a/2,prior.b/2))
			old.mu[k]<-rnorm(1,prior.eta,old.sig[k]/sqrt(prior.lam))
		}else{
			ssq<-sum((yk-mean(yk))^2)
			post.a<-prior.a+nk
			post.b<-prior.b+ssq+(nk*prior.lam)*(mean(yk)-prior.eta)^2/(nk+prior.lam)
			old.sig[k]<-1/sqrt(rgamma(1,post.a/2,post.b/2))
			post.sig<-old.sig[k]/sqrt(nk+prior.lam)
			post.mu<-(sum(yk)+prior.lam*prior.eta)/(nk+prior.lam)
			old.mu[k]<-rnorm(1,post.mu,post.sig)
		}
	}

	old.like<-mix.like.par(old.mu,old.sig,old.omega,Y,old.K)
	old.prior<-sum(dinvgamma(old.sig^2,prior.a/2,prior.b/2,log=T))+
			sum(dnorm(old.mu,prior.eta,old.sig/sqrt(prior.lam),log=T))+
			lgamma(old.K*prior.al)-old.K*lgamma(prior.al)+
                    (prior.al-1)*sum(log(old.omega))+dpois0(old.K,prior.K.gam,log=T)

	#Transdimensional moves: Birth and Death
	if(runif(1) < 0.5){
		#Death
		new.K<-old.K-1
		if(new.K == 0){	#Cannot remove last component, prior prob is zero
            mh<--Inf
		}else{
			k<-sample(1:old.K,size=1)
			new.mu<-old.mu[-k]
			new.sig<-old.sig[-k]
			new.omega<-old.omega[-k]/sum(old.omega[-k])
			new.like<-mix.like.par(new.mu,new.sig,new.omega,Y,new.K)
			new.prior<-sum(dinvgamma(new.sig^2,prior.a/2,prior.b/2,log=T))+
                sum(dnorm(new.mu,prior.eta,new.sig/sqrt(prior.lam),log=T))+
				lgamma(new.K*prior.al)-new.K*lgamma(prior.al)+
                (prior.al-1)*sum(log(new.omega))+dpois0(new.K,prior.K.gam,log=T)
			new.q<-dnorm(old.mu[k],prior.eta,old.sig[k]/sqrt(prior.lam),log=T)+
                dinvgamma(old.sig[k]^2,prior.a/2,prior.b/2,log=T)+
                dbeta(old.omega[k],1,new.K,log=T)
            logJac<-new.K*log(1-old.omega[k])
			mh<-(new.like+new.prior)-(old.like+old.prior)+new.q+logJac
		}
		if(log(runif(1)) < mh){
			old.K<-new.K
			old.mu<-new.mu
			old.sig<-new.sig
			old.omega<-new.omega
			old.like<-new.like
			old.prior<-new.prior
		}
	}else{
		#Birth
		new.K<-old.K+1
		new.s<-1/sqrt(rgamma(1,prior.a/2,prior.b/2))
		new.m<-rnorm(1,prior.eta,new.s/sqrt(prior.lam))
		new.mu<-c(old.mu,new.m)
		new.sig<-c(old.sig,new.s)
		psi<-rbeta(1,1,old.K)
		new.omega<-c(old.omega*(1-psi),psi)
		new.like<-mix.like.par(new.mu,new.sig,new.omega,Y,new.K)
		new.prior<-sum(dinvgamma(new.sig^2,prior.a/2,prior.b/2,log=T))+
            sum(dnorm(new.mu,prior.eta,new.sig/sqrt(prior.lam),log=T))+
            lgamma(new.K*prior.al)-new.K*lgamma(prior.al)+
            (prior.al-1)*sum(log(new.omega))+dpois0(new.K,prior.K.gam,log=T)
		new.q<-dnorm(new.m,prior.eta,new.s/sqrt(prior.lam),log=T)+
            dinvgamma(new.s^2,prior.a/2,prior.b/2,log=T)+
            dbeta(psi,1,old.K)
		logJac<--old.K*log(1-psi)
		mh<-(new.like+new.prior)-(old.like+old.prior)-new.q+logJac
		if(log(runif(1)) < mh){
			old.K<-new.K
			old.mu<-new.mu
			old.sig<-new.sig
			old.omega<-new.omega
			old.like<-new.like
			old.prior<-new.prior
		}
	}

	if(iter %% 1000 == 0){
		Ly<-mix.like.par(old.mu,old.sig,old.omega,Y,old.K)
		old.Py<-sum(dinvgamma(old.sig^2,prior.a/2,prior.b/2,log=T))+
			sum(dnorm(old.mu,prior.eta,old.sig/sqrt(prior.lam),log=T))+
			lgamma(old.K*prior.al)-old.K*lgamma(prior.al)+
                    (prior.al-1)*sum(log(old.omega))+dpois0(old.K,prior.K.gam,log=T)

		xvec<-seq(low,high,by=0.01)
		yvec<-xvec*0
		for(k in 1:old.K){
			yvec<-yvec+old.omega[k]*dnorm(xvec,old.mu[k],old.sig[k])
		}
		lines(xvec,yvec*n*del)
	}

	if(iter > nburn & iter %% nthin == 0){
		ico<-ico+1
		mu.samp[ico,1:old.K]<-old.mu
		sig.samp[ico,1:old.K]<-old.sig
		omega.samp[ico,1:old.K]<-old.omega
		K.samp[ico]<-old.K
	}
}
@

The approximate posterior distribution (under this prior specification) is computed from the sampled values of $K$.
<<T9,comment='+', fig.width=7, fig.height=4.75, fig.align='center', echo=TRUE>>=
K.post<-table(K.samp)/nsamp
K.post
@
Therefore the posterior probability on $k=3$ is \Sexpr{K.post[2]}.
\end{document}