\documentclass[notitlepage]{article}
\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{multirow}
\usepackage{bbm}

\usepackage{amsfonts,amsmath,amssymb}

\setlength{\parindent}{0pt}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%.3f", x),
      sprintf("%.3f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@

\begin{center}
\textsclarge{MATH 598: TOPICS IN STATISTICS}

\vspace{0.1 in} \textsc{Bayesian inference for GLMs}
\end{center}

In a generalized linear model (GLM) the conditional expectation of a response variable $Y_i$ is modelled as a function of covariates $\bx_i = (1,x_{i1},\ldots,x_{id})$, specifically
\[
\E[Y_i | \bx_i;\beta] = g(\bx_i \beta) = g(\beta_0 + \beta_1 x_{i1} + \cdots + \beta_{id} x_{id}) = \mu_i
\]
say for some inverse link function $g(.)$, and where the conditional distribution of $Y_i$ is an Exponential-Dispersion family model.  In such a model
\[
\Var[Y_i | \bx_i;\beta] = \phi V(\mu_i)
\]
for dispersion parameter $\phi$, and variance function $V(.)$.


\bigskip
\textbf{Poisson Regression:}  In a Poisson regression model,
\[
Y_i | \bx_i; \beta \sim Poisson(\mu_i)
\]
and the usual approach is to choose $g(t) = e^t$, so that
\[
\mu_i = \exp(\bx_i \beta) \qquad V(\mu_i) = \mu_i \qquad \phi = 1.
\]
Then the likelihood function is
\[
\mathcal{L}_n(\beta) = \prod_{i=1}^n f_{Y|X}(y_i | \bx_i; \beta) = \prod_{i=1}^n \frac{e^{-\mu_i}}{y_i!} \mu_i^{y_i}
\]
so that the log likelihood is
\[
\ell_n(\beta) = \sum_{i=1}^n \left(-\mu_i + y_i \log \mu_i - \log y_i! \right) = \sum_{i=1}^n \left(-\exp\{\bx_i \beta \} + y_i \bx_i \beta - \log y_i! \right).
\]
To find the maximum likelihood estimate (mle), we differentiate with respect to $\beta$ and equate the result to zero: we have
\[
\dot{\ell}_n(\beta) = \sum_{i=1}^n \left(-\exp\{\bx_i \beta \} \bx_i^\top  + y_i \bx_i^\top  \right) = \sum_{i=1}^n \left(y_i -\exp\{\bx_i \beta \} \right)\bx_i^\top = \sum_{i=1}^n \left(y_i - \mu_i \right)\bx_i^\top
\]
and equating to the $(d+1)$-dimensional zero vector yields the mle $\widehat \beta_n$.  The second derivative $\ddot{\ell}_n(\beta)$ evaluated at $\widehat \beta_n$ yields the curvature of the log-likelihood at the mle:
\[
\ddot{\ell}_n(\beta) = - \sum_{i=1}^n \exp\{\bx_i \beta \} \bx_i^\top \bx_i = - \sum_{i=1}^n \mu_i \bx_i^\top \bx_i = - \bX^\top \bD_n \bX
\]
say, where
\[
\bX = \begin{pmatrix}\bx_1 \\ \bx_2 \\ \vdots \\ \bx_n \end{pmatrix} \qquad \bD_n = \bD_n (\beta) =  \text{diag}(\mu_1,\ldots,\mu_n).
\]
Using the quadratic approximation to the log-likelihood, we see that for large $n$ the posterior distribution is approximately Normal
\[
\pi_n(\beta) \approx Normal\left( \widehat \beta_n, (\bX^\top \widehat \bD_n \bX)^{-1} \right)
\]
and $\widehat \bD_n = \bD_n (\widehat \beta_n)$.   The mle and variance matrices can be computed using \texttt{glm} in \texttt{R}.
<<GLM00,comment='+', fig.width=6, fig.height=4, fig.align='center'>>=
set.seed(234)
n<-20
x<-rnorm(n,-1,2)
X<-cbind(1,x)
be<-c(1,2)
mu<-exp(X %*% be)
y<-rpois(n,mu)
fit1<-glm(y~x,family=poisson)
th.n<-coef(fit1)
post.var.n<-summary(fit1)$cov.unscaled
th.n
post.var.n

#Exact log likelhood
log.like<-function(th0,th1,xv,yv){
    muv<-exp(th0+th1*xv)
    return(sum(dpois(yv,muv,log=T)))
}
f <- Vectorize(log.like,vectorize.args=c("th0","th1"))
th0v<-seq(0.7,1.2,by=0.001)
th1v<-seq(1.7,2.2,by=0.001)
lmat<-outer(th0v,th1v,f,xv=x,yv=y)
lmax<-max(lmat)

#Quadratic approximation
quad.func<-function(th0,th1,m,M){

    tv<-c(th0,th1)-m
    q<--0.5*t(tv) %*% solve(M) %*% tv
    return(q[1])
}
f <- Vectorize(quad.func,vectorize.args=c("th0","th1"))
th0v<-seq(0.7,1.2,by=0.001)
th1v<-seq(1.7,2.2,by=0.001)
qmat<-outer(th0v,th1v,f,m=th.n,M=post.var.n)+lmax


#Plot
library(fields,quietly=TRUE)
par(pty='s',mar=c(2,3,2,2))
colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red"))
image.plot(th0v,th1v,lmat,col=colfunc(100),
           xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8)
contour(th0v,th1v,lmat,add=T,levels=c(seq(-90,-40,by=10),seq(-30,-20,by=2)))
title(expression(paste('Log-likelihood ',l[n](beta[0],beta[1]))))
points(th.n[1],th.n[2],pch=4)

image.plot(th0v,th1v,qmat,col=colfunc(100),
           xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8)
contour(th0v,th1v,qmat,add=T,levels=c(seq(-90,-40,by=10),seq(-30,-20,by=2)))
title(expression(paste('Quadratic approximation ',q[n](beta[0],beta[1]))))
points(th.n[1],th.n[2],pch=4)
@

\textbf{Logistic Regression:}  In a logistic regression model,
\[
Y_i | \bx_i; \beta \sim Bernoulli(\mu_i)
\]
and the usual approach is to choose $g(t) = e^t/(1+e^t) = \expit\{t\}$, so that
\[
\mu_i = \expit(\bx_i \beta) \qquad V(\mu_i) = \mu_i (1-\mu_i) \qquad \phi = 1.
\]
Then the likelihood function is
\[
\mathcal{L}_n(\beta) = \prod_{i=1}^n f_{Y|X}(y_i | \bx_i; \beta) = \prod_{i=1}^n \mu_i^{y_i} (1-\mu_i)^{1-\mu_i}
\]
so that the log likelihood is
\[
\ell_n(\beta) = \sum_{i=1}^n \left( y_i \log \mu_i + (1-y_i) \log (1-\mu_i) \right) = \sum_{i=1}^n \left(y_i \bx_i \beta - \log (1+\exp(\bx_i \beta) \right).
\]
To find the maximum likelihood estimate (mle), we differentiate with respect to $\beta$ and equate the result to zero: we have
\[
\dot{\ell}_n(\beta) = \sum_{i=1}^n \left(y_i \bx_i^\top - \expit(\bx_i \beta) \bx_i^\top \right) = \sum_{i=1}^n \left(y_i - \mu_i \right)\bx_i^\top
\]
and equating to the $(d+1)$-dimensional zero vector yields the mle $\widehat \beta_n$.  The second derivative $\ddot{\ell}_n(\beta)$ evaluated at $\widehat \beta_n$ yields the curvature of the log-likelihood at the mle:
\[
\ddot{\ell}_n(\beta) = - \sum_{i=1}^n \mu_i (1-\mu_i) \bx_i^\top \bx_i = - \bX^\top \bD_n \bX
\]
say, where
\[
\bX = \begin{pmatrix}\bx_1 \\ \bx_2 \\ \vdots \\ \bx_n \end{pmatrix} \qquad \bD_n = \text{diag}(\mu_1 (1-\mu_1),\ldots,\mu_n (1-\mu_n)).
\]
Using the quadratic approximation to the log-likelihood, we see that for large $n$ the posterior distribution is approximately Normal
\[
\pi_n(\beta) \approx Normal\left( \widehat \beta_n, (\bX^\top \widehat \bD_n \bX)^{-1} \right)
\]
The mle and variance matrices can be computed using \texttt{glm} in \texttt{R}.

\medskip

In the following plots, the exact log-likelihood, and the corresponding quadratic approximation, are plotted for

\begin{enumerate}[(i)]

\item $n=20$

\item $n=200$

\item $n=2000$

\end{enumerate}
Even at the largest sample size, the quadratic approximation is good only quite close to the maximum value.

<<GLM01,comment='+', fig.width=6, fig.height=4, fig.align='center'>>=
set.seed(2634)
n<-20
x<-rnorm(n,-1,1)
X<-cbind(1,x)
be<-c(1,2)
expit<-function(x){return(1/(1+exp(-x)))}
mu<-expit(X %*% be)
y<-rbinom(n,1,mu)
table(y)
fit1<-glm(y~x,family=binomial)
th.n<-coef(fit1)
post.var.n<-summary(fit1)$cov.unscaled
th.n
post.var.n

#Exact log likelhood
log.like<-function(th0,th1,xv,yv){
    muv<-expit(th0+th1*xv)
    return(sum(dbinom(yv,1,muv,log=T)))
}
f <- Vectorize(log.like,vectorize.args=c("th0","th1"))
th0v<-seq(-6,6,by=0.05)
th1v<-seq(-6,6,by=0.05)
lmat<-outer(th0v,th1v,f,xv=x,yv=y)
lmax<-max(lmat)

#Quadratic approximation
quad.func<-function(th0,th1,m,M){

    tv<-c(th0,th1)-m
    q<--0.5*t(tv) %*% solve(M) %*% tv
    return(q[1])
}
f <- Vectorize(quad.func,vectorize.args=c("th0","th1"))
qmat<-outer(th0v,th1v,f,m=th.n,M=post.var.n)+lmax


#Plot
library(fields,quietly=TRUE)
par(pty='s',mar=c(2,3,2,2))
colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red"))
image.plot(th0v,th1v,lmat,col=colfunc(100),
           xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8)
contour(th0v,th1v,lmat,add=T,nlevels=20)
title(expression(paste('Log-likelihood ',l[n](beta[0],beta[1]))))
points(th.n[1],th.n[2],pch=4)

image.plot(th0v,th1v,qmat,col=colfunc(100),
           xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8)
contour(th0v,th1v,qmat,add=T,nlevels=20)
title(expression(paste('Quadratic approximation ',q[n](beta[0],beta[1]))))
points(th.n[1],th.n[2],pch=4)
@

<<GLM02,comment='+', fig.width=6, fig.height=4, fig.align='center'>>=
set.seed(2634)
n<-200
x<-rnorm(n,-1,1)
X<-cbind(1,x)
be<-c(1,2)
expit<-function(x){return(1/(1+exp(-x)))}
mu<-expit(X %*% be)
y<-rbinom(n,1,mu)
table(y)
fit1<-glm(y~x,family=binomial)
th.n<-coef(fit1)
post.var.n<-summary(fit1)$cov.unscaled
th.n
post.var.n

#Exact log likelhood
log.like<-function(th0,th1,xv,yv){
    muv<-expit(th0+th1*xv)
    return(sum(dbinom(yv,1,muv,log=T)))
}
f <- Vectorize(log.like,vectorize.args=c("th0","th1"))
th0v<-seq(-6,6,by=0.05)
th1v<-seq(-6,6,by=0.05)
lmat<-outer(th0v,th1v,f,xv=x,yv=y)
lmax<-max(lmat)

#Quadratic approximation
quad.func<-function(th0,th1,m,M){

    tv<-c(th0,th1)-m
    q<--0.5*t(tv) %*% solve(M) %*% tv
    return(q[1])
}
f <- Vectorize(quad.func,vectorize.args=c("th0","th1"))
qmat<-outer(th0v,th1v,f,m=th.n,M=post.var.n)+lmax


#Plot
library(fields,quietly=TRUE)
par(pty='s',mar=c(2,3,2,2))
colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red"))
image.plot(th0v,th1v,lmat,col=colfunc(100),
           xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8)
contour(th0v,th1v,lmat,add=T,nlevels=20)
title(expression(paste('Log-likelihood ',l[n](beta[0],beta[1]))))
points(th.n[1],th.n[2],pch=4)

image.plot(th0v,th1v,qmat,col=colfunc(100),
           xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8)
contour(th0v,th1v,qmat,add=T,nlevels=20)
title(expression(paste('Quadratic approximation ',q[n](beta[0],beta[1]))))
points(th.n[1],th.n[2],pch=4)
@

<<GLM03,comment='+', fig.width=6, fig.height=4, fig.align='center'>>=
set.seed(2634)
n<-2000
x<-rnorm(n,-1,1)
X<-cbind(1,x)
be<-c(1,2)
expit<-function(x){return(1/(1+exp(-x)))}
mu<-expit(X %*% be)
y<-rbinom(n,1,mu)
table(y)
fit1<-glm(y~x,family=binomial)
th.n<-coef(fit1)
post.var.n<-summary(fit1)$cov.unscaled
th.n
post.var.n

#Exact log likelhood
log.like<-function(th0,th1,xv,yv){
    muv<-expit(th0+th1*xv)
    return(sum(dbinom(yv,1,muv,log=T)))
}
f <- Vectorize(log.like,vectorize.args=c("th0","th1"))
th0v<-seq(-6,6,by=0.05)
th1v<-seq(-6,6,by=0.05)
lmat<-outer(th0v,th1v,f,xv=x,yv=y)
lmax<-max(lmat)

#Quadratic approximation
quad.func<-function(th0,th1,m,M){

    tv<-c(th0,th1)-m
    q<--0.5*t(tv) %*% solve(M) %*% tv
    return(q[1])
}
f <- Vectorize(quad.func,vectorize.args=c("th0","th1"))
qmat<-outer(th0v,th1v,f,m=th.n,M=post.var.n)+lmax


#Plot
library(fields,quietly=TRUE)
par(pty='s',mar=c(2,3,2,2))
colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red"))
image.plot(th0v,th1v,lmat,col=colfunc(100),
           xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8)
contour(th0v,th1v,lmat,add=T,nlevels=20)
title(expression(paste('Log-likelihood ',l[n](beta[0],beta[1]))))
points(th.n[1],th.n[2],pch=4)

image.plot(th0v,th1v,qmat,col=colfunc(100),
           xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8)
contour(th0v,th1v,qmat,add=T,nlevels=20)
title(expression(paste('Quadratic approximation ',q[n](beta[0],beta[1]))))
points(th.n[1],th.n[2],pch=4)
@


\end{document}