\documentclass[notitlepage]{article} \usepackage{../../Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{multirow} \usepackage{bbm} \usepackage{amsfonts,amsmath,amssymb} \setlength{\parindent}{0pt} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%.3f", x), sprintf("%.3f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \begin{center} \textsclarge{MATH 598: TOPICS IN STATISTICS} \vspace{0.1 in} \textsc{Bayesian inference for GLMs} \end{center} In a generalized linear model (GLM) the conditional expectation of a response variable $Y_i$ is modelled as a function of covariates $\bx_i = (1,x_{i1},\ldots,x_{id})$, specifically \[ \E[Y_i | \bx_i;\beta] = g(\bx_i \beta) = g(\beta_0 + \beta_1 x_{i1} + \cdots + \beta_{id} x_{id}) = \mu_i \] say for some inverse link function $g(.)$, and where the conditional distribution of $Y_i$ is an Exponential-Dispersion family model. In such a model \[ \Var[Y_i | \bx_i;\beta] = \phi V(\mu_i) \] for dispersion parameter $\phi$, and variance function $V(.)$. \bigskip \textbf{Poisson Regression:} In a Poisson regression model, \[ Y_i | \bx_i; \beta \sim Poisson(\mu_i) \] and the usual approach is to choose $g(t) = e^t$, so that \[ \mu_i = \exp(\bx_i \beta) \qquad V(\mu_i) = \mu_i \qquad \phi = 1. \] Then the likelihood function is \[ \mathcal{L}_n(\beta) = \prod_{i=1}^n f_{Y|X}(y_i | \bx_i; \beta) = \prod_{i=1}^n \frac{e^{-\mu_i}}{y_i!} \mu_i^{y_i} \] so that the log likelihood is \[ \ell_n(\beta) = \sum_{i=1}^n \left(-\mu_i + y_i \log \mu_i - \log y_i! \right) = \sum_{i=1}^n \left(-\exp\{\bx_i \beta \} + y_i \bx_i \beta - \log y_i! \right). \] To find the maximum likelihood estimate (mle), we differentiate with respect to $\beta$ and equate the result to zero: we have \[ \dot{\ell}_n(\beta) = \sum_{i=1}^n \left(-\exp\{\bx_i \beta \} \bx_i^\top + y_i \bx_i^\top \right) = \sum_{i=1}^n \left(y_i -\exp\{\bx_i \beta \} \right)\bx_i^\top = \sum_{i=1}^n \left(y_i - \mu_i \right)\bx_i^\top \] and equating to the $(d+1)$-dimensional zero vector yields the mle $\widehat \beta_n$. The second derivative $\ddot{\ell}_n(\beta)$ evaluated at $\widehat \beta_n$ yields the curvature of the log-likelihood at the mle: \[ \ddot{\ell}_n(\beta) = - \sum_{i=1}^n \exp\{\bx_i \beta \} \bx_i^\top \bx_i = - \sum_{i=1}^n \mu_i \bx_i^\top \bx_i = - \bX^\top \bD_n \bX \] say, where \[ \bX = \begin{pmatrix}\bx_1 \\ \bx_2 \\ \vdots \\ \bx_n \end{pmatrix} \qquad \bD_n = \bD_n (\beta) = \text{diag}(\mu_1,\ldots,\mu_n). \] Using the quadratic approximation to the log-likelihood, we see that for large $n$ the posterior distribution is approximately Normal \[ \pi_n(\beta) \approx Normal\left( \widehat \beta_n, (\bX^\top \widehat \bD_n \bX)^{-1} \right) \] and $\widehat \bD_n = \bD_n (\widehat \beta_n)$. The mle and variance matrices can be computed using \texttt{glm} in \texttt{R}. <>= set.seed(234) n<-20 x<-rnorm(n,-1,2) X<-cbind(1,x) be<-c(1,2) mu<-exp(X %*% be) y<-rpois(n,mu) fit1<-glm(y~x,family=poisson) th.n<-coef(fit1) post.var.n<-summary(fit1)$cov.unscaled th.n post.var.n #Exact log likelhood log.like<-function(th0,th1,xv,yv){ muv<-exp(th0+th1*xv) return(sum(dpois(yv,muv,log=T))) } f <- Vectorize(log.like,vectorize.args=c("th0","th1")) th0v<-seq(0.7,1.2,by=0.001) th1v<-seq(1.7,2.2,by=0.001) lmat<-outer(th0v,th1v,f,xv=x,yv=y) lmax<-max(lmat) #Quadratic approximation quad.func<-function(th0,th1,m,M){ tv<-c(th0,th1)-m q<--0.5*t(tv) %*% solve(M) %*% tv return(q[1]) } f <- Vectorize(quad.func,vectorize.args=c("th0","th1")) th0v<-seq(0.7,1.2,by=0.001) th1v<-seq(1.7,2.2,by=0.001) qmat<-outer(th0v,th1v,f,m=th.n,M=post.var.n)+lmax #Plot library(fields,quietly=TRUE) par(pty='s',mar=c(2,3,2,2)) colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red")) image.plot(th0v,th1v,lmat,col=colfunc(100), xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8) contour(th0v,th1v,lmat,add=T,levels=c(seq(-90,-40,by=10),seq(-30,-20,by=2))) title(expression(paste('Log-likelihood ',l[n](beta[0],beta[1])))) points(th.n[1],th.n[2],pch=4) image.plot(th0v,th1v,qmat,col=colfunc(100), xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8) contour(th0v,th1v,qmat,add=T,levels=c(seq(-90,-40,by=10),seq(-30,-20,by=2))) title(expression(paste('Quadratic approximation ',q[n](beta[0],beta[1])))) points(th.n[1],th.n[2],pch=4) @ \textbf{Logistic Regression:} In a logistic regression model, \[ Y_i | \bx_i; \beta \sim Bernoulli(\mu_i) \] and the usual approach is to choose $g(t) = e^t/(1+e^t) = \expit\{t\}$, so that \[ \mu_i = \expit(\bx_i \beta) \qquad V(\mu_i) = \mu_i (1-\mu_i) \qquad \phi = 1. \] Then the likelihood function is \[ \mathcal{L}_n(\beta) = \prod_{i=1}^n f_{Y|X}(y_i | \bx_i; \beta) = \prod_{i=1}^n \mu_i^{y_i} (1-\mu_i)^{1-\mu_i} \] so that the log likelihood is \[ \ell_n(\beta) = \sum_{i=1}^n \left( y_i \log \mu_i + (1-y_i) \log (1-\mu_i) \right) = \sum_{i=1}^n \left(y_i \bx_i \beta - \log (1+\exp(\bx_i \beta) \right). \] To find the maximum likelihood estimate (mle), we differentiate with respect to $\beta$ and equate the result to zero: we have \[ \dot{\ell}_n(\beta) = \sum_{i=1}^n \left(y_i \bx_i^\top - \expit(\bx_i \beta) \bx_i^\top \right) = \sum_{i=1}^n \left(y_i - \mu_i \right)\bx_i^\top \] and equating to the $(d+1)$-dimensional zero vector yields the mle $\widehat \beta_n$. The second derivative $\ddot{\ell}_n(\beta)$ evaluated at $\widehat \beta_n$ yields the curvature of the log-likelihood at the mle: \[ \ddot{\ell}_n(\beta) = - \sum_{i=1}^n \mu_i (1-\mu_i) \bx_i^\top \bx_i = - \bX^\top \bD_n \bX \] say, where \[ \bX = \begin{pmatrix}\bx_1 \\ \bx_2 \\ \vdots \\ \bx_n \end{pmatrix} \qquad \bD_n = \text{diag}(\mu_1 (1-\mu_1),\ldots,\mu_n (1-\mu_n)). \] Using the quadratic approximation to the log-likelihood, we see that for large $n$ the posterior distribution is approximately Normal \[ \pi_n(\beta) \approx Normal\left( \widehat \beta_n, (\bX^\top \widehat \bD_n \bX)^{-1} \right) \] The mle and variance matrices can be computed using \texttt{glm} in \texttt{R}. \medskip In the following plots, the exact log-likelihood, and the corresponding quadratic approximation, are plotted for \begin{enumerate}[(i)] \item $n=20$ \item $n=200$ \item $n=2000$ \end{enumerate} Even at the largest sample size, the quadratic approximation is good only quite close to the maximum value. <>= set.seed(2634) n<-20 x<-rnorm(n,-1,1) X<-cbind(1,x) be<-c(1,2) expit<-function(x){return(1/(1+exp(-x)))} mu<-expit(X %*% be) y<-rbinom(n,1,mu) table(y) fit1<-glm(y~x,family=binomial) th.n<-coef(fit1) post.var.n<-summary(fit1)$cov.unscaled th.n post.var.n #Exact log likelhood log.like<-function(th0,th1,xv,yv){ muv<-expit(th0+th1*xv) return(sum(dbinom(yv,1,muv,log=T))) } f <- Vectorize(log.like,vectorize.args=c("th0","th1")) th0v<-seq(-6,6,by=0.05) th1v<-seq(-6,6,by=0.05) lmat<-outer(th0v,th1v,f,xv=x,yv=y) lmax<-max(lmat) #Quadratic approximation quad.func<-function(th0,th1,m,M){ tv<-c(th0,th1)-m q<--0.5*t(tv) %*% solve(M) %*% tv return(q[1]) } f <- Vectorize(quad.func,vectorize.args=c("th0","th1")) qmat<-outer(th0v,th1v,f,m=th.n,M=post.var.n)+lmax #Plot library(fields,quietly=TRUE) par(pty='s',mar=c(2,3,2,2)) colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red")) image.plot(th0v,th1v,lmat,col=colfunc(100), xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8) contour(th0v,th1v,lmat,add=T,nlevels=20) title(expression(paste('Log-likelihood ',l[n](beta[0],beta[1])))) points(th.n[1],th.n[2],pch=4) image.plot(th0v,th1v,qmat,col=colfunc(100), xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8) contour(th0v,th1v,qmat,add=T,nlevels=20) title(expression(paste('Quadratic approximation ',q[n](beta[0],beta[1])))) points(th.n[1],th.n[2],pch=4) @ <>= set.seed(2634) n<-200 x<-rnorm(n,-1,1) X<-cbind(1,x) be<-c(1,2) expit<-function(x){return(1/(1+exp(-x)))} mu<-expit(X %*% be) y<-rbinom(n,1,mu) table(y) fit1<-glm(y~x,family=binomial) th.n<-coef(fit1) post.var.n<-summary(fit1)$cov.unscaled th.n post.var.n #Exact log likelhood log.like<-function(th0,th1,xv,yv){ muv<-expit(th0+th1*xv) return(sum(dbinom(yv,1,muv,log=T))) } f <- Vectorize(log.like,vectorize.args=c("th0","th1")) th0v<-seq(-6,6,by=0.05) th1v<-seq(-6,6,by=0.05) lmat<-outer(th0v,th1v,f,xv=x,yv=y) lmax<-max(lmat) #Quadratic approximation quad.func<-function(th0,th1,m,M){ tv<-c(th0,th1)-m q<--0.5*t(tv) %*% solve(M) %*% tv return(q[1]) } f <- Vectorize(quad.func,vectorize.args=c("th0","th1")) qmat<-outer(th0v,th1v,f,m=th.n,M=post.var.n)+lmax #Plot library(fields,quietly=TRUE) par(pty='s',mar=c(2,3,2,2)) colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red")) image.plot(th0v,th1v,lmat,col=colfunc(100), xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8) contour(th0v,th1v,lmat,add=T,nlevels=20) title(expression(paste('Log-likelihood ',l[n](beta[0],beta[1])))) points(th.n[1],th.n[2],pch=4) image.plot(th0v,th1v,qmat,col=colfunc(100), xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8) contour(th0v,th1v,qmat,add=T,nlevels=20) title(expression(paste('Quadratic approximation ',q[n](beta[0],beta[1])))) points(th.n[1],th.n[2],pch=4) @ <>= set.seed(2634) n<-2000 x<-rnorm(n,-1,1) X<-cbind(1,x) be<-c(1,2) expit<-function(x){return(1/(1+exp(-x)))} mu<-expit(X %*% be) y<-rbinom(n,1,mu) table(y) fit1<-glm(y~x,family=binomial) th.n<-coef(fit1) post.var.n<-summary(fit1)$cov.unscaled th.n post.var.n #Exact log likelhood log.like<-function(th0,th1,xv,yv){ muv<-expit(th0+th1*xv) return(sum(dbinom(yv,1,muv,log=T))) } f <- Vectorize(log.like,vectorize.args=c("th0","th1")) th0v<-seq(-6,6,by=0.05) th1v<-seq(-6,6,by=0.05) lmat<-outer(th0v,th1v,f,xv=x,yv=y) lmax<-max(lmat) #Quadratic approximation quad.func<-function(th0,th1,m,M){ tv<-c(th0,th1)-m q<--0.5*t(tv) %*% solve(M) %*% tv return(q[1]) } f <- Vectorize(quad.func,vectorize.args=c("th0","th1")) qmat<-outer(th0v,th1v,f,m=th.n,M=post.var.n)+lmax #Plot library(fields,quietly=TRUE) par(pty='s',mar=c(2,3,2,2)) colfunc <- colorRampPalette(c("blue","lightblue","white","yellow","orange","red")) image.plot(th0v,th1v,lmat,col=colfunc(100), xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8) contour(th0v,th1v,lmat,add=T,nlevels=20) title(expression(paste('Log-likelihood ',l[n](beta[0],beta[1])))) points(th.n[1],th.n[2],pch=4) image.plot(th0v,th1v,qmat,col=colfunc(100), xlab=expression(beta[0]),ylab=expression(beta[1]),cex.axis=0.8) contour(th0v,th1v,qmat,add=T,nlevels=20) title(expression(paste('Quadratic approximation ',q[n](beta[0],beta[1])))) points(th.n[1],th.n[2],pch=4) @ \end{document}