\documentclass[notitlepage]{article} \usepackage{Math533} \usepackage{listings} \usepackage{numprint} \usepackage{amsfonts} \usepackage{amsmath} \usepackage{amssymb} \usepackage{bbm} \usepackage{bbold} \usepackage{enumitem} \usepackage{bm} \lstloadlanguages{R} \parindent0in \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \newcommand{\smallG}{\textnormal{\tiny \text{G}}} \def\Ind{\mathbb{1}} \def\SST{\text{SS}_\text{T}} \def\SSR{\text{SS}_\text{R}} \def\SSRes{\text{SS}_\text{Res}} \def\MSRes{\text{MS}_\text{Res}} \def\SSTover{\overline{\text{SS}}_\text{T}} \def\SSRover{\overline{\text{SS}}_\text{R}} \def\SSB{\text{SS}_\text{B}} \def\FMSE{\text{FMSE}} \def\bmu{\bm{\mu}} \begin{document} \begin{center} \textsclarge{Model selection criteria in \texttt{R}: } \end{center} \begin{enumerate}[label=\arabic*.,topsep=2pt,itemsep=2ex,partopsep=1ex,parsep=1ex] \item \textbf{$R^2$ statistics:} We may use \[ R^2 = \frac{\SSR}{\SST} = 1 - \frac{\SSRes}{\SST} \qquad \text{or} \qquad R_\text{Adj}^2 = 1 - \frac{\SSRes/(n-p)}{\SST/(n-1)} = 1 - \left(\frac{n-1}{n-p}\right) (1-R^2). \] where $p$ is the total number of parameters. $R^2$ does not take into account model complexity (that is, the number of parameters fitted), whereas $R_\text{Adj}^2$ does. \item \textbf{Mean Square Residual:} We consider \[ \MSRes = \frac{\SSRes}{(n-p)} \] and note that \[ R_\text{Adj}^2 = 1 - \left(\frac{n-1}{n-p}\right) \left(1-\frac{\SSRes}{\SST} \right) = 1 - \frac{\MSRes}{\SST/(n-1)} \] so that maximizing $R_\text{Adj}^2$ corresponds exactly to minimizing $\MSRes$. \item \textbf{Mallows's $C_p$ statistic:} Let $\mu_i = \Expect_{Y_i|X_i}[Y_i|\bx_i] $ and $\widehat \mu_i = \Expect_{\bY|\bX}[\widehat Y_i|\bx_i]$ be the \textit{modelled} and \textit{fitted} expected values of response $Y_i$ at predictor values $\bx_i$ respectively. The expected (or mean) squared error (MSE) of the fit for datum $i$ is \[ \Expect_{\bY|\bX}[(\widehat Y_i - \mu_i)^2 |\bx_i] \] which can be decomposed \begin{align*} \Expect_{\bY|\bX}[(\widehat Y_i - \mu_i)^2 | \bx_i] = \Expect_{\bY|\bX}[(\widehat Y_i - \widehat \mu_i)^2| \bx_i] + (\widehat \mu_i - \mu_i)^2 & = \Var_{\bY|\bX}[\widehat Y_i|\bx_i] + (\widehat \mu_i - \mu_i)^2\\[6pt] & = \text{variance for datum $i$} + (\text{bias for datum $i$})^2 \end{align*} Let \[ \SSB = \sum_{i=1}^n (\widehat \mu_i - \mu_i)^2 = (\bmu-\widehat\bmu)^\top(\bmu-\widehat\bmu) = \bmu^\top (\Ident_n - \bH) \bmu \] say, denote the total squared bias, aggregated across all data points, and \[ \FMSE = \frac{1}{\sigma^2} \sum_{i=1}^n \left[\Var_{\bY|\bX}[\widehat Y_i|\bx_i] + (\widehat \mu_i - \mu_i)^2 \right] = \frac{1}{\sigma^2} \sum_{i=1}^n \Var_{\bY|\bX}[\widehat Y_i|\bx_i] + \frac{\SSB}{\sigma^2}. \] Recall that if $\bH$ is the hat matrix $\bH = \bX (\bX^\top\bX)^{-1} \bX^\top$ then \[ \Var_{\bY|\bX}[\widehat \bY|\bx] = \Var_{\bY|\bX}[\bH \bY|\bx] = \sigma^2 \bH^\top\bH = \sigma^2 \bH \] and so \[ \sum_{i=1}^n \Var_{\bY|\bX}[\widehat Y_i|\bx_i] = \text{Trace}(\sigma^2 \bH) = \sigma^2 \text{Trace}(\bH) = p \sigma^2 \] Also by previous results for quadratic forms \begin{align*} \Expect_{\bY|\bX}[\SSRes\bX] & = \Expect_{\bY|\bX}\left[\bY^\top (\Ident_n - \bH) \bY \bigg| \bX\right]\\[6pt] & = \bmu^\top (\Ident_n - \bH) \bmu + \text{Trace}(\sigma^2 (\Ident_n - \bH)) \\[6pt] & = (\bmu-\widehat\bmu)^\top(\bmu-\widehat\bmu) + (n-p) \sigma^2\\[6pt] & = \SSB + (n-p) \sigma^2. \end{align*} Therefore we may rewrite \begin{align*} \FMSE & = \dfrac{1}{\sigma^2} \left[ p \sigma^2 + \Expect_{\bY|\bX}[\SSRes|\bX] - (n-p) \sigma^2 \right] = \dfrac{\Expect_{\bY|\bX}[\SSRes|\bX]}{\sigma^2} -n + 2p \end{align*} An estimator of this quantity is \[ C_p = \frac{\SSRes}{\widehat \sigma^2} -n + 2p \] where $\widehat \sigma^2$ is some estimator of $\sigma^2$ derived, say, from the the `largest' model that is being considered. \medskip $C_p$ is \textit{Mallows's statistic}. We choose the model that minimizes $C_p$. We have that \[ \Expect_{\bY|\bX}[C_p|\bX] = p. \] \item \textbf{Akaike's Information Criterion (AIC):} We define for a probability model with parameters $\theta$ \[ \textrm{AIC} = -2 \ell(\widehat \theta) + 2 \text{dim}(\theta) \] where $\ell(\theta)$ is the log-likelihood function, $\widehat \theta$ is the maximum likelihood estimate of the parameter $\theta$, and $\text{dim}(\theta)$ is the dimension of $\theta$. \medskip For linear regression models under a normality assumption, we have that $\theta = (\beta,\sigma^2)$ with \[ \ell(\beta,\sigma^2) = -\frac{n}{2} \log (2 \pi) - \frac{n}{2} \log \sigma^2 - \frac{1}{2 \sigma^2} \sum_{i=1}^n (y_i - \bx_i \beta)^2 \] Plugging in $\widehat \beta$ and $\widehat \sigma_{\text{ML}}^2$, we obtain \[ \ell(\widehat \beta,\widehat\sigma_{\text{ML}}^2) = -\frac{n}{2} \log (2 \pi) - \frac{n}{2} \log \left( \frac{\SSRes}{n} \right) - \frac{n \SSRes}{2 \SSRes} \] so therefore, writing \[ c(n) = n \log(2\pi) + n \] for the constant function of $n$, we have \[ \text{AIC} = c(n) + n \log \left( \frac{\SSRes}{n} \right) + 2 (p+1). \] \medskip This is Akaike's Information Criterion: we choose the model with the lowest value of $\text{AIC}$. The constant $c(n)$ need not be included in the calculation as it is constant across all models considered. \item \textbf{Bayesian Information Criterion (BIC):} The Bayesian Information Criterion (BIC) is a modification of AIC. We define \[ \text{BIC} = n \log \left( \frac{\SSRes}{n} \right) + (p+1) \log (n). \] and again choose the model with the smallest $\text{BIC}$. \end{enumerate} \pagebreak \begin{center}\textsclarge{Simulation Study}\end{center} We have the model for three continuous predictors $X_1,X_2,X_3$ \[ Y_i = 2 + 2 x_{i1} + 2 x_{i2} - 2 x_{i1} x_{i2} + \epsilon_i \] with $\sigma^2 = 1$. We have $n=200$. Here is the simulation code: <>= set.seed(798) n<-200; p<-3 Sig<-rWishart(1,p+2,diag(1,p)/(p+2))[,,1] library(MASS) x<-mvrnorm(n,mu=rep(0,p),Sigma=Sig) be<-c(2,2,2,0,-2) xm<-cbind(rep(1,n),x,x[,1]*x[,2]) Y<-xm %*% be + rnorm(n) x1<-x[,1] x2<-x[,2] x3<-x[,3] fit0<-lm(Y~1) fit1<-lm(Y~x1) fit2<-lm(Y~x2) fit3<-lm(Y~x3) fit12<-lm(Y~x1+x2) fit13<-lm(Y~x1+x3) fit23<-lm(Y~x2+x3) fit123<-lm(Y~x1+x2+x3) fit12i<-lm(Y~x1*x2) fit13i<-lm(Y~x1*x3) fit23i<-lm(Y~x2*x3) fit123i<-lm(Y~x1*x2*x3) criteria.eval<-function(fit.obj,nv,bigsig.hat){ cvec<-rep(0,5) SSRes<-sum(residuals(fit.obj)^2) p<-length(coef(fit.obj)) cvec[1]<-summary(fit.obj)$r.squared cvec[2]<-summary(fit.obj)$adj.r.squared cvec[3]<-SSRes/bigsig.hat^2-n+2*p #AIC in R computes # n*log(sum(residuals(fit.obj)^2)/n)+2*(length(coef(fit.obj))+1)+n*log(2*pi)+n cvec[4]<-AIC(fit.obj) #BIC in R computes # n*log(sum(residuals(fit.obj)^2)/n)+log(n)*(length(coef(fit.obj))+1)+n*log(2*pi)+n cvec[5]<-BIC(fit.obj) return(cvec) } bigs.hat<-summary(fit123i)$sigma cvals<-matrix(0,nrow=12,ncol=5) cvals[1,]<-criteria.eval(fit0,n,bigs.hat) cvals[2,]<-criteria.eval(fit1,n,bigs.hat) cvals[3,]<-criteria.eval(fit2,n,bigs.hat) cvals[4,]<-criteria.eval(fit3,n,bigs.hat) cvals[5,]<-criteria.eval(fit12,n,bigs.hat) cvals[6,]<-criteria.eval(fit13,n,bigs.hat) cvals[7,]<-criteria.eval(fit23,n,bigs.hat) cvals[8,]<-criteria.eval(fit123,n,bigs.hat) cvals[9,]<-criteria.eval(fit12i,n,bigs.hat) cvals[10,]<-criteria.eval(fit13i,n,bigs.hat) cvals[11,]<-criteria.eval(fit23i,n,bigs.hat) cvals[12,]<-criteria.eval(fit123i,n,bigs.hat) Criteria<-data.frame(cvals) names(Criteria)<-c('Rsq','Adj.Rsq','Cp','AIC','BIC') rownames(Criteria)<-c('1','x1','x2','x3','x1+x2','x1+x3','x2+x3','x1+x2+x3', 'x1*x2','x1*x3','x2*x3','x1*x2*x3') round(Criteria,4) @ This reveals the model $X_1*X_2 = X_1+X_2+X_1:X_2$ as most appropriate model. <>= summary(fit12i) @ The parameter estimates are therefore \[ \widehat \beta_0 = \Sexpr{round(coef(fit12i)[1],4)} \quad \widehat \beta_1 = \Sexpr{round(coef(fit12i)[2],4)} \quad \widehat \beta_2 = \Sexpr{round(coef(fit12i)[3],4)} \quad \widehat \beta_{12} = \Sexpr{round(coef(fit12i)[4],4)} \] which are close to the data generating values. \pagebreak For an equivalent ANOVA test to the one in the summary output: <>= anova(fit12,fit12i) @ <>= par(mfrow=c(2,2),mar=c(4,2,1,2)) plot(x1,residuals(fit12i),pch=19,cex=0.75) plot(x2,residuals(fit12i),pch=19,cex=0.75) plot(x1*x2,residuals(fit12i),pch=19,cex=0.75) @ \pagebreak Finally, for an \textbf{incorrect} model we obtain misleading results: <>= summary(fit13i) par(mfrow=c(2,2),mar=c(4,2,1,2)) plot(x1,residuals(fit13i),pch=19,cex=0.75) plot(x3,residuals(fit13i),pch=19,cex=0.75) plot(x1*x3,residuals(fit13i),pch=19,cex=0.75) @ \end{document}