\documentclass[notitlepage]{article}
\usepackage{Math533}
\usepackage{listings}
\usepackage{numprint}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bbm}
\usepackage{bbold}
\usepackage{enumitem}
\usepackage{bm}

\lstloadlanguages{R}

\parindent0in

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\newcommand{\smallG}{\textnormal{\tiny \text{G}}}
\def\Ind{\mathbb{1}}
\def\SST{\text{SS}_\text{T}}
\def\SSR{\text{SS}_\text{R}}
\def\SSRes{\text{SS}_\text{Res}}
\def\MSRes{\text{MS}_\text{Res}}
\def\SSTover{\overline{\text{SS}}_\text{T}}
\def\SSRover{\overline{\text{SS}}_\text{R}}
\def\SSB{\text{SS}_\text{B}}
\def\FMSE{\text{FMSE}}
\def\bmu{\bm{\mu}}

\begin{document}


\begin{center}
\textsclarge{Model selection criteria in \texttt{R}: }
\end{center}


\begin{enumerate}[label=\arabic*.,topsep=2pt,itemsep=2ex,partopsep=1ex,parsep=1ex]

\item \textbf{$R^2$ statistics:} We may use
\[
R^2 = \frac{\SSR}{\SST} = 1 - \frac{\SSRes}{\SST}
\qquad \text{or} \qquad
R_\text{Adj}^2 = 1 - \frac{\SSRes/(n-p)}{\SST/(n-1)} = 1 - \left(\frac{n-1}{n-p}\right) (1-R^2).
\]
where $p$ is the total number of parameters. $R^2$ does not take into account model complexity (that is, the number of parameters fitted), whereas $R_\text{Adj}^2$ does.



\item \textbf{Mean Square Residual:} We consider
\[
\MSRes = \frac{\SSRes}{(n-p)}
\]
and note that
\[
R_\text{Adj}^2 = 1 - \left(\frac{n-1}{n-p}\right) \left(1-\frac{\SSRes}{\SST} \right) = 1 - \frac{\MSRes}{\SST/(n-1)}
\]
so that maximizing $R_\text{Adj}^2$ corresponds exactly to minimizing $\MSRes$.

\item \textbf{Mallows's $C_p$ statistic:} Let $\mu_i = \Expect_{Y_i|X_i}[Y_i|\bx_i] $ and $\widehat \mu_i = \Expect_{\bY|\bX}[\widehat Y_i|\bx_i]$ be the \textit{modelled} and \textit{fitted} expected values of response $Y_i$ at predictor values $\bx_i$ respectively.  The expected (or mean) squared error (MSE) of the fit for datum $i$ is
\[
\Expect_{\bY|\bX}[(\widehat Y_i - \mu_i)^2 |\bx_i]
\]
which can be decomposed
\begin{align*}
\Expect_{\bY|\bX}[(\widehat Y_i - \mu_i)^2 | \bx_i] = \Expect_{\bY|\bX}[(\widehat Y_i - \widehat \mu_i)^2| \bx_i] + (\widehat \mu_i - \mu_i)^2 & = \Var_{\bY|\bX}[\widehat Y_i|\bx_i] + (\widehat \mu_i - \mu_i)^2\\[6pt]
& = \text{variance for datum $i$} + (\text{bias for datum $i$})^2
\end{align*}
Let
\[
\SSB = \sum_{i=1}^n (\widehat \mu_i - \mu_i)^2 = (\bmu-\widehat\bmu)^\top(\bmu-\widehat\bmu) = \bmu^\top (\Ident_n - \bH) \bmu
\]
say, denote the total squared bias, aggregated across all data points, and
\[
\FMSE = \frac{1}{\sigma^2} \sum_{i=1}^n \left[\Var_{\bY|\bX}[\widehat Y_i|\bx_i] + (\widehat \mu_i - \mu_i)^2 \right] = \frac{1}{\sigma^2} \sum_{i=1}^n \Var_{\bY|\bX}[\widehat Y_i|\bx_i] + \frac{\SSB}{\sigma^2}.
\]
Recall that if $\bH$ is the hat matrix $\bH = \bX (\bX^\top\bX)^{-1} \bX^\top$ then
\[
\Var_{\bY|\bX}[\widehat \bY|\bx] = \Var_{\bY|\bX}[\bH \bY|\bx] = \sigma^2 \bH^\top\bH = \sigma^2 \bH
\]
and so
\[
\sum_{i=1}^n \Var_{\bY|\bX}[\widehat Y_i|\bx_i] = \text{Trace}(\sigma^2 \bH) = \sigma^2  \text{Trace}(\bH) = p \sigma^2
\]
Also by previous results for quadratic forms
\begin{align*}
\Expect_{\bY|\bX}[\SSRes\bX] & = \Expect_{\bY|\bX}\left[\bY^\top (\Ident_n - \bH) \bY \bigg| \bX\right]\\[6pt]
& = \bmu^\top (\Ident_n - \bH) \bmu + \text{Trace}(\sigma^2 (\Ident_n - \bH)) \\[6pt]
& = (\bmu-\widehat\bmu)^\top(\bmu-\widehat\bmu) + (n-p) \sigma^2\\[6pt]
& = \SSB + (n-p) \sigma^2.
\end{align*}
Therefore we may rewrite
\begin{align*}
\FMSE & = \dfrac{1}{\sigma^2} \left[ p \sigma^2 +  \Expect_{\bY|\bX}[\SSRes|\bX] - (n-p) \sigma^2 \right] = \dfrac{\Expect_{\bY|\bX}[\SSRes|\bX]}{\sigma^2} -n + 2p
\end{align*}
An estimator of this quantity is
\[
C_p = \frac{\SSRes}{\widehat \sigma^2} -n + 2p
\]
where $\widehat \sigma^2$ is some estimator of $\sigma^2$ derived, say, from the the `largest' model that is being considered.

\medskip

$C_p$ is \textit{Mallows's statistic}.  We choose the model that minimizes $C_p$.  We have that
\[
\Expect_{\bY|\bX}[C_p|\bX] = p.
\]



\item \textbf{Akaike's Information Criterion (AIC):} We define for a probability model with parameters $\theta$
    \[
    \textrm{AIC} =
    -2 \ell(\widehat \theta) + 2 \text{dim}(\theta)
    \]
    where $\ell(\theta)$ is the log-likelihood function, $\widehat \theta$ is the maximum likelihood estimate of the parameter $\theta$, and $\text{dim}(\theta)$ is the dimension of $\theta$.
    \medskip

    For linear regression models under a normality assumption, we have that $\theta = (\beta,\sigma^2)$ with
    \[
    \ell(\beta,\sigma^2) = -\frac{n}{2} \log (2 \pi) - \frac{n}{2} \log \sigma^2 - \frac{1}{2 \sigma^2} \sum_{i=1}^n (y_i - \bx_i \beta)^2
    \]
    Plugging in $\widehat \beta$ and $\widehat \sigma_{\text{ML}}^2$, we obtain
    \[
    \ell(\widehat \beta,\widehat\sigma_{\text{ML}}^2) = -\frac{n}{2} \log (2 \pi) - \frac{n}{2} \log \left( \frac{\SSRes}{n} \right) - \frac{n \SSRes}{2 \SSRes}
    \]
    so therefore, writing
    \[
    c(n) = n \log(2\pi) + n
    \]
    for the constant function of $n$, we have
    \[
    \text{AIC} = c(n) + n \log \left( \frac{\SSRes}{n} \right) + 2 (p+1).
    \]

    \medskip

    This is Akaike's Information Criterion: we choose the model with the lowest value of $\text{AIC}$.  The constant $c(n)$ need not be included in the calculation as it is constant across all models considered.

    

    \item \textbf{Bayesian Information Criterion (BIC):} The Bayesian Information Criterion (BIC) is a modification of AIC.  We define
    \[
    \text{BIC} = n \log \left( \frac{\SSRes}{n} \right) + (p+1) \log (n).
    \]
    and again choose the model with the smallest $\text{BIC}$.

    \end{enumerate}

\pagebreak

\begin{center}\textsclarge{Simulation Study}\end{center}

We have the model for three continuous  predictors $X_1,X_2,X_3$
\[
Y_i = 2 + 2 x_{i1} + 2 x_{i2} - 2 x_{i1} x_{i2} + \epsilon_i
\]
with $\sigma^2 = 1$.  We have $n=200$.  Here is the simulation code:

<<sel1,comment=':', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
set.seed(798)
n<-200; p<-3
Sig<-rWishart(1,p+2,diag(1,p)/(p+2))[,,1]
library(MASS)
x<-mvrnorm(n,mu=rep(0,p),Sigma=Sig)
be<-c(2,2,2,0,-2)
xm<-cbind(rep(1,n),x,x[,1]*x[,2])
Y<-xm %*% be + rnorm(n)

x1<-x[,1]
x2<-x[,2]
x3<-x[,3]

fit0<-lm(Y~1)
fit1<-lm(Y~x1)
fit2<-lm(Y~x2)
fit3<-lm(Y~x3)
fit12<-lm(Y~x1+x2)
fit13<-lm(Y~x1+x3)
fit23<-lm(Y~x2+x3)
fit123<-lm(Y~x1+x2+x3)
fit12i<-lm(Y~x1*x2)
fit13i<-lm(Y~x1*x3)
fit23i<-lm(Y~x2*x3)
fit123i<-lm(Y~x1*x2*x3)

criteria.eval<-function(fit.obj,nv,bigsig.hat){
	cvec<-rep(0,5)
	SSRes<-sum(residuals(fit.obj)^2)
	p<-length(coef(fit.obj))
	cvec[1]<-summary(fit.obj)$r.squared
	cvec[2]<-summary(fit.obj)$adj.r.squared
	cvec[3]<-SSRes/bigsig.hat^2-n+2*p

	#AIC in R computes 
      # n*log(sum(residuals(fit.obj)^2)/n)+2*(length(coef(fit.obj))+1)+n*log(2*pi)+n
	cvec[4]<-AIC(fit.obj)

	#BIC in R computes 
      # n*log(sum(residuals(fit.obj)^2)/n)+log(n)*(length(coef(fit.obj))+1)+n*log(2*pi)+n
	cvec[5]<-BIC(fit.obj)

	return(cvec)
}

bigs.hat<-summary(fit123i)$sigma
cvals<-matrix(0,nrow=12,ncol=5)
cvals[1,]<-criteria.eval(fit0,n,bigs.hat)
cvals[2,]<-criteria.eval(fit1,n,bigs.hat)
cvals[3,]<-criteria.eval(fit2,n,bigs.hat)
cvals[4,]<-criteria.eval(fit3,n,bigs.hat)
cvals[5,]<-criteria.eval(fit12,n,bigs.hat)
cvals[6,]<-criteria.eval(fit13,n,bigs.hat)
cvals[7,]<-criteria.eval(fit23,n,bigs.hat)
cvals[8,]<-criteria.eval(fit123,n,bigs.hat)
cvals[9,]<-criteria.eval(fit12i,n,bigs.hat)
cvals[10,]<-criteria.eval(fit13i,n,bigs.hat)
cvals[11,]<-criteria.eval(fit23i,n,bigs.hat)
cvals[12,]<-criteria.eval(fit123i,n,bigs.hat)

Criteria<-data.frame(cvals)
names(Criteria)<-c('Rsq','Adj.Rsq','Cp','AIC','BIC')

rownames(Criteria)<-c('1','x1','x2','x3','x1+x2','x1+x3','x2+x3','x1+x2+x3',
                    'x1*x2','x1*x3','x2*x3','x1*x2*x3')

round(Criteria,4)
@
This reveals the model $X_1*X_2 = X_1+X_2+X_1:X_2$ as most appropriate model.
<<sel2,comment=':', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
summary(fit12i)
@
The parameter estimates are therefore
\[
\widehat \beta_0 = \Sexpr{round(coef(fit12i)[1],4)} \quad
\widehat \beta_1 = \Sexpr{round(coef(fit12i)[2],4)} \quad
\widehat \beta_2 = \Sexpr{round(coef(fit12i)[3],4)} \quad
\widehat \beta_{12} = \Sexpr{round(coef(fit12i)[4],4)}
\]
which are close to the data generating values.

\pagebreak

For an equivalent ANOVA test to the one in the summary output:
<<sel3,comment=':', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
anova(fit12,fit12i)
@
<<sel4,comment=':', fig.width=8.5, fig.height=6.5, fig.align='center'>>=
par(mfrow=c(2,2),mar=c(4,2,1,2))
plot(x1,residuals(fit12i),pch=19,cex=0.75)
plot(x2,residuals(fit12i),pch=19,cex=0.75)
plot(x1*x2,residuals(fit12i),pch=19,cex=0.75)
@

\pagebreak
Finally, for an \textbf{incorrect} model we obtain misleading results:
<<sel5,comment=':', fig.width=8.5, fig.height=6.0, fig.align='center'>>=
summary(fit13i)
par(mfrow=c(2,2),mar=c(4,2,1,2))
plot(x1,residuals(fit13i),pch=19,cex=0.75)
plot(x3,residuals(fit13i),pch=19,cex=0.75)
plot(x1*x3,residuals(fit13i),pch=19,cex=0.75)
@

\end{document}