\documentclass[notitlepage]{article}
\usepackage{Math533}
\usepackage{listings}
\usepackage{numprint}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bbm}
\usepackage{bbold}
\usepackage{enumitem}
\usepackage{bm}

\lstloadlanguages{R}

\parindent0in

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\newcommand{\smallG}{\textnormal{\tiny \text{G}}}
\def\Ind{\mathbb{1}}
\def\SST{\text{SS}_\text{T}}
\def\SSR{\text{SS}_\text{R}}
\def\SSRes{\text{SS}_\text{Res}}
\def\MSRes{\text{MS}_\text{Res}}
\def\SSTover{\overline{\text{SS}}_\text{T}}
\def\SSRover{\overline{\text{SS}}_\text{R}}
\def\SSB{\text{SS}_\text{B}}
\def\FMSE{\text{FMSE}}
\def\bmu{\bm{\mu}}

\begin{document}


\begin{center}
\textsclarge{The Importance of Correct Model Identification }
\end{center}

This simulation illustrates why it is important to identify the correct model.  We simulate 5000 replicated data sets of size 1000 from a polynomial regression model with modelled mean
\[
\beta_0 + \beta_1 x_{i1} + \beta_2 x_{i1}^2
\]
with $\beta_0 = 0, \beta_1 = 2, \beta_2 = 1.2$.  We record the estimates of the regression coefficients and $\sigma^2$ for the following models:

\begin{itemize}
  \item \textbf{Over-simple:} \texttt{fit1}
  \[
  \beta_0 + \beta_1 x_{i1}
  \]
  \item \textbf{Correct:} \texttt{fit2}
  \[
  \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i1}^2
  \]
  \item \textbf{Over-complex:} \texttt{fit3}
  \[
  \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i1}^2 + \beta_3 x_{i1}^3
  \]
\end{itemize}
where the predictor $X_{i1}$ has a normal distribution with mean 5 and variance 1.

<<predsel1,comment=':', fig.width=8.5, fig.height=6.0, fig.align='center'>>=
nreps<-5000
library(MASS)
set.seed(423)
X<-seq(0,10,by=0.01)
X<-cbind(X,X^2,X^3)
n<-nrow(X)
be<-rep(0,4)
be[2:4]<-c(2,1.2,0)
mean.vec<-cbind(rep(1,n),X)%*%be
@
The following code fits the three models for each of the 5000 replicates, records the estimates for the three models, and predictions for all of the data points.
<<predsel2,comment=':', fig.width=8.5, fig.height=6.0, fig.align='center'>>=
ests.mat1<-ests.mat2<-ests.mat3<-matrix(0,nrow=nreps,ncol=4)
bias.mat1<-bias.mat2<-bias.mat3<-matrix(0,nrow=nreps,ncol=4)
fitted.vals1<-fitted.vals2<-fitted.vals3<-matrix(0,nreps,n)
sigma.ests<-matrix(0,nreps,3)
for(i in 1:nreps){
    Y<-mean.vec+rnorm(n)
    fit1<-lm(Y~X[,1])
    ests.mat1[i,]<-c(coef(fit1),0,0)
    bias.mat1[i,]<-c(coef(fit1),0,0)-c(0,be[2:3],0)
    fitted.vals1[i,]<-fitted(fit1)
    sigma.ests[i,1]<-summary(fit1)$sigma^2
    fit2<-lm(Y~X[,1:2])
    ests.mat2[i,]<-c(coef(fit2),0)
    bias.mat2[i,]<-c(coef(fit2),0)-c(0,be[2:4])
    fitted.vals2[i,]<-fitted(fit2)
    sigma.ests[i,2]<-summary(fit2)$sigma^2
    fit3<-lm(Y~X)
    ests.mat3[i,]<-coef(fit3)
    bias.mat3[i,]<-coef(fit3)-c(0,be[2:4])
    fitted.vals3[i,]<-fitted(fit3)
    sigma.ests[i,3]<-summary(fit3)$sigma^2
}
@

\pagebreak
We now summarize the biases (scaled by $\sqrt{n}$), variances and mean square errors (scaled by $n$):
<<predsel3,comment=':', fig.width=8.5, fig.height=6.0, fig.align='center'>>=
bias1<-apply(bias.mat1,2,mean);bias2<-apply(bias.mat2,2,mean);bias3<-apply(bias.mat3,2,mean)
sqrt(n)*rbind(bias1,bias2,bias3)
var1<-apply(ests.mat1,2,var);var2<-apply(ests.mat2,2,var);var3<-apply(ests.mat3,2,var)
n*rbind(var1,var2,var3)
mse1<-var1+bias1^2;mse2<-var2+bias2^2;mse3<-var3+bias3^2; n*rbind(mse1,mse2,mse3)
@
We now compare the fitted values for \texttt{fit2} and \texttt{fit3} to the true model mean; the model \texttt{fit1} produces biased fitted values, so we omit that comparison and examine deviations from the true value.
<<predsel4,comment=':', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
true.values<-cbind(1,X) %*% be
pred.var2<-apply(fitted.vals2,2,quantile,prob=c(0.025,0.975))
pred.var3<-apply(fitted.vals3,2,quantile,prob=c(0.025,0.975))
par(mar=c(4,4,1,2));plot(X[,1],true.values-true.values,type='n',xlab='x',ylab='y',ylim=range(-.5,.5))
lines(X[,1],pred.var2[1,]-true.values,col='red');lines(X[,1],pred.var2[2,]-true.values,col='red')
lines(X[,1],pred.var3[1,]-true.values,col='blue');lines(X[,1],pred.var3[2,]-true.values,col='blue')
legend(0,0.5,c('fit2 95% CI','fit3 95% CI'),col=c('red','blue'),lty=c(1,1))
title('Difference from true mean value')
@
Thus the estimator and prediction variance and MSE are \textbf{larger} for the over-complex model that includes the spurious cubic term.

\end{document}