\documentclass[notitlepage]{article} \usepackage{Math533} \usepackage{listings} \usepackage{numprint} \usepackage{amsfonts} \usepackage{amsmath} \usepackage{amssymb} \usepackage{bbm} \usepackage{bbold} \usepackage{enumitem} \usepackage{bm} \lstloadlanguages{R} \parindent0in \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \newcommand{\smallG}{\textnormal{\tiny \text{G}}} \def\Ind{\mathbb{1}} \def\SST{\text{SS}_\text{T}} \def\SSR{\text{SS}_\text{R}} \def\SSRes{\text{SS}_\text{Res}} \def\MSRes{\text{MS}_\text{Res}} \def\SSTover{\overline{\text{SS}}_\text{T}} \def\SSRover{\overline{\text{SS}}_\text{R}} \def\SSB{\text{SS}_\text{B}} \def\FMSE{\text{FMSE}} \def\bmu{\bm{\mu}} \begin{document} \begin{center} \textsclarge{The Importance of Correct Model Identification } \end{center} This simulation illustrates why it is important to identify the correct model. We simulate 5000 replicated data sets of size 1000 from a polynomial regression model with modelled mean \[ \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i1}^2 \] with $\beta_0 = 0, \beta_1 = 2, \beta_2 = 1.2$. We record the estimates of the regression coefficients and $\sigma^2$ for the following models: \begin{itemize} \item \textbf{Over-simple:} \texttt{fit1} \[ \beta_0 + \beta_1 x_{i1} \] \item \textbf{Correct:} \texttt{fit2} \[ \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i1}^2 \] \item \textbf{Over-complex:} \texttt{fit3} \[ \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i1}^2 + \beta_3 x_{i1}^3 \] \end{itemize} where the predictor $X_{i1}$ has a normal distribution with mean 5 and variance 1. <>= nreps<-5000 library(MASS) set.seed(423) X<-seq(0,10,by=0.01) X<-cbind(X,X^2,X^3) n<-nrow(X) be<-rep(0,4) be[2:4]<-c(2,1.2,0) mean.vec<-cbind(rep(1,n),X)%*%be @ The following code fits the three models for each of the 5000 replicates, records the estimates for the three models, and predictions for all of the data points. <>= ests.mat1<-ests.mat2<-ests.mat3<-matrix(0,nrow=nreps,ncol=4) bias.mat1<-bias.mat2<-bias.mat3<-matrix(0,nrow=nreps,ncol=4) fitted.vals1<-fitted.vals2<-fitted.vals3<-matrix(0,nreps,n) sigma.ests<-matrix(0,nreps,3) for(i in 1:nreps){ Y<-mean.vec+rnorm(n) fit1<-lm(Y~X[,1]) ests.mat1[i,]<-c(coef(fit1),0,0) bias.mat1[i,]<-c(coef(fit1),0,0)-c(0,be[2:3],0) fitted.vals1[i,]<-fitted(fit1) sigma.ests[i,1]<-summary(fit1)$sigma^2 fit2<-lm(Y~X[,1:2]) ests.mat2[i,]<-c(coef(fit2),0) bias.mat2[i,]<-c(coef(fit2),0)-c(0,be[2:4]) fitted.vals2[i,]<-fitted(fit2) sigma.ests[i,2]<-summary(fit2)$sigma^2 fit3<-lm(Y~X) ests.mat3[i,]<-coef(fit3) bias.mat3[i,]<-coef(fit3)-c(0,be[2:4]) fitted.vals3[i,]<-fitted(fit3) sigma.ests[i,3]<-summary(fit3)$sigma^2 } @ \pagebreak We now summarize the biases (scaled by $\sqrt{n}$), variances and mean square errors (scaled by $n$): <>= bias1<-apply(bias.mat1,2,mean);bias2<-apply(bias.mat2,2,mean);bias3<-apply(bias.mat3,2,mean) sqrt(n)*rbind(bias1,bias2,bias3) var1<-apply(ests.mat1,2,var);var2<-apply(ests.mat2,2,var);var3<-apply(ests.mat3,2,var) n*rbind(var1,var2,var3) mse1<-var1+bias1^2;mse2<-var2+bias2^2;mse3<-var3+bias3^2; n*rbind(mse1,mse2,mse3) @ We now compare the fitted values for \texttt{fit2} and \texttt{fit3} to the true model mean; the model \texttt{fit1} produces biased fitted values, so we omit that comparison and examine deviations from the true value. <>= true.values<-cbind(1,X) %*% be pred.var2<-apply(fitted.vals2,2,quantile,prob=c(0.025,0.975)) pred.var3<-apply(fitted.vals3,2,quantile,prob=c(0.025,0.975)) par(mar=c(4,4,1,2));plot(X[,1],true.values-true.values,type='n',xlab='x',ylab='y',ylim=range(-.5,.5)) lines(X[,1],pred.var2[1,]-true.values,col='red');lines(X[,1],pred.var2[2,]-true.values,col='red') lines(X[,1],pred.var3[1,]-true.values,col='blue');lines(X[,1],pred.var3[2,]-true.values,col='blue') legend(0,0.5,c('fit2 95% CI','fit3 95% CI'),col=c('red','blue'),lty=c(1,1)) title('Difference from true mean value') @ Thus the estimator and prediction variance and MSE are \textbf{larger} for the over-complex model that includes the spurious cubic term. \end{document}