\documentclass[notitlepage]{article}

\usepackage{fancyhdr}
\usepackage{lastpage}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{bbm}
\usepackage{bbold}
\usepackage{enumitem}

\usepackage{undertilde}
\usepackage{../../MATH533}

\usepackage{listings}
\usepackage{courier}
\usepackage{numprint}

\lstloadlanguages{R}

\lstset{basicstyle=\ttfamily, numbers=left,
numberstyle=\ttfamily, stepnumber=1, numbersep=5pt,literate={~}{{$\sim$}}1,
escapeinside={(*@}{@*)} }


\parindent 0in
\pagestyle{empty}

\def\ExNumber{4}

\lfoot{MATH 423/533 ASSIGNMENT \ExNumber~Solutions} \cfoot{ } \rfoot{{\small{\em
Page \thepage \ of \pageref{LastPage}}}}

\renewcommand{\headrulewidth}{0.0pt}
\renewcommand{\footrulewidth}{0.0pt}

\def\bM{\mathbf{M}}
\def\bL{\mathbf{L}}
\def\bW{\mathbf{W}}

\input{../../tcilatex}

\lstloadlanguages{R}

%\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}




\begin{document}

<<echo=FALSE>>=
options(width=70)
library(knitr)
options(nsmall = 4)
print.summary.lm <-
function (x, digits = max(3L, getOption("digits") - 3L), symbolic.cor = x$symbolic.cor,
          signif.stars = getOption("show.signif.stars"), concise = FALSE, ...)
    {
        cat("\nCall:", if(!concise) "\n" else " ", paste(deparse(x$call), sep = "\n", collapse = "\n"),
            if (!concise) "\n\n", sep = "")
        resid <- x$residuals
        df <- x$df
        rdf <- df[2L]
        if (!concise) {
            cat(if (!is.null(x$weights) && diff(range(x$weights)))
                    "Weighted ", "Residuals:\n", sep = "")
        }
        if (rdf > 5L) {
            nam <- c("Min", "1Q", "Median", "3Q", "Max")
            rq <- if (length(dim(resid)) == 2L)
                      structure(apply(t(resid), 1L, quantile), dimnames = list(nam,
                                                                   dimnames(resid)[[2L]]))
                  else {
                      zz <- zapsmall(quantile(resid), digits + 1L)
                      structure(zz, names = nam)
                  }
            if (!concise) print(rq, digits = digits, ...)
        }
        else if (rdf > 0L) {
            print(resid, digits = digits, ...)
        }
        else {
            cat("ALL", df[1L], "residuals are 0: no residual degrees of freedom!")
            cat("\n")
        }
        if (length(x$aliased) == 0L) {
            cat("\nNo Coefficients\n")
        }
        else {
            if (nsingular <- df[3L] - df[1L])
                cat("\nCoefficients: (", nsingular, " not defined because of singularities)\n",
                    sep = "")
            else { cat("\n"); if (!concise) cat("Coefficients:\n")  }
            coefs <- x$coefficients
            if (!is.null(aliased <- x$aliased) && any(aliased)) {
                cn <- names(aliased)
                coefs <- matrix(NA, length(aliased), 4, dimnames = list(cn,
                                                            colnames(coefs)))
                coefs[!aliased, ] <- x$coefficients
            }
            printCoefmat(coefs, digits = digits, signif.stars = signif.stars, signif.legend = (!concise),
                         na.print = "NA", eps.Pvalue = if (!concise) .Machine$double.eps else 1e-4, ...)
        }
        cat("\nResidual standard error:", format(signif(x$sigma,
                                                        digits)), "on", rdf, "degrees of freedom")
        cat("\n")
        if (nzchar(mess <- naprint(x$na.action)))
            cat("  (", mess, ")\n", sep = "")
        if (!is.null(x$fstatistic)) {
            cat("Multiple R-squared: ", formatC(x$r.squared, digits = digits))
            cat(",\tAdjusted R-squared: ", formatC(x$adj.r.squared,
                                                   digits = digits), "\nF-statistic:", formatC(x$fstatistic[1L],
                                                       digits = digits), "on", x$fstatistic[2L], "and",
                x$fstatistic[3L], "DF,  p-value:", format.pval(pf(x$fstatistic[1L],
                                                                  x$fstatistic[2L], x$fstatistic[3L], lower.tail = FALSE),
                                                               digits = digits, if (!concise) .Machine$double.eps else 1e-4))
            cat("\n")
        }
        correl <- x$correlation
        if (!is.null(correl)) {
            p <- NCOL(correl)
            if (p > 1L) {
                cat("\nCorrelation of Coefficients:\n")
                if (is.logical(symbolic.cor) && symbolic.cor) {
                    print(symnum(correl, abbr.colnames = NULL))
                }
                else {
                    correl <- format(round(correl, 2), nsmall = 2,
                                     digits = digits)
                    correl[!lower.tri(correl)] <- ""
                    print(correl[-1, -p, drop = FALSE], quote = FALSE)
                }
            }
        }
        cat("\n")
        invisible(x)
    }


@
\pagestyle{fancy}
%\title{{\textsclarge{MATH 423/533 - Assignment \ExpectxNumber \\ Solutions }}}
%\author{David A. Stephens}
%\maketitle
%\begin{abstract}
%\end{abstract}

\input{Math423-533-2016-Sol4Q}

\pagebreak

\begin{center}
{\textsclarge{Solutions }}
\end{center}
\begin{enumerate}[label=\arabic*.,topsep=2pt,itemsep=2ex,partopsep=1ex,parsep=1ex]

\item The data set \texttt{TestScores.csv} contains data on standardized math test scores of 45 students from three Faculties in a University.

\begin{enumerate}[label=(\alph*),topsep=1pt,itemsep=1ex,partopsep=1ex,parsep=1ex]

\item With the \texttt{TestScores.csv} file stored in the local directory, we implement as follows:
<<A4-01,comment=':', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
Q1data<-read.csv('TestScores.csv',header=TRUE)
Q1data$Faculty<-as.factor(Q1data$Faculty)
fitQ1<-lm(Score~Faculty,data=Q1data)
summary(fitQ1)
anova(fitQ1)
@
We conclude, on the basis of the result of the global F test, with $p$-value equal to \Sexpr{anova(fitQ1)[1,5]}, that there \textbf{is} a significant difference between the scores for faculties.  To make this statement conclusively, we should also check the residual plots to assess the validity of the model assumptions, in particular the constant variance assumption (there is one mean per group, so the residuals will be zero mean in all groups) -- see plot below, which indicates perhaps a smaller variance in Faculty 3 observations, although in the small sample this cannot be conclusively assessed.


\hfill{3 Marks}

\item This can be done in two ways; either using the model formula removing the intercept
<<A4-02,comment=':', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
fitQ2<-lm(Score~-1+Faculty,data=Q1data)
summary(fitQ2)$coef[,1:2]
@
or by using the linear transformation
\[
\beta_1^\smallG = \beta_0 \qquad \beta_2^\smallG = \beta_0+\beta_1^\smallC \qquad \beta_3^\smallG = \beta_0+\beta_2^\smallC
\]
that is, using matrix
\[
\bL = \left[
\begin{array}{ccc}
  1 & 0 & 0 \\
  1 & 1 & 0 \\
  1 & 0 & 1
\end{array}
\right]
\]
so that $\beta_G = \bL \beta$.  We can therefore compute from scratch:
<<A4-03,comment=':', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
L<-matrix(c(1,1,1,0,1,0,0,0,1),3,3,byrow=F)
X<-cbind(1,Q1data$Faculty=='2',Q1data$Faculty=='3')
beta.Sigma<-summary(fitQ1)$sigma^2 * solve(t(X)%*%X)
(betaG.ests<-L %*% coef(fitQ1))                      #Estimates
betaG.Sigma<-L%*%beta.Sigma%*%t(L)
sqrt(diag(betaG.Sigma))                              #Standard errors
@
\hfill{3 Marks}

The residuals plot indicates that the residuals for Faculty 3 seem to perhaps have lower variance.
<<A4-04,comment=':', fig.width=7.5, fig.height=5.0, fig.align='center'>>=
res.data<-data.frame(res=residuals(fitQ1),Faculty = Q1data$Faculty)
par(mar=c(4,4,1,2))
stripchart(res~Faculty,res.data,pch=19,vertical=T,ylim=range(-20,20),xlab='Faculty')
abline(h=0,lty=2)
@


\end{enumerate}

\pagebreak

\item

\begin{enumerate}[label=(\alph*),topsep=1pt,itemsep=1ex,partopsep=1ex,parsep=1ex]

\item  Computing these quantities is straightforward in \texttt{R}:
<<A4-05,comment=':', fig.width=7.5, fig.height=5.0, fig.align='center'>>=
Q2data<-read.csv('Filter.csv',header=TRUE)
mod1<-lm(noise~1,data=Q2data);SS1<-round(anova(mod1)[1,2]/1000,3)
mod2<-lm(noise~carsize,data=Q2data);SS2<-round(anova(mod2)[2,2]/1000,3)
mod3<-lm(noise~type,data=Q2data);SS3<-round(anova(mod3)[2,2]/1000,3)
mod4<-lm(noise~carsize+type,data=Q2data);SS4<-round(anova(mod4)[3,2]/1000,3)
mod5<-lm(noise~carsize*type,data=Q2data);SS5<-round(anova(mod5)[4,2]/1000,3)
@
We may then put them in a table as follows
\begin{table}[h]
\centering
\begin{tabular}{|c|r|c|}
  \hline
  Model & $\SSRes (\times 10^{-3})$ & $p$ \\
  \hline
  $1$ & \Sexpr{SS1} & 1 \\[6pt]
  $1+X_1$ & \Sexpr{SS2} & \Sexpr{anova(mod2)[1,1]+1} \\[6pt]
  $1+X_2$ & \Sexpr{SS3} &  \Sexpr{anova(mod3)[1,1]+1}\\[6pt]
  $1+X_1+X_2$ & \Sexpr{SS4} &  \Sexpr{sum(anova(mod4)[1:2,1])+1}\\[6pt]
  $1+X_1+X_2+X_1 : X_2$ & \Sexpr{SS5} & \Sexpr{sum(anova(mod5)[1:3,1])+1} \\[6pt]
  \hline
\end{tabular}
\end{table}

\hfill{5 Marks}

\item We can do this easily from first principles
<<A4-07,comment=':', fig.width=7.5, fig.height=5.0, fig.align='center'>>=
n<-nrow(Q2data)
fit1<-lm(noise~carsize,data=Q2data)
fit2<-lm(noise~carsize*type,data=Q2data)
SSRes1<-sum(residuals(fit1)^2)
p1<-length(coef(fit1))
SSRes2<-sum(residuals(fit2)^2)
p2<-length(coef(fit2))
(fstat<-((SSRes1-SSRes2)/(p2-p1))/(SSRes2/(n-p2)))
(pvalue<-1-pf(fstat,p2-p1,n-p2))
@
or using \texttt{anova} in \texttt{R}.
<<A4-08,comment=':', fig.width=7.5, fig.height=5.0, fig.align='center'>>=
anova(fit1,fit2,test='F')
@


\hfill{3 Marks}

\pagebreak

\end{enumerate}

\item

First we can explore the structure numerically (using correlation) and graphically (using a scatterplot):

<<A4-09,comment=':', fig.width=6.5, fig.height=6.65, fig.align='center'>>=
Q3data<-read.csv('PatSat.csv',header=TRUE)
pairs(Q3data,pch=19)
Q3num<-Q3data;Q3num$Surgery<-as.numeric(Q3num$Surgery)-1
cor(Q3num)
@
There appear to be some strong associations between the variables.

\medskip
In a direct comparison between surgery groups, it seems that there is no effect of surgery on satisfaction level

<<A4-10,comment=':', fig.width=6.5, fig.height=6.5, fig.align='center'>>=
fit0<-lm(Satisfaction~Surgery,data=Q3data)
print(summary(fit0), concise=TRUE)
@
The task now is to assess whether any effect is being masked by the other variables.  We start with the main effects model:
<<A4-11,comment=':', fig.width=6.5, fig.height=6.5, fig.align='center'>>=
fit1<-lm(Satisfaction~Age+Severity+Surgery+Anxiety,data=Q3data)
print(summary(fit1), concise=TRUE)
drop1(fit1,test='F')
@
It seems we can drop the predictors \texttt{Surgery} and \texttt{Anxiety}: we update the model as follows by omitting these variables, and computing the summaries and comparison statistics.

\pagebreak

<<A4-12,comment=':', fig.width=6.5, fig.height=6.5, fig.align='center'>>=
fit2<-update(fit1,~.-Surgery-Anxiety)
print(summary(fit2), concise=TRUE)
anova(fit2,fit1,test='F')
AIC(fit1,fit2)
@

We now check whether the introduction of interactions has any affect:
<<A4-13,comment=':', fig.width=6.5, fig.height=6.5, fig.align='center'>>=
fit3<-update(fit2,~.+Surgery*Age*Anxiety)
fit4<-update(fit3,~.-Surgery:Age:Anxiety)
anova(fit2,fit4,fit3,test='F')
AIC(fit2,fit4,fit3)
@
These models do not improve the fit, it seems, so we attempt other models:
<<A4-14,comment=':', fig.width=6.5, fig.height=6.5, fig.align='center'>>=
fit5<-update(fit2,~.+Age:Severity)
fit6<-update(fit5,~.+Surgery)
anova(fit2,fit5,fit6,test='F')
AIC(fit2,fit5,fit6)
BIC(fit2,fit5,fit6)
@
The model \texttt{Age + Severity} still seems to be preferred.  We can also attempt some automatic methods for selection using the \texttt{step} function, and different starting models:  we attempt comparisons using AIC for convenience, although other methods can be used.
<<A4-15,comment=':', fig.width=6.5, fig.height=6.5, fig.align='center'>>=
step.fit1<-step(lm(Satisfaction~Age*Severity*Anxiety*Surgery,data=Q3data),k=2,trace=0)
print(summary(step.fit1),concise=T)
@
Starting with the model with up to the four-way interaction, the model gets simplified to the model above that includes some third order interactions.  The $R^2$ statistic is up to \Sexpr{round(summary(step.fit1)$r.squared,4)}, although the adjusted $R^2$ statistic is lower, at \Sexpr{round(summary(step.fit1)$adj.r.squared,4)}.  However, more compellingly, the AIC value is \Sexpr{round(AIC(step.fit1),3)}, indicating an inferior model to the \texttt{fit2} model above.  We also note that some of the estimated standard errors are large, indicating some multicollinearity and variance inflation.

\medskip
We now attempt some simplification by dropping the three-way interactions
<<A4-16,comment=':', fig.width=6.5, fig.height=6.5, fig.align='center'>>=
step.fit11<-update(step.fit1,~.-Age:Severity:Anxiety-Age:Anxiety:Surgery)
anova(step.fit11,step.fit1)
AIC(fit2,step.fit11)
@
which seems a legitimate simplification, and then re-attempting an automatic fit using \texttt{step} but with a different starting model including the two-way interactions:
<<A4-17,comment=':', fig.width=6.5, fig.height=6.5, fig.align='center'>>=
step.fit2<-step(lm(Satisfaction~(Age+Severity+Anxiety+Surgery)^2,data=Q3data),k=2,trace=0)
print(summary(step.fit2),concise=T)
AIC(fit2,fit5,step.fit2)
@
The AIC values inform is that the three models \texttt{fit2}, \texttt{fit5} and \texttt{step.fit2} have very similar qualities; recall that these three models are
\begin{itemize}

\item \texttt{fit2}: \texttt{\Sexpr{gsub('~','=',format(formula(fit2)))}}

\item \texttt{fit5}: \texttt{\Sexpr{gsub('~','=',format(formula(fit5)))}}

\item \texttt{step.fit2}: \texttt{\Sexpr{gsub('~','=',format(formula(step.fit2)))}}

\end{itemize}

\begin{table}[ht]
\centering
\begin{tabular}{|l|c|c|c|c|}
  \hline
  Model & $\SSRes$ & AIC & BIC & $R_\text{Adj}^2$ \\
\hline
  \texttt{fit2}      & \Sexpr{round(anova(fit2)[3,2],3)} & \Sexpr{round(AIC(fit2),3)} & \Sexpr{round(BIC(fit2),3)} & \Sexpr{round(summary(fit2)$adj.r.squared,3)} \\
  \texttt{fit5}      & \Sexpr{round(anova(fit5)[4,2],3)} & \Sexpr{round(AIC(fit5),3)} & \Sexpr{round(BIC(fit5),3)} & \Sexpr{round(summary(fit5)$adj.r.squared,3)} \\
  \texttt{step.fit2} & \Sexpr{round(anova(step.fit2)[6,2],3)} & \Sexpr{round(AIC(step.fit2),3)} & \Sexpr{round(BIC(step.fit2),3)} & \Sexpr{round(summary(step.fit2)$adj.r.squared,3)} \\
  \hline
\end{tabular}
\end{table}
These three models could be equally well supported by the data, and only the last depends on the \texttt{Surgery} factor.  Therefore, the conclusion is not at all clear.  To attempt to quantify the effect of surgery, we look at fitted values for the last models when \texttt{Surgery} is set to \texttt{No} for all patients, and then to \texttt{Yes} for all patients, and then look at the difference in Satisfaction Score.
<<A4-18,comment=':', fig.width=7.5, fig.height=5.5, fig.align='center'>>=
No.data<-Q3data;No.data$Surgery<-as.factor('No')
Yes.data<-Q3data;Yes.data$Surgery<-as.factor('Yes')
No.fit<-predict(step.fit2,newdata=No.data)
Yes.fit<-predict(step.fit2,newdata=Yes.data)
par(mar=c(4,4,1,1))
plot(Yes.fit-No.fit,pch=19,xlab='Patient',ylab='Predicted Difference in Satisfaction Yes-No')
abline(h=0,lty=2)
@
We might therefore conclude that surgery has a small effect to improve patient satisfaction overall (most of these differences between \texttt{Surgery==Yes} and \texttt{Surgery==No} are positive), but it is not clear that it is statistically significant.
\hfill{6 Marks}

\end{enumerate}

\pagebreak

{\textsclarge{Extra Question For Students In Math 533}}

\hrule
In each case we can extract the design matrix using \texttt{model.matrix()} in \texttt{R}:

\medskip

\begin{enumerate}[label=(\alph*),topsep=2ex,itemsep=1ex,partopsep=1ex,parsep=2ex]

\item \textbf{the main effect model in Q1;}
<<A4-19,comment=':', fig.width=7.5, fig.height=5.5, fig.align='center'>>=
Xa<-model.matrix(fitQ1)[,]
(XatXa<-data.matrix(t(Xa) %*% Xa))
@

\item \textbf{the main effects only model in Q2;}
<<A4-20,comment=':', fig.width=7.5, fig.height=5.5, fig.align='center'>>=
Xb<-model.matrix(mod4)[,]
colnames(Xb)<-c('Int','car=medium','car=small','type=Octel')
(XbtXb<-data.matrix(t(Xb) %*% Xb))
@

\item\textbf{ the main effects plus interaction model in Q2}
<<A4-21,comment=':', fig.width=7.5, fig.height=5.5, fig.align='center'>>=
Xc<-model.matrix(mod5)[,]
colnames(Xc)<-c('Int','car=medium','car=small','type=Octel','med:Oct.','small:Oct.')
(XctXc<-data.matrix(t(Xc) %*% Xc))
@
\end{enumerate}
These matrices are not diagonal, and therefore the dummy predictors given by the indicator functions on page 1 are not orthogonal.
\hfill{3 Marks}

\medskip
The simplest way to obtain an orthogonal parameterization in (a) is to reparameterize to use the group means, as in Q1 (b), by removing the intercept:
\hfill{2 Marks}

<<A4-22,comment=':', fig.width=7.5, fig.height=5.5, fig.align='center'>>=
fitQ2<-lm(Score~-1+Faculty,data=Q1data)
Xd<-model.matrix(fitQ2)[,]
(XdtXd<-data.matrix(t(Xd) %*% Xd))
@

However, note that does NOT work for multi-factor models.  For example,
<<A4-23,comment=':', fig.width=7.5, fig.height=5.5, fig.align='center'>>=
mod4.2<-lm(noise~-1+carsize+type,data=Q2data)
Xe<-model.matrix(mod4.2)[,]
colnames(Xe)<-c('car=large','car=medium','car=small','type=Octel')
(XetXe<-data.matrix(t(Xe) %*% Xe))
@
In order to obtain a fully orthogonal general representation, \textit{polynomial} contrasts can be used.  The \texttt{contr.poly} function defines a new parameterization where
<<A4-24,comment=':', fig.width=7.5, fig.height=5.5, fig.align='center'>>=
Q2data.poly<-Q2data
contrasts(Q2data.poly$carsize)<-contr.poly(3)
contrasts(Q2data.poly$type)<-contr.poly(2)
mod4.3<-lm(noise~carsize+type,data=Q2data.poly)
Xf<-model.matrix(mod4.3)[,]
(XftXf<-round(data.matrix(t(Xf) %*% Xf),6))
mod5.1<-lm(noise~carsize*type,data=Q2data.poly)
Xg<-model.matrix(mod5.1)[,]
colnames(Xg)<-c('Int','c.L','c.Q','t.L','c.L:t.L','c.Q:t.L')
(XgtXg<-round(data.matrix(t(Xg) %*% Xg),6))
@
The \texttt{contr.poly} parameterization uses a representation of the predictor columns based on orthogonal polynomials, and it is possible to use the resulting contrasts especially if it is believed that the factor levels are equally spaced on an underlying continuum; the idea is that the factor levels on an underlying ordinal scale can be represented by real values, and then for a factor with $M$ levels, one may represent the effects of the levels by using polynomials of increasing order $1,2,\ldots,M-1$, and hence look for polynomial patterns in the data.  See for example
\[
\texttt{http://www.ats.ucla.edu/stat/r/library/contrast\underline{ }coding.htm}
\]
Alternatively, a direct reparameterization into an orthogonal matrix can be obtained by decomposing any $\bX$ matrix: for example, using standard Gram-Schmidt or QR decomposition.

\medskip
Note, however, that such a reparameterization typically changes the interpretation of the estimates, and will change the result of the inference, but not the ANOVA tests; for example
<<A4-25,comment=':', fig.width=7.5, fig.height=5.5, fig.align='center'>>=
round(summary(mod5)$coef,3)
round(summary(mod5.1)$coef,3)
anova(mod5,test='F')
anova(mod5.1,test='F')
@
\end{document}
colnames(Xf)<-c('car=large','car=medium','car=small','type=Octel')