\documentclass[notitlepage]{article}
\usepackage{Math}
%\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{bm}
\usepackage{bbm}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{dsfont}
\usepackage{scalerel}

\usepackage{amsfonts,amsmath,amssymb}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}


\usepackage{pgfplots}
\pgfplotsset{compat=1.17}

\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
            \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}

\usepackage{xcolor}

\newtheorem{alg}{Algorithm}
\newtheorem{ex}{Example}
\newtheorem{note}{Note}
\newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof
}}}}
{\hfill \small$\blacksquare$\\[2mm]}


\setlength{\parindent}{0in}

\def\E{\mathbbmss{E}}
\def\Var{\text{Var}}
\def\Cov{\text{Cov}}
\def\Corr{\text{Corr}}

\def\bW{\mathbf{W}}
\def\bH{\mathbf{H}}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\Rho}{\mathrm{P}}
\newcommand{\cif}{\text{ if }}
\newcommand{\Ra}{\Longrightarrow}

\def\bphi{\boldsymbol \phi}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\def\Xb{\mathbf{X}_{\beta}}
\def\Xp{\mathbf{X}_{\psi}}
\def\beps{\boldsymbol{\epsilon}}
\def\betahat{\widehat \beta}
\def\psihat{\widehat \psi}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=6)
options(repos=c(CRAN="https://cloud.r-project.org/"))
knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)})
@
\begin{center}
{\textsclarge{Semiparametric Theory for the Linear Model}}
\end{center}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

For the $p \times 1$ system of estimating equations
\[
\sum_{i=1}^n \bU_i(\theta) = \zerovec
\]
with $\E[\bU(\theta_0)] = \zerovec$, we have that
\[
\sqrt{n}(\widehat \theta_n - \theta_0) \CiD Normal_p(\zerovec,\bV)
\]
where
\[
\bV = \mathcal{J}^{-1} \mathcal{I} \mathcal{J}^{-\top}
\]
with
\[
\mathcal{I} = \E[ \bU(\theta_0) \bU(\theta_0)^\top] \qquad \qquad \mathcal{J} = \E[\dot{\bU}(\theta_0)]
\]
both $(p \times p)$ matrices, and
\[
\dot{\bU}(\theta_0) = -\dfrac{\partial \bU(\theta)}{\partial \theta^\top} \bigg |_{\theta = \theta_0}
\]
We proceed by computing estimates, $\widehat I_n$ and $\widehat J_n$, of the two matrices based on the observed data,
\[
\widehat I_n = \frac{1}{n} \sum_{i=1}^n \bU_i(\widehat \theta_n)\{\bU_i(\widehat \theta_n)\}^\top \qquad \widehat J_n = -\frac{1}{n} \sum_{i=1}^n \dot{\bU}_i(\widehat \theta_n)
\]
and then computing
\[
\widehat \bV = \widehat J_n^{-1} \widehat I_n \widehat J_n^{-\top}
\]
to estimate the asymptotic variance.

\medskip

In the linear model, suppose that we have parameter $\theta = (\psi^\top,\beta^\top)^\top$, with parameter of interest $\psi$ $(q \times 1)$ and nuisance parameter $\beta$ $(r \times 1)$. The estimating function (for a single data point) is
\[
\bU(\theta) = \bx^\top (Y - \bx \theta) =
\begin{bmatrix}
  \bx_2^\top \\[6pt]
  \bx_1^\top
\end{bmatrix} (Y - \bx_2 \psi - \bx_1 \beta)
\]
where $\bx_1$ and $\bx_2$ are $1 \times r$ and $1 \times q$ row vectors, respectively, with $\bx = (\bx_2,\bx_1)$ is $1 \times p$.  We have
\[
\dot{\bU}(\theta)  = \begin{bmatrix}
              \bx_2^\top \bx_2 & \bx_2^\top \bx_1 \\[6pt]
              \bx_1^\top \bx_2 & \bx_1^\top \bx_1
            \end{bmatrix} = \bx^\top \bx \equiv \dot{\bU}
\]
is a $p \times p$ symmetric block matrix that does not depend on $\theta$.  We have that
\[
\calI(\theta_0) = \E[\bx^\top (Y - \bx \theta)^2 \bx] = \E[ (Y - \bx \theta)^2 \bx^\top\bx]
\]
as $(Y - \bx \theta)$ is a scalar quantity, and
\[
\calJ = \E[\bx^\top \bx].
\]
The estimators would be
\[
\widehat I_n = \frac{1}{n} \sum_{i=1}^n (y_i - \bx_i \widehat \theta_n )^2 \bx_i^\top \bx_i \qquad \widehat J_n = \frac{1}{n} \sum_{i=1}^n \bx_i^\top\bx_i
\]
with $\widehat \theta_n$ solving the OLS system.

\begin{enumerate}[1.]

\item Under the most common (correct specification) homoscedastic (ie constant variance) model, for iid samples from the joint distribution,
\[
\E_{Y|X}[Y-\bx \theta_0|\bx] = 0 \qquad \E_{Y|X}[(Y-\bx \theta_0)^2|\bx] = \sigma^2
\]
so that by iterated expectation
\[
\calI(\theta_0) = \sigma^2 \E[\bx^\top\bx]
\]
and hence
\[
\bV = \sigma^2 \{ \E[\bx^\top\bx] \}^{-1}
\]
For the influence function for $\theta$, we have from the theory of $m$-estimation that
\[
\varphi(X,Y) = \{\calJ(\theta_0) \}^{-1} \bU(\theta_0) = \{\E[\bx^\top \bx]\}^{-1} \bx^\top (Y - \bx\theta_0)
\]
Also, therefore, we have that
\[
\sqrt{n}(\widehat \theta_n - \theta_0) \CiD Normal_p(0,\sigma^2 \{\E[\bx^\top \bx]\}^{-1}).
\]


\item Focussing on the parameter of interest alone, we seek now a suitable influence or estimating function for estimating $\psi$.  As this is a $q$-dimensional parameter, we seek a $q$-dimensional function.
    
    \medskip
    
    
    The theory from lectures suggests that we can derive a suitable estimating equation by projecting the estimating function for $\psi$ onto the nuisance tangent space and taking the residual.  In this case, the nuisance tangent space is merely the linear space $\Lambda$ spanned by $\bx_1^\top$, that is
    \[
    \Lambda = \{\bB \bx_1^\top \: : \: \bB \text{ a $q \times r$ matrix} \}
    \]

    \medskip

    Write $\varepsilon = Y - \bx_2 \psi_0 - \bx_1 \theta_0$, and
    \[
    \bU(\theta_0) = \begin{bmatrix}
                      \bU_\psi(\theta_0) \\[6pt]
                      \bU_\beta(\theta_0)
                    \end{bmatrix} =
                    \begin{bmatrix}
                      \bx_2^\top \varepsilon \\[6pt]
                      \bx_1^\top \varepsilon
                    \end{bmatrix}
    \]
    The projection of $\bU_\psi(\theta_0)$ onto $\Lambda$ is computed by finding $\bB_0$ (a $q \times r$ matrix) such that
    \[
    \E[ (\bx_2^\top \varepsilon - \bB_0 \bx_1^\top \varepsilon)^\top \bB \bx_1^\top \varepsilon ] = \zerovec
    \]
    for all $\bB$.  As per the results on slide p389, we can write explicitly
    \[
    \bB_0 = \E[ \varepsilon^2 \bx_2^\top\bx_1] \{\E[\varepsilon^2 \bx_1^\top \bx_1]\}^{-1}
    \]
    and hence the projection as
    \[
    \bB_0 \bx_1^\top \varepsilon = \E[ \varepsilon^2 \bx_2^\top\bx_1] \{\E[\varepsilon^2 \bx_1^\top \bx_1]\}^{-1} \bx_1^\top \varepsilon
    \]
    Therefore the residual is
    \[
    \bx_2^\top \varepsilon - \E[ \varepsilon^2 \bx_2^\top\bx_1] \{\E[\varepsilon^2 \bx_1^\top \bx_1]\}^{-1} \bx_1^\top \varepsilon.
    \]


    \item Under the homoscedastic model, by iterated expectation, the residual becomes
    \[
    \bx_2^\top \varepsilon - \E[\bx_2^\top\bx_1] \{\E[\bx_1^\top \bx_1]\}^{-1} \bx_1^\top \varepsilon.
    \]
    We can estimate the two expectations using sample averages
    \[
    \widehat \E[\bx_2^\top\bx_1] = \frac{1}{n} \sum_{i=1}^n \bx_{i2}^\top\bx_{i1} = \frac{1}{n} \bX_2^\top \bX_1
    \]
    and
    \[
    \widehat \E[\bx_1^\top \bx_1] = \frac{1}{n} \sum_{i=1}^n \bx_{i1}^\top\bx_{i1} = \frac{1}{n} \bX_1^\top \bX_1
    \]
    where
    \[
    \bX_1 = \begin{bmatrix}
              \bx_{11} \\[6pt]
              \vdots \\[6pt]
              \bx_{n1}
            \end{bmatrix}
            \qquad
    \bX_2 = \begin{bmatrix}
              \bx_{12} \\[6pt]
              \vdots \\[6pt]
              \bx_{n2}
            \end{bmatrix}
            \qquad
    \]
    which are $n \times r$ and $n \times q$ matricex respectively.    This inspires the estimating equation
    \[
    \sum_{i=1}^n \left(\bx_{i2}^\top \varepsilon_i - \bX_2^\top \bX_1 (\bX_1^\top \bX_1)^{-1} \bx_{i1}^\top \varepsilon_i \right) = \zerovec
    \]
    or equivalently
    \[
    \left(\bX_2^\top - \bX_2^\top \bX_1 (\bX_1^\top \bX_1)^{-1} \bX_1^\top \right) \epsilon = \zerovec
    \]
    where $\epsilon = (\varepsilon_1,\ldots,\varepsilon_n)^\top$.


\item Defining
\[
\bH_1 = \bX_1 (\bX_1^\top \bX_1)^{-1} \bX_1^\top
\]
we see that the estimating equation is
\[
\left(\bX_2^\top - \bX_2^\top \bH_1 \right) \epsilon = \zerovec
\]
or equivalently
\[
\bX_2^\top\left(\Ident_n - \bH_1 \right) (\by - \bX_2 \psi - \bX_1 \beta) = \zerovec.
\]
This is precisely the estimating equation arrived at by the two-step approach that solves for $\beta$ for fixed $\psi$, and then plugs the solution back into the estimating equation to solve for $\psi$ (lecture slides pp 368--378).  As
\[
\left(\Ident_n - \bH_1 \right) \bX_1 = \zerovec \qquad n \times r
\]
the equation can also be written
\[
\bX_2^\top\left(\Ident_n - \bH_1 \right) (\by - \bX_2 \psi) = \zerovec.
\]
The solution is
\[
\widehat \psi = \left( \bX_2^\top\left(\Ident_n - \bH_1 \right) \bX_2 \right)^{-1} \bX_2^\top\left(\Ident_n - \bH_1 \right) \by.
\]

\item In this case, we get the \textbf{same} solution for $\psi$ whether we solve the joint estimating equation or the projected estimating equation.  Under the homoscedastic model, using the joint estimating equation, we have asymptotic variance
    \[
    \bV = \sigma^2 \{ \E[\bx^\top\bx] \}^{-1}
    \]
    where
    \[
    \E[\bx^\top\bx] = \E \begin{bmatrix}
                           \bx_2^\top \bx_2  & \bx_2^\top \bx_1 \\[6pt] \bx_1^\top \bx_2 & \bx_1^\top \bx_1
                         \end{bmatrix} 
    = \begin{bmatrix}
        \bV_{22} & \bV_{21} \\[6pt]
        \bV_{12} & \bV_{11}
      \end{bmatrix}
    \]
    say, so therefore by results for block matrices, the asymptotic variance for $\psi$ is
    \[
    \sigma^2 \left( \bV_{22} - \bV_{21} \bV_{11}^{-1} \bV_{21} \right)^{-1} .
    \]
    Direct from the projected estimating function, 
    \[
    \bU_\psi^\proj (\theta) = \bx_2^\top \varepsilon - \E[ \varepsilon^2 \bx_2^\top\bx_1] \{\E[\varepsilon^2 \bx_1^\top \bx_1]\}^{-1} \bx_1^\top \varepsilon. = \left(\bx_2^\top - \bV_{21} \bV_{11}^{-1} \bx_1^\top \right) \epsilon
    \]
    we have that the asymptotic variance is
    \[
    \sigma^2 \left\{ \E\left[\left(\bx_2^\top - \bV_{21} \bV_{11}^{-1} \bx_1^\top \right) \left(\bx_2^\top - \bV_{21} \bV_{11}^{-1} \bx_1^\top \right)^\top \right] \right\}^{-1}.
    \]
    Now
    \[
      \left(\bx_2^\top - \bV_{21} \bV_{11}^{-1} \bx_1^\top \right) \left(\bx_2^\top - \bV_{21} \bV_{11}^{-1} \bx_1^\top \right)^\top = \bx_2^\top \bx_2 - \bx_2^\top \bx_1 \bV_{11}^{-1} \bV_{12} - \bV_{21} \bV_{11}^{-1} \bx_1^\top \bx_2 + \bV_{21} \bV_{11}^{-1} \bx_1^\top \bx_1 \bV_{11}^{-1} \bV_{12}
    \]
    and taking expectations yields
    \[
    \bV_{22} - \bV_{21} \bV_{11}^{-1} \bV_{12} - \bV_{21} \bV_{11}^{-1} \bV_{12} + \bV_{21} \bV_{11}^{-1} \bV_{11} \bV_{11}^{-1} \bV_{12} = \bV_{22} - \bV_{21} \bV_{11}^{-1} \bV_{12}
    \]
    and the two asymptotic variances match.

\end{enumerate}
<<C1,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
#Numerical verification
set.seed(1293)
library(mvnfast)
muX<-c(-1,2,1)
Sig<-0.9^abs(outer(1:3,1:3,'-'))
n<-1000
X<-rmvn(n,muX,Sig)
th<-c(1,1,1,1)
Xm<-cbind(1,X)
Y<- Xm %*% th + rnorm(n)*2

coef(summary(lm(Y~X)))

X1<-Xm[,1:2]  #Take beta to be the first two elements of theta
X2<-Xm[,3:4]  #Take psi to be the last two elements of theta

#Joint estimating equation
th.hat<-solve(t(Xm) %*% Xm) %*% (t(Xm) %*% Y)
th.hat
Y.hat0<- Xm %*% th.hat
sigsq.hat<-sum((Y-Y.hat0)^2)/(n-length(th.hat))
sigsq.hat

#Variance
V<-sigsq.hat * solve((t(Xm) %*% Xm)/n)/n
V
sqrt(diag(V)) #Matches the standard error column above

#Variance calc with I and J matrices
Ihat<-(t(Xm) %*% (Xm * matrix(Y-Y.hat0,nrow=n,ncol=4)^2))/n
Jhat<-(t(Xm) %*% Xm)/n
Vhat<-(solve(Jhat) %*% Ihat %*% solve(t(Jhat)))/n
Vhat
sqrt(diag(Vhat))


######################################################################
#Projected estimating equation
I.n<-diag(rep(1,n))
H1<- X1 %*% solve(t(X1) %*% X1) %*% t(X1)
psi.hat<-solve(t(X2) %*% (I.n-H1) %*% X2) %*% (( t(X2)%*%(I.n-H1)) %*% Y)
psi.hat

#Variance estimation
V11<-(t(X1) %*% X1)/n
V12<-(t(X1) %*% X2)/n
V21<-t(V12)
V22<-(t(X2) %*% X2)/n

V<-sigsq.hat*solve(V22-(V21 %*% solve(V11) %*% V12))/n
sqrt(diag(V))

#Check one of the claimed orthogonalities
M<-(I.n-H1) %*% X1
apply(M,2,mean)
apply(M,2,var)
@




\end{document}