\documentclass[notitlepage]{article} \usepackage{Math} %\usepackage{../../Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{bm} \usepackage{bbm} \usepackage{verbatim} \usepackage{mathtools} \usepackage{dsfont} \usepackage{scalerel} \usepackage{amsfonts,amsmath,amssymb} \usepackage{tikz} \usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning} \usepackage{pgfplots} \pgfplotsset{compat=1.17} \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} \usepackage{xcolor} \newtheorem{alg}{Algorithm} \newtheorem{ex}{Example} \newtheorem{note}{Note} \newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof }}}} {\hfill \small$\blacksquare$\\[2mm]} \setlength{\parindent}{0in} \def\E{\mathbbmss{E}} \def\Var{\text{Var}} \def\Cov{\text{Cov}} \def\Corr{\text{Corr}} \def\bW{\mathbf{W}} \def\bH{\mathbf{H}} \newcommand{\bs}[1]{\bm{#1}} \newcommand{\Rho}{\mathrm{P}} \newcommand{\cif}{\text{ if }} \newcommand{\Ra}{\Longrightarrow} \def\bphi{\boldsymbol \phi} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \def\Xb{\mathbf{X}_{\beta}} \def\Xp{\mathbf{X}_{\psi}} \def\beps{\boldsymbol{\epsilon}} \def\betahat{\widehat \beta} \def\psihat{\widehat \psi} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=6) options(repos=c(CRAN="https://cloud.r-project.org/")) knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)}) @ \begin{center} {\textsclarge{Semiparametric Theory for the Linear Model}} \end{center} \tikzset{ -Latex,auto,node distance =1 cm and 1 cm,semithick, state/.style ={circle, draw, minimum width = 0.7 cm}, box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray}, point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}}, bidirected/.style={Latex-Latex,dashed}, el/.style = {inner sep=3pt, align=left, sloped} } For the $p \times 1$ system of estimating equations \[ \sum_{i=1}^n \bU_i(\theta) = \zerovec \] with $\E[\bU(\theta_0)] = \zerovec$, we have that \[ \sqrt{n}(\widehat \theta_n - \theta_0) \CiD Normal_p(\zerovec,\bV) \] where \[ \bV = \mathcal{J}^{-1} \mathcal{I} \mathcal{J}^{-\top} \] with \[ \mathcal{I} = \E[ \bU(\theta_0) \bU(\theta_0)^\top] \qquad \qquad \mathcal{J} = \E[\dot{\bU}(\theta_0)] \] both $(p \times p)$ matrices, and \[ \dot{\bU}(\theta_0) = -\dfrac{\partial \bU(\theta)}{\partial \theta^\top} \bigg |_{\theta = \theta_0} \] We proceed by computing estimates, $\widehat I_n$ and $\widehat J_n$, of the two matrices based on the observed data, \[ \widehat I_n = \frac{1}{n} \sum_{i=1}^n \bU_i(\widehat \theta_n)\{\bU_i(\widehat \theta_n)\}^\top \qquad \widehat J_n = -\frac{1}{n} \sum_{i=1}^n \dot{\bU}_i(\widehat \theta_n) \] and then computing \[ \widehat \bV = \widehat J_n^{-1} \widehat I_n \widehat J_n^{-\top} \] to estimate the asymptotic variance. \medskip In the linear model, suppose that we have parameter $\theta = (\psi^\top,\beta^\top)^\top$, with parameter of interest $\psi$ $(q \times 1)$ and nuisance parameter $\beta$ $(r \times 1)$. The estimating function (for a single data point) is \[ \bU(\theta) = \bx^\top (Y - \bx \theta) = \begin{bmatrix} \bx_2^\top \\[6pt] \bx_1^\top \end{bmatrix} (Y - \bx_2 \psi - \bx_1 \beta) \] where $\bx_1$ and $\bx_2$ are $1 \times r$ and $1 \times q$ row vectors, respectively, with $\bx = (\bx_2,\bx_1)$ is $1 \times p$. We have \[ \dot{\bU}(\theta) = \begin{bmatrix} \bx_2^\top \bx_2 & \bx_2^\top \bx_1 \\[6pt] \bx_1^\top \bx_2 & \bx_1^\top \bx_1 \end{bmatrix} = \bx^\top \bx \equiv \dot{\bU} \] is a $p \times p$ symmetric block matrix that does not depend on $\theta$. We have that \[ \calI(\theta_0) = \E[\bx^\top (Y - \bx \theta)^2 \bx] = \E[ (Y - \bx \theta)^2 \bx^\top\bx] \] as $(Y - \bx \theta)$ is a scalar quantity, and \[ \calJ = \E[\bx^\top \bx]. \] The estimators would be \[ \widehat I_n = \frac{1}{n} \sum_{i=1}^n (y_i - \bx_i \widehat \theta_n )^2 \bx_i^\top \bx_i \qquad \widehat J_n = \frac{1}{n} \sum_{i=1}^n \bx_i^\top\bx_i \] with $\widehat \theta_n$ solving the OLS system. \begin{enumerate}[1.] \item Under the most common (correct specification) homoscedastic (ie constant variance) model, for iid samples from the joint distribution, \[ \E_{Y|X}[Y-\bx \theta_0|\bx] = 0 \qquad \E_{Y|X}[(Y-\bx \theta_0)^2|\bx] = \sigma^2 \] so that by iterated expectation \[ \calI(\theta_0) = \sigma^2 \E[\bx^\top\bx] \] and hence \[ \bV = \sigma^2 \{ \E[\bx^\top\bx] \}^{-1} \] For the influence function for $\theta$, we have from the theory of $m$-estimation that \[ \varphi(X,Y) = \{\calJ(\theta_0) \}^{-1} \bU(\theta_0) = \{\E[\bx^\top \bx]\}^{-1} \bx^\top (Y - \bx\theta_0) \] Also, therefore, we have that \[ \sqrt{n}(\widehat \theta_n - \theta_0) \CiD Normal_p(0,\sigma^2 \{\E[\bx^\top \bx]\}^{-1}). \] \item Focussing on the parameter of interest alone, we seek now a suitable influence or estimating function for estimating $\psi$. As this is a $q$-dimensional parameter, we seek a $q$-dimensional function. \medskip The theory from lectures suggests that we can derive a suitable estimating equation by projecting the estimating function for $\psi$ onto the nuisance tangent space and taking the residual. In this case, the nuisance tangent space is merely the linear space $\Lambda$ spanned by $\bx_1^\top$, that is \[ \Lambda = \{\bB \bx_1^\top \: : \: \bB \text{ a $q \times r$ matrix} \} \] \medskip Write $\varepsilon = Y - \bx_2 \psi_0 - \bx_1 \theta_0$, and \[ \bU(\theta_0) = \begin{bmatrix} \bU_\psi(\theta_0) \\[6pt] \bU_\beta(\theta_0) \end{bmatrix} = \begin{bmatrix} \bx_2^\top \varepsilon \\[6pt] \bx_1^\top \varepsilon \end{bmatrix} \] The projection of $\bU_\psi(\theta_0)$ onto $\Lambda$ is computed by finding $\bB_0$ (a $q \times r$ matrix) such that \[ \E[ (\bx_2^\top \varepsilon - \bB_0 \bx_1^\top \varepsilon)^\top \bB \bx_1^\top \varepsilon ] = \zerovec \] for all $\bB$. As per the results on slide p389, we can write explicitly \[ \bB_0 = \E[ \varepsilon^2 \bx_2^\top\bx_1] \{\E[\varepsilon^2 \bx_1^\top \bx_1]\}^{-1} \] and hence the projection as \[ \bB_0 \bx_1^\top \varepsilon = \E[ \varepsilon^2 \bx_2^\top\bx_1] \{\E[\varepsilon^2 \bx_1^\top \bx_1]\}^{-1} \bx_1^\top \varepsilon \] Therefore the residual is \[ \bx_2^\top \varepsilon - \E[ \varepsilon^2 \bx_2^\top\bx_1] \{\E[\varepsilon^2 \bx_1^\top \bx_1]\}^{-1} \bx_1^\top \varepsilon. \] \item Under the homoscedastic model, by iterated expectation, the residual becomes \[ \bx_2^\top \varepsilon - \E[\bx_2^\top\bx_1] \{\E[\bx_1^\top \bx_1]\}^{-1} \bx_1^\top \varepsilon. \] We can estimate the two expectations using sample averages \[ \widehat \E[\bx_2^\top\bx_1] = \frac{1}{n} \sum_{i=1}^n \bx_{i2}^\top\bx_{i1} = \frac{1}{n} \bX_2^\top \bX_1 \] and \[ \widehat \E[\bx_1^\top \bx_1] = \frac{1}{n} \sum_{i=1}^n \bx_{i1}^\top\bx_{i1} = \frac{1}{n} \bX_1^\top \bX_1 \] where \[ \bX_1 = \begin{bmatrix} \bx_{11} \\[6pt] \vdots \\[6pt] \bx_{n1} \end{bmatrix} \qquad \bX_2 = \begin{bmatrix} \bx_{12} \\[6pt] \vdots \\[6pt] \bx_{n2} \end{bmatrix} \qquad \] which are $n \times r$ and $n \times q$ matricex respectively. This inspires the estimating equation \[ \sum_{i=1}^n \left(\bx_{i2}^\top \varepsilon_i - \bX_2^\top \bX_1 (\bX_1^\top \bX_1)^{-1} \bx_{i1}^\top \varepsilon_i \right) = \zerovec \] or equivalently \[ \left(\bX_2^\top - \bX_2^\top \bX_1 (\bX_1^\top \bX_1)^{-1} \bX_1^\top \right) \epsilon = \zerovec \] where $\epsilon = (\varepsilon_1,\ldots,\varepsilon_n)^\top$. \item Defining \[ \bH_1 = \bX_1 (\bX_1^\top \bX_1)^{-1} \bX_1^\top \] we see that the estimating equation is \[ \left(\bX_2^\top - \bX_2^\top \bH_1 \right) \epsilon = \zerovec \] or equivalently \[ \bX_2^\top\left(\Ident_n - \bH_1 \right) (\by - \bX_2 \psi - \bX_1 \beta) = \zerovec. \] This is precisely the estimating equation arrived at by the two-step approach that solves for $\beta$ for fixed $\psi$, and then plugs the solution back into the estimating equation to solve for $\psi$ (lecture slides pp 368--378). As \[ \left(\Ident_n - \bH_1 \right) \bX_1 = \zerovec \qquad n \times r \] the equation can also be written \[ \bX_2^\top\left(\Ident_n - \bH_1 \right) (\by - \bX_2 \psi) = \zerovec. \] The solution is \[ \widehat \psi = \left( \bX_2^\top\left(\Ident_n - \bH_1 \right) \bX_2 \right)^{-1} \bX_2^\top\left(\Ident_n - \bH_1 \right) \by. \] \item In this case, we get the \textbf{same} solution for $\psi$ whether we solve the joint estimating equation or the projected estimating equation. Under the homoscedastic model, using the joint estimating equation, we have asymptotic variance \[ \bV = \sigma^2 \{ \E[\bx^\top\bx] \}^{-1} \] where \[ \E[\bx^\top\bx] = \E \begin{bmatrix} \bx_2^\top \bx_2 & \bx_2^\top \bx_1 \\[6pt] \bx_1^\top \bx_2 & \bx_1^\top \bx_1 \end{bmatrix} = \begin{bmatrix} \bV_{22} & \bV_{21} \\[6pt] \bV_{12} & \bV_{11} \end{bmatrix} \] say, so therefore by results for block matrices, the asymptotic variance for $\psi$ is \[ \sigma^2 \left( \bV_{22} - \bV_{21} \bV_{11}^{-1} \bV_{21} \right)^{-1} . \] Direct from the projected estimating function, \[ \bU_\psi^\proj (\theta) = \bx_2^\top \varepsilon - \E[ \varepsilon^2 \bx_2^\top\bx_1] \{\E[\varepsilon^2 \bx_1^\top \bx_1]\}^{-1} \bx_1^\top \varepsilon. = \left(\bx_2^\top - \bV_{21} \bV_{11}^{-1} \bx_1^\top \right) \epsilon \] we have that the asymptotic variance is \[ \sigma^2 \left\{ \E\left[\left(\bx_2^\top - \bV_{21} \bV_{11}^{-1} \bx_1^\top \right) \left(\bx_2^\top - \bV_{21} \bV_{11}^{-1} \bx_1^\top \right)^\top \right] \right\}^{-1}. \] Now \[ \left(\bx_2^\top - \bV_{21} \bV_{11}^{-1} \bx_1^\top \right) \left(\bx_2^\top - \bV_{21} \bV_{11}^{-1} \bx_1^\top \right)^\top = \bx_2^\top \bx_2 - \bx_2^\top \bx_1 \bV_{11}^{-1} \bV_{12} - \bV_{21} \bV_{11}^{-1} \bx_1^\top \bx_2 + \bV_{21} \bV_{11}^{-1} \bx_1^\top \bx_1 \bV_{11}^{-1} \bV_{12} \] and taking expectations yields \[ \bV_{22} - \bV_{21} \bV_{11}^{-1} \bV_{12} - \bV_{21} \bV_{11}^{-1} \bV_{12} + \bV_{21} \bV_{11}^{-1} \bV_{11} \bV_{11}^{-1} \bV_{12} = \bV_{22} - \bV_{21} \bV_{11}^{-1} \bV_{12} \] and the two asymptotic variances match. \end{enumerate} <>= #Numerical verification set.seed(1293) library(mvnfast) muX<-c(-1,2,1) Sig<-0.9^abs(outer(1:3,1:3,'-')) n<-1000 X<-rmvn(n,muX,Sig) th<-c(1,1,1,1) Xm<-cbind(1,X) Y<- Xm %*% th + rnorm(n)*2 coef(summary(lm(Y~X))) X1<-Xm[,1:2] #Take beta to be the first two elements of theta X2<-Xm[,3:4] #Take psi to be the last two elements of theta #Joint estimating equation th.hat<-solve(t(Xm) %*% Xm) %*% (t(Xm) %*% Y) th.hat Y.hat0<- Xm %*% th.hat sigsq.hat<-sum((Y-Y.hat0)^2)/(n-length(th.hat)) sigsq.hat #Variance V<-sigsq.hat * solve((t(Xm) %*% Xm)/n)/n V sqrt(diag(V)) #Matches the standard error column above #Variance calc with I and J matrices Ihat<-(t(Xm) %*% (Xm * matrix(Y-Y.hat0,nrow=n,ncol=4)^2))/n Jhat<-(t(Xm) %*% Xm)/n Vhat<-(solve(Jhat) %*% Ihat %*% solve(t(Jhat)))/n Vhat sqrt(diag(Vhat)) ###################################################################### #Projected estimating equation I.n<-diag(rep(1,n)) H1<- X1 %*% solve(t(X1) %*% X1) %*% t(X1) psi.hat<-solve(t(X2) %*% (I.n-H1) %*% X2) %*% (( t(X2)%*%(I.n-H1)) %*% Y) psi.hat #Variance estimation V11<-(t(X1) %*% X1)/n V12<-(t(X1) %*% X2)/n V21<-t(V12) V22<-(t(X2) %*% X2)/n V<-sigsq.hat*solve(V22-(V21 %*% solve(V11) %*% V12))/n sqrt(diag(V)) #Check one of the claimed orthogonalities M<-(I.n-H1) %*% X1 apply(M,2,mean) apply(M,2,var) @ \end{document}