\documentclass[notitlepage]{article}
\usepackage{Math}
%\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{bm}
\usepackage{bbm}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{dsfont}
\usepackage{scalerel}

\usepackage{amsfonts,amsmath,amssymb}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}


\usepackage{pgfplots}
\pgfplotsset{compat=1.17}

\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
            \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}

\usepackage{xcolor}

\newtheorem{alg}{Algorithm}
\newtheorem{ex}{Example}
\newtheorem{note}{Note}
\newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof
}}}}
{\hfill \small$\blacksquare$\\[2mm]}


\setlength{\parindent}{0in}

\def\E{\mathbbmss{E}}
\def\Var{\text{Var}}
\def\Cov{\text{Cov}}
\def\Corr{\text{Corr}}

\def\bW{\mathbf{W}}
\def\bH{\mathbf{H}}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\Rho}{\mathrm{P}}
\newcommand{\cif}{\text{ if }}
\newcommand{\Ra}{\Longrightarrow}

\def\bphi{\boldsymbol \phi}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\def\Xb{\mathbf{X}_{\beta}}
\def\Xp{\mathbf{X}_{\psi}}
\def\beps{\boldsymbol{\epsilon}}
\def\betahat{\widehat \beta}
\def\psihat{\widehat \psi}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
#opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=6)
options(repos=c(CRAN="https://cloud.r-project.org/"))
knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)})
@
\begin{center}
{\textsclarge{Propensity score adjustment with parameter estimation}}
\end{center}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

Consider the data generating (structural) conditional mean model for binary treatment
\[
\E_{Y|X,Z}^\calO[Y|X=x,Z=z] = 2 + 4 x + x^2 + 3 z
\]
and propensity score adjusted estimation of the ATE , which in this case is equal to $\psi_0 = 3$. $X \sim Normal(\mu_X,\sigma_X^2)$ is a confounder, and the conditional model for $Z|X=x$ is $Bernoulli(e(x))$ with
\[
e(x) = \frac{\exp\{-3.5 + 2 x\}}{1+\exp\{-3.5 + 2 x\}}.
\]
In the analysis a parametric model with parameter $\alpha$ estimated via the logistic regression model is used.
<<C1,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#Calculation for large N
library(mvnfast)
set.seed(22087)
N<-1000000
muX<-2;sigX<-0.5
X1<-rnorm(N,muX,sigX)
al<-c(-3.5,2)
expit<-function(x){return(1/(1+exp(-x)))}
Xm<-cbind(1,X1)
be<-c(2,3,4,1)
sigY<-3
ps.true<-expit(Xm %*% al)
Z<-rbinom(N,1,ps.true)
Xb<-cbind(1,Z,X1,X1^2)
Y<-Xb %*% be + rnorm(N)*sigY
par(mar=c(4,4,2,0))
hist(ps.true,breaks=seq(0,1,by=0.02),main='True propensity score',xlab=expression(e(x)))
box()
@

It is easy to see that a correctly specified regression model will estimate the average treatment effect (ATE) correctly, and a mis-specified model will not.

<<C2,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
#Correct model
coef(summary(lm(Y~Z+X1+I(X1^2))))
#Incorrect model
coef(summary(lm(Y~Z)))
@

\bigskip
\textbf{Propensity score regression:}  Consider OLS estimation using the (mis-specified) mean model:
\[
\E_{Y|X,Z}^\calO[Y|X=x,Z=z] = \beta_0 + z \psi_0 + e(x;\widehat \alpha) \phi_0
\]
<<C3,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
#PS regression: Correct PS model
eX<-fitted(glm(Z~X1,family=binomial))
coef(summary(lm(Y~Z+eX)))
@
The estimation is equivalent to OLS estimation that solves an OLS linear system
\[
\sum_{i=1}^n \begin{pmatrix}
               1 \\[6pt]
               z_i \\[6pt]
               e(x_i;\widehat \alpha)
             \end{pmatrix} (y_i - \beta_0 - z_i \psi_0 - e(x_i;\widehat \alpha) \phi_0) = \begin{pmatrix} 0 \\[6pt] 0 \\[6pt] 0\end{pmatrix}
\]
or equivalently
\[
\sum_{i=1}^n \begin{pmatrix}
               1 \\[6pt]
               (z_i-e(x_i;\widehat \alpha)) \\[6pt]
               e(x_i;\widehat \alpha)
             \end{pmatrix} (y_i - \beta_0 - z_i \psi_0 - e(x_i;\widehat \alpha) \phi_0) = \begin{pmatrix} 0 \\[6pt] 0 \\[6pt] 0\end{pmatrix}
\]
to yield
\[
\begin{pmatrix}
  \widehat \beta_0 \\[6pt]
  \widehat \psi_0 \\[6pt]
  \widehat \phi_0
\end{pmatrix} = (\bX^\top \bX)^{-1} \bX^\top \by
\qquad \qquad
\bX = \begin{pmatrix}
        1 & z_1  & e(x_1;\widehat \alpha)\\[3pt]
        1 & z_2 & e(x_2;\widehat \alpha)\\[3pt]
        \vdots & \vdots & \vdots\\[3pt]
        1 & z_n & e(x_n;\widehat \alpha)
      \end{pmatrix}.
\]
<<C4a,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
Xm<-cbind(1,Z,eX)
solve(t(Xm) %*% Xm) %*% t(Xm) %*% Y
@
However, an alternative \textbf{G-estimating} form is
\[
\sum_{i=1}^n \begin{pmatrix}
               1 \\
               (z_i-e(x_i;\widehat \alpha))
             \end{pmatrix} (y_i - \beta_0 - z_i \psi_0) = \begin{pmatrix} 0 \\[6pt] 0 \end{pmatrix}
\]
that is, using the estimating (score) function
\[
\bZ_1^\top (\by - \bZ_2 \theta) = \zerovec
\]
say, where $\theta = (\beta_0, \psi_0)$.
\[
\bZ_1 = \begin{pmatrix}
        1 & z_1 - e(x_1;\widehat \alpha) \\[3pt]
        1 & z_2 - e(x_2;\widehat \alpha) \\[3pt]
        \vdots & \vdots \\[3pt]
        1 & z_n - e(x_n;\widehat \alpha)
      \end{pmatrix} \qquad
\bZ_2 = \begin{pmatrix}
        1 & z_1 \\[3pt]
        1 & z_2 \\[3pt]
        \vdots & \vdots \\[3pt]
        1 & z_n
      \end{pmatrix}
\]
The solution is therefore
\[
\begin{pmatrix}
  \widehat \beta_0 \\[6pt]
  \widehat \psi_0
\end{pmatrix} = (\bZ_1^\top \bZ_2)^{-1} \bZ_1^\top \by
\]
<<C4b,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
Zm1<-cbind(1,Z-eX)
Zm2<-cbind(1,Z)
g.ests<-solve(t(Zm1) %*% Zm2) %*% t(Zm1) %*% Y
g.ests
@
We can incorporate the estimation of the propensity score parameters into the same estimating equation formulation by adding in the estimating equation for $\alpha$, namely, for the logistic regression model, the equations
\[
\sum_{i=1}^n \bx_i^\top (z_i - e(x_i;\alpha) )= \zerovec.
\]

\medskip

We can compute an estimate of the variance-covariance matrix using the theory of estimating equations.  For the $p \times 1$ system of estimating equations
\[
\sum_{i=1}^n \bU_i(\theta) = \zerovec
\]
with $\E[\bU(\theta_0)] = \zerovec$, we have that
\[
\sqrt{n}(\widehat \theta_n - \theta_0) \CiD Normal_p(\zerovec,\bV)
\]
where
\[
\bV = \mathcal{J}^{-1} \mathcal{I} \mathcal{J}^{-\top}
\]
with
\[
\mathcal{I} = \E[ \bU(\theta_0) \bU(\theta_0)^\top] \qquad \qquad \mathcal{J} = \E[\dot{\bU}(\theta_0)]
\]
both $(p \times p)$ matrices, and
\[
\dot{\bU}(\theta_0) = \dfrac{\partial \bU(\theta)}{\partial \theta^\top} \bigg |_{\theta = \theta_0}
\]



We proceed by computing estimates, $\widehat I_n$ and $\widehat J_n$, of the two matrices based on the observed data, and then computing
\[
\widehat \bV = \widehat J_n^{-1} \widehat I_n \widehat J_n^{-\top}
\]
to estimate the asymptotic variance; this approach is known as \textit{sandwich} or \textit{robust} variance estimation.



\begin{itemize}

\item For \textbf{propensity score regression} we stack the estimating equations
\[
\sum_{i=1}^n \begin{pmatrix}
               (y_i - \beta_0 - z_i \psi_0 - e(x_i; \alpha) \phi_0) \\[6pt]
               z_i (y_i - \beta_0 - z_i \psi_0 - e(x_i; \alpha) \phi_0) \\[6pt]
               e(x_i;  \alpha) (y_i - \beta_0 - z_i \psi_0 - e(x_i; \alpha) \phi_0) \\[6pt]
               \bx_i^\top (z_i - e(x_i; \alpha))
             \end{pmatrix} = \begin{pmatrix} 0 \\[6pt] 0 \\[6pt] 0 \\[6pt] 0 \end{pmatrix}
\]
and then compute the $\mathcal{I}$ and $\mathcal{J}$ and their estimated values in this extended system.  The score vector is
\[
\bU(\theta) = \begin{pmatrix}
               (Y - \beta_0 - Z \psi_0 - e(X; \alpha) \phi_0) \\[6pt]
               Z (Y - \beta_0 - Z \psi_0 - e(X; \alpha) \phi_0) \\[6pt]
               e(X;  \alpha) (Y - \beta_0 - Z \psi_0 - e(X; \alpha) \phi_0) \\[6pt]
               \bX^\top (Z - e(X; \alpha))
             \end{pmatrix} =
\begin{pmatrix}
               \epsilon_Y \\[6pt]
               Z \epsilon_Y \\[6pt]
               e \epsilon_Y \\[6pt]
               \bX^\top \epsilon_Z
             \end{pmatrix}
\]
say, denoting $e \equiv e(X;\alpha)$, where $\bX = (1, X)$, and
\[
\epsilon_Y = (Y - \beta_0 - Z \psi_0 - e(X; \alpha) \phi_0) \qquad \qquad \epsilon_Z = (Z - e(X; \alpha)).
\]
Then
\[
\mathcal{I} = \E \begin{bmatrix}
                \epsilon_Y^2 & Z \epsilon_Y^2 & e \epsilon_Y^2 & \bX \epsilon_Z \epsilon_Y \\[6pt]
                Z \epsilon_Y^2 & Z^2 \epsilon_Y^2 & Z e \epsilon_Y^2 & \bX Z \epsilon_Z \epsilon_Y \\[6pt]
                e \epsilon_Y^2 & Z e\epsilon_Y^2 & e^2 \epsilon_Y^2 & e \epsilon_Z \epsilon_Y \\[6pt]
                \bX^\top \epsilon_Z \epsilon_Y & \bX^\top Z \epsilon_Z \epsilon_Y & \bX^\top Z \epsilon_Z \epsilon_Y & \bX^\top \bX \epsilon_Z^2
              \end{bmatrix}
\qquad
\mathcal{J} = \E \begin{bmatrix}
                1 & Z & e & \bX e(1-e) \phi_0  \\[6pt]
                Z & Z^2 & Z e & Z \bX e(1-e) \phi_0 \\[6pt]
                e & Z e & e^2 & \bX e(1-e) (e \phi_0 - \epsilon_Y) \\[6pt]
                 0 & 0 & 0 & \bX^\top \bX e (1-e)
              \end{bmatrix}.
\]

<<C4c,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#Propensity score regression
psr.ests<-coef(lm(Y~Z+eX))
res1<-Y-psr.ests[1]-psr.ests[2]*Z - psr.ests[3]*eX
res2<-Z-eX
eterm<-eX*(1-eX)
dterm<-eX*psr.ests[3]-res1

Xm1<-cbind(1,Z,eX) * matrix(res1,nrow=N,ncol=3)
Xm2<-cbind(1,X1) * matrix(res2,nrow=N,ncol=2)
Xm<-cbind(Xm1,Xm2)
I.n<-matrix((t(Xm) %*% Xm)/N,5,5)

round(I.n,6)

J.n<-matrix(0,5,5)
J.n[1:3,1:3]<-crossprod(cbind(1,Z,eX))/N
J.n[1,4:5]<-c(mean(eterm*psr.ests[3]),mean(X1*eterm*psr.ests[3]))
J.n[2,4:5]<-c(mean(Z*eterm*psr.ests[3]),mean(Z*X1*eterm*psr.ests[3]))
J.n[3,4:5]<-c(mean(eterm*dterm),mean(X1*eterm*dterm))
J.n[4:5,4:5]<-t(cbind(1,X1) * matrix(eterm,nrow=N,ncol=2)) %*% cbind(1,X1)/N

round(J.n,6)

V1<-solve(J.n) %*% (I.n %*% t(solve(J.n)))
round(V1,6)
@
Thus the asymptotic variance of $\widehat \psi_0$ (that is, the marginal asymptotic variance of $\sqrt{n}(\widehat \psi_0 - \psi_0^\true)$) under propensity score estimation is approximately $\Sexpr{V1[2,2]}$

\bigskip

\item For \textbf{G-estimation} we stack the estimating equations
\[
\sum_{i=1}^n \begin{pmatrix}
               (y_i - \beta_0 - z_i \psi_0) \\[6pt]
               (z_i-e(x_i; \alpha))(y_i - \beta_0 - z_i \psi_0) \\[6pt]
               \bx_i^\top (z_i - e(x_i; \alpha))
             \end{pmatrix} = \begin{pmatrix} 0 \\[6pt] 0 \\[6pt] 0 \end{pmatrix}
\]
The score vector is
\[
\bU(\theta) = \begin{pmatrix}
               (Y - \beta_0 - Z \psi_0) \\[6pt]
               (Z-e(X; \alpha))(Y - \beta_0 - Z \psi_0) \\[6pt]
               \bX^\top (Z - e(X; \alpha))
             \end{pmatrix} =
\begin{pmatrix}
               \varepsilon_Y \\[6pt]
               \epsilon_Z \varepsilon_Y \\[6pt]
               \bX^\top \epsilon_Z
             \end{pmatrix}
\]
where
\[
\varepsilon_Y = (Y - \beta_0 - Z \psi_0)
\]
Therefore
\[
\mathcal{I} = \E \begin{bmatrix}
                \varepsilon_Y^2 & \epsilon_Z \varepsilon_Y^2 & \bX \epsilon_Z \varepsilon_Y \\[6pt]
                \epsilon_Z \varepsilon_Y^2 & \epsilon_Z^2 \varepsilon_Y^2 & \bX \epsilon_Z^2 \varepsilon_Y \\[6pt]
                \bX^\top \epsilon_Z \varepsilon_Y & \bX^\top \epsilon_Z^2 \varepsilon_Y & \bX^\top \bX \epsilon_Z^2
              \end{bmatrix}
\qquad \qquad
\mathcal{J} = \E \begin{bmatrix}
                1 & Z & \zerovec \\[6pt]
                \epsilon_Z & Z \epsilon_Z & \bX e (1-e) \varepsilon_Y\\[6pt]
                 \zerovec & \zerovec & \bX^\top \bX e (1-e)
              \end{bmatrix}.
\]
Now, the first component of the score vector, $\varepsilon_Y$, is required to have expectation zero; also, we know by assumption that this term does not depend on $Z$.  Therefore,
\[
\E_{Y|X,Z}[\varepsilon_Y | X,Z] = h(X) \qquad \text{with} \qquad \E_X[h(X)] = 0
\]
and
\[
\E_{Y|X,Z}[\varepsilon_Y^2 | X,Z] = v(X).
\]
Hence by iterated expectation we must have that
\[
\E_{X,Y,Z}[\varepsilon_Y] = 0.
\]
Similarly we require by the estimating equation for $\alpha$ that
\[
\E_{Z|X}[\epsilon_Z | X] = 0
\]
and hence
\[
\E_{X,Y,Z}[\epsilon_Z] = \E_{X,Y,Z}[\epsilon_Z \varepsilon_Y^2] = 0 \qquad \E_{X,Y,Z}[\bX \epsilon_Z \varepsilon_Y] = \zerovec.
\]
We also know that
\[
\E_{Z|X}[\epsilon_Z^2 | X] = \E_{Z|X}[(Z-e(X))^2 | X] = e(X) (1-e(X)).
\]
Thus
\[
\mathcal{I} = \E \begin{bmatrix}
                v(X) & 0 & 0 & 0 \\[6pt]
                0 & e(1-e) v(X) & e(1-e) h(X) & e(1-e) X h(X)  \\[6pt]
                0 & e(1-e) h(X) & e(1-e) & e(1-e) X   \\[6pt]
                0 & e(1-e) X h(X) & e(1-e) X  & X^2 e(1-e) &
              \end{bmatrix} = \begin{bmatrix}
                                \mathcal{I}_{11} & \mathcal{I}_{12} \\[6pt]
                                \mathcal{I}_{21} & \mathcal{I}_{22}
                              \end{bmatrix}
\]
and
\[
\mathcal{J} = \E \begin{bmatrix}
                1 & e & 0 & 0 \\[6pt]
                0 & e(1-e) & e (1-e) h(X) & e (1-e) X h(X) \\[6pt]
                0 & 0 & e (1-e) & e (1-e) X \\[6pt]
                0 & 0 & e (1-e) X & e (1-e) X^2
              \end{bmatrix} = \begin{bmatrix}
                                \mathcal{J}_{11} & \mathcal{J}_{12} \\[6pt]
                                \zerovec & \mathcal{J}_{22}
                              \end{bmatrix}.
\]
On multiplying out, and noting that $\mathcal{I}_{22} = \mathcal{J}_{22}$, we can conclude that $\bV = \mathcal{J}^{-1} \mathcal{I} \mathcal{J}^{-\top}$ is \textbf{block diagonal}, that is
\[
\bV = \begin{bmatrix}
        \bV_{11} & \zerovec \\
        \zerovec & \bV_{22}
      \end{bmatrix}
\]
from which we can conclude that asymptotically $(\widehat \beta_0, \widehat \psi_0) \independent \widehat \alpha$, that is, the estimators are \textbf{independent}.

<<C4e,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
Zm1<-cbind(1,Z-eX)
Zm2<-cbind(1,Z)
g.ests<-solve(t(Zm1) %*% Zm2) %*% t(Zm1) %*% Y
res1<-Y-g.ests[1]-g.ests[2]*Z
res2<-Z-eX

I.n<-matrix(0,4,4)
I.n[1,1]<-mean(res1^2)
I.n[1,2]<-I.n[2,1]<-mean(res2*res1^2)
I.n[1,3]<-I.n[3,1]<-mean(res2*res1)
I.n[1,4]<-I.n[4,1]<-mean(X1*res2*res1)
I.n[2,2]<-mean(res2^2*res1^2)
I.n[2,3]<-I.n[3,2]<-mean(res2^2*res1)
I.n[2,4]<-I.n[4,2]<-mean(X1*res2^2*res1)
I.n[3,3]<-mean(res2^2)
I.n[3,4]<-I.n[4,3]<-mean(X1*res2^2)
I.n[4,4]<-mean(X1^2*res2^2)

round(I.n,6)

J.n<-matrix(0,4,4)
J.n[1,1]<-1
J.n[1,2]<-mean(Z)
J.n[2,1]<-mean(res2)
J.n[2,2]<-mean(Z*res2)
J.n[2,3]<-mean(eX*(1-eX)*res1)
J.n[2,4]<-mean(X1*eX*(1-eX)*res1)
J.n[3,3]<-mean(eX*(1-eX))
J.n[3,4]<-J.n[4,3]<-mean(X1*eX*(1-eX))
J.n[4,4]<-mean(X1^2*eX*(1-eX))
round(J.n,6)

V2<-solve(J.n) %*% (I.n %*% t(solve(J.n)))
round(V2,6)
@


\bigskip

\item \textbf{Inverse Probability Weighting:} The original IPW estimators are
\[
\widetilde \mu_{\text{IPW}} (0) = \dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{(1-Z_i) Y_i}{1-e(X_i)} \qquad \qquad \widetilde \mu_{\text{IPW}}(1) = \dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{Z_i Y_i}{e(X_i)}.
\]
and in standardized weight form
\[
\widehat \mu_{\text{IPW}} (0) = \dfrac{\dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{(1-Z_i) Y_i}{1-e(X_i)} }{\dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{(1-Z_i)}{1-e(X_i)}} \qquad \qquad \widehat \mu_{IPW}(1) = \dfrac{\dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{Z_i Y_i}{e(X_i)}}{\dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{Z_i}{e(X_i)}}.
\]
The standardized weight estimators arise from the fit of a weighted least squares problem; if we denote $\beta_0 = \mu (0)$ and $\psi_0 = \mu(1) - \mu(0)$, with $\theta = (\beta_0,\psi_0)$, then
\[
(\widehat \beta_0, \widehat \psi_0) = \arg \min_{(\beta,\psi)} \sum_{i=1}^n R_i (Y_i - \beta_0 - Z_i \psi_0)^2
\]
where for each $i$
\[
R_i = \dfrac{(1-Z_i)}{1-e(X_i)} + \dfrac{Z_i}{e(X_i)} = R_{0i} + R_{1i}
\]
say.

\bigskip
We have for the estimating equations
\[
\sum_{i=1}^n \begin{pmatrix}
               r_i (y_i - \beta_0 - z_i \psi_0) \\[6pt]
               r_i z_i (y_i - \beta_0 - z_i \psi_0) \\[6pt]
               \bx_i^\top (z_i - e(x_i; \alpha))
             \end{pmatrix} = \begin{pmatrix} 0 \\[6pt] 0 \\[6pt] 0 \end{pmatrix}
\]
The score vector is
\[
\bU(\theta) = \begin{pmatrix}
               R (Y - \beta_0 - Z \psi_0) \\[6pt]
               R Z (Y - \beta_0 - Z \psi_0) \\[6pt]
               \bX^\top (Z - e(X; \alpha))
             \end{pmatrix} =
\begin{pmatrix}
               R \varepsilon_Y \\[6pt]
               R Z \varepsilon_Y \\[6pt]
               \bX^\top \epsilon_Z
             \end{pmatrix}
\]
Therefore
\[
\mathcal{I} = \E \begin{bmatrix}
                R^2 \varepsilon_Y^2 & R^2 Z \varepsilon_Y^2 & \bX  R \epsilon_Z \varepsilon_Y \\[6pt]
                R^2 Z \varepsilon_Y^2 & R^2 Z^2 \varepsilon_Y^2 & \bX R Z \epsilon_Z \varepsilon_Y \\[6pt]
                \bX^\top R \epsilon_Z \varepsilon_Y & \bX^\top R Z \epsilon_Z \varepsilon_Y & \bX^\top \bX \epsilon_Z^2
              \end{bmatrix}
\qquad \qquad
\mathcal{J} = \E \begin{bmatrix}
                R & R Z & -\dot{\bR}^\top \varepsilon_Y \\[6pt]
                R Z  & RZ^2  & -\dot{\bR}^\top Z \varepsilon_Y \\[6pt]
                \zerovec  & \zerovec  & \bX^\top \bX e (1-e)
              \end{bmatrix}.
\]
where
\[
\dot{\bR} = \frac{d R}{d \alpha} = \frac{1-Z}{1-e(X)} e(X) \bX^\top - \frac{Z}{e(X)} (1-e(X)) \bX^\top.
\]
<<C5,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
eX<-fitted(glm(Z~X1,family=binomial))
w0<-(1-Z)/(1-eX); W0<-w0/sum(w0)
w1<-Z/eX; W1<-w1/sum(w1)
w<-w0+w1

#IPW via original weights
mean(w1*Y)-mean(w0*Y)

#IPW via Standardized weights
sum(W1*Y)-sum(W0*Y)

#IPW via regression
fit.w<-lm(Y~Z,weights=w)
coef(summary(fit.w))
@

<<C5a,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
ipw.ests<-coef(fit.w)
res1<-Y-ipw.ests[1]-ipw.ests[2]*Z
res2<-Z-eX
R<-w
Rdot1<-((1-Z)/(1-eX))*eX-(Z/eX)*(1-eX)
Rdot2<-X1*Rdot1


I.n<-matrix(0,4,4)
I.n[1,1]<-mean(R^2*res1^2)
I.n[1,2]<-I.n[2,1]<-mean(R^2*Z*res1^2)
I.n[1,3]<-I.n[3,1]<-mean(R*res2*res1)
I.n[1,4]<-I.n[4,1]<-mean(X1*R*res2*res1)
I.n[2,2]<-mean(R^2*Z^2*res1^2)
I.n[2,3]<-I.n[3,2]<-mean(R*Z*res2*res1)
I.n[2,4]<-I.n[4,2]<-mean(X1*R*Z*res2*res1)
I.n[3,3]<-mean(res2^2)
I.n[3,4]<-I.n[4,3]<-mean(X1*res2^2)
I.n[4,4]<-mean(X1^2*res2^2)

round(I.n,6)

J.n<-matrix(0,4,4)
J.n[1,1]<-mean(R)
J.n[1,2]<-mean(R*Z)
J.n[1,3]<-mean(-Rdot1*res1)
J.n[1,4]<-mean(-Rdot2*res1)
J.n[2,1]<-mean(R*Z)
J.n[2,2]<-mean(R*Z^2)
J.n[2,3]<-mean(-Rdot1*Z*res1)
J.n[2,4]<-mean(-Rdot2*Z*res1)
J.n[3,3]<-mean(res2^2)
J.n[3,4]<-J.n[4,3]<-mean(X1*res2^2)
J.n[4,4]<-mean(X1^2*res2^2)
round(J.n,6)

V3<-solve(J.n) %*% (I.n %*% t(solve(J.n)))
round(V3,6)
@
Again, we note that the matrix is block-diagonal, and therefore that asymptotically $(\widehat \beta_0, \widehat \psi_0) \independent \widehat \alpha$, that is, the estimators are \textbf{independent}

\end{itemize}

\pagebreak

<<C6,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#Monte Carlo study
nreps<-20000
n<-1000
psr.ests<-matrix(0,nrow=nreps,ncol=3)
g.ests<-ipw.ests<-al.ests<-matrix(0,nrow=nreps,ncol=2)
V1.ests<-array(0,c(nreps,5,5))
V2.ests<-V3.ests<-array(0,c(nreps,4,4))
for(irep in 1:nreps){

    X1<-rnorm(n,muX,sigX)
    Xm<-cbind(1,X1)
    ps.true<-expit(Xm %*% al)
    Z<-rbinom(n,1,ps.true)
    Xb<-cbind(1,Z,X1,X1^2)
    Y<-Xb %*% be + rnorm(n)*sigY

    fit.p<-glm(Z~X1,family=binomial)
    eX<-fitted(fit.p)
    w0<-(1-Z)/(1-eX)
    W0<-w0/sum(w0)
    w1<-Z/eX
    W1<-w1/sum(w1)
    w<-w0+w1
	
    psr.fit<-coef(lm(Y~Z+eX))
    psr.ests[irep,]<-psr.fit
    ipw.ests[irep,]<-coef(summary(lm(Y~Z,weights=w)))[1:2]
    al.ests[irep,]<-coef(fit.p)

    Zm1<-cbind(1,Z-eX)
    Zm2<-cbind(1,Z)
    g.ests[irep,]<-solve(t(Zm1) %*% Zm2) %*% t(Zm1) %*% Y

    #PSR variance estimate
    res1<-Y-psr.fit[1]-psr.fit[2]*Z - psr.fit[3]*eX
    res2<-Z-eX
    eterm<-eX*(1-eX)
    dterm<-eX*psr.fit[3]-res1

    Xm1<-cbind(1,Z,eX) * matrix(res1,nrow=n,ncol=3)
    Xm2<-cbind(1,X1) * matrix(res2,nrow=n,ncol=2)
    Xm<-cbind(Xm1,Xm2)
    I.n<-matrix((t(Xm) %*% Xm)/n,5,5)

    J.n<-matrix(0,5,5)
    J.n[1:3,1:3]<-crossprod(cbind(1,Z,eX))/n
    J.n[1,4:5]<-c(mean(eterm*psr.fit[3]),mean(X1*eterm*psr.fit[3]))
    J.n[2,4:5]<-c(mean(Z*eterm*psr.fit[3]),mean(Z*X1*eterm*psr.fit[3]))
    J.n[3,4:5]<-c(mean(eterm*dterm),mean(X1*eterm*dterm))
    J.n[4:5,4:5]<-t(cbind(1,X1) * matrix(eterm,nrow=n,ncol=2)) %*% cbind(1,X1)/n

    V1.ests[irep,,]<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n

    #G-estimation variance estimate
    res1<-Y-g.ests[irep,1]-g.ests[irep,2]*Z
    res2<-Z-eX

    I.n<-matrix(0,4,4)
    I.n[1,1]<-mean(res1^2)
    I.n[1,2]<-I.n[2,1]<-mean(res2*res1^2)
    I.n[1,3]<-I.n[3,1]<-mean(res2*res1)
    I.n[1,4]<-I.n[4,1]<-mean(X1*res2*res1)
    I.n[2,2]<-mean(res2^2*res1^2)
    I.n[2,3]<-I.n[3,2]<-mean(res2^2*res1)
    I.n[2,4]<-I.n[4,2]<-mean(X1*res2^2*res1)
    I.n[3,3]<-mean(res2^2)
    I.n[3,4]<-I.n[4,3]<-mean(X1*res2^2)
    I.n[4,4]<-mean(X1^2*res2^2)

    J.n<-matrix(0,4,4)
    J.n[1,1]<-1
    J.n[1,2]<-mean(Z)
    J.n[2,1]<-mean(res2)
    J.n[2,2]<-mean(Z*res2)
    J.n[2,3]<-mean(eX*(1-eX)*res1)
    J.n[2,4]<-mean(X1*eX*(1-eX)*res1)
    J.n[3,3]<-mean(eX*(1-eX))
    J.n[3,4]<-J.n[4,3]<-mean(X1*eX*(1-eX))
    J.n[4,4]<-mean(X1^2*eX*(1-eX))
    V<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n
    V2.ests[irep,,]<-V

    #IPW Variance estimate
    res1<-Y-ipw.ests[irep,1]-ipw.ests[irep,2]*Z
    res2<-Z-eX
    R<-w
    Rdot1<-((1-Z)/(1-eX))*eX-(Z/eX)*(1-eX)
    Rdot2<-X1*Rdot1


    I.n<-matrix(0,4,4)
    I.n[1,1]<-mean(R^2*res1^2)
    I.n[1,2]<-I.n[2,1]<-mean(R^2*Z*res1^2)
    I.n[1,3]<-I.n[3,1]<-mean(R*res2*res1)
    I.n[1,4]<-I.n[4,1]<-mean(X1*R*res2*res1)
    I.n[2,2]<-mean(R^2*Z^2*res1^2)
    I.n[2,3]<-I.n[3,2]<-mean(R*Z*res2*res1)
    I.n[2,4]<-I.n[4,2]<-mean(X1*R*Z*res2*res1)
    I.n[3,3]<-mean(res2^2)
    I.n[3,4]<-I.n[4,3]<-mean(X1*res2^2)
    I.n[4,4]<-mean(X1^2*res2^2)

    J.n<-matrix(0,4,4)
    J.n[1,1]<-mean(R)
    J.n[1,2]<-mean(R*Z)
    J.n[1,3]<-mean(-Rdot1*res1)
    J.n[1,4]<-mean(-Rdot2*res1)
    J.n[2,1]<-mean(R*Z)
    J.n[2,2]<-mean(R*Z^2)
    J.n[2,3]<-mean(-Rdot1*Z*res1)
    J.n[2,4]<-mean(-Rdot2*Z*res1)
    J.n[3,3]<-mean(res2^2)
    J.n[3,4]<-J.n[4,3]<-mean(X1*res2^2)
    J.n[4,4]<-mean(X1^2*res2^2)

    V<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n
    V3.ests[irep,,]<-V
}
@

<<C7,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0))
psi0.ests<-cbind(psr.ests[,2],g.ests[,2],ipw.ests[,2])
colnames(psi0.ests)<-c('PSR','G-est','IPW')
boxplot(psi0.ests,names=c('PSR','G-est','IPW'),pch=19,cex=0.5)
abline(h=3,col='red',lty=2)
@


<<C7a,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#Variances
apply(psi0.ests,2,var)
@

<<C8,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#Propensity score regression variances
round(apply(V1.ests,2:3,mean),6)         #Monte Carlo sandwich estimate
round(cov(cbind(psr.ests,al.ests)),6)    #Empirical estimate
round(V1/n,6)                            #Monte Carlo sandwich estimate (large sample)


#G-estimation regression variances
round(apply(V2.ests,2:3,mean),6)         #Monte Carlo sandwich estimate
round(cov(cbind(g.ests,al.ests)),6)      #Empirical estimate
round(V2/n,6)                            #Monte Carlo sandwich estimate (large sample)

#IPW variances
round(apply(V3.ests,2:3,mean),6)         #Monte Carlo sandwich estimate
round(cov(cbind(ipw.ests,al.ests)),6)    #Empirical estimate
round(V3/n,6)                            #Monte Carlo sandwich estimate (large sample)
@

<<C9,comment='+', fig.width=5, fig.height=5, fig.align='center',echo=FALSE>>=
nvec<-c(expression(hat(beta)[0]),expression(hat(psi)[0]),
        expression(hat(alpha)[0]),expression(hat(alpha)[1]))
nvecp<-c(expression(hat(beta)[0]),expression(hat(psi)[0]),expression(hat(phi)[0]),
        expression(hat(alpha)[0]),expression(hat(alpha)[1]))
pairs(cbind(psr.ests,al.ests),labels=nvecp,pch=19,cex=0.5,main = "PSR")
print('Correlation matrix')
cor(cbind(psr.ests,al.ests))
pairs(cbind(g.ests,al.ests),labels=nvec,pch=19,cex=0.5,main = "G-est")
print('Correlation matrix')
cor(cbind(g.ests,al.ests))
pairs(cbind(ipw.ests,al.ests),labels=nvec,pch=19,cex=0.5,main = "IPW")
print('Correlation matrix')
cor(cbind(ipw.ests,al.ests))
@
Here the sandwich estimates match the empirical estimates well.

\end{document}