\documentclass[notitlepage]{article}
\usepackage{Math}
%\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{bm}
\usepackage{bbm}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{dsfont}
\usepackage{scalerel}

\usepackage{amsfonts,amsmath,amssymb}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}


\usepackage{pgfplots}
\pgfplotsset{compat=1.17}

\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
            \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}

\usepackage{xcolor}

\newtheorem{alg}{Algorithm}
\newtheorem{ex}{Example}
\newtheorem{note}{Note}
\newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof
}}}}
{\hfill \small$\blacksquare$\\[2mm]}


\setlength{\parindent}{0in}

\def\E{\mathbbmss{E}}
\def\Var{\text{Var}}
\def\Cov{\text{Cov}}
\def\Corr{\text{Corr}}

\def\bW{\mathbf{W}}
\def\bH{\mathbf{H}}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\Rho}{\mathrm{P}}
\newcommand{\cif}{\text{ if }}
\newcommand{\Ra}{\Longrightarrow}

\def\bphi{\boldsymbol \phi}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\def\Xb{\mathbf{X}_{\beta}}
\def\Xp{\mathbf{X}_{\psi}}
\def\beps{\boldsymbol{\epsilon}}
\def\betahat{\widehat \beta}
\def\psihat{\widehat \psi}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=6)
options(repos=c(CRAN="https://cloud.r-project.org/"))
knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)})
@
\begin{center}
{\textsclarge{Variance comparison for IPW and G-estimation}}
\end{center}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

We can compute an estimate of the variance-covariance matrix using the theory of estimating equations.  For the $p \times 1$ system of estimating equations
\[
\sum_{i=1}^n \bU_i(\theta) = \zerovec
\]
with $\E[\bU(\theta_0)] = \zerovec$, we have that
\[
\sqrt{n}(\widehat \theta_n - \theta_0) \CiD Normal_p(\zerovec,\bV)
\]
where
\[
\bV = \mathcal{J}^{-1} \mathcal{I} \mathcal{J}^{-\top}
\]
with
\[
\mathcal{I} = \E[ \bU(\theta_0) \bU(\theta_0)^\top] \qquad \qquad \mathcal{J} = \E[\dot{\bU}(\theta_0)]
\]
both $(p \times p)$ matrices, and
\[
\dot{\bU}(\theta_0) = -\dfrac{\partial \bU(\theta)}{\partial \theta^\top} \bigg |_{\theta = \theta_0}
\]



We proceed by computing estimates, $\widehat I_n$ and $\widehat J_n$, of the two matrices based on the observed data, and then computing
\[
\widehat \bV = \widehat J_n^{-1} \widehat I_n \widehat J_n^{-\top}
\]
to estimate the asymptotic variance; this approach is known as \textit{sandwich} or \textit{robust} variance estimation.  In the following analysis for, we denote the ATE by $\psi_0$.


\begin{enumerate}[1.]

\item \textbf{IPW version 1:}  For the elementary application of IPW, there is a single unknown parameter of interest, and the estimator is
\[
\widehat \psi_0 = \frac{1}{n} \sum_{i=1}^n \left(\frac{Z_i}{e(X)} - \frac{1-Z_i}{1-e(X_i)} \right)Y_i
\]

\begin{enumerate}[(a)]


\item \textbf{\textit{Known propensity score:}} The score is
\[
U(\theta) = \left(\frac{Z}{e(X)} - \frac{1-Z}{1-e(X)} \right) Y - \psi_0
\]
and hence $\dot{U}(\theta) = 1$.

\item \textbf{\textit{Unknown propensity score:}} The score is
\[
\bU(\theta) = \begin{pmatrix}
               \left(\dfrac{Z}{e(X;\alpha)} - \dfrac{1-Z}{1-e(X; \alpha)} \right)Y - \psi_0 \\[12pt]
               \bX^\top (Z - e(X; \alpha))
             \end{pmatrix} =
\begin{pmatrix}
               \zeta_Y \\[6pt]
               \bX^\top \epsilon_Z
             \end{pmatrix}.
\]
say.  Therefore
\[
\mathcal{I} = \E \begin{bmatrix}
                \zeta_Y^2 &  \bX \epsilon_Z \zeta_Y \\[6pt]
                 \bX^\top \epsilon_Z \zeta_Y & \bX^\top \bX \epsilon_Z^2
              \end{bmatrix}
\qquad \qquad
\mathcal{J} = \E \begin{bmatrix}
                1 &  \left(\dfrac{Z}{e^2} + \dfrac{1-Z}{(1-e)^2} \right) Y \bX e(1-e) \\[12pt]
                 \zerovec & \bX^\top \bX e (1-e)
              \end{bmatrix}.
\]


\end{enumerate}

\item \textbf{IPW version 2:}  For the second version of the IPW estimator, we consider weighted least squares and the objective function
\[
\sum_{i=1}^n R_i (Y_i - \beta_0 - Z_i \psi_0)^2
\]
where
\[
R_i = \left(\dfrac{1-Z}{1-e(X)}+ \dfrac{Z_i}{e(X)} \right).
\]
This yields the estimator
\[
\widehat \psi_0 = \dfrac{\displaystyle\sum_{i=1}^n \dfrac{Z_i Y_i}{e(X_i)}}{\displaystyle \sum_{i=1}^n \dfrac{Z_i}{e(X_i)}} - \dfrac{\displaystyle \sum_{i=1}^n \dfrac{(1-Z_i) Y_i}{1-e(X_i)}}{\displaystyle\sum_{i=1}^n \dfrac{(1-Z_i)}{1-e(X_i)}}.
\]

\begin{enumerate}[(a)]

\item \textbf{\textit{Known propensity score:}} The score vector is
\[
\bU(\theta) = \begin{pmatrix}
               R (Y - \beta_0 - Z \psi_0) \\[6pt]
               R Z (Y - \beta_0 - Z \psi_0)
              \end{pmatrix} =
\begin{pmatrix}
               R \varepsilon_Y \\[6pt]
               R Z \varepsilon_Y
              \end{pmatrix}
\]
say, where $\varepsilon_Y = (Y - \beta_0 - Z \psi_0)$.  Therefore
\[
\mathcal{I} = \E \begin{bmatrix}
                R^2 \varepsilon_Y^2 & R^2 Z \varepsilon_Y^2 \\[6pt]
                R^2 Z \varepsilon_Y^2 & R^2 Z^2 \varepsilon_Y^2
              \end{bmatrix}
\qquad \qquad
\mathcal{J} = \E \begin{bmatrix}
                R & R Z \\[6pt]
                R Z  & RZ^2
              \end{bmatrix}.
\]


\item \textbf{\textit{Unknown propensity score:}} The score vector is
\[
\bU(\theta) = \begin{pmatrix}
               R (Y - \beta_0 - Z \psi_0) \\[6pt]
               R Z (Y - \beta_0 - Z \psi_0) \\[6pt]
               \bX^\top (Z - e(X; \alpha))
             \end{pmatrix} =
\begin{pmatrix}
               R \varepsilon_Y \\[6pt]
               R Z \varepsilon_Y \\[6pt]
               \bX^\top \epsilon_Z
             \end{pmatrix}
\]
Therefore
\[
\mathcal{I} = \E \begin{bmatrix}
                R^2 \varepsilon_Y^2 & R^2 Z \varepsilon_Y^2 & \bX  R \epsilon_Z \varepsilon_Y \\[6pt]
                R^2 Z \varepsilon_Y^2 & R^2 Z^2 \varepsilon_Y^2 & \bX R Z \epsilon_Z \varepsilon_Y \\[6pt]
                \bX^\top R \epsilon_Z \varepsilon_Y & \bX^\top R Z \epsilon_Z \varepsilon_Y & \bX^\top \bX \epsilon_Z^2
              \end{bmatrix}
\qquad \qquad
\mathcal{J} = \E \begin{bmatrix}
                R & R Z & -\dot{\bR}^\top \varepsilon_Y \\[6pt]
                R Z  & RZ^2  & -\dot{\bR}^\top Z \varepsilon_Y \\[6pt]
                \zerovec  & \zerovec  & \bX^\top \bX e (1-e)
              \end{bmatrix}.
\]
where
\[
\dot{\bR} = \frac{d R}{d \alpha} = \frac{1-Z}{1-e(X)} e(X) \bX^\top - \frac{Z}{e(X)} (1-e(X)) \bX^\top.
\]


\end{enumerate}


\item \textbf{G-estimation:}  The most elementary form of the G-estimating equation is based on solving
\[
\sum_{i=1}^n (Z_i - e(X_i)) (Y_i - Z_i \psi_0) = 0.
\]

\begin{enumerate}


\item \textbf{\textit{Known propensity score:}}  The score vector is

\[
U(\theta) = (Z-e(X))(Y - Z \psi_0)
\]
with $\dot{U}(\theta) = Z (Z-e(X))$.


\item \textbf{\textit{Unknown propensity score:}}  The score vector is
\[
\bU(\theta) = \begin{pmatrix}
               (Z-e(X; \alpha))(Y - Z \psi_0) \\[6pt]
               \bX^\top (Z - e(X; \alpha))
             \end{pmatrix} =
\begin{pmatrix}
                \epsilon_Z \epsilon_Y \\[6pt]
               \bX^\top \epsilon_Z
             \end{pmatrix}
\]
say, where $\epsilon_Y = (Y - Z \psi_0)$ . Therefore
\[
\mathcal{I} = \E \begin{bmatrix}
                \epsilon_Z^2 \epsilon_Y^2 & \bX \epsilon_Z^2 \epsilon_Y \\[6pt]
                \epsilon_Z^2 \epsilon_Y \bX^\top & \bX^\top \bX \epsilon_Z^2
              \end{bmatrix}
\qquad \qquad
\mathcal{J} = \E \begin{bmatrix}
                Z \epsilon_Z & \bX e (1-e) \epsilon_Y\\[6pt]
               \zerovec & \bX^\top \bX e (1-e)
              \end{bmatrix}.
\]

\end{enumerate}

\end{enumerate}


<<C1,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
#Monte Carlo study
library(mvnfast)
set.seed(22087)
n<-10000
muX<-2;sigX<-1
al<-c(-3.5,2);be<-c(2,3,4,1);sigY<-3
expit<-function(x){return(1/(1+exp(-x)))}
nreps<-10000
ate.ests<-matrix(0,nrow=nreps,ncol=6)
colnames(ate.ests)<-c('1(a)','1(b)','2(a)','2(b)','3(a)','3(b)')
for(irep in 1:nreps){
    X1<-rnorm(n,muX,sigX)
    Xm<-cbind(1,X1)
    eX0<-expit(Xm %*% al)
    Z<-rbinom(n,1,eX0)
    Xb<-cbind(1,Z,X1,X1^2)
    Y<-Xb %*% be + rnorm(n)*sigY
    eX<-fitted(glm(Z~X1,family=binomial))
    ate.ests[irep,1]<-mean(Z*Y/eX0)-mean((1-Z)*Y/(1-eX0))
    ate.ests[irep,2]<-mean(Z*Y/eX)-mean((1-Z)*Y/(1-eX))
    w<-Z/eX0+(1-Z)/(1-eX0)
    ate.ests[irep,3]<-coef(lm(Y~Z,weights=w))[2]
    w<-Z/eX+(1-Z)/(1-eX)
    ate.ests[irep,4]<-coef(lm(Y~Z,weights=w))[2]
    res2<-Z-eX0
    ate.ests[irep,5]<-sum(res2*Y)/sum(res2*Z)
    res2<-Z-eX
    ate.ests[irep,6]<-sum(res2*Y)/sum(res2*Z)
}
#apply(ate.ests,2,mean)-3
apply(ate.ests,2,var)
@

<<C2,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,1,0))
boxplot(ate.ests,labels=names(ate.ests),pty=19,cex=0.5);abline(h=be[2],lty=2,col='red')
@

<<C3,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center',echo=FALSE>>=
N<-100000
X1<-rnorm(N,muX,sigX)
Xm<-cbind(1,X1)
eX0<-expit(Xm %*% al)
Z<-rbinom(N,1,eX0)
Xb<-cbind(1,Z,X1,X1^2)
Y<-Xb %*% be + rnorm(N)*sigY
Xb0<-cbind(1,0,X1,X1^2)
Xb1<-cbind(1,1,X1,X1^2)
mu0<-Xb0 %*% be
mu1<-Xb1 %*% be
eX<-fitted(glm(Z~X1,family=binomial))
@

<<C4,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#IPW 1a: PS known
#Direct calculation
vX0<-mean((sigY^2+mu0^2)/(1-eX0))
vX1<-mean((sigY^2+mu1^2)/eX0)
ipw.var<-(vX0+vX1)-be[2]^2
ipw.var/n

I.n<-J.n<-matrix(0,1,1)
res1<-(Z/eX0-(1-Z)/(1-eX0))*Y-be[2]
I.n[1,1]<-mean(res1^2)

vx<-eX0*(1-eX0)
J.n[1,1]<-1
I.var<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n

#This should match the 1(a) variance
I.var[1,1]

var(ate.ests[,1])

########################################################
#IPW 1b: PS unknown
I.n<-J.n<-matrix(0,3,3)
res1<-(Z/eX-(1-Z)/(1-eX))*Y-be[2]
res2<-(Z-eX)
I.n[1,1]<-mean(res1^2)
I.n[1,2]<-I.n[2,1]<-mean(res1*res2)
I.n[2,2]<-mean(res2^2)
I.n[1,3]<-I.n[3,1]<-mean(X1*res1*res2)
I.n[2,3]<-I.n[3,2]<-mean(X1*res2^2)
I.n[3,3]<-mean(X1^2*res2^2)
I.n

vx<-eX*(1-eX)
J.n[1,1]<-1
J.n[1,2]<-mean(vx*Y*(Z/eX^2+(1-Z)/(1-eX)^2))
J.n[1,3]<-mean(X1*vx*Y*(Z/eX^2+(1-Z)/(1-eX)^2))
J.n[2,2]<-mean(vx)
J.n[2,3]<-J.n[3,2]<-mean(vx*X1)
J.n[3,3]<-mean(vx*X1^2)
J.n

I.var<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n

#This should match the 1b variance
I.var[1,1]

var(ate.ests[,2])

@

\pagebreak

<<C5,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#IPW hat 2a: PS known
w0<-(1-Z)/(1-eX0)
w1<-Z/eX0
w<-w0+w1
ipw.ests<-coef(lm(Y~Z,weights=w))
res1<-Y-ipw.ests[1]-ipw.ests[2]*Z
R<-w

I.n<-matrix(0,2,2)
I.n[1,1]<-mean(R^2*res1^2)
I.n[1,2]<-I.n[2,1]<-mean(R^2*Z*res1^2)
I.n[2,2]<-mean(R^2*Z^2*res1^2)

round(I.n,6)
J.n<-matrix(0,2,2)
J.n[1,1]<-mean(R)
J.n[1,2]<-mean(R*Z)
J.n[2,1]<-mean(R*Z)
J.n[2,2]<-mean(R*Z^2)
round(J.n,6)

#This should match the 2a variance
H.var<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n
H.var[2,2]

var(ate.ests[,3])

############################################################
#IPW hat 2b: PS unknown
w0<-(1-Z)/(1-eX)
w1<-Z/eX
w<-w0+w1

ipw.ests<-coef(lm(Y~Z,weights=w))
res1<-Y-ipw.ests[1]-ipw.ests[2]*Z
res2<-Z-eX
R<-w
Rdot1<-((1-Z)/(1-eX))*eX-(Z/eX)*(1-eX)
Rdot2<-X1*Rdot1

I.n<-matrix(0,4,4)
I.n[1,1]<-mean(R^2*res1^2)
I.n[1,2]<-I.n[2,1]<-mean(R^2*Z*res1^2)
I.n[1,3]<-I.n[3,1]<-mean(R*res2*res1)
I.n[1,4]<-I.n[4,1]<-mean(X1*R*res2*res1)
I.n[2,2]<-mean(R^2*Z^2*res1^2)
I.n[2,3]<-I.n[3,2]<-mean(R*Z*res2*res1)
I.n[2,4]<-I.n[4,2]<-mean(X1*R*Z*res2*res1)
I.n[3,3]<-mean(res2^2)
I.n[3,4]<-I.n[4,3]<-mean(X1*res2^2)
I.n[4,4]<-mean(X1^2*res2^2)
round(I.n,6)

J.n<-matrix(0,4,4)
J.n[1,1]<-mean(R)
J.n[1,2]<-mean(R*Z)
J.n[1,3]<-mean(-Rdot1*res1)
J.n[1,4]<-mean(-Rdot2*res1)
J.n[2,1]<-mean(R*Z)
J.n[2,2]<-mean(R*Z^2)
J.n[2,3]<-mean(-Rdot1*Z*res1)
J.n[2,4]<-mean(-Rdot2*Z*res1)
J.n[3,3]<-mean(res2^2)
J.n[3,4]<-J.n[4,3]<-mean(X1*res2^2)
J.n[4,4]<-mean(X1^2*res2^2)
round(J.n,6)

#This should match the 2c variance
H.var<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n
H.var[2,2]

var(ate.ests[,4])
@


\pagebreak

<<C6,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#G-estimation variance 3a: known PS
I.n<-J.n<-matrix(0,1,1)
res1<-(Y-be[2]*Z)

I.n[1,1]<-mean(res1^2*res2^2)
J.n[1,1]<-mean(Z*res2)

#This should match the 3a variance
G.var<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n
G.var

var(ate.ests[,5])

############################################################
#G-estimation variance 3b: unknown PS
I.n<-J.n<-matrix(0,3,3)
res1<-(Y-be[2]*Z)
res2<-(Z-eX)

I.n[1,1]<-mean(res1^2*res2^2)
I.n[1,2]<-I.n[2,1]<-mean(res2^2*res1)
I.n[2,2]<-mean(res2^2)
I.n[1,3]<-I.n[3,1]<-mean(X1*res2^2*res1)
I.n[2,3]<-I.n[3,2]<-mean(X1*res2^2)
I.n[3,3]<-mean(X1^2*res2^2)
I.n
vx<-eX*(1-eX)
J.n[1,1]<-mean(Z*res2)
J.n[1,2]<-mean(vx*res1)
J.n[2,2]<-mean(vx)
J.n[1,3]<-mean(vx*X1*res1)
J.n[2,3]<-J.n[3,2]<-mean(vx*X1)
J.n[3,3]<-mean(vx*X1^2)
J.n

#This should match the 3b variance
G.var<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n
G.var[1,1]
G.var[1,1]-G.var[1,-1] %*% solve(G.var[-1,-1]) %*% G.var[-1,1]

var(ate.ests[,6])
@




\end{document}