\documentclass[notitlepage]{article}
\usepackage{Math}
%\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{bm}
\usepackage{bbm}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{dsfont}
\usepackage{scalerel}

\usepackage{amsfonts,amsmath,amssymb}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}


\usepackage{pgfplots}
\pgfplotsset{compat=1.17}

\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
            \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}

\usepackage{xcolor}

\newtheorem{alg}{Algorithm}
\newtheorem{ex}{Example}
\newtheorem{note}{Note}
\newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof
}}}}
{\hfill \small$\blacksquare$\\[2mm]}


\setlength{\parindent}{0in}

\def\E{\mathbbmss{E}}
\def\Var{\text{Var}}
\def\Cov{\text{Cov}}
\def\Corr{\text{Corr}}

\def\bW{\mathbf{W}}
\def\bH{\mathbf{H}}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\Rho}{\mathrm{P}}
\newcommand{\cif}{\text{ if }}
\newcommand{\Ra}{\Longrightarrow}

\def\bphi{\boldsymbol \phi}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\def\Xb{\mathbf{X}_{\beta}}
\def\Xp{\mathbf{X}_{\psi}}
\def\beps{\boldsymbol{\epsilon}}
\def\betahat{\widehat \beta}
\def\psihat{\widehat \psi}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=6)
options(repos=c(CRAN="https://cloud.r-project.org/"))
knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)})
@
\begin{center}
{\textsclarge{Project 3: Solutions}}
\end{center}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

The data generating conditional mean model for binary treatment is
\[
\E_{Y|X,Z}^\calO[Y|X=x,Z=z] = 1 + x + z + x z
\]
with confounder $X \sim Normal(0.5,0.5^2)$, and the conditional model for $Z|X=x$ is $Bernoulli(e(x))$ with
\[
e(x) = \frac{\exp\{-1.5 + 2 x + x^2\}}{1+\exp\{-1.5 + 2 x + x^2 \}}.
\]
In the analysis a parametric model with parameter $\alpha$ estimated via the logistic regression model is used.
<<C1,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#Calculation for large N
library(mvnfast)
set.seed(22087)
N<-1000000
muX<-0.5;sigX<-0.5
X1<-rnorm(N,muX,sigX)
al<-c(-1.5,2,1)
expit<-function(x){return(1/(1+exp(-x)))}
Xa<-cbind(1,X1,X1^2)
be<-c(1,1,1,1)
sigY<-1
ps.true<-expit(Xa %*% al)
Z<-rbinom(N,1,ps.true)
Xb<-cbind(1,X1,Z,Z*X1)
Y<-Xb %*% be + rnorm(N)*sigY
par(mar=c(4,4,2,0))
hist(ps.true,breaks=seq(0,1,by=0.02),main='True propensity score',xlab=expression(e(x)))
box()
@

<<C2,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
#Correct model
coef(summary(lm(Y~X1+Z+X1:Z)))
#Incorrect model
coef(summary(lm(Y~Z)))
@

\bigskip
\textbf{G-estimation} The \textbf{G-estimating} form is
\[
\sum_{i=1}^n \begin{pmatrix}
               1 \\
               (z_i-e(x_i;\widehat \alpha)) \\[6pt]
               (z_i-e(x_i;\widehat \alpha)) x_i
             \end{pmatrix} (y_i - \beta_0 - z_i \psi_0 - z_i x_i \psi_1) = \begin{pmatrix} 0 \\[6pt] 0 \\[6pt] 0\end{pmatrix}
\]
that is, using the estimating (score) function
\[
\bZ_1^\top (\by - \bZ_2 \theta) = \zerovec
\]
say, where $\theta = (\beta_0, \psi_0)$.
\[
\bZ_1 = \begin{pmatrix}
        1 & z_1 - e(x_1;\widehat \alpha) & (z_1 - e(x_1;\widehat \alpha)) x_1\\[3pt]
        1 & z_2 - e(x_2;\widehat \alpha) & (z_2 - e(x_2;\widehat \alpha)) x_2 \\[3pt]
        \vdots & \vdots & \vdots \\[3pt]
        1 & z_n - e(x_n;\widehat \alpha) & (z_n - e(x_n;\widehat \alpha)) x_n
      \end{pmatrix} \qquad
\bZ_2 = \begin{pmatrix}
        1 & z_1 & z_1 x_1 \\[3pt]
        1 & z_2 & z_2 x_2\\[3pt]
        \vdots & \vdots & \vdots \\[3pt]
        1 & z_n & z_n x_n
      \end{pmatrix}
\]
The solution is therefore
\[
\begin{pmatrix}
  \widehat \beta_0 \\[6pt]
  \widehat \psi_0 \\[6pt]
  \widehat \psi_1
\end{pmatrix} = (\bZ_1^\top \bZ_2)^{-1} \bZ_1^\top \by
\]
<<C4b,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
eX<-fitted(glm(Z~X1+I(X1^2),family=binomial))
Zm1<-cbind(1,Z-eX,(Z-eX)*X1)
Zm2<-cbind(1,Z,Z*X1)
g.ests<-solve(t(Zm1) %*% Zm2) %*% t(Zm1) %*% Y
g.ests
@

We can incorporate the estimation of the propensity score parameters into the same estimating equation formulation by adding in the estimating equation for $\alpha$, namely, for the logistic regression model, the equations
\[
\sum_{i=1}^n \bx_i^\top (z_i - e(x_i;\alpha) )= \zerovec.
\]
We can compute an estimate of the variance-covariance matrix using the theory of estimating equations.  For the $p \times 1$ system of estimating equations
\[
\sum_{i=1}^n \bU_i(\theta) = \zerovec
\]
with $\E[\bU(\theta_0)] = \zerovec$, we have that
\[
\sqrt{n}(\widehat \theta_n - \theta_0) \CiD Normal_p(\zerovec,\bV)
\]
where
\[
\bV = \mathcal{J}^{-1} \mathcal{I} \mathcal{J}^{-\top}
\]
with
\[
\mathcal{I} = \E[ \bU(\theta_0) \bU(\theta_0)^\top] \qquad \qquad \mathcal{J} = \E[\dot{\bU}(\theta_0)]
\]
both $(p \times p)$ matrices, and
\[
\dot{\bU}(\theta_0) = -\dfrac{\partial \bU(\theta)}{\partial \theta^\top} \bigg |_{\theta = \theta_0}
\]
We proceed by computing estimates, $\widehat I_n$ and $\widehat J_n$, of the two matrices based on the observed data, and then computing
\[
\widehat \bV = \widehat J_n^{-1} \widehat I_n \widehat J_n^{-\top}
\]
to estimate the asymptotic variance; this approach is known as \textit{sandwich} or \textit{robust} variance estimation.

\begin{enumerate}[(a)]


\item \textbf{\textit{Known propensity score:}}  If
\[
\epsilon_Y = (Y - \beta_0 - Z \psi_0 - Z X \psi_1) \qquad \qquad \epsilon_Z = (Z-e(X;\alpha))
\]
then
\[
\bU(\theta) = \begin{pmatrix}
                (Y - \beta_0 - Z \psi_0 - Z X \psi_1) \\[6pt]
               (Z-e(X; \alpha))(Y - \beta_0 - Z \psi_0 - Z X \psi_1) \\[6pt]
               (Z-e(X; \alpha)) X (Y - \beta_0 - Z \psi_0 - Z X \psi_1)
             \end{pmatrix} =
\begin{pmatrix}
                \epsilon_Y \\[6pt]
                \epsilon_Z \epsilon_Y \\[6pt]
                \epsilon_Z X \epsilon_Y
             \end{pmatrix}
\]
and the derivative is
\[
\dot{\bU}(\theta) =
\begin{bmatrix}
1 & Z & ZX\\[6pt]
\epsilon_Z & Z \epsilon_Z & ZX \epsilon_Z \\[6pt]
X \epsilon_Z & Z X \epsilon_Z & ZX^2 \epsilon_Z
\end{bmatrix}
\]
and the $\calI$ and $\calJ$ matrices are $3 \times 3$.

\item \textbf{\textit{Unknown propensity score:}}  If $\bX = (1,X,X^2)$, the score vector is
\[
\bU(\theta) = \begin{pmatrix}
                (Y - \beta_0 - Z \psi_0 - Z X \psi_1) \\[6pt]
               (Z-e(X; \alpha))(Y - \beta_0 - Z \psi_0 - Z X \psi_1) \\[6pt]
               (Z-e(X; \alpha)) X (Y - \beta_0 - Z \psi_0 - Z X \psi_1) \\[6pt]
               \bX^\top (Z - e(X; \alpha))
             \end{pmatrix} =
\begin{pmatrix}
                \epsilon_Y \\[6pt]
                \epsilon_Z \epsilon_Y \\[6pt]
                \epsilon_Z X \epsilon_Y \\[6pt]
               \bX^\top \epsilon_Z
             \end{pmatrix}
\]
and writing $e \equiv e(X;\alpha)$, the derivative is
\[
\dot{\bU}(\theta) =
\begin{bmatrix}
1 & Z & ZX & \zerovec\\[6pt]
\epsilon_Z & Z \epsilon_Z & ZX \epsilon_Z & e (1-e) \bX \epsilon_Y \\[6pt]
X \epsilon_Z & Z X \epsilon_Z & ZX^2 \epsilon_Z & e (1-e) X \bX \epsilon_Y\\[6pt]
0 & 0 & 0 & e (1-e) \bX^\top \bX
\end{bmatrix}
\]
and the $\calI$ and $\calJ$ matrices are $6 \times 6$.

\end{enumerate}

<<C5,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#Large sample calculation

    eX<-ps.true
    Zm1<-cbind(1,Z-eX,X1*(Z-eX))
    Zm2<-cbind(1,Z,Z*X1)
    g.ests0<-solve(t(Zm1) %*% Zm2) %*% t(Zm1) %*% Y

    #G-estimation variance estimate
    res1<-Y-g.ests0[1]-g.ests0[2]*Z - g.ests0[3]*Z*X1
    res2<-Z-eX

    I.n<-matrix(0,3,3)
    I.n[1,1]<-mean(res1^2)
    I.n[1,2]<-I.n[2,1]<-mean(res1^2*res2)
    I.n[1,3]<-I.n[3,1]<-mean(res1^2*res2*X1)
    I.n[2,2]<-mean(res1^2*res2^2)
    I.n[2,3]<-I.n[3,2]<-mean(res1^2*res2^2*X1)
    I.n[3,3]<-mean(res1^2*res2^2*X1^2)

    J.n<-matrix(0,3,3)
    J.n[1,1]<-1
    J.n[1,2]<-mean(Z)
    J.n[1,3]<-mean(Z*X1)
    J.n[2,1]<-mean(res2)
    J.n[2,2]<-mean(Z*res2)
    J.n[2,3]<-mean(Z*X1*res2)
    J.n[3,1]<-mean(X1*res2)
    J.n[3,2]<-mean(Z*X1*res2)
    J.n[3,3]<-mean(Z*X1^2*res2)

    V1<-solve(J.n) %*% (I.n %*% t(solve(J.n)))

###############################################################

    fit.p<-glm(Z~X1+I(X1^2),family=binomial)
    eX<-fitted(fit.p)

    Zm1<-cbind(1,Z-eX,X1*(Z-eX))
    Zm2<-cbind(1,Z,Z*X1)
    g.ests1<-solve(t(Zm1) %*% Zm2) %*% t(Zm1) %*% Y

    #G-estimation variance estimate
    res1<-Y-g.ests1[1]-g.ests1[2]*Z - g.ests1[3]*Z*X1
    res2<-Z-eX

    I.n<-matrix(0,6,6)
    I.n[1,1]<-mean(res1^2)
    I.n[1,2]<-I.n[2,1]<-mean(res1^2*res2)
    I.n[1,3]<-I.n[3,1]<-mean(res1^2*res2*X1)
    I.n[1,4]<-I.n[4,1]<-mean(res1*res2)
    I.n[1,5]<-I.n[5,1]<-mean(res1*res2*X1)
    I.n[1,6]<-I.n[6,1]<-mean(res1*res2*X1^2)
    I.n[2,2]<-mean(res1^2*res2^2)
    I.n[2,3]<-I.n[3,2]<-mean(res1^2*res2^2*X1)
    I.n[2,4]<-I.n[4,2]<-mean(res1*res2^2)
    I.n[2,5]<-I.n[5,2]<-mean(res1*res2^2*X1)
    I.n[2,6]<-I.n[6,2]<-mean(res1*res2^2*X1^2)

    I.n[3,3]<-mean(res1^2*res2^2*X1^2)
    I.n[3,4]<-I.n[4,3]<-mean(res1*res2^2*X1)
    I.n[3,5]<-I.n[5,3]<-mean(res1*res2^2*X1^2)
    I.n[3,6]<-I.n[6,3]<-mean(res1*res2^2*X1^3)

    I.n[4,4]<-mean(res2^2)
    I.n[4,5]<-I.n[5,4]<-mean(X1*res2^2)
    I.n[4,6]<-I.n[6,4]<-mean(X1^2*res2^2)
    I.n[5,5]<-mean(X1^2*res2^2)
    I.n[5,6]<-I.n[6,5]<-mean(X1^3*res2^2)
    I.n[6,6]<-I.n[6,6]<-mean(X1^4*res2^2)

    J.n<-matrix(0,6,6)
    vX<-eX*(1-eX)
    J.n[1,1]<-1
    J.n[1,2]<-mean(Z)
    J.n[1,3]<-mean(Z*X1)
    J.n[2,1]<-mean(res2)
    J.n[2,2]<-mean(Z*res2)
    J.n[2,3]<-mean(Z*X1*res2)
    J.n[2,4]<-mean(vX*res1)
    J.n[2,5]<-mean(vX*res1*X1)
    J.n[2,6]<-mean(vX*res1*X1^2)
    J.n[3,1]<-mean(X1*res2)
    J.n[3,2]<-mean(Z*X1*res2)
    J.n[3,3]<-mean(Z*X1^2*res2)
    J.n[3,4]<-mean(vX*X1*res1)
    J.n[3,5]<-mean(vX*res1*X1^2)
    J.n[3,6]<-mean(vX*res1*X1^3)
    J.n[4,4]<-mean(vX)
    J.n[4,5]<-J.n[5,4]<-mean(X1*vX)
    J.n[4,6]<-J.n[6,4]<-mean(X1^2*vX)
    J.n[5,5]<-mean(X1^2*vX)
    J.n[5,6]<-J.n[6,5]<-mean(X1^3*vX)
    J.n[6,6]<-J.n[6,6]<-mean(X1^4*vX)

    V2<-solve(J.n) %*% (I.n %*% t(solve(J.n)))
@

<<C6,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#Monte Carlo study
nreps<-20000
n<-1000
psr.ests<-matrix(0,nrow=nreps,ncol=3)
g.ests0<-g.ests1<-al.ests<-matrix(0,nrow=nreps,ncol=3)
V1.ests<-array(0,c(nreps,3,3))
V2.ests<-array(0,c(nreps,6,6))
for(irep in 1:nreps){

    X1<-rnorm(n,muX,sigX)
    Xa<-cbind(1,X1,X1^2)
    ps.true<-expit(Xa %*% al)
    Z<-rbinom(n,1,ps.true)
    Xb<-cbind(1,X1,Z,X1*Z)
    Y<-Xb %*% be + rnorm(n)*sigY

    eX<-ps.true
    Zm1<-cbind(1,Z-eX,X1*(Z-eX))
    Zm2<-cbind(1,Z,Z*X1)
    g.ests0[irep,]<-solve(t(Zm1) %*% Zm2) %*% t(Zm1) %*% Y

    #G-estimation variance estimate
    res1<-Y-g.ests0[irep,1]-g.ests0[irep,2]*Z - g.ests0[irep,3]*Z*X1
    res2<-Z-eX

    I.n<-matrix(0,3,3)
    I.n[1,1]<-mean(res1^2)
    I.n[1,2]<-I.n[2,1]<-mean(res1^2*res2)
    I.n[1,3]<-I.n[3,1]<-mean(res1^2*res2*X1)
    I.n[2,2]<-mean(res1^2*res2^2)
    I.n[2,3]<-I.n[3,2]<-mean(res1^2*res2^2*X1)
    I.n[3,3]<-mean(res1^2*res2^2*X1^2)

    J.n<-matrix(0,3,3)
    J.n[1,1]<-1
    J.n[1,2]<-mean(Z)
    J.n[1,3]<-mean(Z*X1)
    J.n[2,1]<-mean(res2)
    J.n[2,2]<-mean(Z*res2)
    J.n[2,3]<-mean(Z*X1*res2)
    J.n[3,1]<-mean(X1*res2)
    J.n[3,2]<-mean(Z*X1*res2)
    J.n[3,3]<-mean(Z*X1^2*res2)

    V<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n
    V1.ests[irep,,]<-V

###############################################################

    fit.p<-glm(Z~X1+I(X1^2),family=binomial)
    eX<-fitted(fit.p)
    al.ests[irep,]<-coef(fit.p)

    Zm1<-cbind(1,Z-eX,X1*(Z-eX))
    Zm2<-cbind(1,Z,Z*X1)
    g.ests1[irep,]<-solve(t(Zm1) %*% Zm2) %*% t(Zm1) %*% Y

    #G-estimation variance estimate
    res1<-Y-g.ests1[irep,1]-g.ests1[irep,2]*Z - g.ests1[irep,3]*Z*X1
    res2<-Z-eX

    I.n<-matrix(0,6,6)
    I.n[1,1]<-mean(res1^2)
    I.n[1,2]<-I.n[2,1]<-mean(res1^2*res2)
    I.n[1,3]<-I.n[3,1]<-mean(res1^2*res2*X1)
    I.n[1,4]<-I.n[4,1]<-mean(res1*res2)
    I.n[1,5]<-I.n[5,1]<-mean(res1*res2*X1)
    I.n[1,6]<-I.n[6,1]<-mean(res1*res2*X1^2)
    I.n[2,2]<-mean(res1^2*res2^2)
    I.n[2,3]<-I.n[3,2]<-mean(res1^2*res2^2*X1)
    I.n[2,4]<-I.n[4,2]<-mean(res1*res2^2)
    I.n[2,5]<-I.n[5,2]<-mean(res1*res2^2*X1)
    I.n[2,6]<-I.n[6,2]<-mean(res1*res2^2*X1^2)
    
    I.n[3,3]<-mean(res1^2*res2^2*X1^2)
    I.n[3,4]<-I.n[4,3]<-mean(res1*res2^2*X1)
    I.n[3,5]<-I.n[5,3]<-mean(res1*res2^2*X1^2)
    I.n[3,6]<-I.n[6,3]<-mean(res1*res2^2*X1^3)
    
    I.n[4,4]<-mean(res2^2)
    I.n[4,5]<-I.n[5,4]<-mean(X1*res2^2)
    I.n[4,6]<-I.n[6,4]<-mean(X1^2*res2^2)
    I.n[5,5]<-mean(X1^2*res2^2)
    I.n[5,6]<-I.n[6,5]<-mean(X1^3*res2^2)
    I.n[6,6]<-I.n[6,6]<-mean(X1^4*res2^2)

    J.n<-matrix(0,6,6)
    vX<-eX*(1-eX)
    J.n[1,1]<-1
    J.n[1,2]<-mean(Z)
    J.n[1,3]<-mean(Z*X1)
    J.n[2,1]<-mean(res2)
    J.n[2,2]<-mean(Z*res2)
    J.n[2,3]<-mean(Z*X1*res2)
    J.n[2,4]<-mean(vX*res1)
    J.n[2,5]<-mean(vX*res1*X1)
    J.n[2,6]<-mean(vX*res1*X1^2)
    J.n[3,1]<-mean(X1*res2)
    J.n[3,2]<-mean(Z*X1*res2)
    J.n[3,3]<-mean(Z*X1^2*res2)
    J.n[3,4]<-mean(vX*X1*res1)
    J.n[3,5]<-mean(vX*res1*X1^2)
    J.n[3,6]<-mean(vX*res1*X1^3)
    J.n[4,4]<-mean(vX)
    J.n[4,5]<-J.n[5,4]<-mean(X1*vX)
    J.n[4,6]<-J.n[6,4]<-mean(X1^2*vX)
    J.n[5,5]<-mean(X1^2*vX)
    J.n[5,6]<-J.n[6,5]<-mean(X1^3*vX)
    J.n[6,6]<-J.n[6,6]<-mean(X1^4*vX)

    V<-solve(J.n) %*% (I.n %*% t(solve(J.n)))/n
    V2.ests[irep,,]<-V

}
@

<<C7,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0))
psi.ests<-cbind(g.ests0[,2:3],g.ests1[,2:3])
pn<-c(expression(hat(psi)[0]),expression(hat(psi)[1]),expression(hat(psi)[0]),expression(hat(psi)[1]))
colnames(psi.ests)<-c('Psi0-0','Psi1-0','Psi0-1','Psi1-1')
boxplot(psi.ests,names=pn,pch=19,cex=0.5)
abline(h=1,col='red',lty=2)
@

<<C7a,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
#Variances
apply(psi.ests,2,var)
@

<<C8,comment='+', fig.width=7.5, fig.height=4.5, fig.align='center'>>=
round(apply(V1.ests,2:3,mean),6)[2:3,2:3]        #Monte Carlo sandwich estimate
round(cov(cbind(g.ests0,al.ests)),6)[2:3,2:3]    #Empirical estimate
round(V1/n,6)[2:3,2:3]                           #Monte Carlo sandwich estimate (large sample)

round(apply(V2.ests,2:3,mean),6)[2:3,2:3]        #Monte Carlo sandwich estimate
round(cov(cbind(g.ests1,al.ests)),6)[2:3,2:3]    #Empirical estimate
round(V2/n,6)[2:3,2:3]                           #Monte Carlo sandwich estimate (large sample)
@


\end{document}