\documentclass[notitlepage]{article}
\usepackage{Math}
%\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{bm}
\usepackage{bbm}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{dsfont}

\usepackage{amsfonts,amsmath,amssymb}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}


\usepackage{pgfplots}
\pgfplotsset{compat=1.17}

\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
            \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}

\usepackage{xcolor}

\newtheorem{alg}{Algorithm}
\newtheorem{ex}{Example}
\newtheorem{note}{Note}
\newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof
}}}}
{\hfill \small$\blacksquare$\\[2mm]}


\setlength{\parindent}{0in}

\def\E{\mathbbmss{E}}
\def\Var{\text{Var}}
\def\Cov{\text{Cov}}
\def\Corr{\text{Corr}}

\def\bW{\mathbf{W}}
\def\bH{\mathbf{H}}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\Rho}{\mathrm{P}}
\newcommand{\cif}{\text{ if }}
\newcommand{\Ra}{\Longrightarrow}

\def\bphi{\boldsymbol \phi}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\def\Xb{\mathbf{X}_{\beta}}
\def\Xp{\mathbf{X}_{\psi}}
\def\beps{\boldsymbol{\epsilon}}
\def\betahat{\widehat \beta}
\def\psihat{\widehat \psi}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)})
@
\begin{center}
{\textsclarge{Causal Adjustment Methods Using The Propensity Score}}
\end{center}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

The most common causal inference problem is to estimate the causal effect of treatment $Z$ on outcome $Y$ in the presence of confounders $X$.  The causal effect is defined via the expectation under the experimental measure $\calE$.
\[
\mu(z) = \E_{Y|Z}^\calE [Y|Z=z] \qquad z \in \mathcal{Z}.
\]

The propensity score for binary exposure, denoted $e(X)$, is defined via the observational conditional model for $Z$ given confounders $X$, as
\[
e(x) = f_{Z|X}^\calO (1|x) = {\Pr}_{Z|X}^\calO[Z=1|X=x]
\]
with $e(X)$ being the corresponding random variable.  The propensity score is a \textit{balancing score}, that is, we have that $Z \independent X \given e(X)$.

\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (z) at (0,0) {${Z}$};
    \node[state] (y) at (2,0){${Y}$};
    \node[state] (x) at (1,-2) {${X}$};

    \path (x) edge (y);
    \path (x) edge (z);
    \path (z) edge (y);

    % x node set with absolute coordinates
    \node[state] (z1) at (4,0) {${Z}$};
    \node[state] (y1) at (6,0){${Y}$};
    \node[state] (x1) at (5,-2) {${X}$};
    \node[state] (b1) at (4.5,-1) {$\scriptstyle e(X)$};

    \path (x1) edge (y1);
    \path (x1) edge[line width=1pt, double distance=1pt, -{Classical TikZ Rightarrow[length=1.5mm]}] (b1);
    \path (z1) edge (y1);
    \path (b1) edge (z1);

    \node at (1,0.5) {Original DAG};
    \node at (5,0.5) {DAG with propensity score $e(X)$};


 \end{tikzpicture}
\end{figure}


Therefore, an analysis that conditions on $e(X)$ will block the biasing (backdoor, confounding) path between $Z$ and $Y$ that goes via $X$.
\medskip

Several different methods based on the propensity score can be used for causal adjustment:

\begin{itemize}

\item \textbf{Stratification :}  The propensity score space is partitioned into $J$ regions or \textit{strata}, and the `local' causal effect is estimated in each, and then combined in an average to estimate $\mu(z)$

\item \textbf{Matching :} The outcomes for individuals with different $Z$ values but equal $e(X)$ values (that is, the individuals are \textit{matched} on  their propensity score values) are compared.

\item \textbf{Regression :}  The propensity score is conditioned upon using a \textit{regression model}.

\item \textbf{Inverse Weighting :}  Using an `importance sampling' \textit{reweighting} via the propensity score of the $Y$ values observed under $\calO$, a pseudo-sample is created that can be treated as a sample collected under $\mathcal{E}$ in which exposure is balanced.

\end{itemize}

Consider the following example where we generate according to the parametric model

\begin{itemize}

\item $X \sim \textrm{Normal}_3(\mu, \Sigma)$ with
\[
\mu = \begin{bmatrix*}[r]
        1 \\
        -2 \\
        1
      \end{bmatrix*} \qquad \qquad \Sigma =
      \begin{bmatrix*}[r]
      0.25 & 0.00 & 0.00 \\
      0.00 & 0.50 & 0.00 \\
      0.00 & 0.00 & 0.75
      \end{bmatrix*}
      \begin{bmatrix*}[r]
      1.0 & 0.9 & -0.1 \\
      0.9 & 1.0 & -0.2 \\
      -0.1 & -0.2 & 1.0
      \end{bmatrix*}
      \begin{bmatrix*}[r]
      0.25 & 0.00 & 0.00 \\
      0.00 & 0.50 & 0.00 \\
      0.00 & 0.00 & 0.75
      \end{bmatrix*}
\]

\item $Z|X = x \sim Bernoulli(e(x;\alpha))$, with
\[
e(x;\alpha) = \frac{\exp\{\alpha_0 + \alpha_1 x_1 + \alpha_2 x_2 + \alpha_3 x_1 x_2 \}}{1 + \exp\{\alpha_0 + \alpha_1 x_1 + \alpha_2 x_2 + \alpha_3 x_1 x_2 \}}
\]
with $\alpha = (\alpha_0,\alpha_1,\alpha_2,\alpha_3)^\top = (6.0,-0.2,0.7,2)^\top$.

\item $Y|X=x,Z=z \sim \text{Normal}(\mu(x,z; \beta,\psi),\sigma^2)$, with
\[
\mu(x,z) = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_1 x_2 + z (\psi_0 + \psi_1 x_1 + \psi_2 x_2 + \psi_3 x_1 x_2) = \mu_0(x;\beta) + z \mu_1(x;\psi)
\]
say, with $\sigma^2 = 0.1^2$, and
\[
\beta = (\beta_0,\beta_1,\beta_2,\beta_3)^\top = (10,-2.0,1.2,0.6)^\top \qquad \psi = (\psi_0,\psi_1,\psi_2,\psi_3)^\top = (1,1,1,1)^\top
\]
In this model, $X_1$ and $X_2$ are confounders, and $X_3$ is a spurious variable unrelated to $Y$ or $Z$.
\end{itemize}


<<CA1,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
#Data simulation
library(MASS);library(xtable)
set.seed(23987)
n<-1000
D<-diag(c(0.25,0.5,0.75))
Mu<-c(1,-2,-1)
Sigma<-D %*% (matrix(c(1,0.9,-0.1,0.9,1,-0.2,-0.1,-0.2,1),3,3) %*% D)
X<-mvrnorm(n,mu=Mu,Sigma)
al<-c(6.0,-0.2,0.7,2)
Xal<-cbind(1,X[,1],X[,2],X[,1]*X[,2])
expit<-function(x){return(1/(1+exp(-x)))}
ps.true<-expit(Xal %*% al)
Z<-rbinom(n,1,ps.true)
be<-c(10,-2.0,1.2,0.6)
Xb<-cbind(1,X[,1],X[,2],X[,1]*X[,2])
mu0<- Xb %*% be
psi<-c(1,1,1,1)
Xp<-cbind(1,X[,1],X[,2],X[,1]*X[,2])
mu1<- (Xp %*% psi)
eta<-mu0 + Z * mu1 ; sig<-0.1
Y<-rnorm(n,eta,sig)
X1<-X[,1];X2<-X[,2];X3<-X[,3]
true.vals<-c(be,psi)[c(1,2,3,5,4,6,7,8)]
eX<-as.vector(Z-ps.true)
@
In this linear model, we have that the average causal effect can be computed directly in terms of the model parameters:
\[
\mu(z) = \E_{Y|Z}^\calE [Y|Z=z] \equiv \E_{X}^\calO \left[\E_{Y|X,Z}^\calO [Y|X,Z=z] \right] = \E_{X}^\calO \left[\mu(X,z) \right]
\]
so in this case
\begin{align*}
\mu(z) & = \E_{X}^\calO \left[\beta_0 + \beta_1 X_1 + \beta_2 X_2 + \beta_3 X_1 X_2 + z (\psi_0 + \psi_1 X_1 + \psi_2 X_2 + \psi_3 X_1 X_2) \right]\\[6pt]
& = \beta_0 + \beta_1 \E[X_1] + \beta_2 \E[X_2] + \beta_3 \E[X_1 X_2] + z (\psi_0 + \psi_1 \E[X_1] + \psi_2 \E[X_2] + \psi_3 \E[X_1 X_2])
\end{align*}
Here we have that
\[
\E[X X^\top] = \begin{bmatrix*}[c]
                 \E[X_1^2] & \E[X_1 X_2] & \E[X_1 X_3] \\[6pt]
                 \E[X_1 X_2] & \E[X_2^2] & \E[X_2 X_3] \\[6pt]
                 \E[X_1 X_3] & \E[X_2 X_2] & \E[X_3^2]
               \end{bmatrix*} = \Sigma + \mu \mu^\top =
\begin{bmatrix*}[r]
                 \Sexpr{Sigma[1,1]} & \Sexpr{Sigma[1,2]} & \Sexpr{Sigma[1,3]} \\[6pt]
                 \Sexpr{Sigma[2,1]} & \Sexpr{Sigma[2,2]} & \Sexpr{Sigma[2,3]} \\[6pt]
                 \Sexpr{Sigma[3,1]} & \Sexpr{Sigma[3,2]} & \Sexpr{Sigma[3,3]}
               \end{bmatrix*} +
\begin{bmatrix*}[r]
                 \Sexpr{Mu[1]*Mu[1]} & \Sexpr{Mu[1]*Mu[2]} & \Sexpr{Mu[1]*Mu[3]} \\[6pt]
                 \Sexpr{Mu[2]*Mu[1]} & \Sexpr{Mu[2]*Mu[2]} & \Sexpr{Mu[2]*Mu[3]} \\[6pt]
                 \Sexpr{Mu[3]*Mu[1]} & \Sexpr{Mu[3]*Mu[2]} & \Sexpr{Mu[3]*Mu[3]}
               \end{bmatrix*}
\]
which yields
<<CA2>>=
(X.M<-Sigma + Mu %*% t(Mu))
@
so therefore $\mu(0)$ and $\mu(1)$ are
<<CA2a>>=
(mu0<-sum(be*c(1,Mu[1],Mu[2],X.M[1,2])))
(mu1<-mu0 + sum(psi*c(1,Mu[1],Mu[2],X.M[1,2])))
@
and hence the `causal effect' is
\[
\mu(1)-\mu(0) = \Sexpr{mu1}-\Sexpr{mu0} = \Sexpr{mu1-mu0}.
\]

\pagebreak

\section{Unadjusted analysis}
If we try to analyze the data as if they arose from an experimental study, as in an unadjusted analysis, it is evident that incorrect inferences are drawn.
<<CA3a,comment='+', fig.width=7.25, fig.height=4.0, fig.align='center'>>=
par(mar=c(2,2,0,0))
boxplot(Y~Z)
points(1:2,c(mu0,mu1),col='red',pch=15)
c(mean(Y[Z==0]),mean(Y[Z==1]),mean(Y[Z==1])-mean(Y[Z==0]))
@
Thus the estimated causal effect is \Sexpr{mean(Y[Z==1])-mean(Y[Z==0])}.

\section{Outcome Regression}
\label{sec:OR}

\subsection{Correct Specification}

A regression analysis using a correctly specified model returns correct parameter estimation.
<<CA3b,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit0<-lm(Y~X1+X2+X1:X2+Z+Z:X1+Z:X2+Z:X1:X2)
round(cbind(coef(summary(fit0)),true.vals),6)
@
From this fit, it is possible to estimate the causal quantities using an average prediction
\begin{equation}\label{eq:regest}
\widehat \mu (z) = \frac{1}{n} \sum_{i=1}^n \mu(x_i,z;\widehat \beta,\widehat \psi)
\end{equation}
<<CA3c,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
mu0.0<-mean(predict(fit0,newdata=data.frame(X1,X2,Z=0)))
mu1.0<-mean(predict(fit0,newdata=data.frame(X1,X2,Z=1)))
c(mu0.0,mu1.0,mu1.0-mu0.0)
@

\medskip

Notice, however, that what is being computed is \textbf{not the average of the predictions for the observed $z$ values} in the two cases for $z_i=1$ and $z_i=0$, which would be respectively
\[
\dfrac{\sum\limits_{i=1}^n z_i \mu(x_i,1;\widehat \beta,\widehat \psi)}{\sum\limits_{i=1}^n z_i} \qquad \qquad \dfrac{\sum\limits_{i=1}^n (1-z_i) \mu(x_i,0;\widehat \beta,\widehat \psi)}{\sum\limits_{i=1}^n (1-z_i)}.
\]
Such a procedure would not estimate the required quantities correctly:
<<CA3d,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
m0<-sum((1-Z)*predict(fit0))/sum(1-Z)
m1<-sum(Z*predict(fit0))/sum(Z)
c(m0,m1,m1-m0)
@
This is due to the confounding present in the structure of the joint distribution,

\subsection{Incorrect specification}

An incorrectly specified model leads to incorrect estimation, although the estimated causal parameters can still potentially be recovered.  For example, if the interaction terms are omitted, although it is not possible to estimate individual coefficients correctly, the causal quantities of interest are estimated reasonably well.
<<CA4,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit1<-lm(Y~X1+X2+Z+Z:X1+Z:X2)
round(coef(summary(fit1)),6)
mu0.1<-mean(predict(fit1,newdata=data.frame(X1,X2,Z=0)))
mu1.1<-mean(predict(fit1,newdata=data.frame(X1,X2,Z=1)))
c(mu0.1,mu1.1,mu1.1-mu0.1)
@
The omission of the \texttt{X1:X2} and \texttt{Z:X1:X2} interactions does not lead to poor results in this case because the magnitude of the effect of these terms is relatively small.

\pagebreak

If either $X_1$ or $X_2$ or both is omitted, there is no correct recovery.
<<CA5,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit2<-lm(Y~X1+Z+Z:X1)
mu0.2<-mean(predict(fit2,newdata=data.frame(X1,Z=0)))
mu1.2<-mean(predict(fit2,newdata=data.frame(X1,Z=1)))
c(mu0.2,mu1.2,mu1.2-mu0.2)
fit3<-lm(Y~X2+Z+Z:X2)
mu0.3<-mean(predict(fit3,newdata=data.frame(X1,Z=0)))
mu1.3<-mean(predict(fit3,newdata=data.frame(X1,Z=1)))
c(mu0.3,mu1.3,mu1.3-mu0.3)
fit4<-lm(Y~Z)
mu0.4<-mean(predict(fit4,newdata=data.frame(X1,Z=0)))
mu1.4<-mean(predict(fit4,newdata=data.frame(X1,Z=1)))
c(mu0.4,mu1.4,mu1.4-mu0.4)
@

\section{Propensity score stratification}

For propensity score stratification, we seek to first construct strata of $X$, $\mathcal{X}_j, j=1,\ldots,J$ defined by propensity score values, then estimate the causal quantities within each stratum, and then combine across strata.

\subsection{True propensity score}
First, we assume the true propensity score values are known and are given as in the simulation.
<<CA6a,comment='+', fig.width=7.25, fig.height=4.0, fig.align='center'>>=
par(mar=c(2,2,0,0))
hist(ps.true,main='',col='gray',breaks=seq(0,1,by=0.02));box()
@
We may define the strata by taking percentiles of the propensity score, for example, quintiles.
<<CA6b,comment='+', fig.width=7.25, fig.height=4.0, fig.align='center'>>=
nstrata<-5
ps.quint.true<-quantile(ps.true,probs=seq(0,1,by=1/nstrata))
ps.quint.true
ps.strat.true<-as.numeric(cut(ps.true,ps.quint.true,include.lowest=TRUE))
table(Z,ps.strat.true)
@
Stratification by the quintiles of the propensity score allows a reasonable number of both $Z=0$ and $Z=1$ cases in each of the strata.
<<CA6c,comment='+', fig.width=7.25, fig.height=4.0, fig.align='center'>>=
mu.strat.true<-matrix(0,ncol=nstrata,nrow=3)
for(j in 1:nstrata){
    mu.strat.true[1,j]<-mean(Y[Z==0 & ps.strat.true == j])
    mu.strat.true[2,j]<-mean(Y[Z==1 & ps.strat.true == j])
    mu.strat.true[3,j]<-mu.strat.true[2,j]-mu.strat.true[1,j]
}
mu.strat.true
@
We can then estimate the causal quantities by averaging within the rows of this matrix to compute
\[
\widehat \mu(z) = \sum_{j=1}^J \widehat \mu^{\mathcal{X}_j}(z) \Pr[X \in \mathcal{X}_j]
\]
<<CA6d,comment='+', fig.width=7.25, fig.height=4.0, fig.align='center'>>=
apply(mu.strat.true,1,mean)
@

\subsection{Fitted propensity score}

In practice, we do not know the true values of the propensity score, and so typically we would estimate them from the observed data.   If we assume the correct form for the model, but not the values of the parameters, we can estimate them from the observed data using a logistic regression model and the \texttt{glm} function.
<<CA7a,comment='+', fig.width=7.25, fig.height=4.0, fig.align='center'>>=
ps.fit.c<-fitted(glm(Z~X1+X2+X1:X2,family=binomial))
@
We can then proceed with an identical analysis, replacing the true PS values with the fitted ones.
<<CA7b,comment='+', fig.width=7.25, fig.height=4.0, fig.align='center'>>=
ps.quint.fit.c<-quantile(ps.fit.c,probs=seq(0,1,by=1/nstrata))
ps.quint.fit.c
ps.strat.fit.c<-as.numeric(cut(ps.fit.c,ps.quint.fit.c,include.lowest=TRUE))
table(Z,ps.strat.fit.c)
mu.strat.fit.c<-matrix(0,ncol=nstrata,nrow=3)
for(j in 1:nstrata){
    mu.strat.fit.c[1,j]<-mean(Y[Z==0 & ps.strat.fit.c == j])
    mu.strat.fit.c[2,j]<-mean(Y[Z==1 & ps.strat.fit.c == j])
    mu.strat.fit.c[3,j]<-mu.strat.fit.c[2,j]-mu.strat.fit.c[1,j]
}
mu.strat.fit.c
apply(mu.strat.fit.c,1,mean)
@
The results are broadly similar to when the true PS values are used.  However, if the PS model is misspecified, this does not hold; for example if we omit $X_2$ from the PS model, incorrect results are obtained.
<<CA7c,comment='+', fig.width=7.25, fig.height=4.0, fig.align='center'>>=
ps.fit.i<-fitted(glm(Z~X1,family=binomial))
ps.quint.fit.i<-quantile(ps.fit.i,probs=seq(0,1,by=1/nstrata))
ps.quint.fit.i
ps.strat.fit.i<-as.numeric(cut(ps.fit.i,ps.quint.fit.i,include.lowest=TRUE))
table(Z,ps.strat.fit.i)
mu.strat.fit.i<-matrix(0,ncol=nstrata,nrow=3)
for(j in 1:nstrata){
    mu.strat.fit.i[1,j]<-mean(Y[Z==0 & ps.strat.fit.i == j])
    mu.strat.fit.i[2,j]<-mean(Y[Z==1 & ps.strat.fit.i == j])
    mu.strat.fit.i[3,j]<-mu.strat.fit.i[2,j]-mu.strat.fit.i[1,j]
}
mu.strat.fit.i
apply(mu.strat.fit.i,1,mean)
@

\pagebreak

With a mis-specified PS model, we no longer have a balancing score for the known confounding variables, that is, $Z$ and $X$ are no longer independent within strata of the propensity score.  Lack of balance can be diagnosed numerically (by comparing means of the confounders for $Z=0$ and $Z=1$ within strata of the propensity score) or graphically (by comparing boxplots within strata).  For the correct model, we have
<<CA8a,comment='+', fig.width=7.25, fig.height=4.0, fig.align='center'>>=
t1<-aggregate(X1,by=list(Z,ps.strat.fit.c),FUN=mean)
t2<-aggregate(X2,by=list(Z,ps.strat.fit.c),FUN=mean)
t3<-aggregate(X3,by=list(Z,ps.strat.fit.c),FUN=mean)
dnames<-list(c("Z=0", "Z=1"),c("Q1", "Q2", "Q3","Q4","Q5"))
matrix(t1$x,nrow=2,ncol=nstrata,byrow=FALSE,dimnames = dnames )   #Summary for X1: mean
matrix(t2$x,nrow=2,ncol=nstrata,byrow=FALSE,dimnames = dnames )   #Summary for X2: mean
matrix(t3$x,nrow=2,ncol=nstrata,byrow=FALSE,dimnames = dnames )   #Summary for X3: mean
@
and the variables fitted within the PS are balanced. For the incorrect model we have
<<CA8b,comment='+', fig.width=7.25, fig.height=4.0, fig.align='center'>>=
t1<-aggregate(X1,by=list(Z,ps.strat.fit.i),FUN=mean)
t2<-aggregate(X2,by=list(Z,ps.strat.fit.i),FUN=mean)
t3<-aggregate(X3,by=list(Z,ps.strat.fit.i),FUN=mean)
dnames<-list(c("Z=0", "Z=1"),c("Q1", "Q2", "Q3","Q4","Q5"))
matrix(t1$x,nrow=2,ncol=nstrata,byrow=FALSE,dimnames = dnames )   #Summary for X1: mean
matrix(t2$x,nrow=2,ncol=nstrata,byrow=FALSE,dimnames = dnames )   #Summary for X2: mean
matrix(t3$x,nrow=2,ncol=nstrata,byrow=FALSE,dimnames = dnames )   #Summary for X3: mean
@
and now only $X_1$ appears balanced.  Therefore, the PS mechanism is producing balance amongst those variables that appear in the fitted PS model.  However, those variables that remain unbalanced may still have some confounding influence.  This analysis informs us that the functional form of the propensity score is correct for the included variables.

\pagebreak


\section{Matching}

Propensity score matching uses the propensity score to find matched (and therefore comparable) individuals in the $Z=0$ and $Z=1$ subsets.  These matched individuals can then be used to estimate the difference $\mu(1)-\mu(0)$.  A statistical analysis using the \texttt{Matching} package is straightforward.  In the first analysis, we match 1--1 for observations with $Z=0$ and $Z=1$.  There are \Sexpr{sum(Z)} cases with $Z=1$ in the simulated data, but the package re-uses the data to produce a sample of size $n$ of matched pairs.  In the function \texttt{Match}, the quantity \texttt{Tr} denotes the `treatment' $Z$, and $X$ is the variable (or variables) used for matching.

<<CA9a,comment='+', fig.width=7.5, fig.height=5.0, fig.align='center'>>=
library(Matching)
fit.match1  <- Match(Y=Y, Tr=Z, X=ps.true, M=1, estimand='ATE')
summary(fit.match1)
@
In this case the estimate is \Sexpr{fit.match1$est}, and the estimated standard error is \Sexpr{fit.match1$se}.


\medskip

It is possible also to carry out 1--$M$ matching for $M=5$ and $M=10$, say.  In this analysis, every case with $Z=1$ is matched to $M$ individuals with $Z=0$.
<<CA9b,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit.match2  <- Match(Y=Y, Tr=Z, X=ps.true, M=5, estimand='ATE')
summary(fit.match2)
fit.match3  <- Match(Y=Y, Tr=Z, X=ps.true, M=10, estimand='ATE')
summary(fit.match3)
@
There are many other options in the \texttt{Matching} package.


\pagebreak
\section{Propensity score regression}

In propensity score regression, a regression model is built using the propensity score as a predictor.  For example, we might fit
\[
\mu(z,x) = \E_{Y|X,Z}^\calO[Y|X=x,Z=z] = \beta_0 + \psi_0 z + \phi e(x)
\]
with the idea that conditioning on the propensity score is achieved by entering the term into the regression model.
<<CA10,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit.psr.1<-lm(Y~Z+ps.fit.c)
summary(fit.psr.1)
@
The estimator that follows from this procedure is a regression estimator as in equation \eqref{eq:regest}.  In a this model, we can read off the estimator of $\mu(1)-\mu(0)$ as $\widehat \psi_0$, which here is equal to \Sexpr{coef(fit.psr.1)[2]}.  As a regression model, it suffers the same problems as the regression model from Section \ref{sec:OR}, which relate to mis-specification: if $\mu(x,z)$ is mis-specified, incorrect inference will follow.

\medskip
The regression model may be extended to include other terms.  Suppose we specify
\[
\mu(x,z) = \beta_0 + z(\psi_0 + \psi_1 x_1 + \psi_2 x_2 + \psi_3 x_1 x_2) + \phi_0 e(x)
\]
<<CA11,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit.psr.2<-lm(Y~Z+Z:X1+Z:X2+Z:X1:X2+ps.fit.c)
round(coef(summary(fit.psr.2)),6)
mu0.psr<-mean(predict(fit.psr.2,newdata=data.frame(X1,X2,Z=0,ps.fit.c)))
mu1.psr<-mean(predict(fit.psr.2,newdata=data.frame(X1,X2,Z=1,ps.fit.c)))
c(mu0.psr,mu1.psr,mu1.psr-mu0.psr)
@
then an approximately correct conclusion is obtained for estimating $\mu(0)$ and $\mu(1)$ -- this is despite the fact that the $\psi$ parameters are not estimated correctly, that is, the estimates do not match the $\psi$ parameters in the data generating model.   Compare this with the incorrectly specified outcome regression
\[
\mu(x,z) = \beta_0 + z(\psi_0 + \psi_1 x_1 + \psi_2 x_2 + \psi_3 x_1 x_2) = m_0(\beta_0) + z m_1(x;\psi)
\]
say.  Then
<<CA12,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit.or2<-lm(Y~Z+Z:X1+Z:X2+Z:X1:X2)
round(coef(summary(fit.or2)),6)
mu0.or2<-mean(predict(fit.or2,newdata=data.frame(X1,X2,Z=0)))
mu1.or2<-mean(predict(fit.or2,newdata=data.frame(X1,X2,Z=1)))
c(mu0.or2,mu1.or2,mu1.or2-mu0.or2)
@
which does not yield the correct result.  We have in this model that the \textit{interaction} model
$m_1(x;\psi) \equiv \mu_1(x;\psi)$ is correctly specified in that it coincides with the true interaction model.  However we have that $m_0(\beta_0)$ does \textbf{not} match the true \textit{outcome nuisance model} $\mu_0(x;\beta) =  \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_1 x_2$, and as the parameter $\beta_0$ in $m_0(.)$ is estimated, we do not obtain correct inference.

\medskip
Suppose finally we fit the model that includes interactions between $e(x)$ and $x_1$ and $x_2$.
\[
\mu(x,z) = \beta_0 + z(\psi_0 + \psi_1 x_1 + \psi_2 x_2 + \psi_3 x_1 x_2) + e(x)(\phi_0+\phi_1 x_1 + \phi_2 x_2 + \phi_3 x_1 x_2)
\]
<<CA13,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit.psr.3<-lm(Y~Z+Z:X1+Z:X2+Z:X1:X2+ps.fit.c+ps.fit.c:X1+ps.fit.c:X2 + ps.fit.c:X1:X2)
round(coef(summary(fit.psr.3)),6)
mu0.psr3<-mean(predict(fit.psr.3,newdata=data.frame(X1,X2,Z=0,ps.fit.c)))
mu1.psr3<-mean(predict(fit.psr.3,newdata=data.frame(X1,X2,Z=1,ps.fit.c)))
c(mu0.psr3,mu1.psr3,mu1.psr3-mu0.psr3)
@
The resulting estimation of the causal quantities does not change appreciably.  However,
<<CA14,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
round(cbind(coef(summary(fit.psr.3))[c(2,4,5,8),],psi),6)
@
we observe that now the $\psi$ values are now also correctly estimated.
\pagebreak

\section{Inverse Weighting}

We now study the inverse weighting (or inverse probability weighting, IPW) approach that is based on an importance sampling (or change of measure) procedure.  We have that
\[
\mu(z) = \E_{Y|Z}^\calE[Y|Z=z] = \dfrac{\E_{X,Y,Z}^\calO \left[ \dfrac{\Ind_{\{z\}}(Z) Y}{f_{Z|X}^\calO(Z|X)} \right]}{\E_{X,Y,Z}^\calO \left[ \dfrac{\Ind_{\{z\}}(Z)}{f_{Z|X}^\calO(Z|X)} \right]}
\]
which becomes in the binary treatment case
\[
\mu(0) = \dfrac{\E_{X,Y,Z}^\calO \left[ \dfrac{(1-Z) Y}{1-e(X)} \right]}{\E_{X,Y,Z}^\calO \left[ \dfrac{(1-Z)}{1-e(X)} \right]} \qquad \qquad \mu(1) = \dfrac{\E_{X,Y,Z}^\calO \left[ \dfrac{Z Y}{e(X)} \right]}{\E_{X,Y,Z}^\calO \left[ \dfrac{Z}{e(X)} \right]}
\]

\subsection{IPW}

We deduce that suitable estimators following from the importance sampling results are
\[
\widehat \mu_{IPW} (0) = \dfrac{\dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{(1-Z_i) Y_i}{1-e(X_i)} }{\dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{(1-Z_i)}{1-e(X_i)}} \qquad \qquad \widehat \mu_{IPW}(1) = \dfrac{\dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{Z_i Y_i}{e(X_i)}}{\dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{Z_i}{e(X_i)}}
\]
which we may write
\[
\widehat \mu_{IPW} (0) = \dfrac{\sum\limits_{i=1}^n w_{0i} Y_i }{\sum\limits_{i=1}^n w_{0i}} = \sum\limits_{i=1}^n W_{0i} Y_i \qquad \qquad \widehat \mu_{IPW}(1) = \dfrac{\sum\limits_{i=1}^n w_{1i} Y_i }{\sum\limits_{i=1}^n w_{1i}} = \sum\limits_{i=1}^n W_{1i} Y_i
\]
with
\[
w_{0i} = \dfrac{(1-Z_i)}{1-e(X_i)}  \qquad \qquad w_{1i} = \dfrac{Z_i}{e(X_i)}  \qquad \qquad W_{zi} = \dfrac{w_{zi}}{\sum\limits_{j=1}^n w_{zj}} \quad z=0,1.
\]
Note that by construction,
\[
\E_{X,Y,Z}^\calO [\widehat \mu_{IPW}(z)] = \mu(z) \qquad z=0,1
\]
so the estimators are unbiased, and also that
\begin{align*}
\E_{X,Y,Z}^\calO [W_{zi}] = \E_{X,Y,Z}^\calO \left[\dfrac{\Ind_{\{z\}}(Z)}{f_{Z|X}^\calO(Z|X)}\right] & = \E_{X}^\calO \left[ \E_{Z|X}^\calO \left[\Ind_{\{z\}}(Z)\dfrac{1}{f_{Z|X}^\calO(Z|X)} \bigg | X \right] \right] \\[6pt]
& = \E_{X}^\calO \left[ \E_{Z|X}^\calO \left[f_{Z|X}^\calO(Z|X) \dfrac{1}{f_{Z|X}^\calO(Z|X)} \bigg | X \right] \right] = 1.
\end{align*}
This latter result suggests the alternative IPW estimators where we replace the denominator by its expectation, $n$, that is,
\[
\widetilde \mu_{IPW} (0) = \dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{(1-Z_i) Y_i}{1-e(X_i)} \qquad \qquad \widetilde \mu_{IPW}(1) = \dfrac{1}{n} \sum\limits_{i=1}^n \dfrac{Z_i Y_i}{e(X_i)}.
\]

Using fitted PS, we can compute as follows: for $\widehat \mu(z)$, we have
<<CA14a,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
w0<-(1-Z)/(1-ps.fit.c)
W0<-w0/sum(w0)
mu.hat0<-sum(W0*Y)
w1<-Z/ps.fit.c
W1<-w1/sum(w1)
mu.hat1<-sum(W1*Y)
c(mu.hat0,mu.hat1,mu.hat1-mu.hat0)
@
and for $\widetilde \mu(z)$, we have
<<CA14b,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
mu.tilde0<-mean(w0*Y)
mu.tilde1<-mean(w1*Y)
c(mu.tilde0,mu.tilde1,mu.tilde1-mu.tilde0)
@

\textbf{Note:}  We may recover the $\widehat \mu(z)$ quantities using a weighted least squares (WLS) regression of $Y$ on $Z$
\[
\E_{Y|Z}^\calO [Y|Z=z] = \beta_0 + \psi_0 z
\]
with weights
\[
w_{0i} + w_{1i} = \dfrac{(1-Z_i)}{1-e(X_i)}  + \dfrac{Z_i}{e(X_i)} = R_i
\]
say.
<<CA14c,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
w<-w0+w1
fit.wls<-lm(Y~Z,weights=w)
round(coef(summary(fit.wls)),6)
@
The estimated coefficients are precisely $\widehat \mu(0)$ and $\widehat \mu(1) - \widehat \mu(1)$.  This result gives us further insight into how the inverse probability weighting succeeds in correcting the confounding; it creates a pseudo-sample from the distribution from the experimental measure $\calE$: that is
\begin{itemize}
  \item in the observed sample, collected under $\calO$, each data point carries a weight $1/n$.
  \item for the pseudo sample, we wish to have that each data point carries a weight $1/n$ under $\calE$, but in fact datum $i$ carries a weight that is dependent on $X_i$, that is
      \[
      f_{Z|X}^\calO (z|x_i) = \{e(x_i)\}^z \{1-e(x_i)\}^{1-z} \qquad z=0,1.
      \]
\end{itemize}
\subsection{Augmented IPW}

In augmented IPW (AIPW) estimation, we utilize the fact that if we denote
\[
\mu(x,z) = \E_{Y|X,Z}^\calE [Y|X=z,Z=z]  = \E_{Y|X,Z}^\calO [Y|X=z,Z=z]
\]
we may write
\begin{align*}
\E_{X,Y,Z}^\calO \left[ \dfrac{\Ind_{\{z\}}(Z) Y}{f_{Z|X}^\calO(Z|X)} \right] & = \E_{X,Y,Z}^\calO \left[ \dfrac{\Ind_{\{z\}}(Z) (Y-\mu(X,Z)+\mu(X,Z))}{f_{Z|X}^\calO(Z|X)} \right] \\[6pt]
& = \E_{X,Y,Z}^\calO \left[ \dfrac{\Ind_{\{z\}}(Z) (Y-\mu(X,Z))}{f_{Z|X}^\calO(Z|X)} \right] + \E_{X,Y,Z}^\calO \left[ \dfrac{\Ind_{\{z\}}(Z) \mu(X,Z)}{f_{Z|X}^\calO(Z|X)} \right]\\[6pt]
& = \E_{X,Y,Z}^\calO \left[ \dfrac{\Ind_{\{z\}}(Z) (Y-\mu(X,Z))}{f_{Z|X}^\calO(Z|X)} \right] + \E_{X}^\calO \left[ \mu(X,z) \right]\\[6pt]
\end{align*}
with the last simplification following by iterated expectation.  This result suggests the alternative estimator
\[
\widehat \mu_{AIPW}(z) = \sum_{i=1}^n W_{zi} (Y_i - \mu(X_i,z)) + \frac{1}{n} \sum_{i=1}^n \mu(X_i,z) \qquad z=0,1.
\]
By the above construction, $\widehat \mu_{AIPW}(z)$ is also an unbiased estimator of $\mu(z)$.  We may also utilize the estimator
\[
\widetilde \mu_{AIPW}(z) = \frac{1}{n} \sum_{i=1}^n w_{zi} (Y_i - \mu(X_i,z)) + \frac{1}{n} \sum_{i=1}^n \mu(X_i,z) \qquad z=0,1
\]
which is also unbiased for $\mu(z)$.

\medskip
These augmented (AIPW) estimators are preferred to the original (IPW) estimators as they can have lower variance.  To see this, note that we have that
\[
\widetilde \mu_{AIPW}(z) = \frac{1}{n} \sum_{i=1}^n (w_{zi} Y_i + (1-w_{zi}) \mu(X_i,z)).
\]
From this form, we see that $\widetilde \mu_{AIPW}(z)$ is an unbiased estimator, as by iterated expectation
\[
\E[w_{zi} Y_i + (1-w_{zi}) \mu(X_i,z)] = \E[w_{zi} \mu(X_i,z) + (1-w_{zi}) \mu(X_i,z)] = \E[\mu(X_i,z)] = \mu(z).
\]
The variance of the estimator is the variance of the individual terms in the summation divided by $n$ as the terms are independent.  In the following calculation, all expectations and variances are taken with respect to the joint observational distribution $\calO$.  We have from first principles that
\[
  \Var[W_{z} Y + (1-W_{z}) \mu(X,z)] = \Var[W_z Y] + \Var[(1-W_z) \mu(X,z)] + 2 \Cov[W_z Y, (1-W_z) \mu(X,z)].
\]
Now
\[
\Var[(1-W_z) \mu(X,z)] = \E[(1-W_z)^2 \{\mu(X,z)\}^2] - \{\E[(1-W) \mu(X,z)]\}^2 = \E[(1-W)^2 \{\mu(X,z)\}^2]
\]
as $\E[W_z] = 1$ by the earlier result, so $\E[(1-W) \mu(X,z)] = 0$ by iterated expectation.  Secondly
\[
\Cov[W_z Y, (1-W_z) \mu(X,z)] = \E[W_z (1-W_z) Y \mu(X,z)] - \E[W_z Y] \E[(1-W_z) \mu(X,z)] = \E[W_z (1-W_z) \{\mu(X,z)\}^2]
\]
again by iterated expectation, again as $\E[(1-W) \mu(X,z)] = 0$.  Therefore
\begin{align*}
  \Var[W_{z} Y + (1-W_{z}) \mu(X,z)] & = \Var[W_z Y] + \E[(1-W_z)^2 \{\mu(X,z)\}^2] + 2 \E[W_z (1-W_z) \{\mu(X,z)\}^2] \\[6pt]
  & = \Var[W_z Y] + \E[(1-W_z^2) \{\mu(X,z)\}^2 ]\\[6pt]
  & < \Var[W_z Y]
\end{align*}
as, in the second term, on computing by iterated expectation, we have that
\[
\E_{Z|X}^\calO[(1-W_z^2)] = \E_{Z|X}^\calO \left[ 1 - \left(\frac{\Ind_{\{z\}}(Z)}{f_{Z|X}^\calO(Z|X)} \right)^2 \right] = \E_{Z|X}^\calO \left[ 1 - \frac{\Ind_{\{z\}}(Z)}{\left(f_{Z|X}^\calO(Z|X)\right)^2} \right] = 1 - \frac{1}{f_{Z|X}^\calO(z|X)}
\]
that is,
\[
\E_{Z|X}^\calO[(1-W_z^2)|X] = 1 - \frac{1}{\{e(X)\}^z \{1-e(X)\}^{1-z} } < 0
\]
almost surely (that is, this quantity is a random variable defined in terms of $X$ that is negative with probability 1), and so
\[
\E[(1-W_z^2) \{\mu(X,z)\}^2 ] < 0.
\]
This result follows also for the estimator $\widetilde \mu(z)$.

\pagebreak

For the calculation with known, correctly specified $\mu(x,z)$ we have
<<CA15a,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
mu.x0<- Xb %*% be
mu.x1<- Xb %*% be + Xp %*% psi
mu.hat.a0<-sum(W0*(Y-mu.x0))+mean(mu.x0)
mu.hat.a1<-sum(W1*(Y-mu.x1))+mean(mu.x1)
c(mu.hat.a0,mu.hat.a1,mu.hat.a1-mu.hat.a0)
mu.tilde.a0<-mean(w0*(Y-mu.x0))+mean(mu.x0)
mu.tilde.a1<-mean(w1*(Y-mu.x1))+mean(mu.x1)
c(mu.tilde.a0,mu.tilde.a1,mu.tilde.a1-mu.tilde.a0)
@
In practice we need to estimate the parameters in the specification $\mu(x,z;\beta,\psi)$: under correct specification
<<CA15b,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit0<-lm(Y~X1+X2+X1:X2+Z+Z:X1+Z:X2+Z:X1:X2)
mu.x0<-predict(fit0,newdata=data.frame(X1,X2,Z=0))
mu.x1<-predict(fit0,newdata=data.frame(X1,X2,Z=1))
mu.hat.a0<-sum(W0*(Y-mu.x0))+mean(mu.x0)
mu.hat.a1<-sum(W1*(Y-mu.x1))+mean(mu.x1)
c(mu.hat.a0,mu.hat.a1,mu.hat.a1-mu.hat.a0)
mu.tilde.a0<-mean(w0*(Y-mu.x0))+mean(mu.x0)
mu.tilde.a1<-mean(w1*(Y-mu.x1))+mean(mu.x1)
c(mu.tilde.a0,mu.tilde.a1,mu.tilde.a1-mu.tilde.a0)
@
This realization highlights the fact that under an assumption of correct specification, we could merely perform outcome regression, which is known to optimal, so that in fact the IPW version is redundant.  Specifically, under correct specification of $\mu(X,z;\beta,\psi)$, we have by iterated expectation that
\[
\E[W_{z} (Y-\mu(X,z;\beta,\psi))] = \E[W_{z} (Y-\mu(x,z;\widehat \beta, \widehat \psi))] = 0.
\]
However, suppose that we mis-specify the conditional mean model as $m(x,z;\beta,\psi)$, say.   Then provided the propensity score model is correctly specified, we still have that
\[
\Var[W_{z} Y + (1-W_{z}) m(X,z;\widehat \beta,\widehat \psi )] < \Var[W_{z} Y]
\]
but also that
\begin{align*}
\E[W_{z} Y + (1-W_{z}) \mu(X,z;\widehat \beta, \widehat \psi)] & =  \E[W_{z} Y + (1-W_{z}) (\mu(X,z;\widehat \beta, \widehat \psi) - \mu(X,z)) + (1-W_{z}) \mu(X,z) ] \\[6pt]
& =  \E[W_{z} \mu(X,z) + (1-W_{z}) (\mu(X,z;\widehat \beta, \widehat \psi) - \mu(X,z)) + (1-W_{z}) \mu(X,z) ] \\[6pt]
& = \E[\mu(X,z)] = \mu(z)
\end{align*}
by iterated expectation, as  $\E_{Z|X}^\calO[(1-W_{z})|X] = 0$.  Therefore, we obtain an unbiased estimator even if the conditional mean model is mis-specified.  For example, if we specify
\[
m(x,z) = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \psi_0 z
\]
then the result holds.
<<CA15c,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit0<-lm(Y~X1+X2+Z)
mu.x0<-predict(fit0,newdata=data.frame(X1,X2,Z=0))
mu.x1<-predict(fit0,newdata=data.frame(X1,X2,Z=1))
mu.hat.a0<-sum(W0*(Y-mu.x0))+mean(mu.x0)
mu.hat.a1<-sum(W1*(Y-mu.x1))+mean(mu.x1)
c(mu.hat.a0,mu.hat.a1,mu.hat.a1-mu.hat.a0)
mu.tilde.a0<-mean(w0*(Y-mu.x0))+mean(mu.x0)
mu.tilde.a1<-mean(w1*(Y-mu.x1))+mean(mu.x1)
c(mu.tilde.a0,mu.tilde.a1,mu.tilde.a1-mu.tilde.a0)
@
Conversely, if the propensity score model is mis-specified, but the conditional mean model is correctly specified, we again obtain an unbiased estimator:  suppose that the propensity score model proposed is $f_{Z|X}^\dag(Z|X)$, and denote
\[
w_{zi}^\dag = \frac{\Ind_{\{z\}}(Z_i)}{f_{Z|X}^\dag(Z_i|X_i)}.
\]
Then
\[
\E[w_{zi}^\dag] = \E_{Z|X}^\calO \left[\frac{\Ind_{\{z\}}(Z_i)}{f_{Z|X}^\dag(Z_i|X_i)} \bigg | X_i \right] = \frac{f_{Z|X}^\calO(z|X_i)}{f_{Z|X}^\dag(z|X_i)} = r^\dag (X_i,z)
\]
say, and we still have an unbiased estimator as
\[
\E[(w_{zi}^\dag Y_i + (1-w_{zi}^\dag) \mu(X_i,z)] = \E[(w_{zi}^\dag \mu(X_i,z) + (1-w_{zi}^\dag) \mu(X_i,z)] = \mu(z).
\]
Recall that the IPW estimator is not unbiased if the PS model is mis-specified, as we would have
\[
\E_{X,Y,Z}^\calO[\widetilde \mu(z)] = \E_{X,Y,Z}^\calO \left[\frac{\Ind_{\{z\}}(Z_i)Y_i}{f_{Z|X}^\dag(Z_i|X_i)} \right] = \E_{X}^\calO \left[r^\dag (X_i,z)\mu(X_i,z) \right] \neq \mu(z).
\]
Thus the AIPW estimator is unbiased even if precisely one of the propensity or conditional mean models is mis-specified.  This property is termed \textit{double robustness}.

\bigskip
\textbf{Note:} The double robustness to mis-specification of the AIPW estimator might imply that it is superior to the singly robust outcome regression or IPW estimators.  This is not necessarily true in practice, as a badly mis-specified model for the PS or the conditional mean model may yield an AIPW estimator that is not appreciably better than either singly robust estimator.

\bigskip

If both PS and outcome regression model are mis-specified, then a biased estimator results

<<CA15d,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit0<-lm(Y~X1+Z)
w0.i<-(1-Z)/(1-ps.fit.i)
W0.i<-w0.i/sum(w0.i)
w1.i<-Z/ps.fit.i
W1.i<-w1.i/sum(w1.i)
mu.hat.i0<-sum(W0.i*Y)
mu.hat.i1<-sum(W1.i*Y)
c(mu.hat.i0,mu.hat.i1,mu.hat.i1-mu.hat.i0)
mu.x0<-predict(fit0,newdata=data.frame(X1,X2,Z=0))
mu.x1<-predict(fit0,newdata=data.frame(X1,X2,Z=1))
mu.hat.a0<-sum(W0.i*(Y-mu.x0))+mean(mu.x0)
mu.hat.a1<-sum(W1.i*(Y-mu.x1))+mean(mu.x1)
c(mu.hat.a0,mu.hat.a1,mu.hat.a1-mu.hat.a0)
mu.tilde.a0<-mean(w0.i*(Y-mu.x0))+mean(mu.x0)
mu.tilde.a1<-mean(w1.i*(Y-mu.x1))+mean(mu.x1)
c(mu.tilde.a0,mu.tilde.a1,mu.tilde.a1-mu.tilde.a0)
@




\section{G-estimation}

\subsection{Construction}

G--estimation can be considered a modified form of outcome regression estimation that introduces a robustness to mis-specification via PS adjustment.  Consider first the data generating model
\[
\E_{Y|X,Z}^\calO[Y | X=x,Z=z] = z \psi_0.
\]
Then the OLS estimating equation for the $\psi_0$ is
\[
\sum_{i=1}^n z_i (y_i - z_i \psi_0) = 0.
\]
Then we have that the resulting estimate for $\psi_0$ is
\[
\widehat \psi_0 = \dfrac{\sum\limits_{i=1}^n z_i y_i }{\sum\limits_{i=1}^n z_i^2 }.
\]
Suppose now that the estimating equation is modified to
\[
\sum_{i=1}^n (z_i-e(x_i)) (y_i - z_i \psi_0) = 0.
\]
Then we have that the new estimate for $\psi_0$ is
\[
\widehat \psi_0 = \dfrac{\sum\limits_{i=1}^n (z_i-e(x_i)) y_i }{\sum\limits_{i=1}^n z_i (z_i - e(x_i)) }
\]
Consider now the more general linear model
\[
\E_{Y|X,Z}^\calO[Y | X=x,Z=z] = \bx_1 \beta + z \bx_2 \psi = \mu_0(x; \beta) + z \mu_1(x; \psi)
\]
and the OLS estimating equation
\[
\sum_{i=1}^n \begin{pmatrix} \bx_{1 i}^\top \\[3pt] z \bx_{2 i}^\top \end{pmatrix} (y_i - \bx_{1 i} \beta - z \bx_{2 i} \psi ) = \begin{pmatrix} \mathbf{0} \\[3pt] \mathbf{0} \end{pmatrix}.
\]
In modified form, this becomes
\[
\sum_{i=1}^n \begin{pmatrix} \bx_{1 i}^\top \\[3pt] (z-e(x_i)) \bx_{2 i}^\top \end{pmatrix} (y_i - \bx_{1 i} \beta - z \bx_{2 i} \psi ) = \begin{pmatrix} \mathbf{0} \\[3pt] \mathbf{0} \end{pmatrix}.
\]
These equations are referred to as the \textit{G--estimating equations}. It can be shown that the estimates for $\psi$ obtained by solving these equations are realizations of unbiased estimators of the true values even if the model
\[
\mu_0(x;\beta) = \bx_{1} \beta
\]
is mis-specified, \textbf{provided that the model}
\[
\mu_1(x;\psi) = \bx_{2} \psi
\]
\textbf{is correctly specified}, \textbf{and} provided that the PS model is correctly specified.  Thus this procedure is a \textit{doubly robust} procedure if $\mu_1(x;\psi)$ is correctly specified.

\bigskip

To solve the estimating equations, we consider estimating the model
\[
\E_{Y|X,Z}^\calO[Y | X=x,Z=z] = \mu_0(x; \beta) + z \mu_1(x; \psi) + e(x) \mu_1(x;\phi)
\]
via OLS.  For example, suppose we have the linear forms
\begin{align*}
\mu_0(x; \beta) & = \beta_0\\[6pt]
\mu_1(x; \psi) & = \psi_0 + \psi_1 x_1 + \psi_2 x_2 + \psi_3 x_1 x_2 \\[6pt]
\mu_1(x; \phi) & = \phi_0 + \phi_1 x_1 + \phi_2 x_2 + \phi_3 x_1 x_2
\end{align*}
Then the OLS estimating equations take the form
\[
\sum_{i=1}^n \begin{pmatrix} \bx_{1 i}^\top \\[3pt] z_i \bx_{2 i}^\top \\ e(x_i) \bx_{2 i}^\top \end{pmatrix} (y_i - \beta_0 - z \bx_{2 i} \psi - e(x) \bx_{2 i} \phi ) = \begin{pmatrix} \mathbf{0} \\[3pt] \mathbf{0} \\[3pt] \mathbf{0}\end{pmatrix}
\]
and subtracting the third block from the second block yields
\[
\sum_{i=1}^n \begin{pmatrix} \bx_{1 i}^\top \\[3pt] (z_i -e(x_i)) \bx_{2 i}^\top \\ e(x_i) \bx_{2 i}^\top \end{pmatrix} (y_i - \beta_0 - z \bx_{2 i} \psi - e(x) \bx_{2 i} \phi ) = \begin{pmatrix} \mathbf{0} \\[3pt] \mathbf{0} \\[3pt] \mathbf{0}\end{pmatrix}.
\]
which recovers the G--estimating equations.  Hence, \textbf{G--estimation is a form of propensity score regression}.

<<CA16,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit.gee<-lm(Y~Z+Z:X1+Z:X2+Z:X1:X2+ps.fit.c+ps.fit.c:X1+ps.fit.c:X2+ps.fit.c:X1:X2)
round(coef(summary(fit.gee)),6)
mu0.gee<-mean(predict(fit.gee,newdata=data.frame(X1,X2,Z=0,ps.fit.c)))
mu1.gee<-mean(predict(fit.gee,newdata=data.frame(X1,X2,Z=1,ps.fit.c)))
c(mu0.gee,mu1.gee,mu1.gee-mu0.gee)
@

\subsection{The \texttt{drgee} package}
The doubly robust version of G--estimation is carried out by the \texttt{drgee} package in \texttt{R}.  This package considers the model specification
\[
\E_{Y|X,Z}^\calO[Y | X=x,Z=z] = \mu_0(x;\beta) + z \mu_1(x;\psi)
\]
for the outcome regression component, and
\[
\E_{Z|X}^\calO[Z | X=x] = g( x; \alpha)
\]
for the PS component, where $g(.)$ is an appropriate link function.  In the \texttt{drgee} function

\begin{itemize}
  \item $\mu_0(x;\beta)$ is specified via the \texttt{oformula} and \texttt{olink} arguments;
  \item $\mu_1(x;\beta)$ is specified via the \texttt{iaformula} and \texttt{olink} arguments;
  \item $g(x;\alpha)$ is specified via the \texttt{eformula} and \texttt{elink} arguments.
\end{itemize}

The argument \texttt{estimation.method} allows for estimation via three different methods:

\begin{itemize}
  \item \texttt{estimation.method = `dr'} performs doubly robust G--estimation;
  \item \texttt{estimation.method = `e'} performs singly robust G--estimation, that is, it uses the model with
  \[
  \mu_0(x) \equiv 0;
  \]
  \item \texttt{estimation.method = `o'} performs outcome regression using the model
  \[
  \mu_0(x;\beta) + z \mu_1(x;\psi).
  \]
\end{itemize}

<<CA17,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
mydata<-data.frame(X1,X2,Z,Y)
library(drgee)

#Correct specification
dr.est <- drgee(oformula = formula(Y~X1*X2), eformula = formula(Z~X1*X2),
iaformula = formula(~X1*X2), olink = "identity", elink = "logit",
data = mydata, estimation.method = "dr")
round(coef(summary(dr.est)),6)

#Mis-specification of PS model, correct specification of the OR nuisance model
dr.est <- drgee(oformula = formula(Y~X1*X2), eformula = formula(Z~X1),
iaformula = formula(~X1*X2), olink = "identity", elink = "logit",
data = mydata, estimation.method = "dr")
round(coef(summary(dr.est)),6)

#Correct specification of PS model, mis-specification of the OR nuisance model
dr.est <- drgee(oformula = formula(Y~X1), eformula = formula(Z~X1*X2),
iaformula = formula(~X1*X2), olink = "identity", elink = "logit",
data = mydata, estimation.method = "dr")
round(coef(summary(dr.est)),6)
@

To match the estimation in the earlier section, we note that \texttt{drgee} utilizes an alternative formulation and an additional estimating equation.  Using bold symbols to denote vectors and matrices, in the linear case \texttt{drgee} proceeds by solving the estimating equations
\[
\begin{pmatrix}
  \bX_\beta^\top  (\by - \bX_\beta \beta - \bz \odot \bX_\psi \psi^\prime)\\[6pt]
  \bX_\psi^\top \bz \odot (\by - \bX_\beta \beta - \bz \odot \bX_\psi \psi^\prime)\\[6pt]
  \bX_\psi^\top (\bz-\be)  \odot  (\by - \bX_\beta \beta - \bz \odot \bX_\psi \psi)
\end{pmatrix}
=\begin{pmatrix}
  0 \\[6pt]
  0 \\[6pt]
  0
\end{pmatrix}
\]
where $\psi$ is the parameter of interest, and where the symbol $\odot$ denotes \textit{elementwise} multiplication, that is for two vectors $\bx_1, \bx_2 \in \R^k$,
\[
\bx_1 \odot \bx_2  = \left[ x_{1j} x_{2j} \right]_{j=1}^k \in \R^k.
\]
\texttt{{drgee}} solves the first two equations to get $\widehat \beta$ utilizing the additional nuisance parameter $\psi^\prime$, then solves
\[
\bX_\psi^\top (\bz-\be) \odot (\by - \bX_\beta \widehat \beta - \bz \odot \bX_\psi \psi) = 0
\]
that is, yielding the solution
\[
\widehat \psi = (\bX_\psi^\top \bD  \bX_\psi)^{-1} \bX_\psi^\top (\bz-\be) \odot (\by - \bX_\beta \widehat \beta)
\]
where $\bD$ is the $n \times n$ diagonal matrix
\[
\bD = \text{diag} \left\{ (\bz-\be) \odot \bz \right\}.
\]
In contrast, the ordinary G-estimation method uses the reduced system
\[
\begin{pmatrix}
  \bX_\beta^\top  (\by - \bX_\beta \beta - \bz \odot \bX_\psi \psi)\\[6pt]
  \bX_\psi^\top (\bz-\be) \odot (\by - \bX_\beta \beta - \bz \odot \bX_\psi \psi)
\end{pmatrix}
=\begin{pmatrix}
  0 \\[6pt]
  0
\end{pmatrix}
\]
which does not have the extra nuisance parameters $\psi^\prime$.  The estimator $\widehat \psi$ takes precisely the same form except that the plug-in form of $\widehat \beta$ is different.  Under correct specification of the blip (always assumed in our analysis), the two methods are essentially equivalent.

<<CA18,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
dr.est <- drgee(oformula = formula(Y~1), eformula = formula(Z~X1*X2),
iaformula = formula(~X1*X2), olink = "identity", elink = "logit",
data = mydata, estimation.method = "dr")
pars<-dr.est$coefficients.all
psi.hat<-pars[1:4]                         #psi parameters
alpha.hat<-pars[5:8]                       #ps parameters
psi.prime.hat<-pars[9:12]                  #psi additional parameters
beta.hat<-pars[13]                         #beta.parameters
psi.hat
@

<<CA19,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
#Computing the ATE
Xm<-cbind(1,X1,X2,X1*X2)
ps.fit.drgee<-1/(1+exp(-Xm %*% alpha.hat))
mu0.drgee<-mean(beta.hat + (0-ps.fit.drgee)*Xm%*%psi.hat+ps.fit.drgee*Xm%*%psi.prime.hat)
mu1.drgee<-mean(beta.hat + (1-ps.fit.drgee)*Xm%*%psi.hat+ps.fit.drgee*Xm%*%psi.prime.hat)
c(mu0.drgee,mu1.drgee,mu1.drgee-mu0.drgee)
@


<<CA20,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
#Now to check the calculation using the theory above
psi.prime.hat
beta.hat
coef(fit0<-lm(Y~Z+Z:X1+Z:X2+Z:X1:X2))
alpha.hat
coef(fit1<-glm(Z~X1*X2,family=binomial))
eX<-fitted(fit1)
Y1<-(Z-eX)*(Y-beta.hat)
Xp<-cbind(1,X1,X2,X1*X2)
D<-diag(Z*(Z-eX))
psi.ests<-solve((t(Xp) %*% D) %*% Xp) %*% t(Xp) %*% Y1
cbind(psi.hat,psi.ests)  #Results match
@

\end{document}