\documentclass[notitlepage]{article}
\usepackage{Math}
%\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{bm}
\usepackage{bbm}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{dsfont}
\usepackage{scalerel}

\usepackage{amsfonts,amsmath,amssymb}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}


\usepackage{pgfplots}
\pgfplotsset{compat=1.17}

\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
            \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}

\usepackage{xcolor}

\newtheorem{alg}{Algorithm}
\newtheorem{ex}{Example}
\newtheorem{note}{Note}
\newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof
}}}}
{\hfill \small$\blacksquare$\\[2mm]}


\setlength{\parindent}{0in}

\def\E{\mathbbmss{E}}
\def\Var{\text{Var}}
\def\Cov{\text{Cov}}
\def\Corr{\text{Corr}}

\def\bW{\mathbf{W}}
\def\bH{\mathbf{H}}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\Rho}{\mathrm{P}}
\newcommand{\cif}{\text{ if }}
\newcommand{\Ra}{\Longrightarrow}

\def\bphi{\boldsymbol \phi}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\def\Xb{\mathbf{X}_{\beta}}
\def\Xp{\mathbf{X}_{\psi}}
\def\beps{\boldsymbol{\epsilon}}
\def\betahat{\widehat \beta}
\def\psihat{\widehat \psi}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=6)
options(repos=c(CRAN="https://cloud.r-project.org/"))
knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)})
@
\begin{center}
{\textsclarge{The Average Treated Effect on the Treated}}
\end{center}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

The \emph{average treatment effect on the treated} (ATT) is
\[
\E[Y(\rtmt) - Y(\rnotmt) \given Z=1]
\]
that is, the ATT aims to identify the causal effect on intervening to change $Z=\rnotmt$ to $Z=\rtmt$ but \emph{only} in the subpopulation of individuals who are \emph{observed} to receive treatment.  Note that
\begin{align*}
\E[Y(\rtmt) - Y(\rnotmt)] & = \E[Y(\rtmt) - Y(\rnotmt) \given Z=0] \Pr[Z=0]  +  \E[Y(\rtmt) - Y(\rnotmt) \given Z=1] \Pr[Z=1]
\end{align*}

In this calculation, we imagine

\begin{itemize}
  \item the observational distribution $f_{X,Y,Z}^\calO$ generating the observed data $\{(x_i,y_i,z_i), i=1,\ldots,n\}$

  \item in the subgroup observed to have $Z=1$, we then consider a second (hypothetical) experimental intervention to change $Z$ to $\rz$ which over-rides the original $Z$ if $\rz = \rnotmt$,

  \item we then consider comparison of outcomes between the two hypothetical subgroups indexed by $\rz$.

\end{itemize}



\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    cbox/.style ={rounded rectangle, draw, minimum width = 0.7 cm},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

  \begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x) at (-1,0) {${X}$};
    \node[cbox] (yz) at (1,1) {${\{Y(\rnotmt), Y(\rtmt)\}}$};
    \node[state] (z) at (1,-1) {${Z}$};
    \node[state] (y) at (3,0) {${Y}$};

    % Directed edge
    \path (x) edge (z);
    \path (x) edge (yz);
    \path (yz) edge (y);
    \path (z) edge (y);

    \node at (1,-2) {$f_X(x) \: f_{Z|X}(z|x) \: f_{Y(\rnotmt),Y(\rtmt)|X}(y_0,y_1|x) \: f_{Y|Z,Y(\rnotmt),Y(\rtmt)}(y|z,y_0,y_1)$};

 \end{tikzpicture}
\end{figure}

For the ATT, we can represent the quantity of interest using a modified DAG that proposes a second hypothetical binary treatment, $A$.  We allow $Z$ to cause $A$, and then allow $Z$ to act as a selection mechanism, but ensure that we have the conditional independence of $X$ and $A$ given $Z$, $X \independent A \given Z$,

\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x) at (-1,0) {${X}$};
    \node[cbox] (yz) at (1,1) {${\{Y(\rnotmt), Y(\rtmt)\}}$};
    \node[state] (z) at (0.25,-1) {${Z}$};
    \node[state] (a) at (1.75,-1) {${A}$};
    \node[state] (y) at (3,0) {${Y}$};

    % Directed edge
    \path (x) edge (z);
    \path (x) edge (yz);
    \path (yz) edge (y);
    \path (z) edge (a);
    \path (a) edge (y);

 \end{tikzpicture}
\end{figure}
\[
f_X(x) \: f_{Z|X}(z|x) \: f_{Y(\rnotmt),Y(\rtmt)|X}(y_0,y_1|x) \: f_{A|Z}(a|z) \: f_{Y|A,Y(\rnotmt),Y(\rtmt)}(y|a,y_0,y_1)
\]
This is the new `experimental' distribution $\mathcal{E}$.
\[
f_{Y|A,Y(\rnotmt),Y(\rtmt)}^\calE(y|a,y_0,y_1) = \left\{
\begin{array}{cc}
  1 & y=(1-a)y_0 + a y_1 \\[6pt]
  0 & \text{otherwise}
\end{array}
\right. .
\]
As $A$ is binary, the model $f_{A|Z}(a|z)$ must take the form
\[
f_{A|Z}^\calE(a|z) = p_z^a (1-p_z)^{1-a} \qquad a,z \in \{0,1\}
\]
for $0 \leq p_z \leq 1$ for $z = 0,1$. We can then express the ATT via the new DAG as
\[
\E_{Y|A,Z}^\calE [Y|A=\rtmt,Z=1] - \E_{Y|A,Z}^\calE [Y|A=\rnotmt,Z=1].
\]
We have that from the DAG that
\[
f_{Y |A,X,Z}^\calE (y |a,x,z) \equiv f_{Y |A,X}^\calE (y |a,x)
\]
and hence as before
\begin{align*}
f_{Y |A,X}^\calE (y |a,x) & = \displaystyle \int f_{Y|A,Y(\rnotmt),Y(\rtmt)}^\calE (y|a,y_0,y_1) f_{Y(\rnotmt),Y(\rtmt)|A,X}^\calE (y_0,y_1|a,x) \ d y_0 \ d y_1 = f_{Y(a)|X}^\calE (y|x).
\end{align*}
Also from the DAG, $X \independent A \given Z$, so for all $a,x,z$
\[
f_{X|A,Z}^\calE (x|a,z) \equiv  f_{X|Z}^\calE (x|z).
\]



For $\ra=\rnotmt,\rtmt$, we therefore have
\begin{align*}
\E_{Y|A,Z}^\calE [Y \mid  A=\ra, Z=z] =& \iint y \: f_{Y\mid A, X}^\calE (y\mid \ra, x) f_{X\mid Z}^\calE (x\mid z) \: dy \: dx.
\end{align*}
Note that in this integral there is a potential \emph{incompatibility} in the conditioning between
\[
f_{Y\mid A, X}^\calE (y\mid \ra, x) \qquad \textrm{and} \qquad
f_{X\mid Z}^\calE (x\mid z)
\]
when we try to write the integral in terms of the data generating mechanism.  As before, choosing the form of the outcome conditional model
\[
f_{Y\mid A, X}^\calE (y\mid \ra, x)
\]
is to be avoided if possible due to the danger of mis-specification.  If the choice is made correctly, then it can form the basis of a regression estimator.

\bigskip

We seek to resolve the incompatibility using the importance sampling trick, and write the expectation with respect to the observational model
\[
f_{Y|X,Z}^\calO (y|x,z) f_{Z|X}^\calO (z|x) f_X^\calO(x).
\]
First note that
\[
f_{X\mid Z}^\calE (x\mid z) = \frac{f_{Z\mid X}^\calE (z\mid x) f_X^\calE(x)}{f_Z^\calE(z)}
\]
so the integral can be rewritten
\[
\frac{1}{f_Z^\calE(z)} \iint y \: f_{Y\mid A, X}^\calE (y\mid \ra, x) f_{Z\mid X}^\calE (z\mid x) f_X^\calE(x) \: dy \: dx.
\]
For $\rz=0,1$, we can re-write the integrand using the importance sampling trick as
\[
y \: f_{Y\mid A, X}^\calE (y\mid \ra, x) \frac{f_{Z\mid X}^\calE (z\mid x)}{f_{Z\mid X}^\calE (\ra \mid x)} \: f_{Z\mid X}^\calE (\ra \mid x) f_X^\calE(x)
\]
which can be rearranged to
\[
\left\{ y \: \frac{f_{Z\mid X}^\calE (z\mid x)}{f_{Z\mid X}^\calE (\ra \mid x)} \right\} \: f_{Y\mid A, X}^\calE (y\mid \ra, x) \: f_{Z\mid X}^\calE (\ra \mid x) f_X^\calE(x),
\]




Comparing the observational and experimental DAGs, we see that for all $x$ and $z$
\[
f_{Z\mid X}^\calE (z\mid x) \equiv f_{Z\mid X}^\calO (z\mid x)
\qquad
f_X^\calE (x) \equiv  f_X^\calO (x) \qquad  f_Z^\calE (z) \equiv  f_Z^\calO(z).
\]
Also, we have for any $t$ and $y$ that
\[
f_{Y\mid A, X}^\calE (y\mid t , x) \equiv f_{Y\mid X,Z}^\calO (y\mid x,t).
\]
Therefore we have
\[
f_{Y\mid A, X}^\calE (y\mid \ra, x) \: f_{Z\mid X}^\calE (\ra \mid x) f_X^\calE(x) \equiv  f_{Y\mid X,Z}^\calO (y\mid x,\ra ) \: f_{Z\mid X}^\calO (\ra \mid x) f_X^\calO(x).
\]


Thus
\begin{align*}
\E_{Y|A,Z}^\calE [Y \mid  A=\ra, Z=z] & = \frac{1}{f_Z^\calO (z)} \iint \left\{ y \frac{f_{Z\mid X}^\calO (z\mid x)}{f_{Z\mid X}^\calO (\ra \mid x)} \right\} \: f_{X,Y,Z}^\calO (x,y,\ra) \: dy \: dx \\[6pt]
& \quad = \frac{1}{f_Z^\calO (z)} \iiint \left\{ \Ind_{\{\ra\}}(t) y \frac{f_{Z\mid X}^\calO (z\mid x)}{f_{Z\mid X}^\calO (t \mid x)} \right\} \: f_{X,Y,Z}^\calO (x,y,t) \: dy \: dx \: dt \\[6pt]
& \quad = \frac{1}{f_Z^\calO(z)} \E_{X,Y,Z}^\calO \left[ \Ind_{\{\ra\}}(Z) Y \: \dfrac{f_{Z\mid X}^\calO (z\mid X)}{f_{Z\mid X}^\calO (Z \mid X)} \right].
\end{align*}

For the ATT, we are interested only in $z=1$. The moment-based estimator is therefore
\[
\widehat \E_{Y|A,Z}^\calE [Y \mid  A=\ra, Z=1] = \dfrac{\sum\limits_{i=1}^n \Ind_{\{\ra\}}(Z_i) w_1(X_i,Z_i) Y_i }{\sum\limits_{i=1}^n \Ind_{\{1\}}(Z_i) }
\qquad
\textrm{where}
\qquad
w_z(X_i,Z_i) = \dfrac{f_{Z\mid X}^\calO (z\mid X_i)}{f_{Z\mid X}^\calO (Z_i \mid X_i)}
\]


\begin{itemize}
\item When $\ra = \rtmt$,
\[
\Ind_{\{\ra\}}(Z_i) w_1(X_i,Z_i) = \Ind_{\{\rtmt\}}(Z_i) = Z_i \quad \textrm{w.p. 1}
\]
as the weight is identically 1, so therefore
\[
\widehat \E_{Y|A,Z}^\calE [Y \mid  A=\rtmt, Z=1] = \dfrac{\sum\limits_{i=1}^n Z_i Y_i }{\sum\limits_{i=1}^n Z_i }
\]
that is, the mean in the treated group.


\item When $\ra = \rnotmt$,
\begin{align*}
\Ind_{\{\ra\}}(Z_i) w_1(X_i,Z_i) & = \Ind_{\{\rnotmt\}}(Z_i) \dfrac{f_{Z\mid X}^\calO (1 \mid X_i)}{f_{Z \mid X}^\calO (Z_i \mid X_i)}=  (1-Z_i) \dfrac{f_{Z\mid X}^\calO (1 \mid X_i)}{f_{Z \mid X}^\calO (\rnotmt \mid X_i)}.
\end{align*}
Therefore
\[
\widehat \E_{Y|A,Z}^\calE [Y \mid  A=\rnotmt, Z=1] = \dfrac{\sum\limits_{i=1}^n (1-Z_i) w(X_i) Y_i }{\sum\limits_{i=1}^n Z_i }
\]
where
\[
w(X_i) = \dfrac{f_{Z\mid X}^\calO (1 \mid X_i)}{f_{Z \mid X}^\calO (\rnotmt \mid X_i)} = \frac{e(X_i)}{1-e(X_i)}
\]
That is, this estimator is a weighted sum of contributions from the \emph{untreated} individuals.
\end{itemize}
Thus the estimator for the ATT is
\begin{equation}\label{eq:ATTest}
\widehat \E_{Y|A,Z}^\calE [Y \mid  A=\rtmt, Z=1] - \widehat \E_{Y|A,Z}^\calE [Y \mid  A=\rnotmt, Z=1] = \dfrac{\sum\limits_{i=1}^n (Z_i-(1-Z_i) w(X_i)) Y_i  }{\sum\limits_{i=1}^n Z_i }.
\end{equation}
Under the standard assumptions, this estimator is consistent for the ATT and asymptotically normally distributed if
\[
e(x) = f_{Z \mid X}^\calO (1 \mid X_i)
\]
is correctly specified; that is, the estimator is \emph{singly robust}.

\pagebreak

\textbf{Example:} In this simulation study, we have

\begin{itemize}

\item $X \sim Uniform(0,10)$

\item $Z|X=x \sim Bernoulli(e(x))$, with
\[
e(x) = \frac{\exp\{-3 + 0.2 x\}}{1+\exp\{-3 + 0.2 x\}}
\]

\item $Y | X=x, Z=z \sim Normal(\mu(x,z),1)$ with
\[
\mu(x,z) = x - \frac{1}{2} x \log x + \frac{5x}{1+x} + z (2 + 0.25 x)
\]
\end{itemize}
Thus the ATT is
\[
2 + 0.25 \mu_X(1)
\]
where
\[
\mu_X(1) = \int x f_{X|Z}^\calO(x|1) \ dx = \frac{\displaystyle \iint \Ind_{\{1\}}(z) x f_{X,Z}^\calO(x,z) \ dx \ dz}{\displaystyle \iint \Ind_{\{1\}}(z) f_{X,Z}^\calO(x,z) \ dx \ dz}
\]
which can be estimated by the sample mean in the subset of observations with $Z=1$.

<<C1,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
set.seed(2984)
al<-c(-3,0.2)
psi<-c(2,0.25)
h.func<-function(xv) {return(xv-0.5*xv*log(xv)+5*xv/(1+xv))}
#Large sample Monte Carlo calculation of true ATT
N <- 10000000
X<-runif(N,0,10)
pi.vec<-1/(1+exp(-cbind(1,X) %*% al))
Z<-rbinom(N,1,pi.vec)
att <- psi[1] + psi[2]*mean(X[Z==1])
att
Y<-h.func(X)+Z*(psi[1]+psi[2]*X)+rnorm(N)
@

Under correct specification, a regression estimator may be used
<<C2,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
fit0<-lm(Y~-1+offset(h.func(X))+Z+Z:X)
psi.hat<-coef(fit0)
psi.hat[1]+psi.hat[2]*mean(X[Z==1])
@
However, under mis-specification, the incorrect answer is obtained:
<<C3,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
fit1<-lm(Y~X+Z+Z:X)
psi.hat<-coef(fit1)
psi.hat[2]+psi.hat[3]*mean(X[Z==1])
@

The estimator in \eqref{eq:ATTest} is computed as follows, and gives the correct answer:
<<C4,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
eX<-fitted(glm(Z~X,family=binomial))
w.hat<-Z + (1-Z)*eX/(1-eX)
att.hat<-sum((Z-(1-Z)*w.hat)*Y)/sum(Z)
att.hat
@


\textbf{Double robustness:} To achieve \emph{double robustness}, we augment the estimand for $\ra = \rnotmt$ as follows:
\[
\E[Y(\rnotmt)\mid Z=1] =  \E[Y(\rnotmt) - \mu(X,\rnotmt) | Z=1] +  \E[\mu(X,\rnotmt) | Z=1]
\]
where
\[
\mu(x,z) = \E[Y\mid X=x,Z=z]
\]
is the modelled conditional mean for $Y$.  To estimate the first term, we use
\[
\widehat{\E}[Y(\rnotmt)-\mu(X,\rnotmt)\mid Z=1] = \dfrac{\sum\limits_{i=1}^n (1-Z_i) w(X_i) (Y_i-\mu(X_i,\rnotmt))}{\sum\limits_{i=1}^n Z_i }
\]
as in the singly robust case. For the second term, we have
\[
\widehat{\E}[\mu(X,\rnotmt)\mid Z=1] = \dfrac{\sum\limits_{i=1}^n Z_i \mu(X_i,0)}{\sum\limits_{i=1}^n Z_i }
\]


Therefore
\begin{align*}
  \widehat{\E}[Y(\rnotmt)&\mid Z=1] = \dfrac{ \sum\limits_{i=1}^n (1-Z_i) w(X_i) (Y_i-\mu(X_i,\rnotmt)) + Z_i \mu(X_i,\rnotmt)}{\sum\limits_{i=1}^n Z_i}
\end{align*}
which yields the augmented ATT estimator
\[
\dfrac{\sum\limits_{i=1}^n (Z_i-(1-Z_i) w(X_i))  (Y_i-\mu(X_i,\rnotmt)) }{\sum\limits_{i=1}^n Z_i }. \label{eq:ATT-DRest}
\]

<<C5,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
#DR estimator with mu(x,z) = beta0 + Z (psi[0]+psi[1]*X)
w.mod <- lm(Y~Z+Z:X,weights=w.hat)                       
att.dr1<-coef(w.mod)[2]+coef(w.mod)[3]*mean(X[Z==1])
att.dr1
#DR estimator with mu(x,z) = beta0 + beta1 X + Z (psi[0]+psi[1]*X)
w.mod2 <- lm(Y~Z*X,weights=w.hat)                        
att.dr2<-coef(w.mod2)[2]+coef(w.mod2)[4]*mean(X[Z==1])
att.dr2
@

\textbf{Monte Carlo study: } we carry out a Monte Carlo study with 10000 replicates with $n=1000$:
<<C6,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
nreps<-10000
ests.mat<-matrix(0,nrow=nreps,ncol=4)
n<-1000
for(irep in 1:nreps) {
    X<-runif(n,0,10)
    eX0<-1/(1+exp(-cbind(1,X) %*% al))
    Z<-rbinom(n,1,eX0)
    Y<-psi[1]*Z+psi[2]*Z*X+h.func(X)+rnorm(n)
    fit0<-lm(Y~-1+offset(h.func(X))+Z+Z:X)
    psi.hat<-coef(fit0)
    ests.mat[irep,1]<-psi.hat[1]+psi.hat[2]*mean(X[Z==1])           #Regression estimator
    eX<-fitted(glm(Z~X,family=binomial))
    w.hat<-Z + (1-Z)*eX/(1-eX)
    ests.mat[irep,2]<-sum((Z-(1-Z)*w.hat)*Y)/sum(Z)                 #Singly robust ATT estimator
    w.mod <- lm(Y~Z+Z:X,weights=w.hat)
    ests.mat[irep,3]<-coef(w.mod)[2]+coef(w.mod)[3]*mean(X[Z==1])   #Doubly robust ATT estimator 1
    w.mod2 <- lm(Y~Z*X,weights=w.hat)
    ests.mat[irep,4]<-coef(w.mod2)[2]+coef(w.mod2)[4]*mean(X[Z==1]) #Doubly robust ATT estimator 2

}
@
<<C7,comment='+', fig.width=7.5, fig.height=5.0, fig.align='center'>>=
nv<-c('Regression','ATT','ATT-DR1','ATT-DR-2')
par(mar=c(3,3,2,0))
boxplot(ests.mat,names=nv,pch=19,cex=0.5)
abline(h=att,col='red',lty=2)
apply(ests.mat,2,var)*n                         #Variance
@
\end{document}