\documentclass[notitlepage]{article} \usepackage{Math} %\usepackage{../../Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{bm} \usepackage{bbm} \usepackage{verbatim} \usepackage{mathtools} \usepackage{dsfont} \usepackage{scalerel} \usepackage{amsfonts,amsmath,amssymb} \usepackage{tikz} \usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning} \usepackage{pgfplots} \pgfplotsset{compat=1.17} \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} \usepackage{xcolor} \newtheorem{alg}{Algorithm} \newtheorem{ex}{Example} \newtheorem{note}{Note} \newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof }}}} {\hfill \small$\blacksquare$\\[2mm]} \setlength{\parindent}{0in} \def\E{\mathbbmss{E}} \def\Var{\text{Var}} \def\Cov{\text{Cov}} \def\Corr{\text{Corr}} \def\bW{\mathbf{W}} \def\bH{\mathbf{H}} \newcommand{\bs}[1]{\bm{#1}} \newcommand{\Rho}{\mathrm{P}} \newcommand{\cif}{\text{ if }} \newcommand{\Ra}{\Longrightarrow} \def\bphi{\boldsymbol \phi} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \def\Xb{\mathbf{X}_{\beta}} \def\Xp{\mathbf{X}_{\psi}} \def\beps{\boldsymbol{\epsilon}} \def\betahat{\widehat \beta} \def\psihat{\widehat \psi} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=6) options(repos=c(CRAN="https://cloud.r-project.org/")) knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)}) @ \begin{center} {\textsclarge{The Average Treated Effect on the Treated}} \end{center} \tikzset{ -Latex,auto,node distance =1 cm and 1 cm,semithick, state/.style ={circle, draw, minimum width = 0.7 cm}, box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray}, point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}}, bidirected/.style={Latex-Latex,dashed}, el/.style = {inner sep=3pt, align=left, sloped} } The \emph{average treatment effect on the treated} (ATT) is \[ \E[Y(\rtmt) - Y(\rnotmt) \given Z=1] \] that is, the ATT aims to identify the causal effect on intervening to change $Z=\rnotmt$ to $Z=\rtmt$ but \emph{only} in the subpopulation of individuals who are \emph{observed} to receive treatment. Note that \begin{align*} \E[Y(\rtmt) - Y(\rnotmt)] & = \E[Y(\rtmt) - Y(\rnotmt) \given Z=0] \Pr[Z=0] + \E[Y(\rtmt) - Y(\rnotmt) \given Z=1] \Pr[Z=1] \end{align*} In this calculation, we imagine \begin{itemize} \item the observational distribution $f_{X,Y,Z}^\calO$ generating the observed data $\{(x_i,y_i,z_i), i=1,\ldots,n\}$ \item in the subgroup observed to have $Z=1$, we then consider a second (hypothetical) experimental intervention to change $Z$ to $\rz$ which over-rides the original $Z$ if $\rz = \rnotmt$, \item we then consider comparison of outcomes between the two hypothetical subgroups indexed by $\rz$. \end{itemize} \tikzset{ -Latex,auto,node distance =1 cm and 1 cm,semithick, state/.style ={circle, draw, minimum width = 0.7 cm}, box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray}, cbox/.style ={rounded rectangle, draw, minimum width = 0.7 cm}, point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}}, bidirected/.style={Latex-Latex,dashed}, el/.style = {inner sep=3pt, align=left, sloped} } \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x) at (-1,0) {${X}$}; \node[cbox] (yz) at (1,1) {${\{Y(\rnotmt), Y(\rtmt)\}}$}; \node[state] (z) at (1,-1) {${Z}$}; \node[state] (y) at (3,0) {${Y}$}; % Directed edge \path (x) edge (z); \path (x) edge (yz); \path (yz) edge (y); \path (z) edge (y); \node at (1,-2) {$f_X(x) \: f_{Z|X}(z|x) \: f_{Y(\rnotmt),Y(\rtmt)|X}(y_0,y_1|x) \: f_{Y|Z,Y(\rnotmt),Y(\rtmt)}(y|z,y_0,y_1)$}; \end{tikzpicture} \end{figure} For the ATT, we can represent the quantity of interest using a modified DAG that proposes a second hypothetical binary treatment, $A$. We allow $Z$ to cause $A$, and then allow $Z$ to act as a selection mechanism, but ensure that we have the conditional independence of $X$ and $A$ given $Z$, $X \independent A \given Z$, \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x) at (-1,0) {${X}$}; \node[cbox] (yz) at (1,1) {${\{Y(\rnotmt), Y(\rtmt)\}}$}; \node[state] (z) at (0.25,-1) {${Z}$}; \node[state] (a) at (1.75,-1) {${A}$}; \node[state] (y) at (3,0) {${Y}$}; % Directed edge \path (x) edge (z); \path (x) edge (yz); \path (yz) edge (y); \path (z) edge (a); \path (a) edge (y); \end{tikzpicture} \end{figure} \[ f_X(x) \: f_{Z|X}(z|x) \: f_{Y(\rnotmt),Y(\rtmt)|X}(y_0,y_1|x) \: f_{A|Z}(a|z) \: f_{Y|A,Y(\rnotmt),Y(\rtmt)}(y|a,y_0,y_1) \] This is the new `experimental' distribution $\mathcal{E}$. \[ f_{Y|A,Y(\rnotmt),Y(\rtmt)}^\calE(y|a,y_0,y_1) = \left\{ \begin{array}{cc} 1 & y=(1-a)y_0 + a y_1 \\[6pt] 0 & \text{otherwise} \end{array} \right. . \] As $A$ is binary, the model $f_{A|Z}(a|z)$ must take the form \[ f_{A|Z}^\calE(a|z) = p_z^a (1-p_z)^{1-a} \qquad a,z \in \{0,1\} \] for $0 \leq p_z \leq 1$ for $z = 0,1$. We can then express the ATT via the new DAG as \[ \E_{Y|A,Z}^\calE [Y|A=\rtmt,Z=1] - \E_{Y|A,Z}^\calE [Y|A=\rnotmt,Z=1]. \] We have that from the DAG that \[ f_{Y |A,X,Z}^\calE (y |a,x,z) \equiv f_{Y |A,X}^\calE (y |a,x) \] and hence as before \begin{align*} f_{Y |A,X}^\calE (y |a,x) & = \displaystyle \int f_{Y|A,Y(\rnotmt),Y(\rtmt)}^\calE (y|a,y_0,y_1) f_{Y(\rnotmt),Y(\rtmt)|A,X}^\calE (y_0,y_1|a,x) \ d y_0 \ d y_1 = f_{Y(a)|X}^\calE (y|x). \end{align*} Also from the DAG, $X \independent A \given Z$, so for all $a,x,z$ \[ f_{X|A,Z}^\calE (x|a,z) \equiv f_{X|Z}^\calE (x|z). \] For $\ra=\rnotmt,\rtmt$, we therefore have \begin{align*} \E_{Y|A,Z}^\calE [Y \mid A=\ra, Z=z] =& \iint y \: f_{Y\mid A, X}^\calE (y\mid \ra, x) f_{X\mid Z}^\calE (x\mid z) \: dy \: dx. \end{align*} Note that in this integral there is a potential \emph{incompatibility} in the conditioning between \[ f_{Y\mid A, X}^\calE (y\mid \ra, x) \qquad \textrm{and} \qquad f_{X\mid Z}^\calE (x\mid z) \] when we try to write the integral in terms of the data generating mechanism. As before, choosing the form of the outcome conditional model \[ f_{Y\mid A, X}^\calE (y\mid \ra, x) \] is to be avoided if possible due to the danger of mis-specification. If the choice is made correctly, then it can form the basis of a regression estimator. \bigskip We seek to resolve the incompatibility using the importance sampling trick, and write the expectation with respect to the observational model \[ f_{Y|X,Z}^\calO (y|x,z) f_{Z|X}^\calO (z|x) f_X^\calO(x). \] First note that \[ f_{X\mid Z}^\calE (x\mid z) = \frac{f_{Z\mid X}^\calE (z\mid x) f_X^\calE(x)}{f_Z^\calE(z)} \] so the integral can be rewritten \[ \frac{1}{f_Z^\calE(z)} \iint y \: f_{Y\mid A, X}^\calE (y\mid \ra, x) f_{Z\mid X}^\calE (z\mid x) f_X^\calE(x) \: dy \: dx. \] For $\rz=0,1$, we can re-write the integrand using the importance sampling trick as \[ y \: f_{Y\mid A, X}^\calE (y\mid \ra, x) \frac{f_{Z\mid X}^\calE (z\mid x)}{f_{Z\mid X}^\calE (\ra \mid x)} \: f_{Z\mid X}^\calE (\ra \mid x) f_X^\calE(x) \] which can be rearranged to \[ \left\{ y \: \frac{f_{Z\mid X}^\calE (z\mid x)}{f_{Z\mid X}^\calE (\ra \mid x)} \right\} \: f_{Y\mid A, X}^\calE (y\mid \ra, x) \: f_{Z\mid X}^\calE (\ra \mid x) f_X^\calE(x), \] Comparing the observational and experimental DAGs, we see that for all $x$ and $z$ \[ f_{Z\mid X}^\calE (z\mid x) \equiv f_{Z\mid X}^\calO (z\mid x) \qquad f_X^\calE (x) \equiv f_X^\calO (x) \qquad f_Z^\calE (z) \equiv f_Z^\calO(z). \] Also, we have for any $t$ and $y$ that \[ f_{Y\mid A, X}^\calE (y\mid t , x) \equiv f_{Y\mid X,Z}^\calO (y\mid x,t). \] Therefore we have \[ f_{Y\mid A, X}^\calE (y\mid \ra, x) \: f_{Z\mid X}^\calE (\ra \mid x) f_X^\calE(x) \equiv f_{Y\mid X,Z}^\calO (y\mid x,\ra ) \: f_{Z\mid X}^\calO (\ra \mid x) f_X^\calO(x). \] Thus \begin{align*} \E_{Y|A,Z}^\calE [Y \mid A=\ra, Z=z] & = \frac{1}{f_Z^\calO (z)} \iint \left\{ y \frac{f_{Z\mid X}^\calO (z\mid x)}{f_{Z\mid X}^\calO (\ra \mid x)} \right\} \: f_{X,Y,Z}^\calO (x,y,\ra) \: dy \: dx \\[6pt] & \quad = \frac{1}{f_Z^\calO (z)} \iiint \left\{ \Ind_{\{\ra\}}(t) y \frac{f_{Z\mid X}^\calO (z\mid x)}{f_{Z\mid X}^\calO (t \mid x)} \right\} \: f_{X,Y,Z}^\calO (x,y,t) \: dy \: dx \: dt \\[6pt] & \quad = \frac{1}{f_Z^\calO(z)} \E_{X,Y,Z}^\calO \left[ \Ind_{\{\ra\}}(Z) Y \: \dfrac{f_{Z\mid X}^\calO (z\mid X)}{f_{Z\mid X}^\calO (Z \mid X)} \right]. \end{align*} For the ATT, we are interested only in $z=1$. The moment-based estimator is therefore \[ \widehat \E_{Y|A,Z}^\calE [Y \mid A=\ra, Z=1] = \dfrac{\sum\limits_{i=1}^n \Ind_{\{\ra\}}(Z_i) w_1(X_i,Z_i) Y_i }{\sum\limits_{i=1}^n \Ind_{\{1\}}(Z_i) } \qquad \textrm{where} \qquad w_z(X_i,Z_i) = \dfrac{f_{Z\mid X}^\calO (z\mid X_i)}{f_{Z\mid X}^\calO (Z_i \mid X_i)} \] \begin{itemize} \item When $\ra = \rtmt$, \[ \Ind_{\{\ra\}}(Z_i) w_1(X_i,Z_i) = \Ind_{\{\rtmt\}}(Z_i) = Z_i \quad \textrm{w.p. 1} \] as the weight is identically 1, so therefore \[ \widehat \E_{Y|A,Z}^\calE [Y \mid A=\rtmt, Z=1] = \dfrac{\sum\limits_{i=1}^n Z_i Y_i }{\sum\limits_{i=1}^n Z_i } \] that is, the mean in the treated group. \item When $\ra = \rnotmt$, \begin{align*} \Ind_{\{\ra\}}(Z_i) w_1(X_i,Z_i) & = \Ind_{\{\rnotmt\}}(Z_i) \dfrac{f_{Z\mid X}^\calO (1 \mid X_i)}{f_{Z \mid X}^\calO (Z_i \mid X_i)}= (1-Z_i) \dfrac{f_{Z\mid X}^\calO (1 \mid X_i)}{f_{Z \mid X}^\calO (\rnotmt \mid X_i)}. \end{align*} Therefore \[ \widehat \E_{Y|A,Z}^\calE [Y \mid A=\rnotmt, Z=1] = \dfrac{\sum\limits_{i=1}^n (1-Z_i) w(X_i) Y_i }{\sum\limits_{i=1}^n Z_i } \] where \[ w(X_i) = \dfrac{f_{Z\mid X}^\calO (1 \mid X_i)}{f_{Z \mid X}^\calO (\rnotmt \mid X_i)} = \frac{e(X_i)}{1-e(X_i)} \] That is, this estimator is a weighted sum of contributions from the \emph{untreated} individuals. \end{itemize} Thus the estimator for the ATT is \begin{equation}\label{eq:ATTest} \widehat \E_{Y|A,Z}^\calE [Y \mid A=\rtmt, Z=1] - \widehat \E_{Y|A,Z}^\calE [Y \mid A=\rnotmt, Z=1] = \dfrac{\sum\limits_{i=1}^n (Z_i-(1-Z_i) w(X_i)) Y_i }{\sum\limits_{i=1}^n Z_i }. \end{equation} Under the standard assumptions, this estimator is consistent for the ATT and asymptotically normally distributed if \[ e(x) = f_{Z \mid X}^\calO (1 \mid X_i) \] is correctly specified; that is, the estimator is \emph{singly robust}. \pagebreak \textbf{Example:} In this simulation study, we have \begin{itemize} \item $X \sim Uniform(0,10)$ \item $Z|X=x \sim Bernoulli(e(x))$, with \[ e(x) = \frac{\exp\{-3 + 0.2 x\}}{1+\exp\{-3 + 0.2 x\}} \] \item $Y | X=x, Z=z \sim Normal(\mu(x,z),1)$ with \[ \mu(x,z) = x - \frac{1}{2} x \log x + \frac{5x}{1+x} + z (2 + 0.25 x) \] \end{itemize} Thus the ATT is \[ 2 + 0.25 \mu_X(1) \] where \[ \mu_X(1) = \int x f_{X|Z}^\calO(x|1) \ dx = \frac{\displaystyle \iint \Ind_{\{1\}}(z) x f_{X,Z}^\calO(x,z) \ dx \ dz}{\displaystyle \iint \Ind_{\{1\}}(z) f_{X,Z}^\calO(x,z) \ dx \ dz} \] which can be estimated by the sample mean in the subset of observations with $Z=1$. <>= set.seed(2984) al<-c(-3,0.2) psi<-c(2,0.25) h.func<-function(xv) {return(xv-0.5*xv*log(xv)+5*xv/(1+xv))} #Large sample Monte Carlo calculation of true ATT N <- 10000000 X<-runif(N,0,10) pi.vec<-1/(1+exp(-cbind(1,X) %*% al)) Z<-rbinom(N,1,pi.vec) att <- psi[1] + psi[2]*mean(X[Z==1]) att Y<-h.func(X)+Z*(psi[1]+psi[2]*X)+rnorm(N) @ Under correct specification, a regression estimator may be used <>= fit0<-lm(Y~-1+offset(h.func(X))+Z+Z:X) psi.hat<-coef(fit0) psi.hat[1]+psi.hat[2]*mean(X[Z==1]) @ However, under mis-specification, the incorrect answer is obtained: <>= fit1<-lm(Y~X+Z+Z:X) psi.hat<-coef(fit1) psi.hat[2]+psi.hat[3]*mean(X[Z==1]) @ The estimator in \eqref{eq:ATTest} is computed as follows, and gives the correct answer: <>= eX<-fitted(glm(Z~X,family=binomial)) w.hat<-Z + (1-Z)*eX/(1-eX) att.hat<-sum((Z-(1-Z)*w.hat)*Y)/sum(Z) att.hat @ \textbf{Double robustness:} To achieve \emph{double robustness}, we augment the estimand for $\ra = \rnotmt$ as follows: \[ \E[Y(\rnotmt)\mid Z=1] = \E[Y(\rnotmt) - \mu(X,\rnotmt) | Z=1] + \E[\mu(X,\rnotmt) | Z=1] \] where \[ \mu(x,z) = \E[Y\mid X=x,Z=z] \] is the modelled conditional mean for $Y$. To estimate the first term, we use \[ \widehat{\E}[Y(\rnotmt)-\mu(X,\rnotmt)\mid Z=1] = \dfrac{\sum\limits_{i=1}^n (1-Z_i) w(X_i) (Y_i-\mu(X_i,\rnotmt))}{\sum\limits_{i=1}^n Z_i } \] as in the singly robust case. For the second term, we have \[ \widehat{\E}[\mu(X,\rnotmt)\mid Z=1] = \dfrac{\sum\limits_{i=1}^n Z_i \mu(X_i,0)}{\sum\limits_{i=1}^n Z_i } \] Therefore \begin{align*} \widehat{\E}[Y(\rnotmt)&\mid Z=1] = \dfrac{ \sum\limits_{i=1}^n (1-Z_i) w(X_i) (Y_i-\mu(X_i,\rnotmt)) + Z_i \mu(X_i,\rnotmt)}{\sum\limits_{i=1}^n Z_i} \end{align*} which yields the augmented ATT estimator \[ \dfrac{\sum\limits_{i=1}^n (Z_i-(1-Z_i) w(X_i)) (Y_i-\mu(X_i,\rnotmt)) }{\sum\limits_{i=1}^n Z_i }. \label{eq:ATT-DRest} \] <>= #DR estimator with mu(x,z) = beta0 + Z (psi[0]+psi[1]*X) w.mod <- lm(Y~Z+Z:X,weights=w.hat) att.dr1<-coef(w.mod)[2]+coef(w.mod)[3]*mean(X[Z==1]) att.dr1 #DR estimator with mu(x,z) = beta0 + beta1 X + Z (psi[0]+psi[1]*X) w.mod2 <- lm(Y~Z*X,weights=w.hat) att.dr2<-coef(w.mod2)[2]+coef(w.mod2)[4]*mean(X[Z==1]) att.dr2 @ \textbf{Monte Carlo study: } we carry out a Monte Carlo study with 10000 replicates with $n=1000$: <>= nreps<-10000 ests.mat<-matrix(0,nrow=nreps,ncol=4) n<-1000 for(irep in 1:nreps) { X<-runif(n,0,10) eX0<-1/(1+exp(-cbind(1,X) %*% al)) Z<-rbinom(n,1,eX0) Y<-psi[1]*Z+psi[2]*Z*X+h.func(X)+rnorm(n) fit0<-lm(Y~-1+offset(h.func(X))+Z+Z:X) psi.hat<-coef(fit0) ests.mat[irep,1]<-psi.hat[1]+psi.hat[2]*mean(X[Z==1]) #Regression estimator eX<-fitted(glm(Z~X,family=binomial)) w.hat<-Z + (1-Z)*eX/(1-eX) ests.mat[irep,2]<-sum((Z-(1-Z)*w.hat)*Y)/sum(Z) #Singly robust ATT estimator w.mod <- lm(Y~Z+Z:X,weights=w.hat) ests.mat[irep,3]<-coef(w.mod)[2]+coef(w.mod)[3]*mean(X[Z==1]) #Doubly robust ATT estimator 1 w.mod2 <- lm(Y~Z*X,weights=w.hat) ests.mat[irep,4]<-coef(w.mod2)[2]+coef(w.mod2)[4]*mean(X[Z==1]) #Doubly robust ATT estimator 2 } @ <>= nv<-c('Regression','ATT','ATT-DR1','ATT-DR-2') par(mar=c(3,3,2,0)) boxplot(ests.mat,names=nv,pch=19,cex=0.5) abline(h=att,col='red',lty=2) apply(ests.mat,2,var)*n #Variance @ \end{document}