\documentclass[notitlepage]{article}
\usepackage{Math}
%\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{bm}
\usepackage{bbm}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{dsfont}

\usepackage{amsfonts,amsmath,amssymb}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}


\usepackage{pgfplots}
\pgfplotsset{compat=1.17}

\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
            \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}

\usepackage{xcolor}

\newtheorem{alg}{Algorithm}
\newtheorem{ex}{Example}
\newtheorem{note}{Note}
\newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof
}}}}
{\hfill \small$\blacksquare$\\[2mm]}


\setlength{\parindent}{0in}

\def\E{\mathbbmss{E}}
\def\Var{\text{Var}}
\def\Cov{\text{Cov}}
\def\Corr{\text{Corr}}

\def\bW{\mathbf{W}}
\def\bH{\mathbf{H}}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\Rho}{\mathrm{P}}
\newcommand{\cif}{\text{ if }}
\newcommand{\Ra}{\Longrightarrow}

\def\bphi{\boldsymbol \phi}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\def\Xb{\mathbf{X}_{\beta}}
\def\Xp{\mathbf{X}_{\psi}}
\def\beps{\boldsymbol{\epsilon}}
\def\betahat{\widehat \beta}
\def\psihat{\widehat \psi}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
@
\begin{center}
{\textsclarge{Causal Adjustment Methods Using The Propensity Score}}
\end{center}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

The most common causal inference problem is to estimate the causal effect of treatment $Z$ on outcome $Y$ in the presence of confounders $X$.  The causal effect is defined via the expectation under the experimental measure $\calE$.
\[
\mu(z) = \E_{Y|Z}^\calE [Y|Z=z] \qquad z \in \mathcal{Z}.
\]

The propensity score for binary exposure, denoted $e(X)$, is defined via the observational conditional model for $Z$ given confounders $X$, as
\[
e(x) = f_{Z|X}^\calO (1|x) = {\Pr}_{Z|X}^\calO[Z=1|X=x]
\]
with $e(X)$ being the corresponding random variable.  The propensity score is a \textit{balancing score}, that is, we have that $Z \independent X \given e(X)$.  In the simulations below, we set $X = (X_1,X_2)$, but also introduce a third predictor $X_3$ which is not a confounder, but instead is purely a cause (or predictor) of $Z$.

\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (z) at (0,0) {${Z}$};
    \node[state] (y) at (2,0){${Y}$};
    \node[state] (x) at (1,-2) {${X}$};
    \node[state] (x3) at (-1,-2) {${X_3}$};

    \path (x) edge (y);
    \path (x) edge (z);
    \path (z) edge (y);
    \path (x3) edge (z);

    % x node set with absolute coordinates
    \node[state] (z1) at (4,0) {${Z}$};
    \node[state] (y1) at (6,0){${Y}$};
    \node[state] (x1) at (5,-2) {${X}$};
    \node[state] (b1) at (4.5,-1) {$\scriptstyle e(X)$};
    \node[state] (x31) at (3,-2) {${X_3}$};

    \path (x1) edge (y1);
    \path (x1) edge[line width=1pt, double distance=1pt, -{Classical TikZ Rightarrow[length=1.5mm]}] (b1);
    \path (z1) edge (y1);
    \path (b1) edge (z1);
    \path (x31) edge (z1);

    \node at (1,0.5) {Original DAG};
    \node at (5,0.5) {DAG with propensity score $e(X)$};


 \end{tikzpicture}
\end{figure}




Consider the following example where we generate according to the parametric model

\begin{itemize}

\item $(X_1,X_2,X_3)^\top \sim \textrm{Normal}_3(\mu, \Sigma)$ with
\[
\mu = \begin{bmatrix*}[r]
        1 \\
        -2 \\
        1
      \end{bmatrix*} \qquad \qquad \Sigma =
      \begin{bmatrix*}[r]
      0.25 & 0.00 & 0.00 \\
      0.00 & 0.50 & 0.00 \\
      0.00 & 0.00 & 0.75
      \end{bmatrix*}
      \begin{bmatrix*}[r]
      1.0 & 0.9 & 0.0 \\
      0.9 & 1.0 & 0.0 \\
      0.0 & 0.0 & 1.0
      \end{bmatrix*}
      \begin{bmatrix*}[r]
      0.25 & 0.00 & 0.00 \\
      0.00 & 0.50 & 0.00 \\
      0.00 & 0.00 & 0.75
      \end{bmatrix*}
\]

\item $Z|X = x \sim Bernoulli(e(x;\alpha))$, with
\[
e(x;\alpha) = \frac{\exp\{\alpha_0 + \alpha_1 x_1 + \alpha_2 x_2 + \alpha_3 x_3 + \alpha_4 x_1 x_2 \}}{1 + \exp\{\alpha_0 + \alpha_1 x_1 + \alpha_2 x_2 + \alpha_3 x_3 + \alpha_4 x_1 x_2 \}}
\]
with $\alpha = (\alpha_0,\alpha_1,\alpha_2,\alpha_3,\alpha_4)^\top = (11.5,-0.2,2.7,2,2)^\top$.

\item $Y|X=x,Z=z \sim \text{Normal}(\mu(x,z; \beta,\psi),\sigma^2)$, with
\[
\mu(x,z) = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + \beta_3 x_1 x_2 + z (\psi_0 + \psi_1 x_1 + \psi_2 x_2 + \psi_3 x_1 x_2) = \mu_0(x;\beta) + z \mu_1(x;\psi)
\]
say, with $\sigma^2 = 0.1^2$, and
\[
\beta = (\beta_0,\beta_1,\beta_2,\beta_3)^\top = (10,-2.0,1.2,0.6)^\top \qquad \psi = (\psi_0,\psi_1,\psi_2,\psi_3)^\top = (1,1,1,1)^\top
\]
In this model, $X_1$ and $X_2$ are confounders, and $X_3$ is a predictor of $Z$ alone (and therefore is an \textit{instrument}); $X_3$ is independent of $(X_1,X_2)$.

\end{itemize}


<<CA1,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
#Data simulation
library(mvnfast);library(xtable)
set.seed(23987)
n<-5000
D<-diag(c(0.25,0.5,0.75))
Mu<-c(1,-2,-1)
Sigma<-D %*% (matrix(c(1,0.9,0.0,0.9,1,0.0,0,0,1),3,3) %*% D)
X<-rmvn(n,mu=Mu,Sigma)
al<-c(11.5,-0.2,2.7,2,2)
Xal<-cbind(1,X[,1],X[,2],X[,3],X[,1]*X[,2])
expit<-function(x){return(1/(1+exp(-x)))}
ps.true<-expit(Xal %*% al)
Z<-rbinom(n,1,ps.true)
be<-c(10,-2.0,1.2,0.6)
Xb<-cbind(1,X[,1],X[,2],X[,1]*X[,2])
mu0<- Xb %*% be
psi<-c(1,1,1,1)
Xp<-cbind(1,X[,1],X[,2],X[,1]*X[,2])
mu1<- (Xp %*% psi)
eta<-mu0 + Z * mu1 ; sig<-0.1
Y<-rnorm(n,eta,sig)
X1<-X[,1];X2<-X[,2];X3<-X[,3]
true.vals<-c(be,psi)[c(1,2,3,5,4,6,7,8)]
eX<-as.vector(Z-ps.true)
@
The average treatment effect (ATE) can be computed directly in terms of the model parameters:
\[
\mu(z) = \E_{Y|Z}^\calE [Y|Z=z] \equiv \E_{X}^\calO \left[\E_{Y|X,Z}^\calO [Y|X,Z=z] \right] = \E_{X}^\calO \left[\mu(X,z) \right]
\]
so in this case
\begin{align*}
\mu(z) & = \E_{X}^\calO \left[\beta_0 + \beta_1 X_1 + \beta_2 X_2 + \beta_3 X_1 X_2 + z (\psi_0 + \psi_1 X_1 + \psi_2 X_2 + \psi_3 X_1 X_2) \right]\\[6pt]
& = \beta_0 + \beta_1 \E[X_1] + \beta_2 \E[X_2] + \beta_3 \E[X_1 X_2] + z (\psi_0 + \psi_1 \E[X_1] + \psi_2 \E[X_2] + \psi_3 \E[X_1 X_2])
\end{align*}
Here we have that
\[
\E[X X^\top] = \begin{bmatrix*}[c]
                 \E[X_1^2] & \E[X_1 X_2] & \E[X_1 X_3] \\[6pt]
                 \E[X_1 X_2] & \E[X_2^2] & \E[X_2 X_3] \\[6pt]
                 \E[X_1 X_3] & \E[X_2 X_2] & \E[X_3^2]
               \end{bmatrix*} = \Sigma + \mu \mu^\top =
\begin{bmatrix*}[r]
                 \Sexpr{Sigma[1,1]} & \Sexpr{Sigma[1,2]} & \Sexpr{Sigma[1,3]} \\[6pt]
                 \Sexpr{Sigma[2,1]} & \Sexpr{Sigma[2,2]} & \Sexpr{Sigma[2,3]} \\[6pt]
                 \Sexpr{Sigma[3,1]} & \Sexpr{Sigma[3,2]} & \Sexpr{Sigma[3,3]}
               \end{bmatrix*} +
\begin{bmatrix*}[r]
                 \Sexpr{Mu[1]*Mu[1]} & \Sexpr{Mu[1]*Mu[2]} & \Sexpr{Mu[1]*Mu[3]} \\[6pt]
                 \Sexpr{Mu[2]*Mu[1]} & \Sexpr{Mu[2]*Mu[2]} & \Sexpr{Mu[2]*Mu[3]} \\[6pt]
                 \Sexpr{Mu[3]*Mu[1]} & \Sexpr{Mu[3]*Mu[2]} & \Sexpr{Mu[3]*Mu[3]}
               \end{bmatrix*}
\]
which yields
<<CA2>>=
(X.M<-Sigma + Mu %*% t(Mu))
@
so therefore $\mu(0)$ and $\mu(1)$ are
<<CA2a>>=
(mu0<-sum(be*c(1,Mu[1],Mu[2],X.M[1,2])))
(mu1<-mu0 + sum(psi*c(1,Mu[1],Mu[2],X.M[1,2])))
@
and hence the `causal effect' is
\[
\mu(1)-\mu(0) = \Sexpr{mu1}-\Sexpr{mu0} = \Sexpr{mu1-mu0}.
\]

<<CA3, fig.width=5.5, fig.height=5.5, fig.align='center'>>=
par(mar=c(3,3,3,3))
pairs(cbind(X,Y),pch=19,cex=0.75,
      labels=c(expression(X[1]),expression(X[2]),expression(X[3]),expression(Y)))
@
A correctly specified linear regression model recovers the correct coefficients.  The estimated $\psi$ values are given in the following output:
<<CA5, fig.width=7.5, fig.height=4.5, fig.align='center'>>=
round(coef(summary(lm(Y~X1+X2+X1:X2+Z+Z:X1+Z:X2+Z:X1:X2)))[c(4,6:8),],6)
@

We now study the properties of the propensity score in this case.  We need to inspect
\begin{itemize}
  \item propensity score \textbf{overlap}: assess whether the propensity score distributions for the two treatment groups are sufficiently overlapping to allow comparison within propensity score strata.
  \item \textbf{balance:} assess whether, within strata of the propensity score the distribution of $X_1$ and $X_2$ is the same within $Z=0$ and $Z=1$ groups.

\end{itemize}
Here we define the strata by taking the partition based on intervals of width 0.1 of the propensity score distribution.  There is a reasonable spread of data across these strata.

<<CA6, fig.width=7, fig.height=9, fig.align='center'>>=
ps.cat<-cut(ps.true,seq(0,1,by=0.1),labels=FALSE);table(Z,ps.cat)
@

<<CA7, fig.width=7.5, fig.height=5.0, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0))
hist(ps.true,breaks=seq(0,1,by=0.02),main='Propensity score values', ,ylim=range(0,300),
        xlab=expression(e(X)));box()
p1 <- hist(ps.true[Z==0],breaks=seq(0,1,by=0.02),plot=FALSE)
p2 <- hist(ps.true[Z==1],breaks=seq(0,1,by=0.02),plot=FALSE)
par(mar=c(4,4,2,0))
plot( p1, col=rgb(0,0,1,1/4), xlim=c(0,1), xlab=expression(e(X)),main='PS Overlap',ylim=range(0,300));box()
plot( p2, col=rgb(1,0,0,1/4), xlim=c(0,1),add=T)
legend(0.6,300,c('Z=0','Z=1'),fill=c(rgb(0,0,1,1/4),rgb(1,0,0,1/4)))
@


<<CA8, fig.width=7, fig.height=9, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0),mfrow=c(4,3))
for(j in 1:10){
    boxplot(X1[ps.cat==j]~Z[ps.cat==j],ylim=range(X1),
        xlab=paste('Stratum',j),ylab=expression(X[1]))
}
par(mar=c(4,4,2,0),mfrow=c(4,3))
for(j in 1:10){
    boxplot(X2[ps.cat==j]~Z[ps.cat==j],ylim=range(X2),
        xlab=paste('Stratum',j),ylab=expression(X[2]))
}
@
There is reasonable propensity score overlap for the two treatment groups.

\bigskip


If we instead consider fitted values of the propensity score, obtained using logistic regression, the overlap is essentially unaffected.

<<CA9, fig.width=7.5, fig.height=5.0, fig.align='center'>>=
ps.fit1<-fitted(fit1<-glm(Z~X1+X2+X3+X1:X2,family=binomial))
ps.cat1<-cut(ps.fit1,seq(0,1,by=0.1),labels=FALSE)
round(coef(summary(fit1)),6)
@
<<CA10, fig.width=7.5, fig.height=5.0, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0))
p1 <- hist(ps.fit1[Z==0],breaks=seq(0,1,by=0.02),plot=FALSE)
p2 <- hist(ps.fit1[Z==1],breaks=seq(0,1,by=0.02),plot=FALSE)
par(mar=c(4,4,2,0))
plot( p1, col=rgb(0,0,1,1/4), xlim=c(0,1), xlab=expression(e(X)),main='Fitted PS Overlap',ylim=range(0,300));box()
plot( p2, col=rgb(1,0,0,1/4), xlim=c(0,1),add=T)
legend(0.6,300,c('Z=0','Z=1'),fill=c(rgb(0,0,1,1/4),rgb(1,0,0,1/4)))
@

However, theory suggests that we can \textbf{omit} $X_3$ from the propensity score model as $X_3$ is not a confounder.  This changes the fitted propensity score distribution considerably.

<<CA11, fig.width=7.5, fig.height=5.0, fig.align='center'>>=
ps.fit2<-fitted(fit2<-glm(Z~X1+X2+X1:X2,family=binomial))
round(coef(summary(fit2)),6)
@
<<CA12, fig.width=7.5, fig.height=5.0, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0))
p1 <- hist(ps.fit2[Z==0],breaks=seq(0,1,by=0.02),plot=FALSE)
p2 <- hist(ps.fit2[Z==1],breaks=seq(0,1,by=0.02),plot=FALSE)
par(mar=c(4,4,2,0))
plot( p1, col=rgb(0,0,1,1/4), xlim=c(0,1), xlab=expression(e(X)),main='Fitted PS Overlap (confounders only)',ylim=range(0,300));box()
plot( p2, col=rgb(1,0,0,1/4), xlim=c(0,1),add=T)
legend(0.6,300,c('Z=0','Z=1'),fill=c(rgb(0,0,1,1/4),rgb(1,0,0,1/4)))
@
The overlap is much better.  Balance is still largely induced in the confounder distributions, although the strata near the ends of the range are almost empty.

<<CA13, fig.width=7, fig.height=9, fig.align='center',echo=FALSE>>=
ps.cat2<-cut(ps.fit2,seq(0,1,by=0.1),labels=FALSE)
table(Z,ps.cat2)
@
<<CA14, fig.width=7, fig.height=9, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0),mfrow=c(4,3))
for(j in 1:10){
    if(sum(ps.cat2==j & Z==1)*sum(ps.cat2==j & Z==0) == 0) next
    boxplot(X1[ps.cat2==j]~Z[ps.cat2==j],ylim=range(X1),
        xlab=paste('Stratum',j),ylab=expression(X[1]))
}
par(mar=c(4,4,2,0),mfrow=c(4,3))
for(j in 1:10){
    if(sum(ps.cat2==j & Z==1)*sum(ps.cat2==j & Z==0) == 0) next
    boxplot(X2[ps.cat2==j]~Z[ps.cat2==j],ylim=range(X2),
        xlab=paste('Stratum',j),ylab=expression(X[2]))
}
@

Evidently, the overlap is reasonably good, but the boxplots imply a slight lack of balance.  We can investigate further using histograms.

<<CA14a, fig.width=7, fig.height=9, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0),mfrow=c(3,3))
for(j in 1:10){
    if(sum(ps.cat2==j & Z==1)*sum(ps.cat2==j & Z==0) == 0) next
    p1 <- hist(X1[Z==0 & X1 > 0 & X1 < 2],breaks=seq(0,2,by=0.05),plot=FALSE)
    p2 <- hist(X1[Z==1 & X1 > 0 & X1 < 2],breaks=seq(0,2,by=0.05),plot=FALSE)
    par(mar=c(4,4,2,0))
    plot( p1, col=rgb(0,0,1,1/4), xlim=c(0,2), xlab=expression(X[1]),main='');box()
    plot( p2, col=rgb(1,0,0,1/4), xlim=c(0,2),add=T)
    legend(0.0,200,c('Z=0','Z=1'),fill=c(rgb(0,0,1,1/4),rgb(1,0,0,1/4)))
}
par(mar=c(4,4,2,0),mfrow=c(3,3))
for(j in 1:10){
    if(sum(ps.cat2==j & Z==1)*sum(ps.cat2==j & Z==0) == 0) next
    p1 <- hist(X2[Z==0 & X2 > -4 & X2 < 2],breaks=seq(-4,2,by=0.1),plot=FALSE)
    p2 <- hist(X2[Z==1 & X2 > -4 & X2 < 2],breaks=seq(-4,2,by=0.1),plot=FALSE)
    par(mar=c(4,4,2,0))
    plot( p1, col=rgb(0,0,1,1/4), xlim=c(-4,2), xlab=expression(X[2]),main='');box()
    plot( p2, col=rgb(1,0,0,1/4), xlim=c(-4,2),add=T)
    legend(0,200,c('Z=0','Z=1'),fill=c(rgb(0,0,1,1/4),rgb(1,0,0,1/4)))
}
@
The apparent lack of balance is perhaps due to the fairly coarse propensity score categories.

\bigskip

The final question is whether we can recover good estimates using statistical approaches based on the propensity score.  We first attempt stratification-based estimation of the ATE.

<<CA15, fig.width=7, fig.height=9, fig.align='center'>>=
#True ps
local.ate<-rep(0,10)
for(j in 1:10){
    local.ate[j]<-mean(Y[ps.cat==j & Z == 1]) - mean(Y[ps.cat==j & Z == 0])
}
w0<-as.numeric(table(ps.cat)/n)
ate0<-sum(local.ate*w0)
ate0

#PS fit
local.ate1<-rep(0,10)
for(j in 1:10){
    local.ate1[j]<-mean(Y[ps.cat1==j & Z == 1]) - mean(Y[ps.cat1==j & Z == 0])
}
w1<-as.numeric(table(ps.cat1)/n)
ate1<-sum(local.ate*w1)
ate1

#PS fit with confounders only
local.ate2<-rep(0,10)
for(j in 1:10){
    #Watch for empty strata
    if(sum(ps.cat2==j & Z==1)*sum(ps.cat2==j & Z==0) == 0){
        #Empty !
        local.ate2[j]<-0
    }else{
        local.ate2[j]<-mean(Y[ps.cat2==j & Z == 1]) - mean(Y[ps.cat2==j & Z == 0])
    }
}
w2<-as.numeric(tabulate(ps.cat2,10)/n)
ate2<-sum(local.ate*w2)
ate2
@
These raw estimates seem satisfactory and quite close to the true value of \Sexpr{mu1-mu0}, although further computation would be needed to assess standard errors.

\bigskip

For illustration, we demonstrate the results obtained when the propensity score model does not contain all the confounders.
<<CA16, fig.width=7.5, fig.height=5.0, fig.align='center'>>=
ps.fit3<-fitted(fit3<-glm(Z~X1,family=binomial))
round(coef(summary(fit3)),6)
@
<<CA17, fig.width=7.5, fig.height=5.0, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0))
p1 <- hist(ps.fit3[Z==0],breaks=seq(0,1,by=0.02),plot=FALSE)
p2 <- hist(ps.fit3[Z==1],breaks=seq(0,1,by=0.02),plot=FALSE)
par(mar=c(4,4,2,0))
plot( p1, col=rgb(0,0,1,1/4), xlim=c(0,1), xlab=expression(e(X)),main='Fitted PS Overlap (X1 only)',ylim=range(0,600));box()
plot( p2, col=rgb(1,0,0,1/4), xlim=c(0,1),add=T)
legend(0.8,600,c('Z=0','Z=1'),fill=c(rgb(0,0,1,1/4),rgb(1,0,0,1/4)))
@

<<CA18, fig.width=7, fig.height=5, fig.align='center',echo=FALSE>>=
ps.cat3<-cut(ps.fit3,seq(0,1,by=0.1),labels=FALSE)
table(Z,ps.cat3)
@
<<CA19, fig.width=7, fig.height=5, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,1,0),mfrow=c(2,2))
for(j in 1:10){
    if(sum(ps.cat3==j & Z==1)*sum(ps.cat3==j & Z==0) == 0) next
    boxplot(X1[ps.cat3==j]~Z[ps.cat3==j],ylim=range(X1),
        xlab=paste('Stratum',j),ylab=expression(X[1]))
}
par(mar=c(4,4,1,0),mfrow=c(2,2))
for(j in 1:10){
    if(sum(ps.cat3==j & Z==1)*sum(ps.cat3==j & Z==0) == 0) next
    boxplot(X2[ps.cat3==j]~Z[ps.cat3==j],ylim=range(X2),
        xlab=paste('Stratum',j),ylab=expression(X[2]))
}
@

<<CA20, fig.width=7, fig.height=5, fig.align='center'>>=
#PS containing X1 only
local.ate3<-rep(0,10)
for(j in 1:10){
    #Watch for empty strata
    if(sum(ps.cat3==j & Z==1)*sum(ps.cat3==j & Z==0) == 0){
        #Empty !
        local.ate3[j]<-0
    }else{
        local.ate3[j]<-mean(Y[ps.cat3==j & Z == 1]) - mean(Y[ps.cat3==j & Z == 0])
    }
}
w3<-as.numeric(tabulate(ps.cat3,10)/n)
ate3<-sum(local.ate3*w3)
ate3
@

This point estimate is clearly far away from the true value of \Sexpr{mu1-mu0}.

\pagebreak

We can now carry out a Monte Carlo study of the performance of the different estimation procedures:  for 5000 replications with data sets of $n=2000$, the following results display the results of four different models: using the propensity score regression model
\[
\beta_0 + z (\psi_0 + \psi_1 x_1 + \psi_2 x_2 + \psi_3 x_1 x_2) + e(x) (\phi_0 + \phi_1 x_1 + \phi_2 x_2 + \phi_3 x_1 x_2)
\]
we use for methods for constructing the propensity score

\begin{itemize}
  \item M1:  $e(x_1,x_2,x_3;\alpha)$ computed using the data generating $\alpha$ values; 
  \item M2:  $e(x_1,x_2,x_3;\widehat \alpha)$ computed using fitted $\alpha$ values;
  \item M3:  $e(x_1,x_2;\widehat \alpha)$ computed using fitted $\alpha$ values omitting $X_3$;
  \item M4:  $e(x_1;\widehat \alpha)$ computed using fitted $\alpha$ values for a model only utilizing $X_1$.
\end{itemize}
The first three methods should give unbiased estimation, whereas the fourth should result in bias

<<CA21, fig.width=7, fig.height=5, fig.align='center'>>=
set.seed(2987)
n<-2000
nreps<-5000
ests<-array(0,c(nreps,4,4))
for(irep in 1:nreps){
    X<-rmvn(n,mu=Mu,Sigma)
    Xal<-cbind(1,X[,1],X[,2],X[,3],X[,1]*X[,2])
    ps.true<-expit(Xal %*% al)
    Z<-rbinom(n,1,ps.true)
    Xb<-cbind(1,X[,1],X[,2],X[,1]*X[,2])
    mu0<- Xb %*% be
    Xp<-cbind(1,X[,1],X[,2],X[,1]*X[,2])
    mu1<- (Xp %*% psi)
    eta<-mu0 + Z * mu1 ; sig<-0.1
    Y<-rnorm(n,eta,sig)
    X1<-X[,1];X2<-X[,2];X3<-X[,3]
    true.vals<-c(be,psi)[c(1,2,3,5,4,6,7,8)]

    #True PS
    eX<-ps.true
    ests[irep,1,]<-coef(lm(Y~(Z+Z:X1+Z:X2+Z:X1:X2)+(eX+eX:X1+eX:X2+eX:X1:X2)))[c(2,4,5,8)]

    #Fitted PS
    eX<-fitted(glm(Z~X1+X2+X3+X1:X2,family=binomial))
    ests[irep,2,]<-coef(lm(Y~(Z+Z:X1+Z:X2+Z:X1:X2)+(eX+eX:X1+eX:X2+eX:X1:X2)))[c(2,4,5,8)]

    #Fitted PS confounders only
    eX<-fitted(glm(Z~X1+X2+X1:X2,family=binomial))
    ests[irep,3,]<-coef(lm(Y~(Z+Z:X1+Z:X2+Z:X1:X2)+(eX+eX:X1+eX:X2+eX:X1:X2)))[c(2,4,5,8)]

    #Fitted PS mis-specified
    eX<-fitted(glm(Z~X1,family=binomial))
    ests[irep,4,]<-coef(lm(Y~(Z+Z:X1+Z:X2+Z:X1:X2)+(eX+eX:X1+eX:X2+eX:X1:X2)))[c(2,4,5,8)]


}
@

We report boxplots of the estimates over the replications to approximate the sampling distribution, and then numerical summaries of
\begin{itemize}
  \item $\sqrt{n}$ times the \textit{bias} of the estimator
  \item $n$ times the \textit{variance} of the estimator
  \item $n$ times the \textit{mean-squared error} of the estimator.
\end{itemize}

<<CA22, fig.width=7, fig.height=5, fig.align='center',echo=FALSE>>=
par(mar=c(4,4,2,0))
nvec<-c('M1','M2','M3','M4')
boxplot(ests[,,1],names=nvec,pch=19,cex=0.75);abline(h=1,lty=2,col='red')
title(expression(psi[1]))
bias1<-apply(ests[,,1],2,mean)-psi[1]
var1<-apply(ests[,,1],2,var)
mse1<-(bias1^2+var1)
print('psi1')
res1<-cbind(sqrt(n)*bias1,n*var1,n*mse1)
colnames(res1)<-c('Bias','Var.','MSE')
rownames(res1)<-nvec
res1

boxplot(ests[,,2],names=nvec,pch=19,cex=0.75);abline(h=1,lty=2,col='red')
title(expression(psi[2]))
bias2<-apply(ests[,,2],2,mean)-psi[1]
var2<-apply(ests[,,2],2,var)
mse2<-(bias2^2+var2)
print('psi2')
res2<-cbind(sqrt(n)*bias2,n*var2,n*mse2)
colnames(res2)<-c('Bias','Var.','MSE')
rownames(res2)<-nvec
res2

boxplot(ests[,,3],names=nvec,pch=19,cex=0.75);abline(h=1,lty=2,col='red')
title(expression(psi[3]))
bias3<-apply(ests[,,3],2,mean)-psi[1]
var3<-apply(ests[,,3],2,var)
mse3<-(bias3^2+var3)
print('psi3')
res3<-cbind(sqrt(n)*bias3,n*var3,n*mse3)
colnames(res3)<-c('Bias','Var.','MSE')
rownames(res3)<-nvec
res3

boxplot(ests[,,4],names=nvec,pch=19,cex=0.75);abline(h=1,lty=2,col='red')
title(expression(psi[4]))
bias4<-apply(ests[,,4],2,mean)-psi[1]
var4<-apply(ests[,,4],2,var)
mse4<-(bias4^2+var4)
print('psi4')
res4<-cbind(sqrt(n)*bias4,n*var4,n*mse4)
colnames(res4)<-c('Bias','Var.','MSE')
rownames(res4)<-nvec
res4

@

The results are largely as expected: note that the methods that \emph{estimate} the $\alpha$ parameters yield lower variances, that method M3 which utilizes the confounders only has a lower variance than method M2 that uses the true data generating propensity score specification, and that method M4 produces a biased yet low variance estimator.

\end{document}