\documentclass[notitlepage]{article}
\usepackage{Math}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{bbm}
\usepackage{bm}
\usepackage[all]{xy}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{dsfont}

\DeclareMathAlphabet{\mathpzc}{OT1}{pzc}{m}{it}

\makeatletter
\newcommand{\raisemath}[1]{\mathpalette{\raisem@th{#1}}}
\newcommand{\raisem@th}[3]{\raisebox{#1}{$#2#3$}}
\makeatother

\usepackage{amsfonts,amsmath,amssymb}

\newtheorem{alg}{Algorithm}
\newtheorem{ex}{Example}
\newtheorem{note}{Note}
\newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof
}}}}
{\hfill \small$\blacksquare$\\[2mm]}


\setlength{\parindent}{0in}

\def\E{\mathbb{E}}
\def\Var{\text{Var}}
\def\Cov{\text{Cov}}
\def\Corr{\text{Corr}}

\def\bW{\mathbf{W}}
\def\bH{\mathbf{H}}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\Rho}{\mathrm{P}}
\newcommand{\cif}{\text{ if }}
\newcommand{\Ra}{\Longrightarrow}
\newcommand{\ra}{\longrightarrow}

\def\bphi{\boldsymbol \phi}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\def\Xb{\mathbf{X}_{\beta}}
\def\Xp{\mathbf{X}_{\psi}}
\def\beps{\boldsymbol{\epsilon}}
\def\betahat{\widehat \beta}
\def\psihat{\widehat \psi}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=6)
options(repos=c(CRAN="https://cloud.r-project.org/"))
@
\begin{center}
{\textsclarge{Example: Analysis of the Nhanes data}}
\end{center}

We study the publicly available (U.S.) National Health and Nutrition Examination Survey (NHANES). For this, we may use the libraries \texttt{NHANES}, \texttt{tableone}, and \texttt{Matching} in \texttt{R}. We will focus our analysis on the question of whether current smoking affects average systolic blood pressure. The variables we will need are:

\begin{itemize}

\item \texttt{BPSysAve} (systolic blood pressure, response),
\item \texttt{SmokeNow} (smoking status: 0--No, 1--Yes),

\item \texttt{Gender},

\item \texttt{Age},

\item \texttt{Race3},

\item \texttt{Education},

\item \texttt{MaritalStatus}, and

\item \texttt{Poverty}

\end{itemize}

where the first two are the outcome and exposure of interest and the remaining are potential confounders. Additionally, we will restrict our attention to 1377 adults ($>$ 17 years old) in the second wave of the survey.

<<NH0,comment='+', fig.width=7.25, fig.height=5.5, fig.align='center'>>=
library(NHANES); library(tableone); library(Matching) #Must be pre-loaded in R
NHANES$SmokeNow <- as.numeric(NHANES$SmokeNow)-1
small.nhanes <- na.omit(NHANES[NHANES$SurveyYr=="2011_12"
                      & NHANES$Age > 17,c(3,4,8:11,13,25,61)])
#dim(small.nhanes) ## 1377
vars <- c("Gender", "Age", "Race3", "Education", "MaritalStatus", "Poverty")
fit0<-lm(BPSysAve~SmokeNow,data=small.nhanes)
round(coef(summary(fit0)),5)
fmod<-formula(BPSysAve~SmokeNow+Gender+Age+Race3+Education+MaritalStatus+HHIncome+Poverty)
fit1<-lm(fmod,data=small.nhanes)
round(coef(summary(fit1))[1:2,],5)
@

\textbf{Checking confounder balance:} In PS--based methods, the goal of the treatment model is to eliminate imbalance in the distribution of covariates between treated and untreated subjects; achieving balance on other covariates (particularly strong predictors of treatment) is unhelpful.  The goal is \textit{not} to build an excellent predictive model for the treatment.  Common measures of balance include:
\begin{itemize}
    \item Standardized mean difference (SMD) or proportion:
    \[
    \frac{\bar{x}^{1,w}-\bar{x}^{0,w}}{\sqrt{(v^{1,w}+v^{0,w})/2}}
    \qquad \text{where} \qquad \bar{x}^{z,w} = \frac{1}{n} \sum_{i=1}^n \frac{\Ind_{z}(z_i) x_i}{f_{Z|X}^\Obs(z_i|x_i)},
    \]
        i.e.~the weighted sample mean of variable $X$ among those with treatment value $z$, and similarly $v^{z,w}$ is the weighted variance estimate. For all methods of analysis other than IPW, the weights are taken to be 1 for all subjects.  An SMD of 0.1 or less typically indicates reasonable balance.

    \item Visual examination of weighted empirical CDFs among the treated and untreated groups.
\end{itemize}
In the NHANES example, there is clear age imbalance between the two smoking groups.  This implies that if \texttt{Age} is also a possible predictor of the outcome, then it is a possible confounder.

<<NH1,comment='+', fig.width=6.25, fig.height=4.5, fig.align='center'>>=
tabUnmatched<-CreateTableOne(vars=vars,strata="SmokeNow",data=small.nhanes,test=FALSE)
print(tabUnmatched, smd = TRUE)
Smoke<-small.nhanes$SmokeNow;age0<-small.nhanes$Age[Smoke==0];age1<-small.nhanes$Age[Smoke==1]
ecdf0 <- ecdf(age0); ecdf1 <- ecdf(age1)
@
<<NH2,comment='+', fig.width=6.25, fig.height=4.25, fig.align='center',echo=FALSE>>=
par(mar=c(2,2,2,0))
plot(ecdf0, verticals=TRUE, do.points=FALSE,main='Empirical cdfs for Age',col='blue')
plot(ecdf1, verticals=TRUE, do.points=FALSE, add=TRUE, col='red')
legend(20,0.9,c('Smoke = 0', 'Smoke = 1'),col=c('blue','red'),lty=1)
@


\textbf{Building the propensity score:}  We attempt to build and assess a propensity score model using the available covariates:

<<NH4,comment='+', fig.width=6.25, fig.height=4.25, fig.align='center'>>=
fmods<-formula(SmokeNow~Gender+Age+Race3+Education+MaritalStatus+HHIncome+Poverty)
ps.mod <- glm(fmods,data=small.nhanes,family="binomial")
ps.lr <- predict(ps.mod,type="response")
ps0<-ps.lr[Smoke==0];ps1<-ps.lr[Smoke==1]
quints <- c(0,quantile(ps.lr,seq(.2,1,.2)))
rbind(table(cut(ps.lr[Smoke==0],quints)),table(cut(ps.lr[Smoke==1],quints)))
@
The table indicates that there is good representation of the exposed and unexposed groups in the five quintile-based strata.  We now verify by looking for overlap of the propensity score histograms.

<<NH4a,comment='+', fig.width=6.25, fig.height=4.25, fig.align='center'>>=
par(mar=c(2,2,2,0))
hist(ps0, col=rgb(0,0,1,0.5), breaks=seq(0,1,by=0.05), ylim=c(0,150),
        main="Propensity Score overlap", xlab="PS")
hist(ps1, col=rgb(1,0,0,0.5), breaks=seq(0,1,by=0.05), add=T);box()
legend(0.6,150,c('Smoke = 0', 'Smoke = 1'),col=c(rgb(0,0,1,0.5),rgb(1,0,0,0.5)),lty=1)
boxplot(ps0,ps1,ylab="PS",xlab="Treatment Group",names=c('Smoke=0','Smoke=1'),
            col=c(rgb(0,0,1,0.5),rgb(1,0,0,0.5)));abline(h=quints[2:5],col="red")
@
We can therefore proceed to check for balance knowing we have sufficient numbers of smokers and non-smokers in each quintile
to ensure the stratum-specific estimates are not too unstable.

<<NH5,comment='+', fig.width=6.25, fig.height=7.0, fig.align='center'>>=
Pcat<-as.numeric(cut(ps.lr,quints,include.lowest=T))
W<-(Smoke==0)/(1-ps.lr)+(Smoke==1)/ps.lr
smd.mat<-ExtractSmd(tabUnmatched)
for(k in 1:5){
    nhanesQ<-small.nhanes[Pcat == k,]
    tabQs <- CreateTableOne(vars = vars, strata = "SmokeNow", data = nhanesQ, test = FALSE)
    smd.mat<-cbind(smd.mat,ExtractSmd(tabQs))
}
colnames(smd.mat)<-c('Original','Q1','Q2','Q3','Q4','Q5')
round(smd.mat,4)
@
Balance does not appear to have been achieved: we have SMDs $>$ 0.1 for at least three quintiles for all variables, and the empirical CDFs of age do not overlap in several quintiles.

<<NH6,comment='+', fig.width=7.25, fig.height=8.5, fig.align='center',echo=TRUE>>=
par(mar=c(2,3,4,0),mfrow=c(3,2),oma=c(0,0,2,0))
for(k in 1:5){
    age0 <- small.nhanes$Age[Smoke==0 & Pcat==k]
    age1 <- small.nhanes$Age[Smoke==1 & Pcat==k]
    ecdf0 <- ecdf(age0)
    ecdf1 <- ecdf(age1)
    plot(ecdf0, verticals=TRUE, do.points=FALSE,
        main=substitute(paste('Quintile ',k),list(k=k)),col='blue')
    plot(ecdf1, verticals=TRUE, do.points=FALSE, add=TRUE, col='red')
}
plot(age0,type='n',ylim=range(0,1),axes=FALSE)
title("ECDFs for Age by PS quintile",outer = TRUE)
legend(30,0.75,c('Smoke = 0', 'Smoke = 1'),col=c('blue','red'),lty=1)
@

We may try to find evidence in smaller PS strata, such as those formed by deciles of the propensity score.  However, here this does not assist with providing evidence of balance.

<<NH7,comment='+', fig.width=7.25, fig.height=8.5, fig.align='center',echo=FALSE>>=
dec <- c(0,quantile(ps.lr,seq(0.1,1,.1)))
Pcat10 <- cut(ps.lr,dec,labels=1:10)
SMD.10.table <- ExtractSmd(tabUnmatched)
for(k in 1:10) {
   tabPSdec <- CreateTableOne(vars = vars, strata = "SmokeNow",
            data = small.nhanes[Pcat10==k,], test = FALSE)
   SMD.10.table <- cbind(SMD.10.table,ExtractSmd(tabPSdec))
}
colnames(SMD.10.table)<-c('Original','Q1','Q2','Q3','Q4','Q5','Q6','Q7','Q8','Q9','Q10')
@

\bigskip

\textbf{Matching:}  We now attempt matching using the propensity score.  The function \texttt{MatchBalance} from the \texttt{Matching} library provides many more details than \texttt{CreateTableOne}, including:
\begin{itemize}
  \item mean, median, and maximum difference in empirical CDF plots,
  \item mean, median, and maximum difference in empirical QQ plots,
  \item Kolmogorov-Smirnov statistics,
  \item ratio of variances,
  \item p-value for t-test.
\end{itemize}
Results shown in Appendix.  Note that SMDs reported are multiplied by 100.


<<NH8,comment='+', fig.width=7.25, fig.height=8.5, fig.align='center'>>=
small.nhanes$ps.lr<-ps.lr
ps.lr.match <- Match(Tr=small.nhanes$SmokeNow,X=small.nhanes$ps.lr,estimand="ATE",ties=FALSE)
matched.samp <- small.nhanes[c(ps.lr.match$index.control,ps.lr.match$index.treated),]
table(table(c(ps.lr.match$index.control, ps.lr.match$index.treated)))
tabMatched<-CreateTableOne(vars = vars, strata = "SmokeNow",data = matched.samp, test = FALSE)
@
<<NH9,comment='+', fig.width=6.25, fig.height=4.25, fig.align='center',echo=FALSE>>=
age0 <- matched.samp$Age[matched.samp$SmokeNow==0]
age1 <- matched.samp$Age[matched.samp$SmokeNow==1]
ecdf0 <- ecdf(age0)
ecdf1 <- ecdf(age1)
par(mar=c(2,2,2,0))
plot(ecdf0, verticals=TRUE, do.points=FALSE,main='Empirical cdfs for Age in matched sample',col='blue')
plot(ecdf1, verticals=TRUE, do.points=FALSE, add=TRUE, col='red')
legend(20,0.9,c('Smoke = 0', 'Smoke = 1'),col=c('blue','red'),lty=1)
@

\medskip
Matching seems to have matched the age profiles for the smoking and non-smoking groups.

<<NH9a,comment='+', fig.width=6.25, fig.height=4.25, fig.align='center',echo=FALSE>>=
smd.mat<-cbind(smd.mat,ExtractSmd(tabMatched))
colnames(smd.mat)<-c('Original','Q1','Q2','Q3','Q4','Q5',"Match")
round(smd.mat[,c(1,7)],4)
@

\bigskip

\textbf{Inverse Weighting:}  We construct an inverse weighted sample using the \texttt{survey} and \texttt{Hmisc} packages.
<<NH10,comment='+', fig.width=6.25, fig.height=4.25, fig.align='center'>>=
library(survey);library(Hmisc);library(htmlwidgets) #Must be pre-loaded in R
ps.lr.weight <- small.nhanes$SmokeNow/ps.lr + (1-small.nhanes$SmokeNow)/(1-ps.lr)
nhanes.IPW.lr <- svydesign(ids=~0, data=small.nhanes, weights=ps.lr.weight)
tabIPW<-svyCreateTableOne(vars = vars, strata = "SmokeNow",data = nhanes.IPW.lr, test = FALSE)
@

The summary of the reweighted samples are given in the Appendix; it is clear that in the weighted (pseudo) data balance holds well.

<<NH10a,comment='+', fig.width=6.25, fig.height=4.0, fig.align='center',echo=FALSE>>=
temp0 <- Ecdf(small.nhanes$Age[Smoke==0],weights=ps.lr.weight[Smoke==0],pl=FALSE)
temp1 <- Ecdf(small.nhanes$Age[Smoke==1],weights=ps.lr.weight[Smoke==1],pl=FALSE)
par(mar=c(2,2,2,0))
plot(temp0$x,temp0$y,ylab="ECDF(Age)",xlab="Age",
        main='Empirical cdfs for Age in weighted sample',col='blue',type="s",lwd=1)
lines(temp1$x,temp1$y,col="red",lwd=1,type='s')
smd.mat<-cbind(smd.mat,ExtractSmd(tabIPW))
colnames(smd.mat)<-c('Original','Q1','Q2','Q3','Q4','Q5','Match','IPW')
round(smd.mat[,c(1,8)],4)
@
Here the matched and inverse weighted samples exhibit good balance.

\bigskip

\textbf{Summary: }
\begin{itemize}
\item Creating or restoring confounder balance is essential to estimating a causal effect.
\item It can be hard to assess overlap or achieve balance in high dimensions.
\item The propensity score, a scalar summary of confounding variables, simplifies this task.
\item However:
\begin{itemize}
\item fitting a model for treatment does not guarantee balance,
\item fitting a model that predicts treatment with a high degree of precision can be unhelpful.
\end{itemize}
\end{itemize}

\pagebreak

\begin{center}
\textsclarge{Estimating the ATE} 
\end{center}


We proceed now to estimating the average treatment effect (ATE), using:
\begin{itemize}
\item outcome regression: 
\item PS stratification,
\item PS matching,
\item PS regression,
\item IPW \& AIPW
\end{itemize}
We will use the PS estimated via logistic regression, as this provided the best balance.  Note that the IPW and AIPW estimators can suffer from the fact that the inverse weights based on the propensity score can get large if the propensity score is near zero or one.  For these estimators, weight truncation can be used, where propensity score values more extreme than a given threshold (0.001 or 0.999 say) are set equal to these thresholds.



\begin{itemize}

\item \textbf{Outcome regression, unadjusted:}
<<NH11,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
coef(summary(lm(BPSysAve~SmokeNow,data=small.nhanes)))[2,]
@

\item \textbf{Outcome regression no interaction:}  We use the model
\[
\texttt{BPSysAve $\sim$ SmokeNow+Gender+Age+Race3+Education+MaritalStatus+HHIncome+Poverty}
\]
<<NH12,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
fmod<-formula(BPSysAve~SmokeNow+Gender+Age+Race3+Education+MaritalStatus+HHIncome+Poverty)
coef(lm(fmod,data=small.nhanes))[2]
@

<<NH12a,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
nhanes.allsmoke <- small.nhanes
nhanes.allsmoke$SmokeNow <- 1
nhanes.nosmoke <- small.nhanes
nhanes.nosmoke$SmokeNow <- 0
mod1.lm <- lm(fmod,data=small.nhanes)
APO.lm.1 <- mean(predict(mod1.lm,nhanes.allsmoke))
APO.lm.0 <- mean(predict(mod1.lm,nhanes.nosmoke))
APO.lm.1 - APO.lm.0  #Note: same as for previous answer regression 
@
The naive conditional effect estimate is more than 3 times greater than its confounder-adjusted counterpart.

\item \textbf{Outcome regression with interactions:}  We now add interaction terms
\begin{align*}
\texttt{BPSysAve} \sim & \texttt{SmokeNow+Gender+Age+Race3+Education+MaritalStatus+HHIncome+Poverty}\\
& \texttt{+SmokeNow:HHIncome+SmokeNow:Gender+SmokeNow:Age}
\end{align*}

<<NH13,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
fmodi<-formula(BPSysAve~SmokeNow+Gender+Age+Race3+Education+
        MaritalStatus+HHIncome+Poverty+SmokeNow:HHIncome+SmokeNow:Gender+SmokeNow:Age)
mod1.lmX <- lm(fmodi,data=small.nhanes)
coef(summary(mod1.lmX))[c(2,10,11,12),]
APO.lmX.1 <- mean(predict(mod1.lmX,nhanes.allsmoke))
APO.lmX.0 <- mean(predict(mod1.lmX,nhanes.nosmoke))
APO.lmX.1 - APO.lmX.0
@
Here the coefficients alone \textbf{do not} provide the ATE estimate, and we need to use the \texttt{predict} method to compute the ATE estimate as
\[
\frac{1}{n} \sum_{i=1}^n \mu(x_i,1;\widehat \beta) - \frac{1}{n} \sum_{i=1}^n \mu(x_i,0;\widehat \beta)
\]

\item \textbf{PS stratification:}  We first fit use the propensity score model based on the logistic regression with all possible confounders included
\[
\texttt{BPSysAve $\sim$ Gender+Age+Race3+Education+MaritalStatus+HHIncome+Poverty}
\]
and then used the stratification based on PS quintiles.

<<NH14,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
fmods<-formula(SmokeNow~Gender+Age+Race3+Education+MaritalStatus+HHIncome+Poverty)
ps.mod <- glm(fmods,data=small.nhanes,family="binomial")
ps.lr <- predict(ps.mod,type="response")
Y<-small.nhanes$BPSysAve
ps.lr.quints <- cut(ps.lr,quints,labels=1:5)
p.strat <- table(ps.lr.quints)/length(ps.lr.quints)
ATE.strat <- rep(NA,5)
for(j in 1:5) {
    ATE.strat[j]<-mean(Y[Smoke==1 & ps.lr.quints==j])-mean(Y[Smoke==0 & ps.lr.quints==j])
}
sum(ATE.strat*p.strat)
@

\item \textbf{PS matching:} We now use the same PS model for matching.
<<NH15,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
ps.lr.match <- Match(Tr=small.nhanes$SmokeNow,
      X=small.nhanes$ps.lr,estimand="ATE",ties=FALSE)
matched.samp <- small.nhanes[c(ps.lr.match$index.control,ps.lr.match$index.treated),]
mean(matched.samp$BPSysAve[matched.samp$SmokeNow == 1]) -
    mean(matched.samp$BPSysAve[matched.samp$SmokeNow == 0])
@

\item \textbf{PS regression:}  We now attempt three propensity score regression analyses based on increasingly complex specifications
\[
\begin{array}{lll}
\text{PSR 1} & Y & = \ \beta_0 + \beta_1 e(X;\widehat \alpha) + \theta_1 Z + \epsilon \\[6pt]
\text{PSR 2} & Y & = \ \beta_0 + \beta_1 e(X;\widehat \alpha) + \beta_2 \{e(X;\widehat \alpha) \}^2 + \theta_1 Z  + \theta_2 Z e(X;\widehat \alpha) + \theta_3 Z \{e(X;\widehat \alpha)\} +    \epsilon \\[6pt]
\text{PSR 3} & Y & = \ s(e(X;\widehat \alpha)) + \theta_1 Z +\theta_2 Z s(e(X;\widehat \alpha)) +   \epsilon 
\end{array}
\]
where $s(.)$ is a flexible spline function. 

<<NH16,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
library(splines)
mod1.PSlm1 <- lm(BPSysAve~SmokeNow+ps.lr,data=small.nhanes)
APO.PSlm1.1 <- mean(predict(mod1.PSlm1,nhanes.allsmoke))
APO.PSlm1.0 <- mean(predict(mod1.PSlm1,nhanes.nosmoke))
APO.PSlm1.1 - APO.PSlm1.0

mod1.PSlm2 <- lm(BPSysAve~(1+SmokeNow)*(ps.lr+I(ps.lr^2)),data=small.nhanes)
APO.PSlm2.1 <- mean(predict(mod1.PSlm2,nhanes.allsmoke))
APO.PSlm2.0 <- mean(predict(mod1.PSlm2,nhanes.nosmoke))
APO.PSlm2.1 - APO.PSlm2.0

mod1.PSlm3 <- lm(BPSysAve~(1+SmokeNow)*bs(ps.lr,df=4,Boundary.knots = range(0,1)),
                    data=small.nhanes)
APO.PSlm3.1 <- mean(predict(mod1.PSlm3,nhanes.allsmoke))
APO.PSlm3.0 <- mean(predict(mod1.PSlm3,nhanes.nosmoke))
APO.PSlm3.1 - APO.PSlm3.0
@


\item \textbf{IPW:} The IPW estimator can be computed using a direct calculation
\[
\frac{1}{n} \sum_{i=1}^n \frac{Z_i Y_i}{e(X_i;\widehat \alpha)} - \frac{1}{n} \sum_{i=1}^n \frac{(1-Z_i) Y_i}{1-e(X_i;\widehat \alpha)}
\]
or in its standardized weight form via weighted least squares.

<<NH17,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
ps.lr.weight <- Smoke/ps.lr + (1-Smoke)/(1-ps.lr)
mean(Smoke*Y*ps.lr.weight) -mean((1-Smoke)*Y*ps.lr.weight)  #IPW0
coef(lm(Y ~ Smoke, weights = ps.lr.weight))[2]              #IPW
@

\item \textbf{AIPW:} The augmented IPW estimator adds a proposed mean model for the APOs, that is
\begin{align*}
\widehat \mu(1) & =  \dfrac{1}{n} \sum_{i=1}^n \dfrac{Z_i (Y_i-\mu(X_i,1;\widehat \beta))}{e(X_i;\widehat \alpha)} + \dfrac{1}{n} \sum_{i=1}^n \mu(X_i,1;\widehat \beta) \\[6pt]
\widehat \mu(0) & =  \dfrac{1}{n} \sum_{i=1}^n \dfrac{(1-Z_i) (Y_i-\mu(X_i,0;\widehat \beta))}{1-e(X_i;\widehat \alpha)} + \dfrac{1}{n} \sum_{i=1}^n \mu(X_i,0;\widehat \beta)
\end{align*}
or computed via an augmented outcome regression with the two additional predictors
\[
\dfrac{Z_i}{e(X_i;\widehat \alpha)} \qquad \dfrac{(1-Z_i)}{1-e(X_i;\widehat \alpha)}.
\]
 
<<NH18,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
mod1 <- lm(fmodi,data=small.nhanes)
m1<-predict(mod1,nhanes.allsmoke)
m0<-predict(mod1,nhanes.nosmoke)
ps.trunc<-pmin(pmax(0.001,ps.lr),0.999)
APO.1<-mean(Smoke*(Y-m1)/ps.trunc)+mean(m1)
APO.0<-mean((1-Smoke)*(Y-m0)/(1-ps.trunc))+mean(m0)
APO.1-APO.0 

small.nhanes$ps <- ps.lr
fmodia<-formula(BPSysAve~SmokeNow+Gender+Age+Race3+Education+MaritalStatus+HHIncome+Poverty+
                SmokeNow:HHIncome+SmokeNow:Gender+SmokeNow:Age+
                I(SmokeNow/ps)+I((1-SmokeNow)/(1-ps)))
mod1.lmX <- lm(fmodia,data=small.nhanes)
nhanes.allsmoke <- nhanes.nosmoke <- small.nhanes; 
nhanes.allsmoke$SmokeNow <- 1
nhanes.nosmoke$SmokeNow <- 0
mean(predict(mod1.lmX,nhanes.allsmoke))-mean(predict(mod1.lmX,nhanes.nosmoke))
@
\end{itemize}



{\textbf{Additional considerations:}
\begin{itemize}
\item All of the PS approaches considered rely on substitution estimators.
\begin{itemize}
\item In PS regression, we plug in an estimated PS as a covariate.
\item In IPW, we plug in estimated weights.
\end{itemize}
\item We need to account for this when estimating standard errors and/or confidence intervals.
\item Analytically derived asymptotic variances can be used, but are not provided in many standard software packages.
\item The easiest approach is to bootstrap; however, that the bootstrap is \textit{not} in general valid for matching.
\end{itemize}

\pagebreak

\begin{center}
{\textsclarge{Bootstrap analysis}}
\end{center}

1000 bootstrap samples are used to approximate the sampling distribution of the various estimators.

<<NH19,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
nboot<-1000
set.seed(45)
ests.mat<-matrix(0,nrow=nboot,ncol=10)
n<-nrow(small.nhanes)
library(splines)
for(ib in 1:nboot){
    ind<-sample(1:n,size=n,rep=T)
    boot.nhanes<-small.nhanes[ind,]
    boot.allsmoke <- boot.nosmoke <- boot.nhanes
    boot.allsmoke$SmokeNow <- 1
    boot.nosmoke$SmokeNow <- 0

    ests.mat[ib,1]<-coef(lm(BPSysAve~SmokeNow,data=boot.nhanes))[2]
    ests.mat[ib,2]<-coef(lm(fmod,data=boot.nhanes))[2]

    mod1.lmX <- lm(fmodi,data=boot.nhanes)
    APO.lmX.1 <- mean(predict(mod1.lmX,boot.allsmoke))
    APO.lmX.0 <- mean(predict(mod1.lmX,boot.nosmoke))
    ests.mat[ib,3]<-APO.lmX.1 - APO.lmX.0

    ps.mod <- glm(fmods,data=boot.nhanes,family="binomial")
    ps.lr <- predict(ps.mod,type="response")
    ps0<-ps.lr[boot.nhanes$SmokeNow==0];ps1<-ps.lr[boot.nhanes$SmokeNow==1]
    quints <- c(0,quantile(ps.lr,seq(.2,1,.2)))
    Y<-boot.nhanes$BPSysAve
    ps.lr.quints <- cut(ps.lr,quints,labels=1:5)
    p.strat <- table(ps.lr.quints)/length(ps.lr.quints)
    ATE.strat <- rep(NA,5)
    for(j in 1:5) {
        ATE.strat[j] <- mean(Y[boot.nhanes$SmokeNow == 1 & ps.lr.quints==j])-
                        mean(Y[boot.nhanes$SmokeNow == 0 & ps.lr.quints==j])
    }
    ests.mat[ib,4]<-sum(ATE.strat*p.strat)

    boot.nhanes$ps.lr<-ps.lr
    ps.lr.match <- Match(Tr=boot.nhanes$SmokeNow,X=boot.nhanes$ps.lr,estimand="ATE",ties=FALSE)
    matched.samp <- boot.nhanes[c(ps.lr.match$index.control,ps.lr.match$index.treated),]
    ests.mat[ib,5]<-mean(matched.samp$BPSysAve[matched.samp$SmokeNow == 1]) -
                    mean(matched.samp$BPSysAve[matched.samp$SmokeNow == 0])

    mod1.PSlm1 <- lm(BPSysAve~SmokeNow+ps.lr,data=boot.nhanes)
    APO.PSlm1.1 <- mean(predict(mod1.PSlm1,boot.allsmoke))
    APO.PSlm1.0 <- mean(predict(mod1.PSlm1,boot.nosmoke))
    ests.mat[ib,6]<-APO.PSlm1.1 - APO.PSlm1.0

    mod1.PSlm2 <- lm(BPSysAve~(1+SmokeNow)*(ps.lr+I(ps.lr^2)),data=boot.nhanes)
    APO.PSlm2.1 <- mean(predict(mod1.PSlm2,boot.allsmoke))
    APO.PSlm2.0 <- mean(predict(mod1.PSlm2,boot.nosmoke))
    ests.mat[ib,7]<-APO.PSlm2.1 - APO.PSlm2.0

    mod1.PSlm3 <- lm(BPSysAve~(1+SmokeNow)*bs(ps.lr,df=4,Boundary.knots = range(0,1)),
                        data=boot.nhanes)
    APO.PSlm3.1 <- mean(predict(mod1.PSlm3,boot.allsmoke))
    APO.PSlm3.0 <- mean(predict(mod1.PSlm3,boot.nosmoke))
    ests.mat[ib,8]<-APO.PSlm3.1 - APO.PSlm3.0

    ps.lr.weight <- boot.nhanes$SmokeNow/ps.lr + (1-boot.nhanes$SmokeNow)/(1-ps.lr)
    ests.mat[ib,9]<-coef(lm(Y ~ boot.nhanes$SmokeNow, weights = ps.lr.weight))[2]

    boot.allsmoke <- boot.nosmoke <- boot.nhanes
    boot.allsmoke$SmokeNow <- 1
    boot.nosmoke$SmokeNow <- 0
    
    mod1 <- lm(fmodi,data=boot.nhanes)
    m1<-predict(mod1,boot.allsmoke)
    m0<-predict(mod1,boot.nosmoke)
    Smoke<-boot.nhanes$SmokeNow
    ps.trunc<-pmin(pmax(0.001,ps.lr),0.999)
    APO.1<-mean(Smoke*(Y-m1)/ps.trunc)+mean(m1)
    APO.0<-mean((1-Smoke)*(Y-m0)/(1-ps.trunc))+mean(m0)
    ests.mat[ib,10]<-APO.1-APO.0
}
@

<<NH20,comment='+', fig.width=6.25, fig.height=5.0, fig.align='center',echo=FALSE>>=
par(mar=c(4,2,1,0))
lvec<-c('Reg.','Reg. 2','Reg. 3','Strat.','Match.','PSR 1','PSR 2','PSR 3','IPW','AIPW')
boxplot(ests.mat,names=lvec,pch=19,cex=0.5,ylim=range(-10,5),cex.axis=0.75)
@
The boxplot demonstrates largely similar estimates from quite different analysis methods.  The estimates and standard errors are as follows.

<<NH21,comment='+', fig.width=7.25, fig.height=5.5, fig.align='center',echo=FALSE>>=
ATE<-apply(ests.mat,2,mean)
ATE.se<-sqrt(apply(ests.mat,2,var))
ATE.res<-cbind(ATE,ATE.se)
rownames(ATE.res)<-lvec
colnames(ATE.res)<-c('Est.','s.e.')
ATE.res
@

\pagebreak


\begin{center}
{\textsclarge{Appendix: summary of matched and weighted samples}}
\end{center}

\textbf{Matched sample summary:}
<<NH22,comment='+', fig.width=7.25, fig.height=5.5, fig.align='center'>>=
MatchBalance(SmokeNow~Gender+Age+Education+MaritalStatus+Poverty,
                   data=small.nhanes,match.out=ps.lr.match)
@

\pagebreak

\textbf{Weighted sample summary:}

<<NH23,comment='+', fig.width=7.25, fig.height=5.5, fig.align='center'>>=
print(tabIPW, smd = TRUE)
@
\end{document}