\documentclass[notitlepage]{article}
\usepackage{Math}
%\usepackage{../../Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{bm}
\usepackage{bbm}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{dsfont}
\usepackage{scalerel}

\usepackage{amsfonts,amsmath,amssymb}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}


\usepackage{pgfplots}
\pgfplotsset{compat=1.17}

\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
            \node[shape=circle,draw,inner sep=2pt] (char) {#1};}}

\usepackage{xcolor}

\newtheorem{alg}{Algorithm}
\newtheorem{ex}{Example}
\newtheorem{note}{Note}
\newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof
}}}}
{\hfill \small$\blacksquare$\\[2mm]}


\setlength{\parindent}{0in}

\def\E{\mathbbmss{E}}
\def\Var{\text{Var}}
\def\Cov{\text{Cov}}
\def\Corr{\text{Corr}}

\def\bW{\mathbf{W}}
\def\bH{\mathbf{H}}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\Rho}{\mathrm{P}}
\newcommand{\cif}{\text{ if }}
\newcommand{\Ra}{\Longrightarrow}

\def\bphi{\boldsymbol \phi}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\def\Xb{\mathbf{X}_{\beta}}
\def\Xp{\mathbf{X}_{\psi}}
\def\beps{\boldsymbol{\epsilon}}
\def\betahat{\widehat \beta}
\def\psihat{\widehat \psi}

\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=6)
options(repos=c(CRAN="https://cloud.r-project.org/"))
knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)})
@
\begin{center}
{\textsclarge{Causal adjustment for the Log-Linear Model}}
\end{center}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

In the following,

\begin{itemize}
\item $X$ is the random vector of confounders, and $x$ its observed values,

\item $\bx$ is the vector of functions of $x$ that form the linear predictor,

\item $\bX$ is the random variable version of $\bx$.

\end{itemize}





Suppose that a \emph{log-linear} model is deemed appropriate:
\[
\log \mu(x,z;\beta,\psi) = \bx_\beta  \beta + z \bx_\psi \psi
\]
Regarding this as the structural model, we then have that
\[
\E[Y(\rz)] \equiv \E_{Y|Z}^\calE[Y|Z=\rz] = \E_X^\calE [\exp\{\bX_\beta  \beta + \rz \bX_\psi \psi\}]
\]
which would then invoke the estimator
\[
\widetilde \mu(\rz) = \frac{1}{n} \sum_{i=1}^n \exp\{\bx_{\beta i} \widehat \beta + \rz \bx_{\psi i} \widehat \psi\}.
\]
where $(\widehat \beta, \widehat \psi)$ are estimated from a correctly specified model.  The above model assumes that
\[
\mu(x,\rtmt;\beta,\psi) = \mu(x,\rnotmt;\beta,\psi) \exp\{ \bx_\psi \psi\}.
\]
with the effect of $Z$ represented \emph{conditional} on $X$; \emph{marginally}, $\psi$ alone does not capture the effect of treatment.  However, if we can estimate $\beta$ and $\psi$ consistently, then we can recover APO and ATE estimators from them.  By standard regression arguments, we know that \emph{correct specification} of the outcome model is needed.

<<C1,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
set.seed(1293)
library(mvnfast)
expit<-function(x){return(1/(1+exp(-x)))}
muX<-c(-1,0.5,1)
Sig<-0.1*0.9^abs(outer(1:3,1:3,'-'))
al<-c(2,1,-2,0.75)
N<-1000000
X<-rmvn(N,muX,Sig)
Xa<-cbind(1,X[,1],X[,2],X[,1]*X[,2])
eX<-expit(Xa %*% al)
Z<-rbinom(N,1,eX)
Xb<-cbind(1,X[,1],X[,2],Z)
X1<-X[,1];X2<-X[,2]
be<-c(3,2,-2); psi<-1
muY<-exp(Xb %*% c(be,psi))
Y<-rpois(N,muY)
mu0<-exp(cbind(1,X[,1],X[,2],0) %*% c(be,psi))
mu1<-exp(cbind(1,X[,1],X[,2],1) %*% c(be,psi))
#Estimated effects (via a large sample Monte Carlo estimate)
true.ate1<-mean(mu1)-mean(mu0)   #Additive ATE
true.ate2<-mean(mu1)/mean(mu0)   #Multiplicative ATE
c(true.ate1,true.ate2)
@

<<C2,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
fit0<-glm(Y~X1+X2+Z,family=poisson)
fit.mu0<-predict(fit0,newdata=data.frame(X1=X1,X2=X2,Z=0),type='response')
fit.mu1<-predict(fit0,newdata=data.frame(X1=X1,X2=X2,Z=1),type='response')
ate1.est<-mean(fit.mu1)-mean(fit.mu0)
ate2.est<-mean(fit.mu1)/mean(fit.mu0)
c(ate1.est,ate2.est)
@

<<C3,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
#Mis-specified model
fit1<-glm(Y~X1+Z,family=poisson)
fit.mu0<-predict(fit1,newdata=data.frame(X1=X1,Z=0),type='response')
fit.mu1<-predict(fit1,newdata=data.frame(X1=X1,Z=1),type='response')
ate1.est<-mean(fit.mu1)-mean(fit.mu0)
ate2.est<-mean(fit.mu1)/mean(fit.mu0)
c(ate1.est,ate2.est)
@

For the log-linear model,
\[
\log \mu(x,z;\beta,\psi) = \bx_\beta  \beta + z \bx_\psi \psi
\]
the standard estimating equation takes the form
\[
\sum_{i=1}^n \begin{pmatrix}
               \bx_{\beta i}^\top \\[6pt]
               z_i \bx_{\psi i}^\top
             \end{pmatrix} (y_i - \exp\{\bx_{\beta i}  \beta + z_i \bx_{\psi i} \psi\}) = \zerovec.
\]
We might try to make inference robust to mis-specification using a \emph{G-estimation}-like strategy, and modify the estimating equation to be
\[
\sum_{i=1}^n \begin{pmatrix}
               \bx_{\beta i}^\top \\[6pt]
               (z_i-e(x_i)) \bx_{\psi_i}^\top
             \end{pmatrix} (y_i - \exp\{\bx_{\beta i}  \beta + z_i \bx_{\psi i} \psi\}) = \zerovec
\]
where $e(x_i)$ is the propensity score.  In the \emph{log-linear} case, we must modify the second estimating function to be
\[
\varphi(X,Y,Z) = (Z-e(X))\bX_{\psi}^\top \exp\{-Z \bX_\psi \psi\} (Y- \exp\{\bX_{\beta }  \beta + Z  \bX_{\psi} \psi\})
\]
Thus for consistent estimation, we need to solve
\begin{equation}\label{eq:G-LL}
\sum_{i=1}^n \begin{pmatrix}
               \bx_{\beta i}^\top \\[6pt]
               (z_i-e(x_i)) \bx_{\psi_i}^\top \exp\{-z_i \bx_{\psi i} \psi \}
             \end{pmatrix} (y_i - \exp\{\bx_{\beta i}  \beta + z_i \bx_{\psi i} \psi\}) = \zerovec.
\end{equation}
In the following simulation, we estimate $\psi$ where in the data generating model we have
\[
\log \mu(x,z;\beta,\psi) = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + z \psi
\]
for $Y \sim Poisson(\mu(x,z;\beta,\psi))$, and 
\[
\text{logit}(e(x)) = \alpha_0 + \alpha_1 x_1 + \alpha_2 x_2 + \alpha_3 x_1 x_2 
\]
using five analyses:

\begin{enumerate}[1.]
  \item Correctly specified outcome regression;
  \item Incorrectly specified outcome regression with model
  \[
  \log \mu(x,z;\beta,\psi) = \beta_0 + \beta_1 x_1 + z \psi;
  \]
  \item G-estimation with correctly specified outcome regression with from 1. and correctly specified propensity score model;
  \item G-estimation with incorrectly specified outcome regression with from 2. and correctly specified propensity score model;
  \item G-estimation using the drgee package.
\end{enumerate}
G-estimation using equation \eqref{eq:G-LL} can be carried out using the package \texttt{nleqslv} in \texttt{R}, which is a generic non-linear equation solver.

<<C4,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
library(nleqslv)
library(drgee)
loglin<-function(xv,Xma,Xmb,zv,yv){

    nv<-nrow(Xma)
    alv<-xv[1:ncol(Xma)]
    bev<-xv[(ncol(Xma)+1):(ncol(Xma)+ncol(Xmb))]
        
    eXv<-expit(Xma %*% alv)
    muv<-exp(Xmb %*% bev)

    Xbd<-Xmb
    Xbd[,ncol(Xmb)]<-(zv-eXv)*exp(-zv*bev[length(bev)])

    r1<-t(Xbd) %*% (yv - muv)
    r2<-t(Xma) %*% (zv - eXv)
    return(c(r1,r2))
}

nreps<-1000
n<-1000
psi.ests<-matrix(0,nrow=nreps,ncol=5)
for(irep in 1:nreps){
    X<-rmvn(n,muX,Sig)
    Xa<-cbind(1,X[,1],X[,2],X[,1]*X[,2])
    eX<-expit(Xa %*% al)
    Z<-rbinom(n,1,eX)
    Xb<-cbind(1,X[,1],X[,2],Z)
    X1<-X[,1];X2<-X[,2]
    muY<-exp(Xb %*% c(be,psi))
    Y<-rpois(n,muY)
    
    #1. Correctly specified
    fit1<-glm(Y~X1+X2+Z+X1:X2,family=poisson)
    psi.ests[irep,1]<-coef(fit1)[4]

    #2. Mis-specified    
    fit1<-glm(Y~X1+Z,family=poisson)
    psi.ests[irep,2]<-coef(fit1)[3]
    
    #3. G-estimation - Correct specification
    xs<-rep(0,length(al)+length(be)+1)
    fit.ll<-nleqslv(xs,fn=loglin,Xma=Xa,Xmb=Xb,zv=Z,yv=Y,control=list(ftol=1e-10,maxit=500))
    psi.ests[irep,3]<-fit.ll$x[length(fit.ll$x)]
    
    #4. G-estimation - incorrect specification
    Xa0<-Xa
    Xb0<-cbind(1,X1,Z)
    xs<-rep(0,ncol(Xa0)+ncol(Xb0))
    fit.ll<-nleqslv(xs,fn=loglin,Xma=Xa0,Xmb=Xb0,zv=Z,yv=Y,control=list(ftol=1e-10,maxit=500))
    psi.ests[irep,4]<-fit.ll$x[length(fit.ll$x)]

    #drgee
    dr.est <- drgee(oformula = formula(Y~X1), eformula = formula(Z~X1*X2),
    iaformula = formula(~1), olink = "log", elink = "logit", estimation.method = "dr")
    psi.ests[irep,5]<-coef(dr.est)[1]
}
@

<<C5,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
par(mar=c(2,2,2,0))
nv<-c('Correct','Incorrect','G-C','G-I','drgee')
boxplot(psi.ests,names=nv,pch=19,cex=0.7);abline(h=psi,col='red',lty=2)
bias.est<-apply(psi.ests-psi,2,mean)*sqrt(n)
var.est<-apply(psi.ests,2,var)*n
var.est                              #Estimator variance
mse.est<-bias.est^2+var.est
mse.est                              #Estimator MSE
@
In this example, the G-estimation methods provide unbiased estimation of the $\psi$ parameter, with estimator variance that is not inferior to that of the correctly specified outcome regression.

\bigskip

\textbf{Note:}  Although we can recover unbiased estimates of $\psi$ in this analysis, we cannot in general recover estimates of the ATE using G-estimation if the outcome mean model is mis-specified, as it is not possible to recover consistent estimates of $\beta$.  We can achieve consistent estimation of the ATE using inverse probability weighting.

<<C6,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
ipw.ests<-matrix(0,nrow=nreps,ncol=2)
for(irep in 1:nreps){
    X<-rmvn(n,muX,Sig)
    Xa<-cbind(1,X[,1],X[,2],X[,1]*X[,2])
    eX<-expit(Xa %*% al)
    Z<-rbinom(n,1,eX)
    Xb<-cbind(1,X[,1],X[,2],Z)
    X1<-X[,1];X2<-X[,2]
    muY<-exp(Xb %*% c(be,psi))
    Y<-rpois(n,muY)

    eX<-fitted(glm(Z~X1*X2,family=binomial))
    w0<-(1-Z)/(1-eX); w1<-Z/eX
    W0<-w0/sum(w0); W1<-w1/sum(w1)
    ipw.ests[irep,1]<-mean(w1*Y)-mean(w0*Y)
    ipw.ests[irep,2]<-sum(W1*Y)-sum(W0*Y)   
}
@

<<C7,comment='+', fig.width=7.5, fig.height=4.0, fig.align='center'>>=
par(mar=c(2,2,2,0))
nv<-c(expression(tilde(delta)),expression(hat(delta)))
boxplot(ipw.ests,names=nv,pch=19,cex=0.7);abline(h=true.ate1,col='red',lty=2)
bias.est<-apply(ipw.ests-true.ate1,2,mean)*sqrt(n)
var.est<-apply(ipw.ests,2,var)*n
var.est                              #Estimator variance
mse.est<-bias.est^2+var.est
mse.est                              #Estimator MSE
@
We can compute sandwich estimates of the asymptotic variance using the methods described previously.

\end{document}