\documentclass[notitlepage]{article}
\usepackage{Math}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
%\usepackage{bbm}
\usepackage{bbm}
\usepackage[all]{xy}
\usepackage{verbatim}
\usepackage{mathtools}
\usepackage{dsfont}

\DeclareMathAlphabet{\mathpzc}{OT1}{pzc}{m}{it}

\makeatletter
\newcommand{\raisemath}[1]{\mathpalette{\raisem@th{#1}}}
\newcommand{\raisem@th}[3]{\raisebox{#1}{$#2#3$}}
\makeatother

\usepackage{amsfonts,amsmath,amssymb}

\newtheorem{alg}{Algorithm}
\newtheorem{ex}{Example}
\newtheorem{note}{Note}
\newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof
}}}}
{\hfill \small$\blacksquare$\\[2mm]}


\setlength{\parindent}{0in}

\def\E{\mathbbmss{E}}
\def\Var{\text{Var}}
\def\Cov{\text{Cov}}
\def\Corr{\text{Corr}}

\def\bW{\mathbf{W}}
\def\bH{\mathbf{H}}

\newcommand{\bs}[1]{\bm{#1}}
\newcommand{\Rho}{\mathrm{P}}
\newcommand{\cif}{\text{ if }}
\newcommand{\Ra}{\Longrightarrow}
\newcommand{\ra}{\longrightarrow}

\def\bphi{\boldsymbol \phi}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\def\Xb{\mathbf{X}_{\beta}}
\def\Xp{\mathbf{X}_{\psi}}
\def\beps{\boldsymbol{\epsilon}}
\def\betahat{\widehat \beta}
\def\psihat{\widehat \psi}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}


\usepackage{pgfplots}
\pgfplotsset{compat=1.17}


\begin{document}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)})
@
\begin{center}
{\textsclarge{Longitudinal exposure and the Marginal Structural Model}}
\end{center}

Suppose we wish to study the effect of a longitudinal exposure pattern, over $K$ intervals, on an outcome.  We have for $i=1,\ldots,n$
\begin{itemize}

\item Exposures $Z_{1i},\ldots,Z_{Ki}$;

\item Confounders $X_{1i},\ldots,X_{Ki}$;

\item Outcome $Y_i$.

\end{itemize}
For $K=2$, we might consider the following DAG structure:

%\vspace{-0.2in}
%\5begin{figure}[h]
%\[
%\xymatrix{
%\\
%& Z_1 \ar@{->}[rr] \ar@/^1.5pc/@[black][rrr] \ar@{->}[dr] & & Z_2 \ar@{->}[r] & Y \\
%X_1 \ar@{->}[ur] \ar@{->}[rr] \ar@{->}[urrr] \ar@/_2.5pc/@[black][urrrr] & &  X_2 \ar@{->}[ur] \ar@{->}[urr] \\
%}
%\]
%\caption{Observational DAG}\label{fig:ObsDAG}
%\end{figure}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    cbox/.style ={rounded rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

  \begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x1) at (-2,1) {${X_1}$};
    \node[state] (z1) at (-1,0) {${Z_1}$};
    \node[state] (x2) at (0,1) {${X_2}$};
    \node[state] (z2) at (1,0) {${Z_2}$};
    \node[state] (y) at (2,1) {${Y}$};

    % Directed edge
    \path (x1) edge (z1);
    \path (x1) edge (x2);
    \path (x1) edge [bend left=45] (y);
    \path (x2) edge (z2);
    \path (z1) edge (x2);
    \path (x1) edge (z2);
    \path (x2) edge (y);
    \path (z1) edge (z2);
   \path (z1) edge (y);
   \path (z2) edge (y);

 \end{tikzpicture}
\end{figure}

The full probability model for this DAG for observational distribution $\Obs$ factorizes as
\[
f_{X_1}^{\Obs}(x_1) f_{Z_1|X_1}^{\Obs}(z_1|x_1)  f_{X_2|X_1,Z_1}^{\Obs}(x_2|x_1,z_1) f_{Z_2|X_1,X_2,Z_1}^{\Obs}(z_2|x_1,x_2,z_1) f_{Y|X_1,X_2,Z_1,Z_2}^{\Obs}(y|x_1,x_2,z_1,z_2).
\]

To consider the causal effect of an exposure pattern $Z=z $ where $Z=(Z_1,Z_2)$ and $z=(z_1,z_2)$, we first consider the following DAG:

%\vspace{-0.2in}
%\begin{figure}[h]
%\[
%\xymatrix{
%\\
%& Z_1 \ar@{->}[rr] \ar@/^1.5pc/@[black][rrr]  & & Z_2 \ar@{->}[r] & Y   \\
%X_1 \ar@{->}[rr] \ar@/_2.5pc/@[black][urrrr] & &  X_2 \ar@{->}[urr] \\
%}
%\]
%\caption{Experimental DAG}\label{fig:ExpDAG}
%\end{figure}

  \begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x1) at (-2,1) {${X_1}$};
    \node[state] (z1) at (-1,0) {${Z_1}$};
    \node[state] (x2) at (0,1) {${X_2}$};
    \node[state] (z2) at (1,0) {${Z_2}$};
    \node[state] (y) at (2,1) {${Y}$};

    % Directed edge
    %\path (x1) edge (z1);
    \path (x1) edge (x2);
    \path (x1) edge [bend left=45] (y);
    %\path (x2) edge (z2);
    \path (z1) edge (x2);
    %\path (x1) edge (z2);
    \path (x2) edge (y);
    \path (z1) edge (z2);
   \path (z1) edge (y);
   \path (z2) edge (y);

 \end{tikzpicture}
\end{figure}

The full probability model for this DAG for experimental distribution $\Exp$ factorizes as
\[
f_{X_1}^{\Exp}(x_1) f_{Z_1}^{\Exp}(z_1)  f_{X_2|X_1,Z_1}^{\Exp}(x_2|x_1,z_1) f_{Z_2|Z_1}^{\Exp}(z_2|z_1) f_{Y|X_1,X_2,Z_1,Z_2}^{\Exp}(y|x_1,x_2,z_1,z_2).
\]
Therefore, the ratio of $f^\Exp$ to $f^\Obs$ becomes
\[
\dfrac{f_{X_1}^{\Exp}(x_1) f_{Z_1}^{\Exp}(z_1)  f_{X_2|X_1,Z_1}^{\Exp}(x_2|x_1,z_1) f_{Z_2|Z_1}^{\Exp}(z_2|z_1) f_{Y|X_1,X_2,Z_1,Z_2}^{\Exp}(y|x_1,x_2,z_1,z_2)}{f_{X_1}^{\Obs}(x_1) f_{Z_1|X_1}^{\Obs}(z_1|x_1)  f_{X_2|X_1,Z_1}^{\Obs}(x_2|x_1,z_1) f_{Z_2|X_1,X_2,Z_1}^{\Obs}(z_2|x_1,x_2,z_1) f_{Y|X_1,X_2,Z_1,Z_2}^{\Obs}(y|x_1,x_2,z_1,z_2)}
\]
which, under the usual assumptions and cancelations becomes
\[
\dfrac{f_{Z_1}^{\Exp}(z_1) f_{Z_2|Z_1}^{\Exp}(z_2|z_1)}{f_{Z_1|X_1}^{\Obs}(z_1|x_1)  f_{Z_2|X_1,X_2,Z_1}^{\Obs}(z_2|x_1,x_2,z_1)} = \dfrac{f_{Z_1,Z_2}^\calE(z_1,z_2)}{f_{Z_1,Z_2|X_1,X_2}^{\Obs}(z_1,z_2|x_1,x_2)} \equiv \dfrac{f_{Z}^\calE(z)}{f_{Z|X}^{\Obs}(z|x)}
\]
say.

\medskip

Using the inverse weighting (importance sampling) argument, we can write in the usual way
\begin{align*}
\mu(z) \equiv \E_{Y|Z}^\Exp [ Y | Z=z ] & = \dfrac{\displaystyle{\iiint}\Ind_{z}(t) \ y \ f_{Y|X,Z}^\Exp(y|x,t) f_{X,Z}^\Exp(x,t) \ dy \ dx \ dt}{\displaystyle{\iint} \Ind_{z}(t) f_{X,Z}^\Exp(x,t)  \ dx \ dt} \\[6pt]
& = \dfrac{\displaystyle{\iiint}\Ind_{z}(t) \ y \ \dfrac{f_{Z}^\Exp(t)}{f_{Z|X}^\Obs(t|x)} f_{X,Z}^\Obs(x,t) \ dy \ dx \ dt}{\displaystyle{\iint} \Ind_{z}(t) \dfrac{f_{Z}^\Exp(t)}{f_{Z|X}^\Obs(t|x) } f_{X,Z}^\Obs(x,t) \ dx \ dt}
\end{align*}
and as the term $f_{Z}^\Exp(z)$ is a fixed constant that can be extracted from the two integrals, we have that
\begin{equation} \label{eq:mu-1}
\mu(z)  = \dfrac{\E_{X,Y,Z}^\Obs \left[ \dfrac{\Ind_{z}(Z)}{f_{Z|X}^\Obs(Z|X)} Y \right]}{\E_{X,Z}^\Obs \left[ \dfrac{\Ind_{z}(Z)}{f_{Z|X}^\Obs(Z|X)}\right]}
\end{equation}
where $\mu(z) = \mu(z_1,z_2)$, $\Ind_{z}(Z) = \prod\limits_{k=1}^2 \Ind_{z_k}(Z_k)$, and
\[
f_{Z|X}^\Obs(z|x) =  f_{Z_1|X_1}^{\Obs}(z_1|x_1) f_{Z_2|X_1,X_2,Z_1}^{\Obs}(z_2|x_1,x_2,z_1).
\]
As usual, we have that
\[
\E_{X,Z}^\Obs \left[ \dfrac{\Ind_{z}(Z)}{f_{Z|X}^\Obs(Z|X)}\right] = \E_{X}^\Obs \left[ \E_{Z|X}^\Obs \left[ \dfrac{\Ind_{z}(Z)}{f_{Z|X}^\Obs(Z|X)} \bigg | X \right] \right ] = \E_{X}^\Obs \left[ 1 \right] = 1
\]
as
\[
\E_{Z|X}^\Obs \left[  \Ind_{z}(Z) | X \right] = f_{Z|X}^\Obs(z|X),
\]
so we also may write
\begin{equation} \label{eq:mu-2}
\mu(z) = \E_{X,Y,Z}^\Obs \left[ \dfrac{\Ind_{z}(Z)}{f_{Z|X}^\Obs(Z|X)} Y \right].
\end{equation}
As in the single time point case, the moment-based estimators corresponding to \eqref{eq:mu-1} and \eqref{eq:mu-2} are
\[
\widehat \mu(z) = \dfrac{\sum\limits_{i=1}^n W_{zi} Y_i }{\sum\limits_{i=1}^n W_{zi}} \qquad \qquad \widetilde \mu(z) = \frac{1}{n} \sum_{i=1}^n W_{zi} Y_i
\qquad \text{where} \qquad
W_{zi}  = \dfrac{\Ind_{z}(Z_i)}{f_{Z|X}^\Obs(Z_i|X_i)}.
\]

\medskip

\textbf{Simulated Data Set:}  Consider the following data generation mechanism:

\begin{itemize}
  \item $X_1 \sim \text{Normal}(1,1)$;
  \item $Z_1|X_1 = x_1 \sim Bernoulli(g(\bx_1 \alpha_1))$, with $\bx_1 = (1,x_1)$ and $\alpha_1 = (1,-0.5)^\top$;

  \item $X_2|X_1 = x_1, Z_1 = z_1 \sim {Normal}(x_1 + 2 z_1,1)$;

  \item $Z_2|X_1 = x_1, Z_1 = z_1 ,X_2 = x_2 \sim {Bernoulli}(g(\bx_2 \alpha_2))$,  with $\bx_2 = (1,x_1,z_1,x_2)$ and $\alpha_2 = (1.0,0.1,0.5,-0.5)^\top$;

  \item $Y|X_1 = x_1, Z_1 = z_1 ,X_2 = x_2, Z_2 = z_2 \sim \text{Normal}(x_1 + z_1 + x_2 + z_2,3^2)$.


\end{itemize}
where $g(t) = \exp\{t\}/(1+\exp\{t\})$.  In this model, we have that
\[
\E_{Y|X_1,Z_1,X_2,Z_2}^\Exp[Y|X_1 = x_1, Z_1 = z_1 ,X_2 = x_2, Z_2 = z_2] = x_1 + z_1 + x_2 + z_2
\]
so that the causal quantity of interest is based on
\begin{align*}
\mu(z_1,z_2) = \E_{X_1,X_2|Z}^\Exp[X_1 + X_2 |Z=z] + z_1 + z_2 & = \E_{X_1}^\Exp[X_1] + \E_{X_1}^\Exp [ \E_{X_2|X_1,Z}^\Exp[X_2|X_1,Z=z]] + z_1 + z_2\\[6pt]
& = 1 + (1+2 z_1) + z_1 + z_2 = 2 + 3z_1 + z_2
\end{align*}
Hence
\[
\mu(0,0) = 2 \qquad \mu(1,0) = 5 \qquad \mu(0,1) = 3 \qquad \mu(1,1) = 6.
\]
<<CA1,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
set.seed(34);n<-10000
expit<-function(x){return(1/(1+exp(-x)))}
X1<-rnorm(n,1,1)
al1<-c(1.0,-0.5); al2<-c(1.0,0.1,0.5,-0.5)
eta1<-al1[1]+al1[2]*X1
ps.1<-expit(eta1)
Z1<-rbinom(n,1,ps.1)
X2<-rnorm(n,X1+2*Z1,1)
eta2<-al2[1]+al2[2]*X1+al2[3]*Z1+al2[4]*X2
ps.2<-expit(eta2)
Z2<-rbinom(n,1,ps.2)
eps<-rnorm(n,0,3)
Y<-X1+X2+Z1+Z2+eps
mu.00<-mean(X1)+mean(X2)
mu.10<-mean(X1)+mean(X2)+1
mu.01<-mean(X1)+mean(X2)+1
mu.11<-mean(X1)+mean(X2)+2
c(mu.00,mu.10,mu.01,mu.11)    #Means of the four subsets
@
The sample means in the four exposure subgroups are therefore quite different from the causal quantities, as expected due to confounding.  In the single timepoint case, we can recover the quantities of interest via a regression of $Y$ on $(X,Z)$.  However, here regression does not give the correct result.
<<CA2,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
fit1<-lm(Y~X1+X2+Z1+Z2);round(coef(summary(fit1)),6)
mu.00.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=0)))
mu.10.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=0)))
mu.01.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=1)))
mu.11.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=1)))
c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
@
The usual regression approach does not yield the correct result here due to the mediation of the effect of the exposure at the first time interval (that is, the path $Z_1 \rightarrow X_2 \rightarrow Y$); regressing on $X_2$, which would be necessary for a correctly specified regression model, has the effect of blocking the mediating path.  Note that whereas the coefficient attached to $Z_2$ is apparently correctly estimated (true value 1, estimated value \Sexpr{round(coef(summary(fit1)),6)[5,1]}), the coefficient attached to $Z_1$ is apparently incorrectly estimated (true value 1, estimated value \Sexpr{round(coef(summary(fit1)),6)[4,1]})


\bigskip
\textbf{Inverse weighting:} Inverse weighting proceeds using the weights based on
\[
W_z = \frac{\Ind_{z_1}(Z_1)}{e(X_1)^{z_1} (1-e(X_1))^{1-z_1}} \times \frac{\Ind_{z_2}(Z_2)}{e(X_1,Z_1,X_2)^{z_2} (1-e(X_1,Z_1,X_2))^{1-z_2}}
\]
and obtains the correct results.  For version given by \eqref{eq:mu-2} we have
<<CA3,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
W<-(ps.1^Z1*(1-ps.1)^(1-Z1))*(ps.2^Z2*(1-ps.2)^(1-Z2))
W00<-(1-Z1)*(1-Z2)/W
W10<-(Z1)*(1-Z2)/W
W01<-(1-Z1)*(Z2)/W
W11<-(Z1)*(Z2)/W
mu.00.tilde<-mean(W00*Y)
mu.10.tilde<-mean(W10*Y)
mu.01.tilde<-mean(W01*Y)
mu.11.tilde<-mean(W11*Y)
c(mu.00.tilde,mu.10.tilde,mu.01.tilde,mu.11.tilde)
@
whereas for the version given by \eqref{eq:mu-1} we have
<<CA4,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
W00.star<-W00/sum(W00)
W10.star<-W10/sum(W10)
W01.star<-W01/sum(W01)
W11.star<-W11/sum(W11)
mu.00.hat<-sum(W00.star*Y)
mu.10.hat<-sum(W10.star*Y)
mu.01.hat<-sum(W01.star*Y)
mu.11.hat<-sum(W11.star*Y)
c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
@


\textbf{Simulation Study:}  We now can replicate this study over 2000 replications, for a smaller sample size $n=1000$.

<<CA5,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
set.seed(34);n<-1000;nreps<-2000
al1<-c(1.0,-0.5);al2<-c(1.0,0.1,0.5,-0.5)
ests.mat<-array(0,c(nreps,3,4))
for(irep in 1:nreps){
    X1<-rnorm(n,1,1)
    eta1<-al1[1]+al1[2]*X1
    ps.1<-expit(eta1)
    Z1<-rbinom(n,1,ps.1)
    X2<-rnorm(n,X1+2*Z1,1)
    eta2<-al2[1]+al2[2]*X1+al2[3]*Z1+al2[4]*X2
    ps.2<-expit(eta2)
    Z2<-rbinom(n,1,ps.2)
    eps<-rnorm(n,0,3)
    Y<-X1+X2+Z1+Z2+eps
    fit1<-lm(Y~X1+X2+Z1+Z2)
    mu.00.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=0)))
    mu.10.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=0)))
    mu.01.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=1)))
    mu.11.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=1)))
    ests.mat[irep,1,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
    W<-(ps.1^Z1*(1-ps.1)^(1-Z1))*(ps.2^Z2*(1-ps.2)^(1-Z2))
    W00<-(1-Z1)*(1-Z2)/W
    W10<-(Z1)*(1-Z2)/W
    W01<-(1-Z1)*(Z2)/W
    W11<-(Z1)*(Z2)/W
    mu.00.tilde<-mean(W00*Y)
    mu.10.tilde<-mean(W10*Y)
    mu.01.tilde<-mean(W01*Y)
    mu.11.tilde<-mean(W11*Y)
    ests.mat[irep,2,]<-c(mu.00.tilde,mu.10.tilde,mu.01.tilde,mu.11.tilde)
    W00.star<-W00/sum(W00)
    W10.star<-W10/sum(W10)
    W01.star<-W01/sum(W01)
    W11.star<-W11/sum(W11)
    mu.00.hat<-sum(W00.star*Y)
    mu.10.hat<-sum(W10.star*Y)
    mu.01.hat<-sum(W01.star*Y)
    mu.11.hat<-sum(W11.star*Y)
    ests.mat[irep,3,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
}
true.mu<-c(2,5,3,6);par(mar=c(3,2,2,2),mfrow=c(2,2))
nvec<-c("OR",expression(tilde(mu)(z)),expression(hat(mu)(z)))
boxplot(ests.mat[,,1],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,0)))
abline(h=true.mu[1],col='red')
boxplot(ests.mat[,,2],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,0)))
abline(h=true.mu[2],col='red')
boxplot(ests.mat[,,3],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,1)))
abline(h=true.mu[3],col='red')
boxplot(ests.mat[,,4],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,1)))
abline(h=true.mu[4],col='red')
@
The outcome regression estimator (OR) is evidently biased whereas the two inverse weighting estimators are unbiased. Consider now a second specification, identical to the first except for the component that specifies that
\[
\E_{Y|X_1,Z_1,X_2,Z_2}^\Exp[Y|X_1 = x_1, Z_1 = z_1 ,X_2 = x_2, Z_2 = z_2] = x_1 + z_1 + x_2 + z_2 + x_1 . z_1
\]
that is, with an interaction between $Z_1$ and $X_1$.  The causal quantity of interest is now given by
\begin{align*}
\mu(z_1,z_2) = \E_{X_1,X_2|Z}^\Exp[X_1 + X_2 + X_1 z_1 |Z=z] + z_1 + z_2 & = \E_{X_1}^\Exp[X_1](1+z_1) + \E_{X_1}^\Exp [ \E_{X_2|X_1,Z}^\Exp[X_2|X_1,Z=z]] + z_1 + z_2\\[6pt]
& = 1 (1+z_1) + (1+2 z_1) + z_1 + z_2 = 2 + 4z_1 + z_2
\end{align*}
Hence
\[
\mu(0,0) = 2 \qquad \mu(1,0) = 6 \qquad \mu(0,1) = 3 \qquad \mu(1,1) = 7.
\]

<<CA7,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
set.seed(34)
n<-1000;nreps<-2000
al1<-c(1.0,-0.5);al2<-c(1.0,0.1,0.5,-0.5)
ests.mat<-array(0,c(nreps,3,4))
for(irep in 1:nreps){
    X1<-rnorm(n,1,1)
    eta1<-al1[1]+al1[2]*X1
    ps.1<-expit(eta1)
    Z1<-rbinom(n,1,ps.1)
    X2<-rnorm(n,X1+2*Z1,1)
    eta2<-al2[1]+al2[2]*X1+al2[3]*Z1+al2[4]*X2
    ps.2<-expit(eta2)
    Z2<-rbinom(n,1,ps.2)
    eps<-rnorm(n,0,3)
    Y<-X1+X2+Z1+Z2+X1*Z1+eps
    fit1<-lm(Y~X1+X2+Z1+Z2+X1:Z1)
    mu.00.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=0)))
    mu.10.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=0)))
    mu.01.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=1)))
    mu.11.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=1)))
    ests.mat[irep,1,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
    W<-(ps.1^Z1*(1-ps.1)^(1-Z1))*(ps.2^Z2*(1-ps.2)^(1-Z2))
    W00<-(1-Z1)*(1-Z2)/W
    W10<-(Z1)*(1-Z2)/W
    W01<-(1-Z1)*(Z2)/W
    W11<-(Z1)*(Z2)/W
    mu.00.tilde<-mean(W00*Y)
    mu.10.tilde<-mean(W10*Y)
    mu.01.tilde<-mean(W01*Y)
    mu.11.tilde<-mean(W11*Y)
    ests.mat[irep,2,]<-c(mu.00.tilde,mu.10.tilde,mu.01.tilde,mu.11.tilde)
    W00.star<-W00/sum(W00)
    W10.star<-W10/sum(W10)
    W01.star<-W01/sum(W01)
    W11.star<-W11/sum(W11)
    mu.00.hat<-sum(W00.star*Y)
    mu.10.hat<-sum(W10.star*Y)
    mu.01.hat<-sum(W01.star*Y)
    mu.11.hat<-sum(W11.star*Y)
    ests.mat[irep,3,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
}
true.mu<-c(2,6,3,7)
par(mar=c(3,2,2,2),mfrow=c(2,2))
nvec<-c("OR",expression(tilde(mu)(z)),expression(hat(mu)(z)))
boxplot(ests.mat[,,1],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,0)))
abline(h=true.mu[1],col='red')
boxplot(ests.mat[,,2],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,0)))
abline(h=true.mu[2],col='red')
boxplot(ests.mat[,,3],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,1)))
abline(h=true.mu[3],col='red')
boxplot(ests.mat[,,4],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,1)))
abline(h=true.mu[4],col='red')
@
Again, the outcome regression estimator (OR) is evidently biased whereas the two inverse weighting estimators are unbiased.  Now a third specification, identical to the first except for the component that specifies that
\[
\E_{Y|X_1,Z_1,X_2,Z_2}^\Exp[Y|X_1 = x_1, Z_1 = z_1 ,X_2 = x_2, Z_2 = z_2] = x_1 + z_1 + x_2 + z_2 + x_2 . z_2
\]
with an interaction between $Z_2$ and $X_2$.  The causal quantity of interest is now given by
\begin{align*}
\mu(z_1,z_2) = \E_{X_1,X_2|Z}^\Exp[X_1 + X_2 + X_2 z_2 |Z=z] + z_1 + z_2 & = \E_{X_1}^\Exp[X_1]+ \E_{X_1}^\Exp [ \E_{X_2|X_1,Z}^\Exp[X_2 (1+z_2) |X_1,Z=z]] + z_1 + z_2\\[6pt]
& = 1 + (1+2 z_1)(1+z_2) + z_1 + z_2 = 2 + 3z_1 + 2 z_2 + 2 z_1 z_2
\end{align*}
Hence
\[
\mu(0,0) = 2 \qquad \mu(1,0) = 5 \qquad \mu(0,1) = 4 \qquad \mu(1,1) = 9.
\]
<<CA9,comment='+', fig.width=7.25, fig.height=4.5, fig.align='center'>>=
set.seed(34);n<-1000;nreps<-2000
al1<-c(1.0,-0.5);al2<-c(1.0,0.1,0.5,-0.5)
ests.mat<-array(0,c(nreps,3,4))
for(irep in 1:nreps){
    X1<-rnorm(n,1,1)
    eta1<-al1[1]+al1[2]*X1
    ps.1<-expit(eta1)
    Z1<-rbinom(n,1,ps.1)
    X2<-rnorm(n,X1+2*Z1,1)
    eta2<-al2[1]+al2[2]*X1+al2[3]*Z1+al2[4]*X2
    ps.2<-expit(eta2)
    Z2<-rbinom(n,1,ps.2)
    eps<-rnorm(n,0,3)
    Y<-X1+X2+Z1+Z2+X2*Z2+eps
    fit1<-lm(Y~X1+X2+Z1+Z2+X2:Z2)
    mu.00.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=0)))
    mu.10.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=0)))
    mu.01.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=1)))
    mu.11.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=1)))
    ests.mat[irep,1,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
    W<-(ps.1^Z1*(1-ps.1)^(1-Z1))*(ps.2^Z2*(1-ps.2)^(1-Z2))
    W00<-(1-Z1)*(1-Z2)/W
    W10<-(Z1)*(1-Z2)/W
    W01<-(1-Z1)*(Z2)/W
    W11<-(Z1)*(Z2)/W
    mu.00.tilde<-mean(W00*Y); mu.10.tilde<-mean(W10*Y)
    mu.01.tilde<-mean(W01*Y); mu.11.tilde<-mean(W11*Y)
    ests.mat[irep,2,]<-c(mu.00.tilde,mu.10.tilde,mu.01.tilde,mu.11.tilde)
    W00.star<-W00/sum(W00)
    W10.star<-W10/sum(W10)
    W01.star<-W01/sum(W01)
    W11.star<-W11/sum(W11)
    mu.00.hat<-sum(W00.star*Y); mu.10.hat<-sum(W10.star*Y)
    mu.01.hat<-sum(W01.star*Y); mu.11.hat<-sum(W11.star*Y)
    ests.mat[irep,3,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
}
true.mu<-c(2,5,4,9);par(mar=c(3,2,2,2),mfrow=c(2,2))
boxplot(ests.mat[,,1],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,0)))
abline(h=true.mu[1],col='red')
boxplot(ests.mat[,,2],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,0)))
abline(h=true.mu[2],col='red')
boxplot(ests.mat[,,3],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,1)))
abline(h=true.mu[3],col='red')
boxplot(ests.mat[,,4],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,1)))
abline(h=true.mu[4],col='red')
@
Again, the outcome regression estimator (OR) is evidently biased whereas the two inverse weighting estimators are unbiased.

\textbf{Weighted regression:}  It is also possible to compute the expectation required for $\mu(z)$ using the iterated form
\begin{equation} \label{eq:mu-reg}
\E_{X,Y,Z}^\Obs \left[ \dfrac{\Ind_{z}(Z)}{f_{Z|X}^\Obs(Z|X)} Y \right] = \E_{X,Z}^\Obs \left[ \dfrac{\Ind_{z}(Z)}{f_{Z|X}^\Obs(Z|X)} \E_{Y|X,Z}^\Obs[Y|X,Z] \right]
\end{equation}
after proposing a model for $\E_{Y|X,Z}^\Obs[Y|X,Z]$.  Consider the first simulation with
\begin{equation}\label{eq:regmodel1}
\E_{Y|X_1,Z_1,X_2,Z_2}^\Obs[Y|X_1 = x_1, Z_1 = z_1 ,X_2 = x_2, Z_2 = z_2] = x_1 + z_1 + x_2 + z_2.
\end{equation}
By the earlier calculation, we demonstrated that conditioning on $X_2$ in this model does not yield unbiased estimation as the indirect path from $Z_1$ to $Y$ is blocked.  Instead, consider fitting the model
\begin{equation}\label{eq:msm1}
\E_{Y|X_1,Z_1,X_2,Z_2}^\Obs[Y|X_1 = x_1, Z_1 = z_1 ,X_2 = x_2, Z_2 = z_2] = \beta_0 + \psi_1 z_1 + \psi_2 z_2 = m(z_1,z_2;\beta,\psi)
\end{equation}
say to the observed data via weighted least squares with weights given by $W_z$.  In the reweighted pseudo-sample, the confounding between $X$ and $Z$ is removed by construction.  However, as there is no confounding, the parameter estimators for coefficients $\psi_1$ and $\psi_2$ in the model given by \eqref{eq:msm1}, are unbiased estimators of the true coefficients in the marginalized version of \eqref{eq:regmodel1}; after marginalization, we saw previously that
\begin{equation}\label{eq:msm1_true}
\E_{Y|Z_1,Z_2}^\Obs[Y|Z_1 = z_1, Z_2 = z_2] = 2 + 3 z_1 + z_2
\end{equation}
The model in \eqref{eq:msm1} is termed a \textit{marginal structural model}.

<<CA11,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
set.seed(34);n<-1000;nreps<-2000
al1<-c(1.0,-0.5);al2<-c(1.0,0.1,0.5,-0.5)
ests.mat<-array(0,c(nreps,4,4))
psi.ests<-matrix(0,nrow=nreps,ncol=3)
for(irep in 1:nreps){
    X1<-rnorm(n,1,1)
    eta1<-al1[1]+al1[2]*X1
    ps.1<-expit(eta1)
    Z1<-rbinom(n,1,ps.1)
    X2<-rnorm(n,X1+2*Z1,1)
    eta2<-al2[1]+al2[2]*X1+al2[3]*Z1+al2[4]*X2
    ps.2<-expit(eta2)
    Z2<-rbinom(n,1,ps.2)
    eps<-rnorm(n,0,3)
    Y<-X1+X2+Z1+Z2+eps
    fit1<-lm(Y~X1+X2+Z1+Z2)
    mu.00.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=0)))
    mu.10.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=0)))
    mu.01.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=1)))
    mu.11.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=1)))
    ests.mat[irep,1,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
    W<-(ps.1^Z1*(1-ps.1)^(1-Z1))*(ps.2^Z2*(1-ps.2)^(1-Z2))
    W00<-(1-Z1)*(1-Z2)/W
    W10<-(Z1)*(1-Z2)/W
    W01<-(1-Z1)*(Z2)/W
    W11<-(Z1)*(Z2)/W
    mu.00.tilde<-mean(W00*Y)
    mu.10.tilde<-mean(W10*Y)
    mu.01.tilde<-mean(W01*Y)
    mu.11.tilde<-mean(W11*Y)
    ests.mat[irep,2,]<-c(mu.00.tilde,mu.10.tilde,mu.01.tilde,mu.11.tilde)
    W00.star<-W00/sum(W00)
    W10.star<-W10/sum(W10)
    W01.star<-W01/sum(W01)
    W11.star<-W11/sum(W11)
    mu.00.hat<-sum(W00.star*Y);    mu.10.hat<-sum(W10.star*Y)
    mu.01.hat<-sum(W01.star*Y);    mu.11.hat<-sum(W11.star*Y)
    ests.mat[irep,3,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
    Wvec<-W00+W01+W10+W11
    fit2<-lm(Y~Z1+Z2, weights=Wvec)
    psi.ests[irep,]<-coef(fit2)
    mu.00.reg<-predict(fit2,newdata=data.frame(Z1=0,Z2=0))
    mu.10.reg<-predict(fit2,newdata=data.frame(Z1=1,Z2=0))
    mu.01.reg<-predict(fit2,newdata=data.frame(Z1=0,Z2=1))
    mu.11.reg<-predict(fit2,newdata=data.frame(Z1=1,Z2=1))
    mu.00.rhat<-mean(mu.00.reg);    mu.10.rhat<-mean(mu.10.reg)
    mu.01.rhat<-mean(mu.01.reg);    mu.11.rhat<-mean(mu.11.reg)
    ests.mat[irep,4,]<-c(mu.00.rhat,mu.10.rhat,mu.01.rhat,mu.11.rhat)
}
true.mu<-c(2,5,3,6)
nvec<-c("OR",expression(tilde(mu)(z)),expression(hat(mu)(z)),expression(hat(mu)[R](z)))
par(mar=c(3,2,2,2),mfrow=c(2,2))
boxplot(ests.mat[,,1],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,0)))
abline(h=true.mu[1],col='red')
boxplot(ests.mat[,,2],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,0)))
abline(h=true.mu[2],col='red')
boxplot(ests.mat[,,3],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,1)))
abline(h=true.mu[3],col='red')
boxplot(ests.mat[,,4],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,1)))
abline(h=true.mu[4],col='red')
apply(psi.ests,2,mean)
@

\pagebreak

This approach can be used if there is an interaction between $Z$ and $X$ in the data generating model.  Specifically, consider the simulation with
\begin{equation}\label{eq:regmodel2}
\E_{Y|X_1,Z_1,X_2,Z_2}^\Obs[Y|X_1 = x_1, Z_1 = z_1 ,X_2 = x_2, Z_2 = z_2] = x_1 + z_1 + x_2 + z_2 + x_2 . z_2 .
\end{equation}
After marginalization, we saw previously that
\begin{equation}\label{eq:msm2}
\E_{Y|Z_1,Z_2}^\Obs[Y|Z_1 = z_1, Z_2 = z_2] = 2 + 3z_1 + 2 z_2 + 2 z_1 z_2
\end{equation}
and fitting this `marginal' model yields the correct answer.
<<CA12,comment='+', fig.width=7.25, fig.height=5.0, fig.align='center'>>=
set.seed(34)
n<-1000;nreps<-2000
al1<-c(1.0,-0.5);al2<-c(1.0,0.1,0.5,-0.5)
ests.mat<-array(0,c(nreps,4,4))
psi.ests<-matrix(0,nrow=nreps,ncol=4)
for(irep in 1:nreps){
    X1<-rnorm(n,1,1)
    eta1<-al1[1]+al1[2]*X1
    ps.1<-expit(eta1)
    Z1<-rbinom(n,1,ps.1)
    X2<-rnorm(n,X1+2*Z1,1)
    eta2<-al2[1]+al2[2]*X1+al2[3]*Z1+al2[4]*X2
    ps.2<-expit(eta2)
    Z2<-rbinom(n,1,ps.2)
    eps<-rnorm(n,0,3)
    Y<-X1+X2+Z1+Z2+Z2*X2+eps
    fit1<-lm(Y~X1+X2+Z1+Z2+Z2:X2)
    mu.00.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=0)))
    mu.10.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=0)))
    mu.01.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=1)))
    mu.11.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=1)))
    ests.mat[irep,1,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
    W<-(ps.1^Z1*(1-ps.1)^(1-Z1))*(ps.2^Z2*(1-ps.2)^(1-Z2))
    W00<-(1-Z1)*(1-Z2)/W
    W10<-(Z1)*(1-Z2)/W
    W01<-(1-Z1)*(Z2)/W
    W11<-(Z1)*(Z2)/W
    mu.00.tilde<-mean(W00*Y);    mu.10.tilde<-mean(W10*Y)
    mu.01.tilde<-mean(W01*Y);    mu.11.tilde<-mean(W11*Y)
    ests.mat[irep,2,]<-c(mu.00.tilde,mu.10.tilde,mu.01.tilde,mu.11.tilde)
    W00.star<-W00/sum(W00)
    W10.star<-W10/sum(W10)
    W01.star<-W01/sum(W01)
    W11.star<-W11/sum(W11)
    mu.00.hat<-sum(W00.star*Y);    mu.10.hat<-sum(W10.star*Y)
    mu.01.hat<-sum(W01.star*Y);    mu.11.hat<-sum(W11.star*Y)
    ests.mat[irep,3,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
    Wvec<-W00+W01+W10+W11
    fit2<-lm(Y~Z1+Z2+Z1:Z2, weights=Wvec)
    psi.ests[irep,]<-coef(fit2)
    mu.00.reg<-predict(fit2,newdata=data.frame(Z1=0,Z2=0))
    mu.10.reg<-predict(fit2,newdata=data.frame(Z1=1,Z2=0))
    mu.01.reg<-predict(fit2,newdata=data.frame(Z1=0,Z2=1))
    mu.11.reg<-predict(fit2,newdata=data.frame(Z1=1,Z2=1))
    mu.00.rhat<-mean(mu.00.reg);    mu.10.rhat<-mean(mu.10.reg)
    mu.01.rhat<-mean(mu.01.reg);    mu.11.rhat<-mean(mu.11.reg)
    ests.mat[irep,4,]<-c(mu.00.rhat,mu.10.rhat,mu.01.rhat,mu.11.rhat)
}
true.mu<-c(2,5,4,9)
nvec<-c("OR",expression(tilde(mu)(z)),expression(hat(mu)(z)),expression(hat(mu)[R](z)))
par(mar=c(3,2,2,2),mfrow=c(2,2))
boxplot(ests.mat[,,1],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,0)))
abline(h=true.mu[1],col='red')
boxplot(ests.mat[,,2],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,0)))
abline(h=true.mu[2],col='red')
boxplot(ests.mat[,,3],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,1)))
abline(h=true.mu[3],col='red')
boxplot(ests.mat[,,4],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,1)))
abline(h=true.mu[4],col='red')
apply(psi.ests,2,mean)
apply(psi.ests,2,var)
@
Note that the marginal model needs to be correctly specified in order for the inferences to be correct.

\pagebreak

\textbf{Stabilized weights:}  It is possible to utilize so-called \textit{stabilized} weights in the marginal structural model.  In this version of the analysis, there is conditioning on baseline confounders $X_1$;  the corresponding DAG makes it clear that conditioning on $X_1$ does not block any paths other than the confounding path $Z_1 \rightarrow X_1 \rightarrow Y$.
  \begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x1) at (-2,1) {${X_1}$};
    \node[state] (z1) at (-1,0) {${Z_1}$};
    \node[state] (x2) at (0,1) {${X_2}$};
    \node[state] (z2) at (1,0) {${Z_2}$};
    \node[state] (y) at (2,1) {${Y}$};

    % Directed edge
    \path (x1) edge (z1);
    \path (x1) edge (x2);
    \path (x1) edge [bend left=45] (y);
    %\path (x2) edge (z2);
    \path (z1) edge (x2);
    \path (x1) edge (z2);
    \path (x2) edge (y);
    \path (z1) edge (z2);
   \path (z1) edge (y);
   \path (z2) edge (y);

 \end{tikzpicture}
\end{figure}

Because of the universal conditioning on $X_1$, the corresponding causal quantities to be considered are based on
\[
\mu_S(z,x_1) = \E_{Y|X_1,Z}^\Exp[Y|X_1=x_1, Z=z].
\]
We now have by the usual arguments, 
\begin{align*}
\mu_S(z,x_1) & = \dfrac{\displaystyle{\iiint}\Ind_{z}(t) \ y \ \dfrac{f_{Z|X_1}^\Exp(t|x_1)}{f_{Z|X_1,X_2}^\Obs(t|x_1,x_2)} f_{X_1,X_2,Z}^\Obs(x_1,x_2,t) \ dy \ dx_2 \ dt}{\displaystyle{\iint} \Ind_{z}(t) \dfrac{f_{Z|X_1}^\Exp(t|x_1)}{f_{Z|X_1,X_2}^\Obs(t|x_1,x_2) } f_{X_1,X_2,Z}^\Obs(x_1,x_2,t) \ d x_2 \ dt}\\[6pt]
& \equiv \dfrac{\displaystyle{\iiint}\Ind_{z}(t) \Ind_{x_1}(u)  \ y \ \dfrac{f_{Z|X_1}^\Exp(t|u)}{f_{Z|X_1,X_2}^\Obs(t|u,x_2)} f_{X_1,X_2,Z}^\Obs(u,x_2,t) \ dt \ dy \ d u \ d x_2 \ dt}{\displaystyle{\iint} \Ind_{z}(t) \Ind_{x_1}(u) \dfrac{f_{Z|X_1}^\Exp(t|u)}{f_{Z|X_1,X_2}^\Obs(t|u,x_2) } f_{X_1,X_2,Z}^\Obs(u,x_2,t) \ d u \ d x_2  \ dt}
\end{align*}
Inference would then proceed in the usual way, based on sample averages: for example,
\[
\widehat \mu_S(z,x_1) = \dfrac{\dfrac{1}{n} \dsum\limits_{i=1}^n   \Ind_{z}(Z_i) \Ind_{x_1}(X_i) \dfrac{f_{Z|X_1}^\Exp(Z_i|X_{i1})}{f_{Z|X}^\Obs(Z_i|X_i)} Y}{\dfrac{1}{n} \dsum\limits_{i=1}^n   \Ind_{z}(Z_i) \Ind_{x_1}(X_{i1}) \dfrac{f_{Z|X_1}^\Exp(Z_i|X_{i1})}{f_{Z|X}^\Obs(Z_i|X_i)}}
\]
However, if $X_1$ is high-dimensional and/or continuous-valued, the number of terms in the sums may be very small, and so typically regression methods are favoured to introduce the conditioning on $X_1$; we may again pick a `working' marginal structural (regression) model and solve using weighted least squares with weights
\begin{equation}\label{eq:sw1}
\Ind_{z}(Z_i) \Ind_{x_1}(X_i) \dfrac{f_{Z|X_1}^\Exp(Z_i|X_{i1})}{f_{Z|X}^\Obs(Z_i|X_i)}.
\end{equation}
Note that for the $i$th data point, we might propose a marginal model
\begin{equation}\label{eq:msm3}
\E_{Y|X_1,Z_1,X_2,Z_2}^\Obs[Y|X_1 = x_{i1}, Z_1 = z_{i1}, Z_2 = z_{i2}] = \beta_0 + \beta_1 x_{i1} + \psi_1 z_{i1} + \psi_2 z_{i2}
\end{equation}
and so the indicator function $\Ind_{x_1}(X_{i1})$ in \eqref{eq:sw1} always takes the value 1.

\medskip
From \eqref{eq:sw1}, it is clear that a model for $f_{Z|X_1}^\Exp(Z_i|X_{i1})$ is needed.  Also, we have that
\begin{equation}\label{eq:sw2}
\dfrac{f_{Z|X_1}^\Exp(Z|X_{1})}{f_{Z|X}^\Obs(Z|X)} = \dfrac{f_{Z_1|X_1}^\Exp(Z_{1}|X_{1}) \times f_{Z_2|X_1,Z_1}^\Exp(Z_{2}|X_{1},Z_1) }{f_{Z_1|X_1}^\Obs(Z_{1}|X_{1}) \times f_{Z_2|X_1,Z_1,X_2}^\Obs(Z_{2}|X_{1},Z_1,X_2)} \equiv \dfrac{f_{Z_2|X_1,Z_1}^\Exp(Z_{2}|X_{1},Z_1) }{f_{Z_2|X_1,Z_1,X_2}^\Obs(Z_{2}|X_{1},Z_1,X_2)}
\end{equation}
as it can be seen from the DAG that the terms $f_{Z_1|X_1}^\Exp(Z_{1}|X_{1})$ and $f_{Z_1|X_1}^\Obs(Z_{1}|X_{1})$ cancel.

<<CA13,comment='+', fig.width=7.25, fig.height=6.0, fig.align='center'>>=
set.seed(34)
n<-1000;nreps<-2000
al1<-c(1.0,-0.5);al2<-c(1.0,0.1,0.5,-0.5)
ests.mat<-array(0,c(nreps,5,4))
psi.ests<-matrix(0,nrow=nreps,ncol=4)
psi.ests.s<-matrix(0,nrow=nreps,ncol=5)
for(irep in 1:nreps){
    X1<-rnorm(n,1,1)
    eta1<-al1[1]+al1[2]*X1
    ps.1<-expit(eta1)
    Z1<-rbinom(n,1,ps.1)
    X2<-rnorm(n,X1+2*Z1,1)
    eta2<-al2[1]+al2[2]*X1+al2[3]*Z1+al2[4]*X2
    ps.2<-expit(eta2)
    Z2<-rbinom(n,1,ps.2)
    eps<-rnorm(n,0,3)
    Y<-X1+X2+Z1+Z2+eps
    fit1<-lm(Y~X1+X2+Z1+Z2)
    mu.00.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=0)))
    mu.10.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=0)))
    mu.01.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=0,Z2=1)))
    mu.11.hat<-mean(predict(fit1,newdata=data.frame(X1,X2,Z1=1,Z2=1)))
    ests.mat[irep,1,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
    W<-(ps.1^Z1*(1-ps.1)^(1-Z1))*(ps.2^Z2*(1-ps.2)^(1-Z2))
    W00<-(1-Z1)*(1-Z2)/W
    W10<-(Z1)*(1-Z2)/W
    W01<-(1-Z1)*(Z2)/W
    W11<-(Z1)*(Z2)/W
    mu.00.tilde<-mean(W00*Y)
    mu.10.tilde<-mean(W10*Y)
    mu.01.tilde<-mean(W01*Y)
    mu.11.tilde<-mean(W11*Y)
    ests.mat[irep,2,]<-c(mu.00.tilde,mu.10.tilde,mu.01.tilde,mu.11.tilde)
    W00.star<-W00/sum(W00)
    W10.star<-W10/sum(W10)
    W01.star<-W01/sum(W01)
    W11.star<-W11/sum(W11)
    mu.00.hat<-sum(W00.star*Y)
    mu.10.hat<-sum(W10.star*Y)
    mu.01.hat<-sum(W01.star*Y)
    mu.11.hat<-sum(W11.star*Y)
    ests.mat[irep,3,]<-c(mu.00.hat,mu.10.hat,mu.01.hat,mu.11.hat)
    Wvec<-W00+W01+W10+W11
    fit2<-lm(Y~Z1+Z2+Z1:Z2, weights=Wvec)
    psi.ests[irep,]<-coef(fit2)
    mu.00.reg<-predict(fit2,newdata=data.frame(Z1=0,Z2=0))
    mu.10.reg<-predict(fit2,newdata=data.frame(Z1=1,Z2=0))
    mu.01.reg<-predict(fit2,newdata=data.frame(Z1=0,Z2=1))
    mu.11.reg<-predict(fit2,newdata=data.frame(Z1=1,Z2=1))
    mu.00.rhat<-mean(mu.00.reg);    mu.10.rhat<-mean(mu.10.reg)
    mu.01.rhat<-mean(mu.01.reg);    mu.11.rhat<-mean(mu.11.reg)
    ests.mat[irep,4,]<-c(mu.00.rhat,mu.10.rhat,mu.01.rhat,mu.11.rhat)
    ps.num<-fitted(glm(Z2~X1+Z1,family=binomial))
    ps.den<-fitted(glm(Z2~X1+X2+Z1,family=binomial))
    W<-(ps.num^Z2*(1-ps.num)^(1-Z2))/(ps.den^Z2*(1-ps.den)^(1-Z2))
    fit3<-lm(Y~X1+Z1+Z2+Z1:Z2, weights=W)
    psi.ests.s[irep,]<-coef(fit3)
    mu.00.reg<-predict(fit3,newdata=data.frame(X1,Z1=0,Z2=0))
    mu.10.reg<-predict(fit3,newdata=data.frame(X1,Z1=1,Z2=0))
    mu.01.reg<-predict(fit3,newdata=data.frame(X1,Z1=0,Z2=1))
    mu.11.reg<-predict(fit3,newdata=data.frame(X1,Z1=1,Z2=1))
    mu.00.rhat<-mean(mu.00.reg);    mu.10.rhat<-mean(mu.10.reg)
    mu.01.rhat<-mean(mu.01.reg);    mu.11.rhat<-mean(mu.11.reg)
    ests.mat[irep,5,]<-c(mu.00.rhat,mu.10.rhat,mu.01.rhat,mu.11.rhat)
}
true.mu<-c(2,5,3,6)
nvec<-c("OR",expression(tilde(mu)(z)),expression(hat(mu)(z)),expression(hat(mu)[R](z)),
             expression(hat(mu)[S](z)))
par(mar=c(3,2,2,2),mfrow=c(2,2))
boxplot(ests.mat[,,1],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,0)))
abline(h=true.mu[1],col='red')
boxplot(ests.mat[,,2],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,0)))
abline(h=true.mu[2],col='red')
boxplot(ests.mat[,,3],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(0,1)))
abline(h=true.mu[3],col='red')
boxplot(ests.mat[,,4],names=nvec,pch=19,cex=0.6,col='gray',main=expression(mu(1,1)))
abline(h=true.mu[4],col='red')
apply(psi.ests,2,mean)
apply(psi.ests.s,2,mean)
@

\end{document}

Let $\calH_k$ denote the `history' set of the random variables up to interval $k=1,\ldots,K$, that is
\[
\calH_k = \left\{X_1,\ldots,X_k,Z_1,\ldots,Z_k \right\}
\]
and $\calH_k^{-}$ denote $\calH_k \setminus \{Z_k\}$. If $K=2$ we have
\[
\calH_1 = \{X_1,Z_1\} \qquad \calH_2 = \{X_1,X_2,Z_1,Z_2\}
\]
and
\[
\calH_1^{-} = \{X_1\} \qquad \calH_2^{-} = \{X_1,X_2,Z_1\}.
\]
with the corresponding observed histories being denoted $\calh_k^{}$ and $\calh_{\raisemath{2pt}{k}}^{{-}}$ respectively.