\documentclass[notitlepage]{article}
\usepackage{Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumerate}
\usepackage{bbm}
\usepackage{amsfonts,amsmath}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}

\def\E{\Expect}


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\parindent0in

\begin{document}
\begin{center}
{\textsclarge{Backdoor path with a collider and unmeasured confounding}}
\end{center}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
#options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))
inline_hook <- function (x) {
  if (is.numeric(x)) {
    # ifelse does a vectorized comparison
    # If integer, print without decimal; otherwise print two places
    res <- ifelse(x == round(x),
      sprintf("%6i", x),
      sprintf("%.6f", x)
    )
    paste(res, collapse = ", ")
  }
}
knit_hooks$set(inline = inline_hook)
@

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x) at (1,0) {${X}$};
    \node[state] (y) at (0,0){${Y}$};
    \node[state] (z) at (-1,0) {${Z}$};
    \node[state] (u) at (0,1) {${U}$};

    \path (z) edge (y);
    \path (y) edge (x);
    \path (u) edge (z);
    \path (u) edge (x);
 \end{tikzpicture}
\end{figure}

The corresponding probability model factorizes as
\[
f_{U,X,Y,Z}(u,x,y,z) = f_{U}(u) f_{Z|U}(z|u) f_{Y|Z}(y|z) f_{X|U,Y}(x|u,y)
\]

In this graph, we have two paths from $Z$ to $Y$
\begin{itemize}
  \item Path $(Z,Y)$ this is a directed path;
  \item Path $(Z,U,X,Y)$: this is an undirected path that is also a backdoor path.
\end{itemize}
However the second path is blocked at the collider $X$, so there is no open backdoor path, and thus the effect of $Z$ on $Y$ is only found through the first.

<<exp1,comment='+', fig.width=5.0, fig.height=5.0, fig.align='center'>>=
set.seed(2384)
n<-10000
U<-rnorm(n,10,1)
Z<-rnorm(n,2*U+1,1)
Y<-rnorm(n,-Z+3,1)
X<-rnorm(n,Y+U,1)
data1<-data.frame(U,X,Y,Z);pairs(data1,pch=19,cex=0.5)
@
If we regress $Y$ on $Z$, then the correct relationship is recovered.

<<exp2,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
round(coef(summary(lm(Y~Z))),6)
@
However, if we condition on $X$ in the regression, we see that bias is introduced in the estimation of the coefficient.
<<exp3,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
round(coef(summary(lm(Y~Z+X))),6)
@
If we condition on $U$ only, then the direct effect of $Z$ on $Y$ is correctly captured, as the path is still blocked at $X$
<<exp4,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
round(coef(summary(lm(Y~Z+U))),6)
@
If we condition on $U$ and $X$, then the direct effect of $Z$ on $Y$ is also not captured.
<<exp5,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
round(coef(summary(lm(Y~Z+X+U))),6)
@
This is due to \textit{selection bias}: conditioning on a descendant of $Y$ will lead to bias in most circumstances.  Consider the simple chain graph
\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x) at (1,0) {${X}$};
    \node[state] (y) at (0,0){${Y}$};
    \node[state] (z) at (-1,0) {${Z}$};
    \path (z) edge (y);
    \path (y) edge (x);
 \end{tikzpicture}
\end{figure}
The corresponding probability model factorizes as
\[
f_{X,Y,Z}(x,y,z) = f_{Z}(z) f_{Y|Z}(y|z) f_{X|Y,Z}(x|y,z).
\]
Clearly we can integrate out $x$ from the joint density to leave
\[
f_{Y,Z}(y,z) = f_{Z}(z) f_{Y|Z}(y|z)
\]
leaving the $(Z,Y)$ relationship unchanged.  However, we have that
\[
f_{Y|X,Z}(y|x,z) = \frac{f_{X,Y,Z}(x,y,z)}{f_{X,Z}(x,z)} = \frac{f_{Z}(z) f_{Y|Z}(y|z) f_{X|Y,Z}(x|y,z)}{f_Z(z)f_{X|Z}(x|z)} = \frac{f_{X|Y,Z}(x|y,z)}{f_{X|Z}(x|z)} f_{Y|Z}(y|z)
\]
and in general
\[
\frac{f_{X|Y,Z}(x|y,z)}{f_{X|Z}(x|z)} \neq 1
\]
so
\[
f_{Y|X,Z}(y|x,z) \neq f_{Y|Z}(y|z).
\]

\pagebreak

Notice, however, that we can change the data generating model to make the analyses agree.  If the conditional model for $Y$ given $Z$ is instead
\[
Y = -0.5 Z + 3 + \epsilon
\]
and then
\[
X = 0.75 Y + U + \varepsilon
\]
then the effect of conditioning changes.
<<exp6,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
set.seed(2384)
n<-10000
U<-rnorm(n,10,1)
Z<-rnorm(n,2*U+1,1)
Y<-rnorm(n,-0.5*Z+3,1)
X<-rnorm(n,0.75*Y+U,1)
data2<-data.frame(U,X,Y,Z);pairs(data2,pch=19,cex=0.5)
@
If we regress $Y$ on $Z$, then the \textbf{correct} relationship is recovered.

<<exp7,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
round(coef(summary(lm(Y~Z))),6)
@
Now, if we condition on $X$ in the regression, we see that bias is still \textbf{not present}, even though there is an open, biasing path.
<<exp8,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
round(coef(summary(lm(Y~Z+X))),6)
@
If we condition on $U$ only, then the direct effect of $Z$ on $Y$ is \textbf{correctly captured}, as the path is still blocked at $X$
<<exp9,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
round(coef(summary(lm(Y~Z+U))),6)
@
However, if we condition on $U$ and $X$, then the direct effect of $Z$ on $Y$ is \textbf{not} captured.
<<exp10,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
round(coef(summary(lm(Y~Z+X+U))),6)
@
As a final summary, we can inspect the \textbf{inverse} of the sample correlation matrices:
<<exp11,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
round(solve(cor(data1)),6)
@
<<exp12,comment='+', fig.width=5.5, fig.height=5.5, fig.align='center'>>=
round(solve(cor(data2)),6)
@
Note that in both cases, the entry in the position relating $X$ and $Z$ is almost zero.  This is an indication that conditional on the other variables, $X$ and $Z$ are uncorrelated (actually independent here in this Gaussian case).  This entry corresponds to the \textbf{partial correlation} between the two variables.


\end{document}