\documentclass[notitlepage,11pt]{article}
\usepackage{Math598}
\usepackage{listings}
\usepackage{numprint}
\usepackage{enumitem}
\usepackage{bbm}
\usepackage{amsfonts,amsmath}
\usepackage{hyperref}

\usepackage{tikz}
\usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning}

\def\E{\Expect}
\renewcommand{\independent}{\perp\mkern-9.5mu\perp}
\renewcommand{\notindependent}{\slashed{\independent}}

\usepackage{scalerel}
\usepackage{centernot}
\usepackage{slashed}
\def\given{ \ | \ }


\lstloadlanguages{R}

\definecolor{keywordcolor}{rgb}{0,0.6,0.6}
\definecolor{delimcolor}{rgb}{0.461,0.039,0.102}
\definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432}

\lstdefinestyle{Rsettings}{
  basicstyle=\ttfamily,
  breaklines=true,
  showstringspaces=false,
  keywords={if, else, function, theFunction, tmp}, % Write as many keywords
  otherkeywords={},
  commentstyle=\itshape\color{Rcommentcolor},
  keywordstyle=\color{keywordcolor},
  moredelim=[s][\color{delimcolor}]{"}{"},
}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\usepackage{titlesec}

\titleformat{\section}
  {\large\normalfont\scshape}{\thesection.}{1em}{}


\parindent0in

\begin{document}
\begin{center}
{\textsclarge{Using Causal Graphs In Epidemiological Research}}

\medskip

{\textsclarge{Simulations and Examples}}
\end{center}

<<setup, cache=FALSE, echo=FALSE>>=
library(knitr)
# global chunk options
opts_chunk$set(cache=TRUE, autodep=TRUE)
#options(scipen=999)
options(repos=c(CRAN="https://cloud.r-project.org/"))

@

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

\section{Conditional independence and dependence}

In this example, we aim to illustrate the difference between conditional independence and dependence.  Suppose we have the simple graph
\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x) at (0,1) {${X}$};
    \node[state] (y) at (-1,0){${Y}$};
    \node[state] (z) at (1,0) {${Z}$};

    \path (x) edge (y);
    \path (x) edge (z);
    \path (y) edge[bidirected] (z);
 \end{tikzpicture}
\end{figure}

where the `bidirected' edge between $Y$ and $Z$ indicates a general dependence between those variables.  We can read off the joint distribution
\[
f_X(x) f_{Y,Z|X}(y,z|x).
\]
We can generate data as follows: first, under the assumption that the variables are jointly Normally distributed, with
\begin{align*}
  X & \sim Normal(10,5^2) \\[6pt]
  (Y,Z) | X = x & \sim Normal \left(\begin{pmatrix} x \\ x \end{pmatrix}, \begin{pmatrix}
                                                                            1 & -0.9 \\
                                                                            -0.9 & 1
                                                                          \end{pmatrix}   \right)
\end{align*}
so given $X=x$, $Y$ and $Z$ are \textit{dependent}, each with mean $x$ but with correlation -0.9.

<<simp0a,comment='+', fig.width=7.25, fig.height=5.25, fig.align='center'>>=
library(MASS)
set.seed(2111)                         #Set the random number generator seed value
n<-10000                               #Set the sample size
X<-rnorm(n,10,5)                       #Generate the X random variables
Sig.YZ<-matrix(c(1,-0.9,-0.9,1),2,2)   #Conditional variance-covariance for Y,Z given X
YZ<-mvrnorm(n,mu=c(0,0),Sigma=Sig.YZ)  #Generate the Y,Z variables
Y<-X+YZ[,1];Z<-X+YZ[,2]                #Change the mean according to X
cor(cbind(X,Y,Z))                      #Compute the unconditional correlation
@
We see that marginally all the variables are positively correlated.  We can see this further in a points plot

<<simp0b,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
par(mar=c(4,4,1,0),pty='s')            #Set up the plotting margins
pairs(cbind(X,Y,Z),pch=19,cex=0.6)     #Scatterplot matrix
@
There is evidently strong positive correlation between each pair of variables, including $Y$ and $Z$; this may be a surprise as conditionally, they are negatively correlated.
<<simp0c,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
par(mar=c(4,4,1,0),pty='s')            #Set up the plotting margins
plot(Z,Y,pch=19,cex=0.6)
@
\medskip

We now examine three subsets defined by different ranges of $X$:

<<simp1,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
Y1<-Y[X>4.5 & X < 5.5];Z1<-Z[X>4.5 & X < 5.5];      #First subset analysis
Y2<-Y[X>9.5 & X < 10.5];Z2<-Z[X>9.5 & X < 10.5];    #Second subset analysis
Y3<-Y[X>14.5 & X < 15.5];Z3<-Z[X>14.5 & X < 15.5];  #Third subset analysis
par(mar=c(4,2,1,0),pty='s',mfrow=c(2,2))            #Set up the plotting margins
plot(Z1,Y1,pch=19,cex=0.6)
plot(Z2,Y2,pch=19,cex=0.6)
plot(Z3,Y3,pch=19,cex=0.6)
@
In each subset, we see the negative correlation from the original conditional calculation.  We can check this by regressing $Y$ on $Z$ for each subset.

<<simp3,comment='+', fig.width=7.25, fig.height=5.25, fig.align='center'>>=
coef(summary(lm(Y1~Z1)))                      #Group 1 regression
coef(summary(lm(Y2~Z2)))                      #Group 2 regression
coef(summary(lm(Y3~Z3)))                      #Group 3 regression
coef(summary(lm(c(Y1,Y2,Y3)~c(Z1,Z2,Z3))))    #Pooled regression
@


<<simp4,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
par(mar=c(4,4,1,0),pty='s')
plot(Z,Y,type='n')
points(Z1,Y1,pch=19,cex=0.6,col='red')
points(Z2,Y2,pch=19,cex=0.6,col='blue')
points(Z3,Y3,pch=19,cex=0.6,col='green')
abline(lm(c(Y1,Y2,Y3)~c(Z1,Z2,Z3)),lty=2)
abline(lm(Y1~Z1),col='red')
abline(lm(Y2~Z2),col='blue')
abline(lm(Y3~Z3),col='green')
legend(10,max(Y),c('Group 0','Group 1','Group 2','Pooled'),
                 col=c('red','blue','green','black'),lty=c(1,1,1,2))
@

We can now repeat the analysis, but assuming, as per the graph,
\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x) at (0,1) {${X}$};
    \node[state] (y) at (-1,0){${Y}$};
    \node[state] (z) at (1,0) {${Z}$};

    \path (x) edge (y);
    \path (x) edge (z);
 \end{tikzpicture}
\end{figure}

that $Y$ and $Z$ are \textbf{conditionally independent} given $X$.

<<simp5a,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
set.seed(2111)                         #Set the random number generator seed value
n<-10000                               #Set the sample size
X<-rnorm(n,10,5)                       #Generate the X random variables
Sig.YZ<-matrix(c(1,0,0,1),2,2)         #Conditional independence for Y,Z given X
YZ<-mvrnorm(n,mu=c(0,0),Sigma=Sig.YZ)  #Generate the Y,Z variables
Y<-X+YZ[,1];Z<-X+YZ[,2]                #Change the mean according to X
cor(cbind(X,Y,Z))                      #Compute the unconditional correlation
@
We see that marginally all the variables are positively correlated.

<<simp5b,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
par(mar=c(4,4,1,0),pty='s')            #Set up the plotting margins
pairs(cbind(X,Y,Z),pch=19,cex=0.6)     #Scatterplot matrix
@
There is evidently strong positive correlation between each pair of variables, including $Y$ and $Z$.
<<simp5c,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
par(mar=c(4,4,1,0),pty='s')            #Set up the plotting margins
plot(Z,Y,pch=19,cex=0.6)
@
\medskip

We now examine three subsets defined by different ranges of $X$:

<<simp6,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
Y1<-Y[X>4.5 & X < 5.5];Z1<-Z[X>4.5 & X < 5.5];      #First subset analysis
Y2<-Y[X>9.5 & X < 10.5];Z2<-Z[X>9.5 & X < 10.5];    #Second subset analysis
Y3<-Y[X>14.5 & X < 15.5];Z3<-Z[X>14.5 & X < 15.5];  #Third subset analysis
par(mar=c(4,2,1,0),pty='s',mfrow=c(2,2))            #Set up the plotting margins
plot(Z1,Y1,pch=19,cex=0.6)
plot(Z2,Y2,pch=19,cex=0.6)
plot(Z3,Y3,pch=19,cex=0.6)
@
In each subset, we see the negligible correlation from the original conditional calculation.  We can check this by regressing $Y$ on $Z$ for each subset.

<<simp7,comment='+', fig.width=7.25, fig.height=5.25, fig.align='center'>>=
coef(summary(lm(Y1~Z1)))                      #Group 1 regression
coef(summary(lm(Y2~Z2)))                      #Group 2 regression
coef(summary(lm(Y3~Z3)))                      #Group 3 regression
coef(summary(lm(c(Y1,Y2,Y3)~c(Z1,Z2,Z3))))    #Pooled regression
@
The slope coefficient is essentially zero in each case, confirming conditional independence, and yet marginally $Y$ and $Z$ are strongly positively correlated,

<<simp8,comment='+', fig.width=5.75, fig.height=5.75, fig.align='center'>>=
par(mar=c(4,4,1,0),pty='s')
plot(Z,Y,type='n')
points(Z1,Y1,pch=19,cex=0.6,col='red')
points(Z2,Y2,pch=19,cex=0.6,col='blue')
points(Z3,Y3,pch=19,cex=0.6,col='green')
abline(lm(c(Y1,Y2,Y3)~c(Z1,Z2,Z3)),lty=2)
abline(lm(Y1~Z1),col='red')
abline(lm(Y2~Z2),col='blue')
abline(lm(Y3~Z3),col='green')
legend(10,max(Y),c('Group 0','Group 1','Group 2','Pooled'),
                 col=c('red','blue','green','black'),lty=c(1,1,1,2))
@

\pagebreak

\section{Colliders}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

The following graph
\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x) at (-1,0) {${X}$};
    \node[state] (y) at (1,0){${Y}$};
    \node[state] (z) at (0,-1) {${Z}$};
    \node[state] (u) at (0,0) {${U}$};

    \path (x) edge (z);
    \path (y) edge (z);
    \path (u) edge (z);
 \end{tikzpicture}
\end{figure}

\noindent encodes the dependencies between the four random variables $(X,Y,Z,U)$: the joint density can be represented
\[
f_{X,Y,Z,U}(x,y,z,u) = f_X(x) f_Y(y) f_{U}(u) f_{Z|X,Y,U}(z|x,y,u)
\]
that is, $X,Y,U$ are mutually independent.  Suppose that $X$ and $Y$ are distributed as standard Normal random variables, $U \sim \text{Normal}(0,0.1^2)$, and $Z= X + Y + U$.

<<coll0,comment='+', fig.width=5.0, fig.height=5.0, fig.align='center'>>=
set.seed(2101)                         #Set the random number generator seed value
n<-10000                               #Set the sample size
X<-rnorm(n,0,1)                        #Generate the X random variables
Y<-rnorm(n,0,1)                        #Generate the Y random variables
U<-rnorm(n,0,0.1)                      #Generate the U random variables
Z<-X+Y+U
par(mar=c(3,2,1,0))                    #Set up the plotting margins
pairs(cbind(X,Y,Z),pch=19,cex=0.6)
@

If we condition on the value of collider $Z$, and inspect the joint density of $X$ and $Y$ given $Z$, we see that $X$ and $Y$ are conditionally \textbf{dependent}.

<<coll1,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
par(mar=c(3,2,1,0))                                 #Set up the plotting margins
X1<-X[Z>-2.5 & Z < -1.5];Y1<-Y[Z>-2.5 & Z < -1.5];  #First subset analysis
X2<-X[Z>-0.5 & Z < 0.5];Y2<-Y[Z>-0.5 & Z < 0.5];    #Second subset analysis
X3<-X[Z>0.5 & Z < 1.5];Y3<-Y[Z>0.5 & Z < 1.5];      #Third subset analysis
par(mar=c(4,3,1,0),pty='s',mfrow=c(2,2))            #Set up the plotting margins
plot(X1,Y1,pch=19,cex=0.6)
plot(X2,Y2,pch=19,cex=0.6)
plot(X3,Y3,pch=19,cex=0.6)
cor(X1,Y1)
cor(X2,Y2)
cor(X3,Y3)
@

\pagebreak
\section{d-separation}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

\textbf{Example 1:}  Consider the graph
\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x1) at (0,2) {${X_1}$};
    \node[state] (x2) at (1,1){${X_2}$};
    \node[state] (x3) at (-1,1) {${X_3}$};
    \node[state] (x4) at (0,0) {${X_4}$};
    \node[state] (x5) at (0,-1.25) {${X_5}$};

    \path (x1) edge (x2);
    \path (x1) edge (x3);
    \path (x2) edge (x4);
    \path (x3) edge (x4);
    \path (x4) edge (x5);

 \end{tikzpicture}
\end{figure}

The implied probability distribution is
\[
f_{X_1,X_2,X_3,X_4,X_5}(x_1,x_2,x_3,x_4,x_5) = f_{X_1}(x_1) f_{X_2|X_1}(x_2|x_1) f_{X_3|X_1}(x_3|x_1) f_{X_4|X_2,X_3}(x_4|x_2,x_3) f_{X_5|X_4}(x_5|x_4)
\]
We have that there are two paths connecting $X_2$ and $X_3$, namely
\[
(X_2,X_1,X_3) \qquad (X_2,X_4,X_3)
\]

\begin{itemize}

\item the set $S \equiv \{X_1\}$  d-separates $X_2$ and $X_3$ as conditioning on $X_1$ blocks the path $(X_2,X_1,X_3)$; we do not need to consider the second path here, as that is already blocked by the collider $X_4$;

\medskip

\item the set $S \equiv \{X_1,X_4\}$  does not d-separate $X_2$ and $X_3$ as conditioning on the collider $X_4$ opens the path;

\medskip

\item the set $S \equiv \{X_1,X_5\}$  does not d-separate $X_2$ and $X_3$ as conditioning on $X_5$, a descendant of $X_4$, opens the path at the collider.

\end{itemize}

To illustrate these results, we perform a simulation:

<<dsep0,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
set.seed(43)
n<-1000
X1<-rnorm(n,1,1)
X2<-rnorm(n,X1,1)
X3<-rnorm(n,X1,1)
X4<-rnorm(n,X2+X3,1)
X5<-rnorm(n,X4,1)
cor(cbind(X1,X2,X3,X4,X5))
@

We can check the d-separation here using regression.  We condition on the relevant variables by including them as predictors.
\begin{itemize}[itemsep=0.1in]

\item $S \equiv \{X_1\}$  d-separates $X_2$ and $X_3$
<<dsep1,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
summary(lm(X2~X3))$coef
summary(lm(X3~X2))$coef
@
The two variables $X_2$ and $X_3$ are dependent, as the slope coefficient is non-zero.   We now include $X_1$ as a predictor.
<<dsep2,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
summary(lm(X2~X3+X1))$coef
summary(lm(X3~X2+X1))$coef
@
In each case, after including $X_1$, the coefficient of the other regressor becomes close to zero.

\medskip

\item the set $S \equiv \{X_1,X_4\}$  does not d-separate $X_2$ and $X_3$
<<dsep3,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
summary(lm(X2~X3+X1+X4))$coef
summary(lm(X3~X2+X1+X4))$coef
@
After including $X_1$ and $X_4$, the coefficient of $X_3$ in the first analysis, and $X_2$ in the second analysis, becomes non-zero.

\medskip

\item the set $S \equiv \{X_1,X_5\}$  does not d-separate $X_2$ and $X_3$.
<<dsep4,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
summary(lm(X2~X3+X1+X5))$coef
summary(lm(X3~X2+X1+X5))$coef
@
After including $X_1$ and $X_5$, the coefficient of $X_33$ in the first analysis, and $X_2$ in the second analysis, becomes non-zero.

\end{itemize}

\textbf{Example 2:}  Consider the graph
\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x1) at (0,2) {${X_1}$};
    \node[state] (x2) at (1,1){${X_2}$};
    \node[state] (x3) at (-1,1) {${X_3}$};
    \node[state] (x4) at (0,0) {${X_4}$};
    \node[state] (x5) at (0,-1.25) {${X_5}$};

    \path (x1) edge (x2);
    \path (x1) edge (x3);
    \path (x2) edge (x4);
    \path (x4) edge (x3);
    \path (x4) edge (x5);

 \end{tikzpicture}
\end{figure}

The implied probability distribution is
\[
f_{X_1,X_2,X_3,X_4,X_5}(x_1,x_2,x_3,x_4,x_5) = f_{X_1}(x_1) f_{X_2|X_1}(x_2|x_1) f_{X_4|X_2}(x_4|x_2) f_{X_3|X_1,X_4}(x_3|x_1,x_4) f_{X_5|X_4}(x_5|x_4)
\]
The same two paths connecting $X_2$ and $X_3$ are present, but

\begin{itemize}


\item the set $S \equiv \{X_1\}$  does not d-separate $X_2$ and $X_3$ as conditioning on $X_1$ blocks the path $(X_2,X_1,X_3)$; but there is an open path through $X_4$;

\medskip

\item the set $S \equiv \{X_1,X_4\}$  d-separates $X_2$ and $X_3$ as conditioning on $X_1$ and $X_4$ blocks both paths;

\medskip

\item the set $S \equiv \{X_1,X_5\}$  does not d-separate $X_2$ and $X_3$ as the second path remains open at $X_4$.

\end{itemize}

We can check this in simulation:

<<dsep5,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
X1<-rnorm(n,1,1)
X2<-rnorm(n,X1,1)
X4<-rnorm(n,X2,1)
X3<-rnorm(n,X1+X4,1)
X5<-rnorm(n,X4,1)
@

Again, we can check for d-separation in this case using regression.
\begin{itemize}[itemsep=0.1in]

\item $S \equiv \{X_1\}$  does not d-separate $X_2$ and $X_3$

<<dsep6,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
summary(lm(X2~X3+X1))$coef
summary(lm(X3~X2+X1))$coef
@
In each case, after including $X_1$, the coefficient of the other regressor is non-zero.

\item the set $S \equiv \{X_1,X_4\}$ d-separates $X_2$ and $X_3$
<<dsep7,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
summary(lm(X2~X3+X1+X4))$coef
summary(lm(X3~X2+X1+X4))$coef
@
After including $X_1$ and $X_4$, the coefficient of $X_3$ in the first analysis, and $X_2$ in the second analysis, becomes zero.

\item the set $S \equiv \{X_1,X_5\}$  does not d-separate $X_2$ and $X_3$.
<<dsep8,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
summary(lm(X2~X3+X1+X5))$coef
summary(lm(X3~X2+X1+X5))$coef
@
After including $X_1$ and $X_5$, the coefficient of $X_33$ in the first analysis, and $X_2$ in the second analysis, becomes non-zero.

\end{itemize}


In general, when considering d-separation of two variables $X$ and $Y$ by a set of variables, $S$, we need to consider \textbf{all paths} between $X$ and $Y$, not merely those involving $S$ - however, in the first example $X_4$ is a collider so we do not need to consider the second path when taking $S \equiv \{X_1\}$.

\bigskip

\textbf{Example 3:}  Consider the DAG
\begin{figure}[h]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x1) at (0,0) {${X_1}$};
    \node[state] (x2) at (2,0) {${X_2}$};
    \node[state] (z) at (-1,-2) {${Z}$};
    \node[state] (w) at (1,-1) {${W}$};
    \node[state] (y) at (3,-2) {${Y}$};

    \path (x1) edge (z);
    \path (x1) edge (w);
    \path (x2) edge (w);
    \path (x2) edge (y);
    \path (w) edge (y);

 \end{tikzpicture}
\end{figure}

\noindent which implies the structural decomposition
\[
f_{X_1}(x_1) f_{X_2}(x_2) f_{W|X_1,X_2}(w|x_1,x_2) f_{Z|X_1}(z|x_1) f_{Y|X_2,W}(y|x_2,w).
\]
For example, we might have
\begin{align*}
  X_1 & \sim Normal(1,1) \\[6pt]
  X_2 & \sim Normal(2,1) \\[6pt]
  W|X_1=x_1,X_2=x_2 & \sim Normal(x_1+x_2,1)\\[6pt]
  Z|X_1=x_1 & \sim Normal(2+2 x_1, 1)\\[6pt]
  Y|X_2=x_2,W=w & \sim Normal(x_2+w,1)
\end{align*}
In this DAG, there are no directed paths from $Z$ to $Y$; thus, $Z$ is \textbf{not} a cause of $Y$.  If we unpick the structural specifications, we have either that
\[
Y = X_2 + W + \epsilon
\]
or
\[
Y = X_2 + (X_1 + X_2) + \varepsilon = X_1 + 2 X_2 + \varepsilon.
\]
Thus, manipulating $Z$ by intervention has no impact on $Y$.  There are, however, two undirected paths
\begin{align*}
\text{Path 1: } \ & \ Z \rightarrow X_1 \rightarrow W \rightarrow X_2 \rightarrow Y\\[6pt]
\textrm{Path 2: } \ & \ Z \rightarrow X_1 \rightarrow W \rightarrow Y.
\end{align*}
which have the potential to introduce bias into estimation of the (zero) causal effect.

<<sim0,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
set.seed(43)
n<-1000
X1<-rnorm(n,1,1)
X2<-rnorm(n,2,1)
W<-rnorm(n,X1+X2,1)
Z<-rnorm(n,2+2*X1,1)
Y<-rnorm(n,X2+W,1)
cor(cbind(X1,X2,W,Z,Y))
@

If we attempt an uncorrected analysis, and simply regress $Y$ on $Z$, the results imply that $Z$ does influence $Y$.
<<sim1,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
summary(lm(Y~Z))$coef
@
Therefore, we need to perform some form of adjustment to block the undirected paths by conditioning.   Now $W$ is a collider on Path 1 from $Z$ to $Y$, so this path is blocked. Conditioning on $W$ \emph{opens} the confounding path. Therefore $Z \independent Y$ (as there is no open path between them), but
\[
Z \notindependent Y \given W
\]
Unconditional on $W$, the effect of $Z$ on $Y$ is confounded by the backdoor path Path 2. Conditioning on $W$ alone opens Path 1, therefore to block both paths need to condition on
\[
S \equiv \{W,X_2\} \qquad \text{or} \qquad S = \{X_1\}.
\]
We consider five models
\begin{align*}
  M_0 & \quad Y = Z + \epsilon\\[6pt]
  M_1 & \quad Y = Z + W + \epsilon\\[6pt]
  M_2 & \quad Y = Z + W + X_2 + \epsilon\\[6pt]
  M_3 & \quad Y = Z + X_1 + \epsilon\\[6pt]
  M_4 & \quad Y = Z + W + X_1 + X_2 + \epsilon
\end{align*}
Models $M_2, M_3$ and $M_4$ should yield unbiased estimators of the (null) effect of $Z$ on $Y$.

<<sim2,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
summary(lm(Y~Z))$coef             #M0 - uncorrected
summary(lm(Y~Z+W))$coef           #M1 - conditioned on W
summary(lm(Y~Z+W+X2))$coef        #M2 - conditioned on (W,X2)
summary(lm(Y~Z+X1))$coef          #M3 - conditioned on X1
summary(lm(Y~Z+W+X1+X2))$coef     #M4 - conditioned on (W,X1,X2)
@

We now run a simulation study, replicating 1000 times for $n=500$.
<<sim3,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
nreps<-1000
n<-500
ests<-matrix(0,nrow=nreps,ncol=5)
for(irep in 1:nreps){
    X1<-rnorm(n,1,1)
    X2<-rnorm(n,2,1)
    W<-rnorm(n,X1+X2,1)
    Z<-rnorm(n,2+2*X1,1)
    Y<-rnorm(n,X2+W,1)
    ests[irep,1]<-coef(lm(Y~Z))[2]             #M0 - uncorrected
    ests[irep,2]<-coef(lm(Y~Z+W))[2]           #M1 - conditioned on W
    ests[irep,3]<-coef(lm(Y~Z+W+X2))[2]        #M2 - conditioned on (W,X2)
    ests[irep,4]<-coef(lm(Y~Z+X1))[2]          #M3 - conditioned on X1
    ests[irep,5]<-coef(lm(Y~Z+W+X1+X2))[2]     #M4 - conditioned on (W,X1,X2)
}
Bias<-apply(ests,2,mean)*sqrt(n)
MSE<-apply(ests^2,2,mean)*n
names(Bias)<-names(MSE)<-c('M0','M1','M2','M3','M4')
Bias                                           #Bias
MSE                                            #Mean-square error
@
Therefore we have the following results:

\begin{table}[h]
\centering
\begin{tabular}{clrr}
Model & & $\sqrt{n} \times \text{Bias}$ & $n \times \text{MSE}$\\[1pt]
\hline
\\[-6pt]
  $M_0$ & $Y = Z + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[1])} & \Sexpr{sprintf("%.4f",MSE[1])}\\[6pt]
  $M_1$ & $Y = Z + W + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[2])} & \Sexpr{sprintf("%.4f",MSE[2])}\\[6pt]
  $M_2$ & $Y = Z + W + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[3])} & \Sexpr{sprintf("%.4f",MSE[3])}\\[6pt]
  $M_3$ & $Y = Z + X_1 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[4])} & \Sexpr{sprintf("%.4f",MSE[4])}\\[6pt]
  $M_4$ & $Y = Z + W + X_1 + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[5])} & \Sexpr{sprintf("%.4f",MSE[5])}\\
  \hline
\end{tabular}
\end{table}

\textbf{Example 4:}  Consider the modified DAG
\begin{figure}[h]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x1) at (0,0) {${X_1}$};
    \node[state] (x2) at (2,0) {${X_2}$};
    \node[state] (z) at (-1,-2) {${Z}$};
    \node[state] (w) at (1,-1) {${W}$};
    \node[state] (y) at (3,-2) {${Y}$};

    \path (x1) edge (z);
    \path (x1) edge (w);
    \path (x2) edge (w);
    \path (x2) edge (y);
    \path (w) edge (y);
    \path (z) edge (y);

 \end{tikzpicture}
\end{figure}

\noindent which implies the structural decomposition
\[
f_{X_1}(x_1) f_{X_2}(x_2) f_{W|X_1,X_2}(w|x_1,x_2) f_{Z|X_1}(z|x_1) f_{Y|X_2,W,Z}(y|x_2,w,z).
\]
For example, we might have
\begin{align*}
  X_1 & \sim Normal(1,1) \\[6pt]
  X_2 & \sim Normal(2,1) \\[6pt]
  W|X_1=x_1,X_2=x_2 & \sim Normal(x_1+x_2,1)\\[6pt]
  Z|X_1=x_1 & \sim Normal(2+2 x_1, 1)\\[6pt]
  Y|X_2=x_2,W=w & \sim Normal(x_2+w+z,1)
\end{align*}
In this DAG, there is one directed path from $Z$ to $Y$; thus, $Z$ \textbf{is} a cause of $Y$.  From the structural specifications, we have either that
\[
Y = X_2 + W + Z + \epsilon
\]
or
\[
Y = X_2 + (X_1 + X_2) + Z + \varepsilon = X_1 + 2 X_2 + Z + \varepsilon.
\]
so the impact of manipulating $Z$ by intervention is that changing $Z$ by one unit changes the expected value of $Y$ by one unit.  The same confounding paths are present.  We consider again the five models; models $M_2, M_3$ and $M_4$ should yield unbiased estimators of the effect of $Z$ on $Y$, which is the coefficient 1.

<<sim5,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
nreps<-1000
n<-500
ests<-matrix(0,nrow=nreps,ncol=5)
for(irep in 1:nreps){
    X1<-rnorm(n,1,1)
    X2<-rnorm(n,2,1)
    W<-rnorm(n,X1+X2,1)
    Z<-rnorm(n,2+2*X1,1)
    Y<-rnorm(n,X2+W+Z,1)
    ests[irep,1]<-coef(lm(Y~Z))[2]             #M0 - uncorrected
    ests[irep,2]<-coef(lm(Y~Z+W))[2]           #M1 - conditioned on W
    ests[irep,3]<-coef(lm(Y~Z+W+X2))[2]        #M2 - conditioned on (W,X2)
    ests[irep,4]<-coef(lm(Y~Z+X1))[2]          #M3 - conditioned on X1
    ests[irep,5]<-coef(lm(Y~Z+W+X1+X2))[2]     #M4 - conditioned on (W,X1,X2)
}
Bias<-apply(ests-1,2,mean)*sqrt(n)
MSE<-apply((ests-1)^2,2,mean)*n
names(Bias)<-names(MSE)<-c('M0','M1','M2','M3','M4')
Bias                                           #Bias
MSE                                            #Mean-square error
@

We have the following results:

\begin{table}[h]
\centering
\begin{tabular}{clrr}
Model & & $\sqrt{n} \times \text{Bias}$ & $n \times \text{MSE}$\\[1pt]
\hline
\\[-6pt]
  $M_0$ & $Y = Z + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[1])} & \Sexpr{sprintf("%.4f",MSE[1])}\\[6pt]
  $M_1$ & $Y = Z + W + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[2])} & \Sexpr{sprintf("%.4f",MSE[2])}\\[6pt]
  $M_2$ & $Y = Z + W + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[3])} & \Sexpr{sprintf("%.4f",MSE[3])}\\[6pt]
  $M_3$ & $Y = Z + X_1 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[4])} & \Sexpr{sprintf("%.4f",MSE[4])}\\[6pt]
  $M_4$ & $Y = Z + W + X_1 + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[5])} & \Sexpr{sprintf("%.4f",MSE[5])}\\
  \hline
\end{tabular}
\end{table}

\pagebreak

\textbf{Example 5:}  Consider the following DAG

\begin{figure}[h]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (x1) at (0,1) {${X_1}$};
    \node[state] (x2) at (0,-1) {${X_2}$};
    \node[state] (z) at (-1,0) {${Z}$};
    \node[state] (y) at (1,0) {${Y}$};

    \path (x1) edge (z);
    \path (x2) edge (z);
    \path (x1) edge (y);
    \path (x2) edge (y);
    \path (z) edge (y);

 \end{tikzpicture}
\end{figure}

\noindent which implies the structural decomposition
\[
f_{X_1}(x_1) f_{X_2}(x_2) f_{Z|X_1,X_2}(z|x_1,x_2) f_{Y|X_1,X_2,Z}(y|x_1,x_2,z).
\]
Suppose that
\begin{align*}
  X_1 & \sim Normal(1,1) \\[6pt]
  X_2 & \sim Normal(2,1) \\[6pt]
  Z|X_1=x_1,X_2=x_2 & \sim Normal(x_1+x_2,1)\\[6pt]
  Y|X_1=x_1,X_2=x_2,Z=z & \sim Normal(x_1-x_2+z,1)
\end{align*}
In this DAG, there is one directed path from $Z$ to $Y$; thus, $Z$ \textbf{is} a cause of $Y$.  From the structural specifications, we have either that
\[
Y = X_1 - X_2 + Z + \epsilon
\]
so the impact of manipulating $Z$ by intervention is that changing $Z$ by one unit changes the expected value of $Y$ by one unit, but also
\[
Y = X_1 - X_2 + (X_1+X_2)  + \varepsilon = 2 X_1 + \varepsilon.
\]
There are two confounding paths
\begin{align*}
\text{Path 1: } \ & \ Z \rightarrow X_1 \rightarrow Y\\[6pt]
\textrm{Path 2: } \ & \ Z \rightarrow X_2 \rightarrow Y.
\end{align*}

We consider four models
\begin{align*}
  M_0 & \quad Y = Z + \epsilon\\[6pt]
  M_1 & \quad Y = Z + X_1 + \epsilon\\[6pt]
  M_2 & \quad Y = Z + X_2 + \epsilon\\[6pt]
  M_3 & \quad Y = Z + X_1 + X_2 + \epsilon
\end{align*}

In a simulation study
<<sim6,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
nreps<-1000
n<-500
ests<-matrix(0,nrow=nreps,ncol=4)
for(irep in 1:nreps){
    X1<-rnorm(n,1,1)
    X2<-rnorm(n,2,1)
    Z<-rnorm(n,X1+X2,1)
    Y<-rnorm(n,X1-X2+Z,1)
    ests[irep,1]<-coef(lm(Y~Z))[2]             #M0 - uncorrected
    ests[irep,2]<-coef(lm(Y~Z+X1))[2]          #M1 - conditioned on X1
    ests[irep,3]<-coef(lm(Y~Z+X2))[2]          #M2 - conditioned on X2
    ests[irep,4]<-coef(lm(Y~Z+X1+X2))[2]       #M3 - conditioned on (X1,X2)
}
Bias<-apply(ests-1,2,mean)*sqrt(n)
MSE<-apply((ests-1)^2,2,mean)*n
names(Bias)<-names(MSE)<-c('M0','M1','M2','M3')
@

We have the following results:

\begin{table}[h]
\centering
\begin{tabular}{clrr}
Model & & $\sqrt{n} \times \text{Bias}$ & $n \times \text{MSE}$\\[1pt]
\hline
\\[-6pt]
  $M_0$ & $Y = Z + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[1])} & \Sexpr{sprintf("%.4f",MSE[1])}\\[6pt]
  $M_1$ & $Y = Z + X_1 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[2])} & \Sexpr{sprintf("%.4f",MSE[2])}\\[6pt]
  $M_2$ & $Y = Z + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[3])} & \Sexpr{sprintf("%.4f",MSE[3])}\\[6pt]
  $M_3$ & $Y = Z + X_1 + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[4])} & \Sexpr{sprintf("%.4f",MSE[4])}\\[6pt]
  \hline
\end{tabular}
\end{table}
In this case, the \textbf{unadjusted} model also produces an unbiased estimator of the causal effect of $Z$ on $Y$.  This is because of the \textbf{specific} data generating process chosen.   Specifically, in the outcome model, the random variables $X_1-X_2$ and $Z$ are \textbf{uncorrelated}:
\begin{align*}
  \Cov[(X_1-X_2),Z] & = \Cov[(X_1-X_2),(X_1+X_2+\varepsilon)] & \varepsilon \independent (X_1,X_2)\\[6pt]
   & = \Cov[(X_1-X_2),(X_1+X_2)] + \Cov[(X_1-X_2),\varepsilon] \\[6pt]
   & = \left\{\E[(X_1^2-X_2^2)] - \E[(X_1-X_2)] \E[(X_1+X_2)] \right\} + 0 & \text{zero by independence}\\[6pt]
   & = \left\{(1+1)-(4+1)) - (1-2)(1+2)\right\} = -3 + 3 = 0
\end{align*}
as if $X \sim Normal(\mu,\sigma^2)$, $\E[X^2] = \mu^2 + \sigma^2$.   If the simulation model is varied, $M_0$ \textbf{no longer} produces an unbiased result.
<<sim7,comment='+', fig.width=5.25, fig.height=5.25, fig.align='center'>>=
ests<-matrix(0,nrow=nreps,ncol=4)
for(irep in 1:nreps){
    X1<-rnorm(n,1,1)
    X2<-rnorm(n,2,1)
    Z<-rnorm(n,X1+2*X2,1)                      #Model changed here !
    Y<-rnorm(n,X1-X2+Z,1)
    ests[irep,1]<-coef(lm(Y~Z))[2]             #M0 - uncorrected
    ests[irep,2]<-coef(lm(Y~Z+X1))[2]          #M1 - conditioned on X1
    ests[irep,3]<-coef(lm(Y~Z+X2))[2]          #M2 - conditioned on X2
    ests[irep,4]<-coef(lm(Y~Z+X1+X2))[2]       #M3 - conditioned on (X1,X2)
}
Bias<-apply(ests-1,2,mean)*sqrt(n)
MSE<-apply((ests-1)^2,2,mean)*n
names(Bias)<-names(MSE)<-c('M0','M1','M2','M3')
@
\begin{table}[h]
\centering
\begin{tabular}{clrr}
Model & & $\sqrt{n} \times \text{Bias}$ & $n \times \text{MSE}$\\[1pt]
\hline
\\[-6pt]
  $M_0$ & $Y = Z + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[1])} & \Sexpr{sprintf("%.4f",MSE[1])}\\[6pt]
  $M_1$ & $Y = Z + X_1 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[2])} & \Sexpr{sprintf("%.4f",MSE[2])}\\[6pt]
  $M_2$ & $Y = Z + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[3])} & \Sexpr{sprintf("%.4f",MSE[3])}\\[6pt]
  $M_3$ & $Y = Z + X_1 + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[4])} & \Sexpr{sprintf("%.4f",MSE[4])}\\[6pt]
  \hline
\end{tabular}
\end{table}
Here, from the structural specifications, we have again that
\[
Y = X_1 - X_2 + Z + \epsilon
\]
so the impact of manipulating $Z$ by intervention is that changing $Z$ by one unit changes the expected value of $Y$ by one unit, but now
\[
Y = X_1 - X_2 + (X_1+2 X_2)  + \varepsilon = 2 X_1 + X_2 + \varepsilon.
\]

\section{Selection Bias}

The vignette for \texttt{R} package \texttt{ggdag} contains two examples on selection bias
\begin{center}
\url{https://cran.r-project.org/web/packages/ggdag/vignettes/bias-structures.html}
\end{center}
that inspire the numerical examples below.

\bigskip

\textbf{Example 1:} Consider the DAG for binary variables

\begin{itemize}
  \item $S$ -- smoking status
  \item $G$ -- glioma
  \item $H$ -- hospitalization
  \item $B$ -- broken bone
  \item $R$ -- tendency for reckless behaviour
\end{itemize}
We wish to study the association between $S$ and $G$ in a hospitalized cohort.

\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (h) at (1,1) {${H}$};
    \node[state] (g) at (0,0) {${G}$};
    \node[state] (b) at (2,0) {${B}$};
    \node[state] (r) at (3,-1) {${R}$};
    \node[state] (s) at (4,0) {${S}$};

    \path (g) edge (h);
    \path (b) edge (h);
    \path (r) edge (b);
    \path (r) edge (s);
 \end{tikzpicture}
\end{figure}

The corresponding probability model factorizes as
\[
f_{R,B,S,G,H}(r,b,s,g,h) = f_{R}(r) f_G(g) f_{B|R}(b|r) f_{S|R}(s|r) f_{H|B,G}(h|b,g)
\]

In this graph, we have one path from $S$ to $G$
\begin{itemize}
  \item this is an undirected path,
  \item the path is blocked at collider $H$.
\end{itemize}
Therefore $G \independent S$, but conditioning on $H$ renders $G$ and $S$ \textbf{dependent}.  That is, if we carry out a study in the general population, no association between will be detected, however, if we only look in hospitalized subjects, an association will be detected.


<<exp1,comment='+', fig.width=5.0, fig.height=5.0, fig.align='center'>>=
set.seed(2384)
n<-1000000                      #Large population
R<-rbinom(n,1,0.2)              #Simulate R
G<-rbinom(n,1,0.02)             #Simulate G
pS<-c(0.05,0.25)                #Simulate S, with different probs for R=0 and R=1
S<-rbinom(n,1,pS[R+1])
pB<-c(0.01,0.10)                #Simulate B, with different probs for R=0 and R=1
B<-rbinom(n,1,pB[R+1])
pH<-c(0.01,0.9,0.05,0.95)       #Simulate H, with different probs for different values
H<-rbinom(n,1,pH[B+G+B*G+1])    #of the quantity B+G+G*B
cor(G,S)                        #No association in general population
cor(G[H==1],S[H==1])            #Negative correlation in hospitalized individuals
cor(G[H==0],S[H==0])            #No appreciable association in non hospitalized
@

We can explore the same results using regression:

<<exp1-reg1,comment='+', fig.width=5.0, fig.height=5.0, fig.align='center'>>=
my.data<-data.frame(R,G,S,B,H)
summary(lm(G~S,data=my.data))$coef                 #Analysis in general population
@
From the analysis in the general population, ignoring hospitalization status, there is no association between Smoking and Glioma.  However, performing an analysis stratifying by hospitalization status yields different results.

<<exp1-reg2,comment='+', fig.width=5.0, fig.height=5.0, fig.align='center'>>=
summary(lm(G~S,subset=(H==1),data=my.data))$coef   #Analysis in hospitalized
summary(lm(G~S,subset=(H==0),data=my.data))$coef   #Analysis in non-hospitalized
@
Thus from the analysis of hospitalized subjects, it seems that Smoking is protective for Glioma.

\pagebreak

\textbf{Example 2:} Consider the DAG for variables

\begin{itemize}
  \item $C$ -- CD4 count (continuous)
  \item $D$ -- assignment of new HIV drug (binary)
  \item $H$ -- underlying HIV severity (binary)
  \item $S$ -- symptoms (binary)
  \item $U$ -- follow up indicator (binary, $U=1$ implies that participant stays in trial)
\end{itemize}
We wish to examine the impact of $D$ on $C$ by studying patients who stay in the trial.  Suppose the relevant DAG is

\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state] (c) at (0,0) {${C}$};
    \node[state] (h) at (1,0) {${H}$};
    \node[state] (s) at (2,0) {${S}$};
    \node[state] (u) at (3,1) {${U}$};
    \node[state] (d) at (3,-1) {${D}$};

    \path (h) edge (c);
    \path (h) edge (s);
    \path (s) edge (u);
    \path (d) edge (s);
 \end{tikzpicture}
\end{figure}

The corresponding probability model factorizes as
\[
f_{H,D,C,S,U}(h,d,c,s,u) = f_{H}(h) f_D(d) f_{C|H}(c|h) f_{S|H,D}(s|h,d) f_{U|S}(u|s)
\]

In this graph, we have one path from $D$ to $C$: this is an undirected path, but the path is blocked at collider $S$.  Therefore $D \independent C$. However, conditioning on $U$, which is a descendant of $S$, renders $D$ and $C$ \textbf{dependent}.  Therefore, regressing $C$ on $D$ in the subgroup where $U = 1$ (that is, individuals who are followed in the trial) will indicate an association.


<<exp2,comment='+', fig.width=5.0, fig.height=5.0, fig.align='center'>>=
set.seed(2384)
n<-1000000                      #Large population
H<-rbinom(n,1,0.2)              #Simulate H
D<-rbinom(n,1,0.5)              #Simulate D
C<-rnorm(n,100-20*H,10)         #Simulate C
pS<-c(0.01,0.9,0.05,0.95)       #Simulate S, with different probs for different values
S<-rbinom(n,1,pS[H+D+H*D+1])    #of the quantity H+D+H*D
pU<-c(0.9,0.3)                  #Simulate U
U<-rbinom(n,1,pU[S+1])
coef(summary(lm(C~D)))
coef(summary(lm(C~D, subset=(U==1))))
coef(summary(lm(C~D, subset=(U==0))))
@

\section{M-bias}

\tikzset{
    -Latex,auto,node distance =1 cm and 1 cm,semithick,
    state/.style ={circle, draw, minimum width = 0.7 cm},
    box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray},
    point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}},
    bidirected/.style={Latex-Latex,dashed},
    el/.style = {inner sep=3pt, align=left, sloped}

}

Consider the DAG with two unmeasured confounders:

\medskip

\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state, fill=red] (u1) at (0,0) {${U_1}$};
    \node[state, fill=red] (u2) at (2,0) {${U_2}$};
    \node[state] (z) at (-1,-2) {${Z}$};
    \node[state] (x) at (1,-1) {${X}$};
    \node[state] (y) at (3,-2) {${Y}$};

    \path[red] (u1) edge (z);
    \path[red] (u1) edge (x);
    \path[red] (u2) edge (x);
    \path[red] (u2) edge (y);

 \end{tikzpicture}
\end{figure}
We have that $X$, $Y$ and $Z$ are \emph{independent}; the (true but hidden) path between $Z$ and $Y$ is blocked at collider $X$.

<<mbias1,comment='+', fig.width=5.0, fig.height=5.0, fig.align='center'>>=
set.seed(43)
n<-1000
U1<-rnorm(n,1,1)
U2<-rnorm(n,2,1)
X<-rnorm(n,U1+U2,1)
Z<-rnorm(n,2+2*U1,1)
Y<-rnorm(n,3-U2,1)
cor(cbind(U1,U2,X,Z,Y))
summary(lm(Y~Z))$coef
@
It is evident that $Z$ and $Y$ are not associated; they are uncorrelated, and the estimated coefficient of $Z$ in the regression of $Y$ on $Z$ is close to zero.

\medskip

In an analysis, however, suppose we condition on $X$: the relevant DAG is presented below

\begin{figure}[ht]
\centering
\begin{tikzpicture}[scale=1.5]
    % x node set with absolute coordinates
    \node[state, fill=red] (u1) at (0,0) {${U_1}$};
    \node[state, fill=red] (u2) at (2,0) {${U_2}$};
    \node[state] (z) at (-1,-2) {${Z}$};
    \node[state] (x) at (1,-1) {${X}$};
    \node[state] (y) at (3,-2) {${Y}$};

    \path[red] (u1) edge (z);
    \path[red] (u1) edge (x);
    \path[red] (u2) edge (x);
    \path[red] (u2) edge (y);
    \path (x) edge (z);
    \path (x) edge (y);

 \end{tikzpicture}
\end{figure}
In the modelled DAG, $Y\independent Z \given X$; however, conditioning on $X$ \emph{opens} the \emph{hidden} path through $U_1$ and $U_2$, so there is now an open biasing path.
<<mbias2,comment='+', fig.width=5.0, fig.height=5.0, fig.align='center'>>=
summary(lm(Y~Z+X))$coef
@
Inclusion of $X$ in the regression induces bias into the estimation.

\end{document}