\documentclass[notitlepage,11pt]{article} \usepackage{Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumitem} \usepackage{bbm} \usepackage{amsfonts,amsmath} \usepackage{hyperref} \usepackage{tikz} \usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning} \def\E{\Expect} \renewcommand{\independent}{\perp\mkern-9.5mu\perp} \renewcommand{\notindependent}{\slashed{\independent}} \usepackage{scalerel} \usepackage{centernot} \usepackage{slashed} \def\given{ \ | \ } \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \usepackage{titlesec} \titleformat{\section} {\large\normalfont\scshape}{\thesection.}{1em}{} \parindent0in \begin{document} \begin{center} {\textsclarge{Using Causal Graphs In Epidemiological Research}} \medskip {\textsclarge{Simulations and Examples}} \end{center} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) #options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) @ \tikzset{ -Latex,auto,node distance =1 cm and 1 cm,semithick, state/.style ={circle, draw, minimum width = 0.7 cm}, box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray}, point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}}, bidirected/.style={Latex-Latex,dashed}, el/.style = {inner sep=3pt, align=left, sloped} } \section{Conditional independence and dependence} In this example, we aim to illustrate the difference between conditional independence and dependence. Suppose we have the simple graph \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x) at (0,1) {${X}$}; \node[state] (y) at (-1,0){${Y}$}; \node[state] (z) at (1,0) {${Z}$}; \path (x) edge (y); \path (x) edge (z); \path (y) edge[bidirected] (z); \end{tikzpicture} \end{figure} where the `bidirected' edge between $Y$ and $Z$ indicates a general dependence between those variables. We can read off the joint distribution \[ f_X(x) f_{Y,Z|X}(y,z|x). \] We can generate data as follows: first, under the assumption that the variables are jointly Normally distributed, with \begin{align*} X & \sim Normal(10,5^2) \\[6pt] (Y,Z) | X = x & \sim Normal \left(\begin{pmatrix} x \\ x \end{pmatrix}, \begin{pmatrix} 1 & -0.9 \\ -0.9 & 1 \end{pmatrix} \right) \end{align*} so given $X=x$, $Y$ and $Z$ are \textit{dependent}, each with mean $x$ but with correlation -0.9. <>= library(MASS) set.seed(2111) #Set the random number generator seed value n<-10000 #Set the sample size X<-rnorm(n,10,5) #Generate the X random variables Sig.YZ<-matrix(c(1,-0.9,-0.9,1),2,2) #Conditional variance-covariance for Y,Z given X YZ<-mvrnorm(n,mu=c(0,0),Sigma=Sig.YZ) #Generate the Y,Z variables Y<-X+YZ[,1];Z<-X+YZ[,2] #Change the mean according to X cor(cbind(X,Y,Z)) #Compute the unconditional correlation @ We see that marginally all the variables are positively correlated. We can see this further in a points plot <>= par(mar=c(4,4,1,0),pty='s') #Set up the plotting margins pairs(cbind(X,Y,Z),pch=19,cex=0.6) #Scatterplot matrix @ There is evidently strong positive correlation between each pair of variables, including $Y$ and $Z$; this may be a surprise as conditionally, they are negatively correlated. <>= par(mar=c(4,4,1,0),pty='s') #Set up the plotting margins plot(Z,Y,pch=19,cex=0.6) @ \medskip We now examine three subsets defined by different ranges of $X$: <>= Y1<-Y[X>4.5 & X < 5.5];Z1<-Z[X>4.5 & X < 5.5]; #First subset analysis Y2<-Y[X>9.5 & X < 10.5];Z2<-Z[X>9.5 & X < 10.5]; #Second subset analysis Y3<-Y[X>14.5 & X < 15.5];Z3<-Z[X>14.5 & X < 15.5]; #Third subset analysis par(mar=c(4,2,1,0),pty='s',mfrow=c(2,2)) #Set up the plotting margins plot(Z1,Y1,pch=19,cex=0.6) plot(Z2,Y2,pch=19,cex=0.6) plot(Z3,Y3,pch=19,cex=0.6) @ In each subset, we see the negative correlation from the original conditional calculation. We can check this by regressing $Y$ on $Z$ for each subset. <>= coef(summary(lm(Y1~Z1))) #Group 1 regression coef(summary(lm(Y2~Z2))) #Group 2 regression coef(summary(lm(Y3~Z3))) #Group 3 regression coef(summary(lm(c(Y1,Y2,Y3)~c(Z1,Z2,Z3)))) #Pooled regression @ <>= par(mar=c(4,4,1,0),pty='s') plot(Z,Y,type='n') points(Z1,Y1,pch=19,cex=0.6,col='red') points(Z2,Y2,pch=19,cex=0.6,col='blue') points(Z3,Y3,pch=19,cex=0.6,col='green') abline(lm(c(Y1,Y2,Y3)~c(Z1,Z2,Z3)),lty=2) abline(lm(Y1~Z1),col='red') abline(lm(Y2~Z2),col='blue') abline(lm(Y3~Z3),col='green') legend(10,max(Y),c('Group 0','Group 1','Group 2','Pooled'), col=c('red','blue','green','black'),lty=c(1,1,1,2)) @ We can now repeat the analysis, but assuming, as per the graph, \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x) at (0,1) {${X}$}; \node[state] (y) at (-1,0){${Y}$}; \node[state] (z) at (1,0) {${Z}$}; \path (x) edge (y); \path (x) edge (z); \end{tikzpicture} \end{figure} that $Y$ and $Z$ are \textbf{conditionally independent} given $X$. <>= set.seed(2111) #Set the random number generator seed value n<-10000 #Set the sample size X<-rnorm(n,10,5) #Generate the X random variables Sig.YZ<-matrix(c(1,0,0,1),2,2) #Conditional independence for Y,Z given X YZ<-mvrnorm(n,mu=c(0,0),Sigma=Sig.YZ) #Generate the Y,Z variables Y<-X+YZ[,1];Z<-X+YZ[,2] #Change the mean according to X cor(cbind(X,Y,Z)) #Compute the unconditional correlation @ We see that marginally all the variables are positively correlated. <>= par(mar=c(4,4,1,0),pty='s') #Set up the plotting margins pairs(cbind(X,Y,Z),pch=19,cex=0.6) #Scatterplot matrix @ There is evidently strong positive correlation between each pair of variables, including $Y$ and $Z$. <>= par(mar=c(4,4,1,0),pty='s') #Set up the plotting margins plot(Z,Y,pch=19,cex=0.6) @ \medskip We now examine three subsets defined by different ranges of $X$: <>= Y1<-Y[X>4.5 & X < 5.5];Z1<-Z[X>4.5 & X < 5.5]; #First subset analysis Y2<-Y[X>9.5 & X < 10.5];Z2<-Z[X>9.5 & X < 10.5]; #Second subset analysis Y3<-Y[X>14.5 & X < 15.5];Z3<-Z[X>14.5 & X < 15.5]; #Third subset analysis par(mar=c(4,2,1,0),pty='s',mfrow=c(2,2)) #Set up the plotting margins plot(Z1,Y1,pch=19,cex=0.6) plot(Z2,Y2,pch=19,cex=0.6) plot(Z3,Y3,pch=19,cex=0.6) @ In each subset, we see the negligible correlation from the original conditional calculation. We can check this by regressing $Y$ on $Z$ for each subset. <>= coef(summary(lm(Y1~Z1))) #Group 1 regression coef(summary(lm(Y2~Z2))) #Group 2 regression coef(summary(lm(Y3~Z3))) #Group 3 regression coef(summary(lm(c(Y1,Y2,Y3)~c(Z1,Z2,Z3)))) #Pooled regression @ The slope coefficient is essentially zero in each case, confirming conditional independence, and yet marginally $Y$ and $Z$ are strongly positively correlated, <>= par(mar=c(4,4,1,0),pty='s') plot(Z,Y,type='n') points(Z1,Y1,pch=19,cex=0.6,col='red') points(Z2,Y2,pch=19,cex=0.6,col='blue') points(Z3,Y3,pch=19,cex=0.6,col='green') abline(lm(c(Y1,Y2,Y3)~c(Z1,Z2,Z3)),lty=2) abline(lm(Y1~Z1),col='red') abline(lm(Y2~Z2),col='blue') abline(lm(Y3~Z3),col='green') legend(10,max(Y),c('Group 0','Group 1','Group 2','Pooled'), col=c('red','blue','green','black'),lty=c(1,1,1,2)) @ \pagebreak \section{Colliders} \tikzset{ -Latex,auto,node distance =1 cm and 1 cm,semithick, state/.style ={circle, draw, minimum width = 0.7 cm}, box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray}, point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}}, bidirected/.style={Latex-Latex,dashed}, el/.style = {inner sep=3pt, align=left, sloped} } The following graph \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x) at (-1,0) {${X}$}; \node[state] (y) at (1,0){${Y}$}; \node[state] (z) at (0,-1) {${Z}$}; \node[state] (u) at (0,0) {${U}$}; \path (x) edge (z); \path (y) edge (z); \path (u) edge (z); \end{tikzpicture} \end{figure} \noindent encodes the dependencies between the four random variables $(X,Y,Z,U)$: the joint density can be represented \[ f_{X,Y,Z,U}(x,y,z,u) = f_X(x) f_Y(y) f_{U}(u) f_{Z|X,Y,U}(z|x,y,u) \] that is, $X,Y,U$ are mutually independent. Suppose that $X$ and $Y$ are distributed as standard Normal random variables, $U \sim \text{Normal}(0,0.1^2)$, and $Z= X + Y + U$. <>= set.seed(2101) #Set the random number generator seed value n<-10000 #Set the sample size X<-rnorm(n,0,1) #Generate the X random variables Y<-rnorm(n,0,1) #Generate the Y random variables U<-rnorm(n,0,0.1) #Generate the U random variables Z<-X+Y+U par(mar=c(3,2,1,0)) #Set up the plotting margins pairs(cbind(X,Y,Z),pch=19,cex=0.6) @ If we condition on the value of collider $Z$, and inspect the joint density of $X$ and $Y$ given $Z$, we see that $X$ and $Y$ are conditionally \textbf{dependent}. <>= par(mar=c(3,2,1,0)) #Set up the plotting margins X1<-X[Z>-2.5 & Z < -1.5];Y1<-Y[Z>-2.5 & Z < -1.5]; #First subset analysis X2<-X[Z>-0.5 & Z < 0.5];Y2<-Y[Z>-0.5 & Z < 0.5]; #Second subset analysis X3<-X[Z>0.5 & Z < 1.5];Y3<-Y[Z>0.5 & Z < 1.5]; #Third subset analysis par(mar=c(4,3,1,0),pty='s',mfrow=c(2,2)) #Set up the plotting margins plot(X1,Y1,pch=19,cex=0.6) plot(X2,Y2,pch=19,cex=0.6) plot(X3,Y3,pch=19,cex=0.6) cor(X1,Y1) cor(X2,Y2) cor(X3,Y3) @ \pagebreak \section{d-separation} \tikzset{ -Latex,auto,node distance =1 cm and 1 cm,semithick, state/.style ={circle, draw, minimum width = 0.7 cm}, box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray}, point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}}, bidirected/.style={Latex-Latex,dashed}, el/.style = {inner sep=3pt, align=left, sloped} } \textbf{Example 1:} Consider the graph \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x1) at (0,2) {${X_1}$}; \node[state] (x2) at (1,1){${X_2}$}; \node[state] (x3) at (-1,1) {${X_3}$}; \node[state] (x4) at (0,0) {${X_4}$}; \node[state] (x5) at (0,-1.25) {${X_5}$}; \path (x1) edge (x2); \path (x1) edge (x3); \path (x2) edge (x4); \path (x3) edge (x4); \path (x4) edge (x5); \end{tikzpicture} \end{figure} The implied probability distribution is \[ f_{X_1,X_2,X_3,X_4,X_5}(x_1,x_2,x_3,x_4,x_5) = f_{X_1}(x_1) f_{X_2|X_1}(x_2|x_1) f_{X_3|X_1}(x_3|x_1) f_{X_4|X_2,X_3}(x_4|x_2,x_3) f_{X_5|X_4}(x_5|x_4) \] We have that there are two paths connecting $X_2$ and $X_3$, namely \[ (X_2,X_1,X_3) \qquad (X_2,X_4,X_3) \] \begin{itemize} \item the set $S \equiv \{X_1\}$ d-separates $X_2$ and $X_3$ as conditioning on $X_1$ blocks the path $(X_2,X_1,X_3)$; we do not need to consider the second path here, as that is already blocked by the collider $X_4$; \medskip \item the set $S \equiv \{X_1,X_4\}$ does not d-separate $X_2$ and $X_3$ as conditioning on the collider $X_4$ opens the path; \medskip \item the set $S \equiv \{X_1,X_5\}$ does not d-separate $X_2$ and $X_3$ as conditioning on $X_5$, a descendant of $X_4$, opens the path at the collider. \end{itemize} To illustrate these results, we perform a simulation: <>= set.seed(43) n<-1000 X1<-rnorm(n,1,1) X2<-rnorm(n,X1,1) X3<-rnorm(n,X1,1) X4<-rnorm(n,X2+X3,1) X5<-rnorm(n,X4,1) cor(cbind(X1,X2,X3,X4,X5)) @ We can check the d-separation here using regression. We condition on the relevant variables by including them as predictors. \begin{itemize}[itemsep=0.1in] \item $S \equiv \{X_1\}$ d-separates $X_2$ and $X_3$ <>= summary(lm(X2~X3))$coef summary(lm(X3~X2))$coef @ The two variables $X_2$ and $X_3$ are dependent, as the slope coefficient is non-zero. We now include $X_1$ as a predictor. <>= summary(lm(X2~X3+X1))$coef summary(lm(X3~X2+X1))$coef @ In each case, after including $X_1$, the coefficient of the other regressor becomes close to zero. \medskip \item the set $S \equiv \{X_1,X_4\}$ does not d-separate $X_2$ and $X_3$ <>= summary(lm(X2~X3+X1+X4))$coef summary(lm(X3~X2+X1+X4))$coef @ After including $X_1$ and $X_4$, the coefficient of $X_3$ in the first analysis, and $X_2$ in the second analysis, becomes non-zero. \medskip \item the set $S \equiv \{X_1,X_5\}$ does not d-separate $X_2$ and $X_3$. <>= summary(lm(X2~X3+X1+X5))$coef summary(lm(X3~X2+X1+X5))$coef @ After including $X_1$ and $X_5$, the coefficient of $X_33$ in the first analysis, and $X_2$ in the second analysis, becomes non-zero. \end{itemize} \textbf{Example 2:} Consider the graph \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x1) at (0,2) {${X_1}$}; \node[state] (x2) at (1,1){${X_2}$}; \node[state] (x3) at (-1,1) {${X_3}$}; \node[state] (x4) at (0,0) {${X_4}$}; \node[state] (x5) at (0,-1.25) {${X_5}$}; \path (x1) edge (x2); \path (x1) edge (x3); \path (x2) edge (x4); \path (x4) edge (x3); \path (x4) edge (x5); \end{tikzpicture} \end{figure} The implied probability distribution is \[ f_{X_1,X_2,X_3,X_4,X_5}(x_1,x_2,x_3,x_4,x_5) = f_{X_1}(x_1) f_{X_2|X_1}(x_2|x_1) f_{X_4|X_2}(x_4|x_2) f_{X_3|X_1,X_4}(x_3|x_1,x_4) f_{X_5|X_4}(x_5|x_4) \] The same two paths connecting $X_2$ and $X_3$ are present, but \begin{itemize} \item the set $S \equiv \{X_1\}$ does not d-separate $X_2$ and $X_3$ as conditioning on $X_1$ blocks the path $(X_2,X_1,X_3)$; but there is an open path through $X_4$; \medskip \item the set $S \equiv \{X_1,X_4\}$ d-separates $X_2$ and $X_3$ as conditioning on $X_1$ and $X_4$ blocks both paths; \medskip \item the set $S \equiv \{X_1,X_5\}$ does not d-separate $X_2$ and $X_3$ as the second path remains open at $X_4$. \end{itemize} We can check this in simulation: <>= X1<-rnorm(n,1,1) X2<-rnorm(n,X1,1) X4<-rnorm(n,X2,1) X3<-rnorm(n,X1+X4,1) X5<-rnorm(n,X4,1) @ Again, we can check for d-separation in this case using regression. \begin{itemize}[itemsep=0.1in] \item $S \equiv \{X_1\}$ does not d-separate $X_2$ and $X_3$ <>= summary(lm(X2~X3+X1))$coef summary(lm(X3~X2+X1))$coef @ In each case, after including $X_1$, the coefficient of the other regressor is non-zero. \item the set $S \equiv \{X_1,X_4\}$ d-separates $X_2$ and $X_3$ <>= summary(lm(X2~X3+X1+X4))$coef summary(lm(X3~X2+X1+X4))$coef @ After including $X_1$ and $X_4$, the coefficient of $X_3$ in the first analysis, and $X_2$ in the second analysis, becomes zero. \item the set $S \equiv \{X_1,X_5\}$ does not d-separate $X_2$ and $X_3$. <>= summary(lm(X2~X3+X1+X5))$coef summary(lm(X3~X2+X1+X5))$coef @ After including $X_1$ and $X_5$, the coefficient of $X_33$ in the first analysis, and $X_2$ in the second analysis, becomes non-zero. \end{itemize} In general, when considering d-separation of two variables $X$ and $Y$ by a set of variables, $S$, we need to consider \textbf{all paths} between $X$ and $Y$, not merely those involving $S$ - however, in the first example $X_4$ is a collider so we do not need to consider the second path when taking $S \equiv \{X_1\}$. \bigskip \textbf{Example 3:} Consider the DAG \begin{figure}[h] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x1) at (0,0) {${X_1}$}; \node[state] (x2) at (2,0) {${X_2}$}; \node[state] (z) at (-1,-2) {${Z}$}; \node[state] (w) at (1,-1) {${W}$}; \node[state] (y) at (3,-2) {${Y}$}; \path (x1) edge (z); \path (x1) edge (w); \path (x2) edge (w); \path (x2) edge (y); \path (w) edge (y); \end{tikzpicture} \end{figure} \noindent which implies the structural decomposition \[ f_{X_1}(x_1) f_{X_2}(x_2) f_{W|X_1,X_2}(w|x_1,x_2) f_{Z|X_1}(z|x_1) f_{Y|X_2,W}(y|x_2,w). \] For example, we might have \begin{align*} X_1 & \sim Normal(1,1) \\[6pt] X_2 & \sim Normal(2,1) \\[6pt] W|X_1=x_1,X_2=x_2 & \sim Normal(x_1+x_2,1)\\[6pt] Z|X_1=x_1 & \sim Normal(2+2 x_1, 1)\\[6pt] Y|X_2=x_2,W=w & \sim Normal(x_2+w,1) \end{align*} In this DAG, there are no directed paths from $Z$ to $Y$; thus, $Z$ is \textbf{not} a cause of $Y$. If we unpick the structural specifications, we have either that \[ Y = X_2 + W + \epsilon \] or \[ Y = X_2 + (X_1 + X_2) + \varepsilon = X_1 + 2 X_2 + \varepsilon. \] Thus, manipulating $Z$ by intervention has no impact on $Y$. There are, however, two undirected paths \begin{align*} \text{Path 1: } \ & \ Z \rightarrow X_1 \rightarrow W \rightarrow X_2 \rightarrow Y\\[6pt] \textrm{Path 2: } \ & \ Z \rightarrow X_1 \rightarrow W \rightarrow Y. \end{align*} which have the potential to introduce bias into estimation of the (zero) causal effect. <>= set.seed(43) n<-1000 X1<-rnorm(n,1,1) X2<-rnorm(n,2,1) W<-rnorm(n,X1+X2,1) Z<-rnorm(n,2+2*X1,1) Y<-rnorm(n,X2+W,1) cor(cbind(X1,X2,W,Z,Y)) @ If we attempt an uncorrected analysis, and simply regress $Y$ on $Z$, the results imply that $Z$ does influence $Y$. <>= summary(lm(Y~Z))$coef @ Therefore, we need to perform some form of adjustment to block the undirected paths by conditioning. Now $W$ is a collider on Path 1 from $Z$ to $Y$, so this path is blocked. Conditioning on $W$ \emph{opens} the confounding path. Therefore $Z \independent Y$ (as there is no open path between them), but \[ Z \notindependent Y \given W \] Unconditional on $W$, the effect of $Z$ on $Y$ is confounded by the backdoor path Path 2. Conditioning on $W$ alone opens Path 1, therefore to block both paths need to condition on \[ S \equiv \{W,X_2\} \qquad \text{or} \qquad S = \{X_1\}. \] We consider five models \begin{align*} M_0 & \quad Y = Z + \epsilon\\[6pt] M_1 & \quad Y = Z + W + \epsilon\\[6pt] M_2 & \quad Y = Z + W + X_2 + \epsilon\\[6pt] M_3 & \quad Y = Z + X_1 + \epsilon\\[6pt] M_4 & \quad Y = Z + W + X_1 + X_2 + \epsilon \end{align*} Models $M_2, M_3$ and $M_4$ should yield unbiased estimators of the (null) effect of $Z$ on $Y$. <>= summary(lm(Y~Z))$coef #M0 - uncorrected summary(lm(Y~Z+W))$coef #M1 - conditioned on W summary(lm(Y~Z+W+X2))$coef #M2 - conditioned on (W,X2) summary(lm(Y~Z+X1))$coef #M3 - conditioned on X1 summary(lm(Y~Z+W+X1+X2))$coef #M4 - conditioned on (W,X1,X2) @ We now run a simulation study, replicating 1000 times for $n=500$. <>= nreps<-1000 n<-500 ests<-matrix(0,nrow=nreps,ncol=5) for(irep in 1:nreps){ X1<-rnorm(n,1,1) X2<-rnorm(n,2,1) W<-rnorm(n,X1+X2,1) Z<-rnorm(n,2+2*X1,1) Y<-rnorm(n,X2+W,1) ests[irep,1]<-coef(lm(Y~Z))[2] #M0 - uncorrected ests[irep,2]<-coef(lm(Y~Z+W))[2] #M1 - conditioned on W ests[irep,3]<-coef(lm(Y~Z+W+X2))[2] #M2 - conditioned on (W,X2) ests[irep,4]<-coef(lm(Y~Z+X1))[2] #M3 - conditioned on X1 ests[irep,5]<-coef(lm(Y~Z+W+X1+X2))[2] #M4 - conditioned on (W,X1,X2) } Bias<-apply(ests,2,mean)*sqrt(n) MSE<-apply(ests^2,2,mean)*n names(Bias)<-names(MSE)<-c('M0','M1','M2','M3','M4') Bias #Bias MSE #Mean-square error @ Therefore we have the following results: \begin{table}[h] \centering \begin{tabular}{clrr} Model & & $\sqrt{n} \times \text{Bias}$ & $n \times \text{MSE}$\\[1pt] \hline \\[-6pt] $M_0$ & $Y = Z + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[1])} & \Sexpr{sprintf("%.4f",MSE[1])}\\[6pt] $M_1$ & $Y = Z + W + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[2])} & \Sexpr{sprintf("%.4f",MSE[2])}\\[6pt] $M_2$ & $Y = Z + W + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[3])} & \Sexpr{sprintf("%.4f",MSE[3])}\\[6pt] $M_3$ & $Y = Z + X_1 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[4])} & \Sexpr{sprintf("%.4f",MSE[4])}\\[6pt] $M_4$ & $Y = Z + W + X_1 + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[5])} & \Sexpr{sprintf("%.4f",MSE[5])}\\ \hline \end{tabular} \end{table} \textbf{Example 4:} Consider the modified DAG \begin{figure}[h] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x1) at (0,0) {${X_1}$}; \node[state] (x2) at (2,0) {${X_2}$}; \node[state] (z) at (-1,-2) {${Z}$}; \node[state] (w) at (1,-1) {${W}$}; \node[state] (y) at (3,-2) {${Y}$}; \path (x1) edge (z); \path (x1) edge (w); \path (x2) edge (w); \path (x2) edge (y); \path (w) edge (y); \path (z) edge (y); \end{tikzpicture} \end{figure} \noindent which implies the structural decomposition \[ f_{X_1}(x_1) f_{X_2}(x_2) f_{W|X_1,X_2}(w|x_1,x_2) f_{Z|X_1}(z|x_1) f_{Y|X_2,W,Z}(y|x_2,w,z). \] For example, we might have \begin{align*} X_1 & \sim Normal(1,1) \\[6pt] X_2 & \sim Normal(2,1) \\[6pt] W|X_1=x_1,X_2=x_2 & \sim Normal(x_1+x_2,1)\\[6pt] Z|X_1=x_1 & \sim Normal(2+2 x_1, 1)\\[6pt] Y|X_2=x_2,W=w & \sim Normal(x_2+w+z,1) \end{align*} In this DAG, there is one directed path from $Z$ to $Y$; thus, $Z$ \textbf{is} a cause of $Y$. From the structural specifications, we have either that \[ Y = X_2 + W + Z + \epsilon \] or \[ Y = X_2 + (X_1 + X_2) + Z + \varepsilon = X_1 + 2 X_2 + Z + \varepsilon. \] so the impact of manipulating $Z$ by intervention is that changing $Z$ by one unit changes the expected value of $Y$ by one unit. The same confounding paths are present. We consider again the five models; models $M_2, M_3$ and $M_4$ should yield unbiased estimators of the effect of $Z$ on $Y$, which is the coefficient 1. <>= nreps<-1000 n<-500 ests<-matrix(0,nrow=nreps,ncol=5) for(irep in 1:nreps){ X1<-rnorm(n,1,1) X2<-rnorm(n,2,1) W<-rnorm(n,X1+X2,1) Z<-rnorm(n,2+2*X1,1) Y<-rnorm(n,X2+W+Z,1) ests[irep,1]<-coef(lm(Y~Z))[2] #M0 - uncorrected ests[irep,2]<-coef(lm(Y~Z+W))[2] #M1 - conditioned on W ests[irep,3]<-coef(lm(Y~Z+W+X2))[2] #M2 - conditioned on (W,X2) ests[irep,4]<-coef(lm(Y~Z+X1))[2] #M3 - conditioned on X1 ests[irep,5]<-coef(lm(Y~Z+W+X1+X2))[2] #M4 - conditioned on (W,X1,X2) } Bias<-apply(ests-1,2,mean)*sqrt(n) MSE<-apply((ests-1)^2,2,mean)*n names(Bias)<-names(MSE)<-c('M0','M1','M2','M3','M4') Bias #Bias MSE #Mean-square error @ We have the following results: \begin{table}[h] \centering \begin{tabular}{clrr} Model & & $\sqrt{n} \times \text{Bias}$ & $n \times \text{MSE}$\\[1pt] \hline \\[-6pt] $M_0$ & $Y = Z + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[1])} & \Sexpr{sprintf("%.4f",MSE[1])}\\[6pt] $M_1$ & $Y = Z + W + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[2])} & \Sexpr{sprintf("%.4f",MSE[2])}\\[6pt] $M_2$ & $Y = Z + W + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[3])} & \Sexpr{sprintf("%.4f",MSE[3])}\\[6pt] $M_3$ & $Y = Z + X_1 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[4])} & \Sexpr{sprintf("%.4f",MSE[4])}\\[6pt] $M_4$ & $Y = Z + W + X_1 + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[5])} & \Sexpr{sprintf("%.4f",MSE[5])}\\ \hline \end{tabular} \end{table} \pagebreak \textbf{Example 5:} Consider the following DAG \begin{figure}[h] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x1) at (0,1) {${X_1}$}; \node[state] (x2) at (0,-1) {${X_2}$}; \node[state] (z) at (-1,0) {${Z}$}; \node[state] (y) at (1,0) {${Y}$}; \path (x1) edge (z); \path (x2) edge (z); \path (x1) edge (y); \path (x2) edge (y); \path (z) edge (y); \end{tikzpicture} \end{figure} \noindent which implies the structural decomposition \[ f_{X_1}(x_1) f_{X_2}(x_2) f_{Z|X_1,X_2}(z|x_1,x_2) f_{Y|X_1,X_2,Z}(y|x_1,x_2,z). \] Suppose that \begin{align*} X_1 & \sim Normal(1,1) \\[6pt] X_2 & \sim Normal(2,1) \\[6pt] Z|X_1=x_1,X_2=x_2 & \sim Normal(x_1+x_2,1)\\[6pt] Y|X_1=x_1,X_2=x_2,Z=z & \sim Normal(x_1-x_2+z,1) \end{align*} In this DAG, there is one directed path from $Z$ to $Y$; thus, $Z$ \textbf{is} a cause of $Y$. From the structural specifications, we have either that \[ Y = X_1 - X_2 + Z + \epsilon \] so the impact of manipulating $Z$ by intervention is that changing $Z$ by one unit changes the expected value of $Y$ by one unit, but also \[ Y = X_1 - X_2 + (X_1+X_2) + \varepsilon = 2 X_1 + \varepsilon. \] There are two confounding paths \begin{align*} \text{Path 1: } \ & \ Z \rightarrow X_1 \rightarrow Y\\[6pt] \textrm{Path 2: } \ & \ Z \rightarrow X_2 \rightarrow Y. \end{align*} We consider four models \begin{align*} M_0 & \quad Y = Z + \epsilon\\[6pt] M_1 & \quad Y = Z + X_1 + \epsilon\\[6pt] M_2 & \quad Y = Z + X_2 + \epsilon\\[6pt] M_3 & \quad Y = Z + X_1 + X_2 + \epsilon \end{align*} In a simulation study <>= nreps<-1000 n<-500 ests<-matrix(0,nrow=nreps,ncol=4) for(irep in 1:nreps){ X1<-rnorm(n,1,1) X2<-rnorm(n,2,1) Z<-rnorm(n,X1+X2,1) Y<-rnorm(n,X1-X2+Z,1) ests[irep,1]<-coef(lm(Y~Z))[2] #M0 - uncorrected ests[irep,2]<-coef(lm(Y~Z+X1))[2] #M1 - conditioned on X1 ests[irep,3]<-coef(lm(Y~Z+X2))[2] #M2 - conditioned on X2 ests[irep,4]<-coef(lm(Y~Z+X1+X2))[2] #M3 - conditioned on (X1,X2) } Bias<-apply(ests-1,2,mean)*sqrt(n) MSE<-apply((ests-1)^2,2,mean)*n names(Bias)<-names(MSE)<-c('M0','M1','M2','M3') @ We have the following results: \begin{table}[h] \centering \begin{tabular}{clrr} Model & & $\sqrt{n} \times \text{Bias}$ & $n \times \text{MSE}$\\[1pt] \hline \\[-6pt] $M_0$ & $Y = Z + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[1])} & \Sexpr{sprintf("%.4f",MSE[1])}\\[6pt] $M_1$ & $Y = Z + X_1 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[2])} & \Sexpr{sprintf("%.4f",MSE[2])}\\[6pt] $M_2$ & $Y = Z + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[3])} & \Sexpr{sprintf("%.4f",MSE[3])}\\[6pt] $M_3$ & $Y = Z + X_1 + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[4])} & \Sexpr{sprintf("%.4f",MSE[4])}\\[6pt] \hline \end{tabular} \end{table} In this case, the \textbf{unadjusted} model also produces an unbiased estimator of the causal effect of $Z$ on $Y$. This is because of the \textbf{specific} data generating process chosen. Specifically, in the outcome model, the random variables $X_1-X_2$ and $Z$ are \textbf{uncorrelated}: \begin{align*} \Cov[(X_1-X_2),Z] & = \Cov[(X_1-X_2),(X_1+X_2+\varepsilon)] & \varepsilon \independent (X_1,X_2)\\[6pt] & = \Cov[(X_1-X_2),(X_1+X_2)] + \Cov[(X_1-X_2),\varepsilon] \\[6pt] & = \left\{\E[(X_1^2-X_2^2)] - \E[(X_1-X_2)] \E[(X_1+X_2)] \right\} + 0 & \text{zero by independence}\\[6pt] & = \left\{(1+1)-(4+1)) - (1-2)(1+2)\right\} = -3 + 3 = 0 \end{align*} as if $X \sim Normal(\mu,\sigma^2)$, $\E[X^2] = \mu^2 + \sigma^2$. If the simulation model is varied, $M_0$ \textbf{no longer} produces an unbiased result. <>= ests<-matrix(0,nrow=nreps,ncol=4) for(irep in 1:nreps){ X1<-rnorm(n,1,1) X2<-rnorm(n,2,1) Z<-rnorm(n,X1+2*X2,1) #Model changed here ! Y<-rnorm(n,X1-X2+Z,1) ests[irep,1]<-coef(lm(Y~Z))[2] #M0 - uncorrected ests[irep,2]<-coef(lm(Y~Z+X1))[2] #M1 - conditioned on X1 ests[irep,3]<-coef(lm(Y~Z+X2))[2] #M2 - conditioned on X2 ests[irep,4]<-coef(lm(Y~Z+X1+X2))[2] #M3 - conditioned on (X1,X2) } Bias<-apply(ests-1,2,mean)*sqrt(n) MSE<-apply((ests-1)^2,2,mean)*n names(Bias)<-names(MSE)<-c('M0','M1','M2','M3') @ \begin{table}[h] \centering \begin{tabular}{clrr} Model & & $\sqrt{n} \times \text{Bias}$ & $n \times \text{MSE}$\\[1pt] \hline \\[-6pt] $M_0$ & $Y = Z + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[1])} & \Sexpr{sprintf("%.4f",MSE[1])}\\[6pt] $M_1$ & $Y = Z + X_1 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[2])} & \Sexpr{sprintf("%.4f",MSE[2])}\\[6pt] $M_2$ & $Y = Z + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[3])} & \Sexpr{sprintf("%.4f",MSE[3])}\\[6pt] $M_3$ & $Y = Z + X_1 + X_2 + \epsilon$ & \Sexpr{sprintf("%.4f",Bias[4])} & \Sexpr{sprintf("%.4f",MSE[4])}\\[6pt] \hline \end{tabular} \end{table} Here, from the structural specifications, we have again that \[ Y = X_1 - X_2 + Z + \epsilon \] so the impact of manipulating $Z$ by intervention is that changing $Z$ by one unit changes the expected value of $Y$ by one unit, but now \[ Y = X_1 - X_2 + (X_1+2 X_2) + \varepsilon = 2 X_1 + X_2 + \varepsilon. \] \section{Selection Bias} The vignette for \texttt{R} package \texttt{ggdag} contains two examples on selection bias \begin{center} \url{https://cran.r-project.org/web/packages/ggdag/vignettes/bias-structures.html} \end{center} that inspire the numerical examples below. \bigskip \textbf{Example 1:} Consider the DAG for binary variables \begin{itemize} \item $S$ -- smoking status \item $G$ -- glioma \item $H$ -- hospitalization \item $B$ -- broken bone \item $R$ -- tendency for reckless behaviour \end{itemize} We wish to study the association between $S$ and $G$ in a hospitalized cohort. \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (h) at (1,1) {${H}$}; \node[state] (g) at (0,0) {${G}$}; \node[state] (b) at (2,0) {${B}$}; \node[state] (r) at (3,-1) {${R}$}; \node[state] (s) at (4,0) {${S}$}; \path (g) edge (h); \path (b) edge (h); \path (r) edge (b); \path (r) edge (s); \end{tikzpicture} \end{figure} The corresponding probability model factorizes as \[ f_{R,B,S,G,H}(r,b,s,g,h) = f_{R}(r) f_G(g) f_{B|R}(b|r) f_{S|R}(s|r) f_{H|B,G}(h|b,g) \] In this graph, we have one path from $S$ to $G$ \begin{itemize} \item this is an undirected path, \item the path is blocked at collider $H$. \end{itemize} Therefore $G \independent S$, but conditioning on $H$ renders $G$ and $S$ \textbf{dependent}. That is, if we carry out a study in the general population, no association between will be detected, however, if we only look in hospitalized subjects, an association will be detected. <>= set.seed(2384) n<-1000000 #Large population R<-rbinom(n,1,0.2) #Simulate R G<-rbinom(n,1,0.02) #Simulate G pS<-c(0.05,0.25) #Simulate S, with different probs for R=0 and R=1 S<-rbinom(n,1,pS[R+1]) pB<-c(0.01,0.10) #Simulate B, with different probs for R=0 and R=1 B<-rbinom(n,1,pB[R+1]) pH<-c(0.01,0.9,0.05,0.95) #Simulate H, with different probs for different values H<-rbinom(n,1,pH[B+G+B*G+1]) #of the quantity B+G+G*B cor(G,S) #No association in general population cor(G[H==1],S[H==1]) #Negative correlation in hospitalized individuals cor(G[H==0],S[H==0]) #No appreciable association in non hospitalized @ We can explore the same results using regression: <>= my.data<-data.frame(R,G,S,B,H) summary(lm(G~S,data=my.data))$coef #Analysis in general population @ From the analysis in the general population, ignoring hospitalization status, there is no association between Smoking and Glioma. However, performing an analysis stratifying by hospitalization status yields different results. <>= summary(lm(G~S,subset=(H==1),data=my.data))$coef #Analysis in hospitalized summary(lm(G~S,subset=(H==0),data=my.data))$coef #Analysis in non-hospitalized @ Thus from the analysis of hospitalized subjects, it seems that Smoking is protective for Glioma. \pagebreak \textbf{Example 2:} Consider the DAG for variables \begin{itemize} \item $C$ -- CD4 count (continuous) \item $D$ -- assignment of new HIV drug (binary) \item $H$ -- underlying HIV severity (binary) \item $S$ -- symptoms (binary) \item $U$ -- follow up indicator (binary, $U=1$ implies that participant stays in trial) \end{itemize} We wish to examine the impact of $D$ on $C$ by studying patients who stay in the trial. Suppose the relevant DAG is \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (c) at (0,0) {${C}$}; \node[state] (h) at (1,0) {${H}$}; \node[state] (s) at (2,0) {${S}$}; \node[state] (u) at (3,1) {${U}$}; \node[state] (d) at (3,-1) {${D}$}; \path (h) edge (c); \path (h) edge (s); \path (s) edge (u); \path (d) edge (s); \end{tikzpicture} \end{figure} The corresponding probability model factorizes as \[ f_{H,D,C,S,U}(h,d,c,s,u) = f_{H}(h) f_D(d) f_{C|H}(c|h) f_{S|H,D}(s|h,d) f_{U|S}(u|s) \] In this graph, we have one path from $D$ to $C$: this is an undirected path, but the path is blocked at collider $S$. Therefore $D \independent C$. However, conditioning on $U$, which is a descendant of $S$, renders $D$ and $C$ \textbf{dependent}. Therefore, regressing $C$ on $D$ in the subgroup where $U = 1$ (that is, individuals who are followed in the trial) will indicate an association. <>= set.seed(2384) n<-1000000 #Large population H<-rbinom(n,1,0.2) #Simulate H D<-rbinom(n,1,0.5) #Simulate D C<-rnorm(n,100-20*H,10) #Simulate C pS<-c(0.01,0.9,0.05,0.95) #Simulate S, with different probs for different values S<-rbinom(n,1,pS[H+D+H*D+1]) #of the quantity H+D+H*D pU<-c(0.9,0.3) #Simulate U U<-rbinom(n,1,pU[S+1]) coef(summary(lm(C~D))) coef(summary(lm(C~D, subset=(U==1)))) coef(summary(lm(C~D, subset=(U==0)))) @ \section{M-bias} \tikzset{ -Latex,auto,node distance =1 cm and 1 cm,semithick, state/.style ={circle, draw, minimum width = 0.7 cm}, box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray}, point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}}, bidirected/.style={Latex-Latex,dashed}, el/.style = {inner sep=3pt, align=left, sloped} } Consider the DAG with two unmeasured confounders: \medskip \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state, fill=red] (u1) at (0,0) {${U_1}$}; \node[state, fill=red] (u2) at (2,0) {${U_2}$}; \node[state] (z) at (-1,-2) {${Z}$}; \node[state] (x) at (1,-1) {${X}$}; \node[state] (y) at (3,-2) {${Y}$}; \path[red] (u1) edge (z); \path[red] (u1) edge (x); \path[red] (u2) edge (x); \path[red] (u2) edge (y); \end{tikzpicture} \end{figure} We have that $X$, $Y$ and $Z$ are \emph{independent}; the (true but hidden) path between $Z$ and $Y$ is blocked at collider $X$. <>= set.seed(43) n<-1000 U1<-rnorm(n,1,1) U2<-rnorm(n,2,1) X<-rnorm(n,U1+U2,1) Z<-rnorm(n,2+2*U1,1) Y<-rnorm(n,3-U2,1) cor(cbind(U1,U2,X,Z,Y)) summary(lm(Y~Z))$coef @ It is evident that $Z$ and $Y$ are not associated; they are uncorrelated, and the estimated coefficient of $Z$ in the regression of $Y$ on $Z$ is close to zero. \medskip In an analysis, however, suppose we condition on $X$: the relevant DAG is presented below \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state, fill=red] (u1) at (0,0) {${U_1}$}; \node[state, fill=red] (u2) at (2,0) {${U_2}$}; \node[state] (z) at (-1,-2) {${Z}$}; \node[state] (x) at (1,-1) {${X}$}; \node[state] (y) at (3,-2) {${Y}$}; \path[red] (u1) edge (z); \path[red] (u1) edge (x); \path[red] (u2) edge (x); \path[red] (u2) edge (y); \path (x) edge (z); \path (x) edge (y); \end{tikzpicture} \end{figure} In the modelled DAG, $Y\independent Z \given X$; however, conditioning on $X$ \emph{opens} the \emph{hidden} path through $U_1$ and $U_2$, so there is now an open biasing path. <>= summary(lm(Y~Z+X))$coef @ Inclusion of $X$ in the regression induces bias into the estimation. \end{document}