\documentclass[notitlepage]{article} \usepackage{Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{bbm} \usepackage{amsfonts,amsmath} \usepackage{tikz} \usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning} \def\E{\Expect} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \parindent0in \begin{document} \begin{center} {\textsclarge{Backdoor path with a collider and unmeasured confounding}} \end{center} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) #options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%6i", x), sprintf("%.6f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \tikzset{ -Latex,auto,node distance =1 cm and 1 cm,semithick, state/.style ={circle, draw, minimum width = 0.7 cm}, box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray}, point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}}, bidirected/.style={Latex-Latex,dashed}, el/.style = {inner sep=3pt, align=left, sloped} } \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x) at (1,0) {${X}$}; \node[state] (y) at (0,0){${Y}$}; \node[state] (z) at (-1,0) {${Z}$}; \node[state] (u) at (0,1) {${U}$}; \path (z) edge (y); \path (y) edge (x); \path (u) edge (z); \path (u) edge (x); \end{tikzpicture} \end{figure} The corresponding probability model factorizes as \[ f_{U,X,Y,Z}(u,x,y,z) = f_{U}(u) f_{Z|U}(z|u) f_{Y|Z}(y|z) f_{X|U,Y}(x|u,y) \] In this graph, we have two paths from $Z$ to $Y$ \begin{itemize} \item Path $(Z,Y)$ this is a directed path; \item Path $(Z,U,X,Y)$: this is an undirected path that is also a backdoor path. \end{itemize} However the second path is blocked at the collider $X$, so there is no open backdoor path, and thus the effect of $Z$ on $Y$ is only found through the first. <>= set.seed(2384) n<-10000 U<-rnorm(n,10,1) Z<-rnorm(n,2*U+1,1) Y<-rnorm(n,-Z+3,1) X<-rnorm(n,Y+U,1) data1<-data.frame(U,X,Y,Z);pairs(data1,pch=19,cex=0.5) @ If we regress $Y$ on $Z$, then the correct relationship is recovered. <>= round(coef(summary(lm(Y~Z))),6) @ However, if we condition on $X$ in the regression, we see that bias is introduced in the estimation of the coefficient. <>= round(coef(summary(lm(Y~Z+X))),6) @ If we condition on $U$ only, then the direct effect of $Z$ on $Y$ is correctly captured, as the path is still blocked at $X$ <>= round(coef(summary(lm(Y~Z+U))),6) @ If we condition on $U$ and $X$, then the direct effect of $Z$ on $Y$ is also not captured. <>= round(coef(summary(lm(Y~Z+X+U))),6) @ This is due to \textit{selection bias}: conditioning on a descendant of $Y$ will lead to bias in most circumstances. Consider the simple chain graph \begin{figure}[ht] \centering \begin{tikzpicture}[scale=1.5] % x node set with absolute coordinates \node[state] (x) at (1,0) {${X}$}; \node[state] (y) at (0,0){${Y}$}; \node[state] (z) at (-1,0) {${Z}$}; \path (z) edge (y); \path (y) edge (x); \end{tikzpicture} \end{figure} The corresponding probability model factorizes as \[ f_{X,Y,Z}(x,y,z) = f_{Z}(z) f_{Y|Z}(y|z) f_{X|Y,Z}(x|y,z). \] Clearly we can integrate out $x$ from the joint density to leave \[ f_{Y,Z}(y,z) = f_{Z}(z) f_{Y|Z}(y|z) \] leaving the $(Z,Y)$ relationship unchanged. However, we have that \[ f_{Y|X,Z}(y|x,z) = \frac{f_{X,Y,Z}(x,y,z)}{f_{X,Z}(x,z)} = \frac{f_{Z}(z) f_{Y|Z}(y|z) f_{X|Y,Z}(x|y,z)}{f_Z(z)f_{X|Z}(x|z)} = \frac{f_{X|Y,Z}(x|y,z)}{f_{X|Z}(x|z)} f_{Y|Z}(y|z) \] and in general \[ \frac{f_{X|Y,Z}(x|y,z)}{f_{X|Z}(x|z)} \neq 1 \] so \[ f_{Y|X,Z}(y|x,z) \neq f_{Y|Z}(y|z). \] \pagebreak Notice, however, that we can change the data generating model to make the analyses agree. If the conditional model for $Y$ given $Z$ is instead \[ Y = -0.5 Z + 3 + \epsilon \] and then \[ X = 0.75 Y + U + \varepsilon \] then the effect of conditioning changes. <>= set.seed(2384) n<-10000 U<-rnorm(n,10,1) Z<-rnorm(n,2*U+1,1) Y<-rnorm(n,-0.5*Z+3,1) X<-rnorm(n,0.75*Y+U,1) data2<-data.frame(U,X,Y,Z);pairs(data2,pch=19,cex=0.5) @ If we regress $Y$ on $Z$, then the \textbf{correct} relationship is recovered. <>= round(coef(summary(lm(Y~Z))),6) @ Now, if we condition on $X$ in the regression, we see that bias is still \textbf{not present}, even though there is an open, biasing path. <>= round(coef(summary(lm(Y~Z+X))),6) @ If we condition on $U$ only, then the direct effect of $Z$ on $Y$ is \textbf{correctly captured}, as the path is still blocked at $X$ <>= round(coef(summary(lm(Y~Z+U))),6) @ However, if we condition on $U$ and $X$, then the direct effect of $Z$ on $Y$ is \textbf{not} captured. <>= round(coef(summary(lm(Y~Z+X+U))),6) @ As a final summary, we can inspect the \textbf{inverse} of the sample correlation matrices: <>= round(solve(cor(data1)),6) @ <>= round(solve(cor(data2)),6) @ Note that in both cases, the entry in the position relating $X$ and $Z$ is almost zero. This is an indication that conditional on the other variables, $X$ and $Z$ are uncorrelated (actually independent here in this Gaussian case). This entry corresponds to the \textbf{partial correlation} between the two variables. \end{document}