\documentclass[notitlepage]{article} \usepackage{Math} %\usepackage{../../Math598} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{bm} \usepackage{bbm} \usepackage{verbatim} \usepackage{mathtools} \usepackage{dsfont} \usepackage{scalerel} \usepackage{amsfonts,amsmath,amssymb} \usepackage{tikz} \usetikzlibrary{shapes,decorations,arrows,calc,arrows.meta,fit,positioning} \usepackage{pgfplots} \pgfplotsset{compat=1.17} \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} \usepackage{xcolor} \newtheorem{alg}{Algorithm} \newtheorem{ex}{Example} \newtheorem{note}{Note} \newenvironment{proof}{\par\noindent{\normalfont{\bf{Proof }}}} {\hfill \small$\blacksquare$\\[2mm]} \setlength{\parindent}{0in} \def\E{\mathbbmss{E}} \def\Var{\text{Var}} \def\Cov{\text{Cov}} \def\Corr{\text{Corr}} \def\bW{\mathbf{W}} \def\bH{\mathbf{H}} \newcommand{\bs}[1]{\bm{#1}} \newcommand{\Rho}{\mathrm{P}} \newcommand{\cif}{\text{ if }} \newcommand{\Ra}{\Longrightarrow} \def\bphi{\boldsymbol \phi} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \def\Xb{\mathbf{X}_{\beta}} \def\Xp{\mathbf{X}_{\psi}} \def\beps{\boldsymbol{\epsilon}} \def\betahat{\widehat \beta} \def\psihat{\widehat \psi} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=6) options(repos=c(CRAN="https://cloud.r-project.org/")) knit_hooks$set(document = function(x) {sub('\\usepackage[]{color}', '\\usepackage{xcolor}', x, fixed = TRUE)}) @ \begin{center} {\textsclarge{Project 2: Solutions}} \end{center} \tikzset{ -Latex,auto,node distance =1 cm and 1 cm,semithick, state/.style ={circle, draw, minimum width = 0.7 cm}, box/.style ={rectangle, draw, minimum width = 0.7 cm, fill=lightgray}, point/.style = {circle, draw, inner sep=0.08cm,fill,node contents={}}, bidirected/.style={Latex-Latex,dashed}, el/.style = {inner sep=3pt, align=left, sloped} } The basic simulation will be based on: \begin{itemize} \item $X_1 \sim Normal(0.5,0.4^2)$, $X_2 \sim Normal(-1,1^2)$ independent. \item Conditional mean model is \[ \E_{Y|X,Z}^\calO[Y|X_1=x_1,X_2=x_2,Z=z] = 1 + x_1 + x_2 + x_1 x_2 + z (1 + x_1) \] with Normal additive errors with variance $2.5^2$. \item Conditional model for $Z|X_1=x_1,X_2 = x_2$ is $Bernoulli(e(x))$ with \[ e(x) = \frac{\exp\{-1.5 + 2 x_1 \}}{1+\exp\{-1.5 + 2 x_1 \}}. \] \end{itemize} That is, $X_1$ is a confounder, but $X_2$ is not. Here the ATE is \[ \delta = \mu(1)-\mu(0) = 1 + \E[X_1] = 1.5 \] <>= #Calculation for large N library(mvnfast) set.seed(22087) N<-10000 mu1<-0.5;sig1<-0.4 X1<-rnorm(N,mu1,sig1) mu2<--1;sig2<-1 X2<-rnorm(N,mu2,sig2) al<-c(-1.5,2) expit<-function(x){return(1/(1+exp(-x)))} Xa<-cbind(1,X1) eX0<-expit(Xa %*% al) Z<-rbinom(N,1,eX0) Xb<-cbind(1,X1,X2,X1*X2,Z,Z*X1) be<-c(1,1,1,1,1,1) sigY<-2.5 Y<-Xb %*% be + rnorm(N)*sigY par(mar=c(4,4,2,0)) hist(eX0,breaks=seq(0,1,by=0.02),main='True propensity score',xlab=expression(e(x))) box() @ <>= #Test Correct model fit0<-lm(Y~X1+X2+X1:X2+Z+Z:X1) mu0.0<-mean(predict(fit0,newdata=data.frame(X1=X1,X2=X2,Z=0))) mu1.0<-mean(predict(fit0,newdata=data.frame(X1=X1,X2=X2,Z=1))) mu1.0-mu0.0 @ <>= #Base Monte Carlo nreps<-10000 n<-1000 ests0<-matrix(0,nrow=nreps,ncol=2) for(irep in 1:nreps){ X1<-rnorm(n,mu1,sig1) X2<-rnorm(n,mu2,sig2) Xa<-cbind(1,X1) eX0<-expit(Xa %*% al) Z<-rbinom(n,1,eX0) Xb<-cbind(1,X1,X2,X1*X2,Z,Z*X1) Y<-Xb %*% be + rnorm(n)*sigY w0<-(1-Z)/(1-eX0) w1<-Z/eX0 ests0[irep,1]<-mean(w1*Y)-mean(w0*Y) ests0[irep,2]<-sum(w1*Y)/sum(w1)-sum(w0*Y)/sum(w0) } apply(ests0,2,mean) apply(ests0,2,var) @ \begin{enumerate}[(a)] \item The distribution of the propensity score influences the \textit{variance} of the ATE estimator. <>= #Dependence on PS model nreps<-10000 n<-1000 ests1<-matrix(0,nrow=nreps,ncol=2) for(irep in 1:nreps){ X1<-rnorm(n,mu1,1) #Change sigma1 to 1 X2<-rnorm(n,mu2,sig2) Xa<-cbind(1,X1) eX0<-expit(Xa %*% al) Z<-rbinom(n,1,eX0) Xb<-cbind(1,X1,X2,X1*X2,Z,Z*X1) Y<-Xb %*% be + rnorm(n)*sigY w0<-(1-Z)/(1-eX0) w1<-Z/eX0 ests1[irep,1]<-mean(w1*Y)-mean(w0*Y) ests1[irep,2]<-sum(w1*Y)/sum(w1)-sum(w0*Y)/sum(w0) } par(mar=c(4,4,4,0)) hist(eX0,breaks=seq(0,1,by=0.02),main='New propensity score',xlab=expression(e(x))) box() apply(ests1,2,mean) apply(ests1,2,var) apply(ests1,2,var)/apply(ests0,2,var) #Ratio of variances compared to base case. @ \item Estimators based on $\widetilde \mu(\mathsf{z})$ have different \textit{variances} to estimators based on $\widehat \mu(\mathsf{z})$. <>= #From the previous results par(mar=c(4,4,2,0)) boxplot(cbind(ests0,ests1),pch=19,cex=0.7, names=c(expression(tilde(delta)),expression(hat(delta)), expression(tilde(delta)),expression(hat(delta)))) abline(h=1.5,lty=2,col='red') title('Original PS and New PS') v0<-apply(ests0,2,var) v1<-apply(ests1,2,var) v1[1]/v0[1] v1[2]/v0[2] @ \pagebreak \item Mis-specification of a propensity score model can lead to \emph{biased (and inconsistent)} estimation of the ATE. <>= #Mis-specification nreps<-10000 n<-1000 ests2<-matrix(0,nrow=nreps,ncol=2) for(irep in 1:nreps){ X1<-rnorm(n,mu1,sig1) X2<-rnorm(n,mu2,sig2) Xa<-cbind(1,X1) eX0<-expit(Xa %*% al) Z<-rbinom(n,1,eX0) Xb<-cbind(1,X1,X2,X1*X2,Z,Z*X1) Y<-Xb %*% be + rnorm(n)*sigY Xax<-cbind(1,X2) #Use X2 instead of X1 eXx<-fitted(glm(Z~X2,family=binomial)) w0<-(1-Z)/(1-eXx) w1<-Z/eXx ests2[irep,1]<-mean(w1*Y)-mean(w0*Y) ests2[irep,2]<-sum(w1*Y)/sum(w1)-sum(w0*Y)/sum(w0) } apply(ests2,2,mean) par(mar=c(4,4,2,0)) boxplot(ests2,names=c(expression(tilde(delta)),expression(hat(delta))),pch=19,cex=0.7) title('Mis-specified PS model') abline(h=1.5,lty=2,col='red') box() @ \item Estimation and use of a \emph{parametric} propensity score model can improve the variance of an ATE estimator compared with using the \textit{known} propensity score form. <>= #Estimating the PS nreps<-10000 n<-250 ests3<-matrix(0,nrow=nreps,ncol=4) for(irep in 1:nreps){ X1<-rnorm(n,mu1,sig1) X2<-rnorm(n,mu2,sig2) Xa<-cbind(1,X1) eX0<-expit(Xa %*% al) Z<-rbinom(n,1,eX0) Xb<-cbind(1,X1,X2,X1*X2,Z,Z*X1) Y<-Xb %*% be + rnorm(n)*sigY w0<-(1-Z)/(1-eX0) w1<-Z/eX0 ests3[irep,1]<-mean(w1*Y)-mean(w0*Y) ests3[irep,2]<-sum(w1*Y)/sum(w1)-sum(w0*Y)/sum(w0) eX<-fitted(glm(Z~X1,family=binomial)) w0<-(1-Z)/(1-eX) w1<-Z/eX ests3[irep,3]<-mean(w1*Y)-mean(w0*Y) ests3[irep,4]<-sum(w1*Y)/sum(w1)-sum(w0*Y)/sum(w0) } apply(ests3,2,var) par(mar=c(4,4,2,0)) boxplot(ests3,names=c(expression(tilde(delta)),expression(hat(delta)), expression(tilde(delta)),expression(hat(delta))),pch=19,cex=0.7) title('Estimating the PS model') abline(h=1.5,lty=2,col='red') box() @ \item Propensity score models need only be constructed from \emph{confounders} rather than all predictors. We now change the data generating model \begin{itemize} \item $X_1 \sim Normal(0.5,0.4^2)$, $X_2 \sim Normal(-1,1^2)$ independent. \item Conditional mean model is \[ \E_{Y|X,Z}^\calO[Y|X_1=x_1,X_2=x_2,Z=z] = 1 + x_1 + z (1 + x_1) \] with Normal additive errors with variance $2.5^2$. \item Conditional model for $Z|X_1=x_1,X_2 = x_2$ is $Bernoulli(e(x))$ with \[ e(x) = \frac{\exp\{-1.5 + x_1 + x_2 \}}{1+\exp\{-1.5 + x_1 + x_2 \}}. \] \end{itemize} <>= #Estimating the PS nreps<-10000 n<-1000 al<-c(-1.5,1,1) ests4<-matrix(0,nrow=nreps,ncol=4) for(irep in 1:nreps){ X1<-rnorm(n,mu1,sig1) X2<-rnorm(n,mu2,sig2) Xa<-cbind(1,X1,X2) eX0<-expit(Xa %*% al) Z<-rbinom(n,1,eX0) Xb<-cbind(1,X1,Z,Z*X1) Y<-Xb %*% be[1:4] + rnorm(n)*sigY eX1<-fitted(glm(Z~X1+X2,family=binomial)) w0<-(1-Z)/(1-eX1) w1<-Z/eX1 ests4[irep,1]<-mean(w1*Y)-mean(w0*Y) ests4[irep,2]<-sum(w1*Y)/sum(w1)-sum(w0*Y)/sum(w0) eX<-fitted(glm(Z~X1,family=binomial)) w0<-(1-Z)/(1-eX) w1<-Z/eX ests4[irep,3]<-mean(w1*Y)-mean(w0*Y) ests4[irep,4]<-sum(w1*Y)/sum(w1)-sum(w0*Y)/sum(w0) } apply(ests4,2,mean) apply(ests4,2,var) par(mar=c(4,4,2,0)) boxplot(ests4,names=c(expression(tilde(delta)[12]),expression(hat(delta)[12]), expression(tilde(delta)[1]),expression(hat(delta)[1])),pch=19,cex=0.7) title('Estimating the PS model only using confounder X1') abline(h=1.5,lty=2,col='red') box() @ \end{enumerate} \end{document}