\documentclass[notitlepage]{article}
\usepackage{Math533}
\usepackage{listings}
\usepackage{numprint}
\usepackage{amsfonts}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bbm}

\lstloadlanguages{R}

\lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}}

\begin{document}
%\title{The Bivariate Normal Distribution}
%\maketitle
%\begin{abstract}
%This document shows you how to use the \texttt{knitr} package in \texttt{R} or \texttt{RStudio}
%to create pdf documents based on \texttt{R} scripts.
%\end{abstract}

\begin{center}
\textsclarge{Properties of The Bivariate Normal Distribution}
\end{center}

\noindent Suppose that two variables $X$ and $Y$ have a \textit{\textbf{bivariate Normal distribution}}:
\[
\left[\begin{array}{c} X \\ Y \end{array} \right] \sim \textrm{Normal}_2 \left( \left[ \begin{array}{c} \mu_X \\ \mu_Y \end{array} \right], \left[ \begin{array}{cc} \sigma_X^2 & \sigma_{XY} \\ \sigma_{XY} & \sigma_Y^2 \end{array} \right] \right)
\equiv \textrm{Normal} \left( \mu, \Sigma \right).
\]
In this form, the parameter $\sigma_{XY}$ is the \textit{\textbf{covariance}} between $X$ and $Y$,
\[
\sigma_{XY} = \Expect_{X,Y}[(X-\mu_X)(Y-\mu_Y)]
\]
and $\Sigma$ is the \textbf{\textit{variance-covariance matrix}}.  If we write
\[
\rho_{XY} = \frac{\sigma_{XY}}{\sigma_X \sigma_Y}
\]
the joint density is
\[
f_{X,Y}(x,y) = \frac{1}{2 \pi } \frac{1}{\sigma_X \sigma_Y\sqrt{1 - \rho_{XY}^2} } \exp \left\{ - \frac{1}{2 (1-\rho_{XY}^2) } \left[\left(\frac{x-\mu_X}{\sigma_X}\right)^2 - \frac{2 \rho_{XY} (x-\mu_X)(y-\mu_Y)}{\sigma_X \sigma_Y} + \left(\frac{y-\mu_Y}{\sigma_Y}\right)^2 \right] \right\}
\]
To understand this joint density better, we examine the term in the exponent
\[
\left(\frac{x-\mu_X}{\sigma_X}\right)^2 - \frac{2 \rho_{XY} (x-\mu_X)(y-\mu_Y)}{\sigma_X \sigma_Y} + \left(\frac{y-\mu_Y}{\sigma_Y}\right)^2.
\]
Writing
\[
z_1 = \frac{x-\mu_X}{\sigma_X} \qquad \qquad z_2 = \frac{y-\mu_Y}{\sigma_Y}
\]
the expression becomes simply
\[
z_1^2 - 2 \rho_{XY} z_1 z_2 + z_2^2.
\]
We may re-write this by completing the square in $z_2$ as
\[
(z_2 - \rho_{XY} z_1)^2 + (1-\rho_{XY}^2) z_1^2
\]
Hence, for the transformed random variables $(Z_1,Z_2)$, we must have the joint density
\[
f_{Z_1,Z_2}(z_1,z_2) = \frac{1}{2 \pi } \frac{1}{\sqrt{1 - \rho_{XY}^2} } \exp \left\{ - \frac{1}{2 (1-\rho_{XY}^2) } \left[(z_2 - \rho_{XY} z_1)^2 + (1-\rho_{XY}^2) z_1^2 \right] \right\}.
\]
From this we can deduce the factorization of this joint density as
\[
f_{Z_1,Z_2}(z_1,z_2) = f_{Z_2|Z_1}(z_2|z_1) f_{Z_1}(z_1)
\]
where
\begin{align*}
f_{Z_1}(z_1) & \propto \exp \left\{ - \frac{1}{2} z_1^2 \right\} \\[6pt]
f_{Z_2|Z_1}(z_2|z_1) & \propto \exp \left\{ - \frac{1}{2 (1-\rho_{XY}^2) } \left[(z_2 - \rho_{XY} z_1)^2 \right] \right\}
\end{align*}
that is,
\[
Z_1 \sim \text{Normal}(0,1) \qquad \qquad Z_2|Z_1 = z_1 \sim \text{Normal}(\rho_{XY} z_1,(1-\rho_{XY}^2)).
\]
Finally, back-transforming to $(X,Y)$, we deduce by elementary properties of the Normal distribution
\[
X \sim \text{Normal}(\mu_X,\sigma_X^2) \qquad \qquad Y|X = x \sim \text{Normal}\left(\mu_Y + \rho_{XY}\frac{\sigma_Y}{\sigma_X} (x-\mu_X),\sigma_Y^2 (1-\rho_{XY}^2)\right).
\]
Note that
\[
\rho_{XY}\frac{\sigma_Y}{\sigma_X} = \frac{\sigma_{XY}}{\sigma_X^2}.
\]

\pagebreak

Therefore, we can conclude the following important properties of the bivariate Normal distribution:
\begin{itemize}
  \item \textbf{Marginal distributions:}
  \[
  X \sim \textrm{Normal}(\mu_X,\sigma_X^2) \qquad \textrm{and} \qquad Y \sim \textrm{Normal}(\mu_Y,\sigma_Y^2)
  \]
  \item \textbf{Conditional distributions:}
  \begin{align*}
  Y|X=x &\sim \textrm{Normal}\left(\mu_Y+\rho_{XY} \frac{\sigma_{Y} }{\sigma_X} (x-\mu_X),\sigma_Y^2 (1-\rho_{XY}^2)\right)\\
  X|Y=y &\sim \textrm{Normal}\left(\mu_X+\rho_{XY} \frac{\sigma_{X} }{\sigma_Y} (y-\mu_Y),\sigma_X^2 (1-\rho_{XY}^2)\right)
  \end{align*}
  \item \textbf{Linear transformation:} If $\bA$ is a non-singular $(2 \times 2)$ matrix, and $(U,V)^\top = \bA (X,Y)^\top$, then
\[
\left[\begin{array}{c} U \\ V \end{array} \right] \sim
\textrm{Normal} \left( \left[ \begin{array}{c} \mu_U \\ \mu_V \end{array} \right],
\left[ \begin{array}{cc} \sigma_U^2 & \sigma_{UV} \\ \sigma_{UV} & \sigma_V^2 \end{array} \right] \right)
\equiv \textrm{Normal} \left( \bA \mu, \bA \Sigma \bA^\top \right).
\]
\end{itemize}

The \texttt{mvtnorm} package has functions that allow us to study the multivariate normal density.
After installing the package using \texttt{install.packages('mvtnorm')}, run
<<binorm0,comment='+', fig.width=5, fig.height=5, fig.align='center'>>=
library(mvtnorm)
@
We examine the case where
\[
\left[ \begin{array}{c} \mu_X \\ \mu_Y \end{array} \right] = \left[ \begin{array}{r} 1 \\ -1 \end{array} \right] \qquad
\Sigma = \left[ \begin{array}{cc} 2 & -1.8 \\ -1.8 & 4 \end{array} \right]
\]
For this example
\[
\rho_{XY} = \frac{\sigma_{XY}}{\sigma_X \sigma_Y} = \frac{-1.8}{\sqrt{2 \times 4}} = -0.6363.
\]
\pagebreak
Here is a heatmap and contour plot for the joint density:

<<binorm1,comment='+', fig.width=8, fig.height=8, fig.align='center'>>=
mu<-c(1,-1)
Sig<-matrix(2*c(1,-0.9,-0.9,2),nrow=2,ncol=2)
xvec<-yvec<-seq(-6,6,length=201)
dens <- matrix(dmvnorm(expand.grid(xvec, yvec),mean=mu,
                   sigma = Sig),ncol = length(xvec))
par(pty='s',mar=c(4,4,0,2))
image(xvec,yvec,dens,xlab='x',ylab='y',col=heat.colors(50))
contour(xvec,yvec,dens,add=T)
abline(v=mu[1],h=mu[2],lty=2)
@

\pagebreak

A plot with a colour scale can be obtained using the \textbf{image.plot} function from the \textbf{fields} package:
after loading this package using \texttt{library(fields)} we have
<<binorm2,comment='+', fig.width=8, fig.height=8, fig.align='center'>>=
library(fields,quietly=TRUE,verbose=FALSE)
par(pty='s',mar=c(4,3,0,6))
image(xvec,yvec,dens,xlab='x',ylab='y',col=heat.colors(50))
image.plot(zlim=c(0,0.07),legend.only=TRUE,col=heat.colors(50))
contour(xvec,yvec,dens,add=T)
abline(v=mu[1],h=mu[2],lty=2)
@

\pagebreak

A perspective plot of the joint density can be obtained using \texttt{persp}:
<<binorm3,comment='+', fig.width=8, fig.height=8, fig.align='center'>>=
xvec<-yvec<-seq(-6,6,length=51)
dens <- matrix(dmvnorm(expand.grid(xvec, yvec),mean=mu,
                   sigma = Sig),ncol = length(xvec))
par(pty='s',mar=c(4,2,0,2))
persp(xvec,yvec,dens,xlab='x',ylab='y',zlab='f(x,y)',
           phi=20,zlim=range(0,0.15),ticktype='detailed')
@

\pagebreak
A random sample can be obtained from the joint multivariate normal using \texttt{rmvnorm}
<<binorm4,comment='+', fig.width=8, fig.height=8, fig.align='center'>>=
N<-5000
set.seed(1234)
XY.samp<-rmvnorm(N,mean=mu,sigma = Sig)
par(pty='s',mar=c(4,2,0,6))
plot(XY.samp,pch=19,cex=0.6,xlab='x',ylab='y',xlim=range(-6,6),ylim=range(-6,6))
X<-XY.samp[,1];Y<-XY.samp[,2]
@

\pagebreak

Superimposing the points on the heatmap plot, we see that the random sample
is correctly drawn from the high density region;
<<binorm4a,comment='+', fig.width=8, fig.height=8, fig.align='center'>>=
xvec<-yvec<-seq(-6,6,length=201)
dens <- matrix(dmvnorm(expand.grid(xvec, yvec),mean=mu,
                   sigma = Sig),ncol = length(xvec))
par(pty='s',mar=c(4,2,0,6))
image(xvec,yvec,dens,xlab='x',ylab='y',col=heat.colors(50))
image.plot(zlim=c(0,0.07),legend.only=TRUE,col=heat.colors(50))
points(X,Y,cex=0.3,pch=19)
abline(v=mu[1],h=mu[2],lty=2)
@
\pagebreak


We can verify that the marginal distributions are correct:
<<binorm5,comment='+', fig.width=8, fig.height=5, fig.align='center'>>=
par(mfrow=c(1,2),mar=c(4,2,2,2))
muX<-1;sigmaX<-sqrt(2)
xv<-seq(-5,7,length=1001)
yv<-dnorm(xv,muX,sigmaX)
hist(X[X>-5 & X<7],breaks=seq(-5,7,by=0.2),xlab='X',main='Random sample of X');box()
lines(xv,yv*N*0.2,lwd=2,col='red')
muY<--1;sigmaY<-sqrt(4)
xv<-seq(-8,7,length=1001)
yv<-dnorm(xv,muY,sigmaY)
hist(Y[Y>-8 & Y<7],breaks=seq(-8,7,by=0.2),xlab='Y',main='Random sample of Y');box()
lines(xv,yv*N*0.2,lwd=2,col='red')
@

<<binorm5a,comment='+', fig.width=8, fig.height=5, fig.align='center'>>=
mean(X);var(X)
mean(Y);var(Y)
@

For the conditional distribution, we take a particular narrow range of $x$, and look at the samples of $y$ for which the sampled $x$ lie in that range.  Here we will take a narrow range around $x=0$ and then around $x=3$.  We do this with a much larger sample size for effective illustration.

<<binorm5b,comment='+', fig.width=8, fig.height=5, fig.align='center'>>=
sigmaXY<-Sig[1,2]
rhoXY<-sigmaXY/(sigmaX*sigmaY)
XY.big<-rmvnorm(100*N,mean=mu,sigma = Sig)
Xbig<-XY.big[,1];Ybig<-XY.big[,2]
par(mfrow=c(1,2),mar=c(4,2,2,0))
Y0<-Ybig[Xbig > -0.05 & Xbig < 0.05];N0<-length(Y0)
hist(Y0,breaks=seq(-10,10,by=0.5),ylim=range(0,2000),main='Conditional at x=0');box()
cond.mean0<-muY+(sigmaXY/sigmaX^2)*(0-muX)
cond.var0<-sigmaY^2*(1-rhoXY^2)
yv<-seq(-10,10,by=0.01)
dv<-dnorm(yv,cond.mean0,sqrt(cond.var0))
lines(yv,dv*0.5*N0,col='red',lwd=2)
Y3<-Ybig[Xbig > 2.95 & Xbig < 3.05];N3<-length(Y3)
hist(Y3,breaks=seq(-10,10,by=0.5),ylim=range(0,2000),main='Conditional at x=3');box()
cond.mean3<-muY+(sigmaXY/sigmaX^2)*(3-muX)
cond.var3<-sigmaY^2*(1-rhoXY^2)
dv<-dnorm(yv,cond.mean3,sqrt(cond.var3))
lines(yv,dv*0.5*N3,col='red',lwd=2)
@
At $x=0$, the conditional mean is
\[
\mu_Y + \rho_{XY}\frac{\sigma_Y}{\sigma_X} (0-\mu_X) = \Sexpr{muY + rhoXY*(sigmaY/sigmaX)*(0-muX)}
\]
whereas at $x=3$, the conditional mean is
\[
\mu_Y + \rho_{XY}\frac{\sigma_Y}{\sigma_X} (3-\mu_X) = \Sexpr{muY + rhoXY*(sigmaY/sigmaX)*(3-muX)}.
\]
However, in both cases, the conditional variance is
\[
\sigma_Y^2 (1-\rho_{XY}^2) = \Sexpr{sigmaY^2*(1-rhoXY^2)}
\]

\pagebreak

\begin{center}
\textsclarge{Relationship with Simple Linear Regression}
\end{center}
To see the relation to regression, we may factorize the joint density
\[
f_{X,Y}(x,y) = f_{Y|X}(y|x) f_X(x)
\]
where $X \sim \textrm{Normal}(\mu_X,\sigma_X^2)$ and
\[
Y|X=x \sim \textrm{Normal}\left(\mu_Y+\frac{\sigma_{XY}}{\sigma_X^2} (x-\mu_X),\sigma_Y^2 -  \frac{\sigma_{XY}^2}{\sigma_X^2} \right) \equiv \textrm{Normal}\left(\mu_Y+\rho_{XY} \frac{\sigma_{Y} }{\sigma_X} (x-\mu_X),\sigma_Y^2 (1-\rho_{XY}^2)\right)
\]
Equating the conditional expectation of $Y$ given $X=x$
\[
\Expect_{Y|X}[Y|x] = \mu_Y+\rho_{XY} \frac{\sigma_{Y} }{\sigma_X} (x-\mu_X)
\]
with a simple linear regression
\[
\Expect_{Y|X}[Y|x] = \beta_0 + \beta_1 x
\]
we identify
\[
\beta_0 = \mu_Y - \mu_X \rho_{XY} \frac{\sigma_{Y} }{\sigma_X} \qquad \qquad
\beta_1 = \rho_{XY} \frac{\sigma_{Y} }{\sigma_X}.
\]
In the example above, we have for the conditional mean of $Y$ at $X=x$
\[
\mu_Y+\frac{\sigma_{XY}}{\sigma_X^2} (x-\mu_X) = -1 + \frac{-1.8}{2}(x-1) = -0.1-0.9 x
\]
and the conditional variance is
\[
\sigma_Y^2 (1-\rho_{XY}^2) = 4 (1-(1.8^2/(2 \times 4))) = 2.38.
\]
<<binorm6,comment='+', fig.width=7, fig.height=7, fig.align='center'>>=
be1<-Sig[1,2]/Sig[1,1]
be0<-mu[2]-be1*mu[1]
c(be0,be1)
rho<-Sig[1,2]/sqrt(Sig[1,1]*Sig[2,2])
sigY.X<-Sig[2,2]*(1-rho^2)
rho
sigY.X
par(pty='s',mar=c(4,2,2,2))
plot(X,Y,pch=19,cex=0.5,xlab='x',ylab='y')
abline(be0,be1,col='red',lwd=2)
@
Fitting a simple linear regression model recovers the correct parameters:
<<binorm7,comment='+', fig.width=8, fig.height=8, fig.align='center'>>=
fitY.X<-lm(Y~X)
summary(fitY.X)$coef
summary(fitY.X)$sigma^2  #Estimate of sigma^2
@


\pagebreak
The residual plot still seems adequate:  the values of the $x$-variable are normally distributed around
the value 1, so we no longer see a `band' around zero, however the residuals do appear to be
centered at zero for all $x$, with constant variability.
<<binorm8,comment='+', fig.width=7, fig.height=7, fig.align='center'>>=
plot(X,residuals(fitY.X),ylab='Residuals',pch=19,cex=0.5)
abline(h=0,lty=2)
@

\pagebreak

To confirm correct recovery of the slope and intercept, we may test using a $t$-test.
The test statistics to test $\text{H}_0 ~:~ \beta_0 = -0.1$ and $\text{H}_0 ~:~ \beta_1 = -0.9$ are
computed as follows:
<<binorm7a,comment='+', fig.width=8, fig.height=8, fig.align='center'>>=
t0<-(coef(fitY.X)[1]+0.1)/summary(fitY.X)$coef[1,2]
t1<-(coef(fitY.X)[2]+0.9)/summary(fitY.X)$coef[2,2]
@
Here we have
\[
t_0 = \frac{\widehat \beta_0 - (-0.1)}{\text{e.s.e.}(\widehat \beta_0)} =
\frac{\Sexpr{coef(fitY.X)[1]}+0.1}{\Sexpr{summary(fitY.X)$coef[1,2]}} = \Sexpr{t0} \bumpeq \Sexpr{round(t0,4)}
\]
and
\[
t_1 = \frac{\widehat \beta_1 - (-0.9)}{\text{e.s.e.}(\widehat \beta_1)} =
\frac{\Sexpr{coef(fitY.X)[2]}+0.9}{\Sexpr{summary(fitY.X)$coef[2,2]}} = \Sexpr{t1} \bumpeq \Sexpr{round(t1,4)}
\]
neither of which is significant at $\alpha=0.05$ when compared against the appropriate quantile of the null
distribution; here the null distribution is the $\text{Student}(n-2) \equiv \text{Student}(4998)$ distribution
(with 0.975 quantile \Sexpr{qt(1-0.05/2,N-2)}). The sample correlation is
\[
r = \frac{\sum\limits_{i=1}^n (x_i - \overline{x})(y_i - \overline{y})}{\sqrt{\sum\limits_{i=1}^n (x_i - \overline{x})^2 \sum\limits_{i=1}^n (y_i - \overline{y})^2}} = \frac{S_{xy}}{\sqrt{S_{xx} \SST}}
\]
which is computed in \texttt{R} as follows:
<<binorm9,comment='+', fig.width=8, fig.height=8, fig.align='center'>>=
cor(X,Y)
@


\end{document}