\documentclass[notitlepage]{article} \usepackage{../../Math556} \usepackage{listings} \usepackage{numprint} \usepackage{enumerate} \usepackage{multirow} \usepackage{bbm} \usepackage{amsfonts,amsmath,amssymb} \setlength{\parindent}{0pt} \def\E{\Expect} \lstloadlanguages{R} \definecolor{keywordcolor}{rgb}{0,0.6,0.6} \definecolor{delimcolor}{rgb}{0.461,0.039,0.102} \definecolor{Rcommentcolor}{rgb}{0.101,0.043,0.432} \lstdefinestyle{Rsettings}{ basicstyle=\ttfamily, breaklines=true, showstringspaces=false, keywords={if, else, function, theFunction, tmp}, % Write as many keywords otherkeywords={}, commentstyle=\itshape\color{Rcommentcolor}, keywordstyle=\color{keywordcolor}, moredelim=[s][\color{delimcolor}]{"}{"}, } \lstset{basicstyle=\ttfamily, numbers=none, literate={~} {$\sim$}{2}} \begin{document} <>= library(knitr) # global chunk options opts_chunk$set(cache=TRUE, autodep=TRUE) options(scipen=999) options(repos=c(CRAN="https://cloud.r-project.org/")) inline_hook <- function (x) { if (is.numeric(x)) { # ifelse does a vectorized comparison # If integer, print without decimal; otherwise print two places res <- ifelse(x == round(x), sprintf("%.6f", x), sprintf("%.6f", x) ) paste(res, collapse = ", ") } } knit_hooks$set(inline = inline_hook) @ \begin{center} \textsclarge{MATH 556: Mathematical Statistics I} \vspace{0.1 in} \textsc{Probability functions in \texttt{R}} \end{center} Several functions are available in \texttt{R} to perform calculations for probability distributions. The main function we use to specify probability distributions for a random variable $X$ defined on the probability space $\mathcal{P} = (\Omega,\mathcal{F},P)$ is the \textit{cumulative distribution function} (cdf), $F_X(.)$, defined for any $x \in \R$ by \[ F_X(x) = P_X[X \leq x] \equiv P_X((-\infty,x]) = P(\{\omega \in \Omega : X(\omega) \leq x\}) \] provided the set $\{\omega \in \Omega : X(\omega) \leq x\} \in \mathcal{F}$; recall that the probability space for $X$ is now $(\R,\mathcal{B},P_X)$, where $\mathcal{B}$ is the (Borel) sigma algebra generated on $\R$ by -- for example -- the half-open sets $(-\infty,x]$ for $x \in \R$. \bigskip For convenience, we may also use representations of $F_X$ via \textit{mass} or \textit{density} functions. Consider the (minimal) \textit{support} $\X$ defined as the smallest (measurable) set in $\R$ such that \[ P_X(\X) = 1. \] \begin{itemize} \item If $\X$ is a \textit{countable} set, say \[ \X = \{t_1,t_2,\ldots\} \qquad \text{for } t_1 < t_2 < \cdots \] then $X$ is a discrete random variable, and we may specify the \textit{probability mass function} (pmf), $f_X(.)$ as the function such that \[ P_X(B) = \sum_{t_j \in B} f_X(t_j) \] and where \[ f_X(x) = P_X[X=x] \equiv P(\{\omega \in \Omega : X(\omega) = x\}) \qquad x \in \R. \] Specifically, \[ F_X(x) = \sum_{t \in \X \ : \ t \leq x} f_X(t) \qquad x \in \R \] In this case, $F_X(x)$ is \textit{non-decreasing} in $x$. \bigskip \item If $F_X(x)$ can be represented (using the standard notion of integration) \[ F_X(x) = \int_{-\infty}^x f_X(t) \: dt \qquad x \in \R \] then $X$ is a continuous random variable (that is, $F_X(x)$ is absolutely continuous with respect to $x$), and $f_X(x)$ is the \textit{probability density function} (pdf). By standard calculus results, we have that \[ f_X(x) = \frac{dF_X(t)}{dt} \bigg|_{t=x} \] wherever $F_X(x)$ is differentiable. In this case, $F_X(x)$ is \textit{monotonically increasing} in $x$ on support $\X$, and we have that \[ f_X(x) > 0 \quad x \in \X \] and we may take $f_X(x) = 0$ for $x \notin \X$. We have \[ \int_{-\infty}^\infty f_X(x) \: dx = \int_{\X} f_X(x) \: dx = 1. \] \end{itemize} We also have the notion of an inverse function for $F_X$. The \textit{quantile function}, $Q_X(.)$, is defined for $0 < p < 1$ by \[ Q_X(p) = \inf \{x \in \R : p \leq F_X(x) \}. \] That is, for any fixed $p$, $0 < p < 1$, $Q_X(p)$ is the $p$th \textit{quantile}, the smallest $x$ value for which the inequality $p \leq F_X(x)$ holds: if we track the value of $F_X(x)$ as $x$ increases, there must exist a point where the $F_X(x)$ first passes $p$ -- this point coincides with $Q_X(p)$. \pagebreak \textbf{Example:} \textbf{Continuous case} Suppose \[ F_X(x) = \left\{ \begin{array}{ll} 0 & x < 0 \\[6pt] 1 - e^{-x^2} & x \geq 0 \end{array} \right. . \] Then if follows that \[ f_X(x) = \left\{ \begin{array}{ll} 0 & x < 0 \\[6pt] 2 x e^{-x^2} & x > 0 \end{array} \right. . \] and we may define $f_X(0) = 0$ for completeness. <>= x<-seq(-3,3,by=0.01) Fx<-0*(x<0) + (1-exp(-x^2))*(x >= 0) fx<-0*(x<0) + 2*x*exp(-x^2)*(x >= 0) par(mfrow=c(2,1),mar=c(4,4,1,0)) plot(x,Fx,type='l',main='CDF',ylab=expression(F[X](x)));abline(v=0) plot(x,fx,type='l',main='PDF',ylab=expression(f[X](x)));abline(v=0) @ We have for $0 < p < 1$ that \[ Q_X(p) = \sqrt{-\log(1-p)} \] For example, if $p = 0.43$, we have that \[ Q_X(0.43) = \sqrt{-\log(1-0.43)} = \Sexpr{round(sqrt(-log(1-0.43)),6)}. \] <>= p<-seq(0.001,0.999,by=0.001);Qp<-sqrt(-log(1-p)) par(mfrow=c(2,1),mar=c(4,4,1,0)) plot(x,Fx,type='l',main='CDF',ylab=expression(F[X](x)));abline(v=0) p0<-0.43;(Q0<-sqrt(-log(1-p0))) abline(h=0.43,lty=2,col='red');arrows(Q0,p0,Q0,0,col='red',angle=10,length=0.2) text(-3,0.475,"p=0.43",adj=0,col='red');text(0.80,0,"Q=0.7497",adj=0,col='red') plot(p,Qp,type='l',main='Quantile function',ylab=expression(Q[X](p))) abline(v=0.43,lty=2,col='red');arrows(p0,Q0,0,Q0,col='red',angle=10,length=0.2) text(0,1,"Q=0.7497",adj=0,col='red');text(0.45,0.1,"p=0.43",adj=0,col='red') @ \textbf{Example:} \textbf{Discrete case} Suppose \[ F_X(x) = \left\{ \begin{array}{ll} 0 & x < 1 \\[6pt] 1 - (1-\theta)^{\lfloor x \rfloor} & x \geq 1 \end{array} \right. . \] where $0 < \theta < 1$ is a fixed constant (parameter) and where $\lfloor x \rfloor$ denotes the integer part of $x$ (that is, the largest integer no greater than $x$). This function is a step-function, with steps at the positive integers $\{1,2,\ldots\}$. For $\theta = 0.8$ we have the following plot: <>= x<-seq(-1,5.5,by=0.01) th<-0.8 Fx<-0*(x<1) + ((1-(1-th)^floor(x)))*(x >= 1) xv<-1:5 Fxv<-1-(1-th)^xv par(mar=c(4,4,1,0)) plot(x,Fx,pch=19,cex=0.25,main='CDF',ylab=expression(F[X](x))) points(xv,Fxv,pch=19) @ We may deduce that $X$ is discrete, and that for the pmf, $f_X(1) = 1 - (1-\theta) = \theta$, and for $x = 2,3, \ldots$ \[ f_X(x) = F_X(x) - F_X(x-1) = (1-\theta)^{x-1} - (1-\theta)^{x} = (1-\theta)^{x-1} \theta \] with $f_X(x) = 0$ for all other $x$. <>= x<-c(1:5) th<-0.8 fx<-(1-th)^(xv-1)*th par(mar=c(4,4,1,0)) plot(x,fx,pch=19,main='PMF',ylab=expression(f[X](x)),xlim=range(-1,5.5),ylim=range(0,1)) for(i in 1:length(x)){lines(c(x[i],x[i]),c(0,fx[i]))} @ For the quantile function, for $0 < p < 1$ \[ Q_X(p) = \inf \{ x : p \leq 1 - (1-\theta)^{\lfloor x \rfloor} \} = \inf \{ x : \log (1-p) \geq {\lfloor x \rfloor} \log (1-\theta) \} = \inf \left\{ x : \dfrac{\log (1-p)}{\log (1-\theta)} \leq {\lfloor x \rfloor} \right\} \] as $\log(1-\theta) < 0$. It is evident, therefore, that \[ Q_X(p) = \bigg \lceil \dfrac{\log (1-p)}{\log (1-\theta)} \bigg \rceil \] where $\lceil x \rceil$ denotes the smallest integer greater than or equal to $x$. Note that this function is \textit{left-continuous} as a function of $p$, as $F_X(x)$ is \textit{right-continuous} as a function of $x$. <>= p<-seq(0.001,0.999,by=0.001); Qp<-ceiling(log(1-p)/log(1-th)) par(mar=c(4,4,1,0)) plot(p,Qp,pch=19,cex=0.25,main='Quantile function',ylab=expression(Q[X](p)),ylim=range(0,6)) points(Fxv,xv,pch=19) @ \textbf{Example:} \textbf{Discrete case} Suppose now that \[ f_X(x) = e^{-\lambda} \frac{\lambda^x}{x!} \qquad x = 0,1,2,\ldots \] and $f_X(x) = 0$ for all other values of $x$, where $\lambda > 0$ is a fixed constant (parameter). This is the $Poisson(\lambda)$ distribution, and for $\lambda = 2.5$ we have the following plot: <>= x<-seq(-1,5,by=1);fx<-x lambda<-2.5 fx[x<0]<-0 fx[x>=0]<-exp(-lambda)*lambda^x[x>=0]/factorial(x[x>=0]) par(mar=c(4,4,1,0)) plot(x,fx,pch=19,main='PMF',ylab=expression(f[X](x)),ylim=range(0,1)) for(i in 1:length(x)){lines(c(x[i],x[i]),c(0,fx[i]))} @ In \texttt{R}, the Poisson pmf is computed by the \texttt{dpois} function: <>= rbind(x,fx,dpois(x,lambda)) @ For the cdf, there is no simple closed form, we may merely write that \[ F_X(x) = \left\{ \begin{array}{ll} 0 & x < 0 \\[6pt] \sum\limits_{t=0}^{\lfloor x \rfloor} e^{-\lambda} \dfrac{\lambda^t}{t!} & x \geq 0 \end{array} \right. . \] <>= x<-seq(-1,5.5,by=0.01);fx<-Fx<-x*0 #Form a continuum of x values xsub<-x >= 0 & x == floor(x) #Identify the integers fx[xsub]<-exp(-lambda)*lambda^x[xsub]/factorial(x[xsub]) #Compute the pmf at the integers Fx<-cumsum(fx) #Compute the cdf par(mar=c(4,4,1,0)) plot(x,Fx,pch=19,cex=0.25,ylim=range(0,1),main='CDF',ylab=expression(F[X](x))) points(x[xsub],Fx[xsub],pch=19) @ In \texttt{R}, the Poisson cdf is computed by the \texttt{ppois} function: <>= rbind(x=x[xsub],Fx=Fx[xsub],ppois_cdf=ppois(0:5,lambda)) @ For the quantile function, we must again compute numerically: that is, for $0 < p < 1$, we find the smallest (integer) $x$ such that \[ p \leq \sum\limits_{t=0}^{\lfloor x \rfloor} e^{-\lambda} \dfrac{\lambda^t}{t!} \] <>= p<-seq(0.001,0.999,by=0.001); Qp<-rep(0,length(p)) x<-seq(-1,15.5,by=0.01);fx<-Fx<-x*0 #Form a continuum of x values xsub<-x >= 0 & x == floor(x) #Identify the integers fx[xsub]<-exp(-lambda)*lambda^x[xsub]/factorial(x[xsub]) #Compute the pmf at the integers Fx<-cumsum(fx) #Compute the cdf for(i in 1:length(p)){ if(length(x[p[i]<=Fx]) == 0){ Qp[i]<-NA }else{ Qp[i]<-min(x[p[i]<=Fx]) } } par(mar=c(4,4,1,0)) plot(p,Qp,pch=19,cex=0.25,main='Quantile function',ylab=expression(Q[X](p)),ylim=range(0,10)) points(Fx[xsub],x[xsub],pch=19) @ In \texttt{R}, the Poisson quantile function is computed by the \texttt{qpois} function: <>= par(mar=c(4,4,1,0)) plot(p,qpois(p,lambda),pch=19,cex=0.25,ylab=expression(Q[X](p)),ylim=range(0,10)) title('Quantile function computed via qpois') points(Fx[xsub],x[xsub],pch=19) @ \end{document}