-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathLinearModelingLectureNotes2018.Rnw
5325 lines (3817 loc) · 185 KB
/
LinearModelingLectureNotes2018.Rnw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[nohyper,justified]{tufte-book}
\usepackage[T1]{fontenc}
\usepackage{url}
\usepackage{amsmath}
\usepackage{bm}
\usepackage[unicode=true,pdfusetitle,
bookmarks=true,bookmarksnumbered=true,bookmarksopen=true,bookmarksopenlevel=2,
breaklinks=true,pdfborder={0 0 0},backref=false,colorlinks=false]
{hyperref}
\hypersetup{
pdfstartview=FitH}
\usepackage[noanswer]{exercise}
%\newcounter{Exercise}
%\newenvironment{Exercise}{\begin{Exercise}[name={Exercise},
%counter={Exercise}]}
%{\end{Exercise}}
\usepackage{esint}
\setcounter{secnumdepth}{3}% turn on numbering
\newcommand{\BlackBox}{\rule{1.5ex}{1.5ex}}
\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{fact}{Fact}
\newtheorem{proposition}{Proposition}
\usepackage{mathtools}
\makeatletter
\newcommand{\explain}[2]{\underset{\mathclap{\overset{\uparrow}{#2}}}{#1}}
\newcommand{\explainup}[2]{\overset{\mathclap{\underset{\downarrow}{#2}}}{#1}}
\makeatother
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands.
\title{\large Linear modeling (MSc Linguistics and IECL program)}
\author[Shravan Vasishth]{\small Compiled by Shravan Vasishth}
\publisher{Vasishth Lab lecture notes}
\date{Version of \today}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% User specified LaTeX commands.
\renewcommand{\textfraction}{0.05}
\renewcommand{\topfraction}{0.8}
\renewcommand{\bottomfraction}{0.8}
\renewcommand{\floatpagefraction}{0.75}
\usepackage[buttonsize=1em]{animate}
\makeatother
\begin{document}
<<include=FALSE>>=
library(knitr)
# set global chunk options, put figures into folder
options(replace.assign=TRUE,show.signif.stars=FALSE)
opts_chunk$set(fig.path='figures/figure-', fig.align='center', fig.show='hold')
options(replace.assign=TRUE,width=75)
opts_chunk$set(dev='postscript')
options(show.signif.stars=FALSE)
library(lme4)
@
\maketitle
\setcounter{tocdepth}{2}
\tableofcontents
\newpage
\chapter*{Acknowledgements}
Much of the material here is derived from the University of Sheffield lecture notes in the MSc in Statistics.
I'm grateful to Lena J\"ager and Paul M\"atzig for catching numerous errors and unclear paragraphs.
Any mistakes are of course mine.
\chapter{Preliminaries}
\section{What this course is about}
These lecture notes cover the basic theory of linear models. My notes are heavily dependent on the MSc lecture notes in Statistics taught at the University of Sheffield, UK, and on the textbooks mentioned in these notes.
\cite{kerns,RossProb,gelmanhill07,dobson2011introduction}
The lecture notes are intended for graduate students in the MSc Linguistics and
IECL programs at the University of Potsdam.
I assume some basic knowledge of probability theory, but no knowledge of
calculus or linear algebra. These latter topics will come up in class and will be explained
as needed.
No significant active knowledge of calculus or linear algebra is needed for this course.
A prerequisite for taking this course is the Introduction to StatisticaL Data Analysis
class taught in winter semester.
My general philosophy is to try to convey the intuitive idea (graphically if possible), augmented with some proofs. I avoid complex proofs, referring the interested reader to more advanced textbooks.
We begin by considering some facts about random variables. Then, we look at how expectation and variance etc.\ are computed. In subsequent chapters, several typical probability distributions and their properties are discussed. A major topic of interest is maximum likelihood estimation.
Then we cover the basic theory of linear models, generalized linear models, and linear mixed models. We close with a tutorial on Bayesian linear modeling.
\section{Software and source code accompanying these notes}
Please install RStudio and R on your computer.
You can download the source code and data associated with these lecture notes from the course web page:
\href{http://www.ling.uni-potsdam.de/$\sim$vasishth/statistics/LinearModeling.html}{http://www.ling.uni-potsdam.de/~vasishth/statistics/LinearModeling.html}
\section{A comment on notation}
Throughout, I will define the Normal distribution in terms of $\mu$ and $\sigma$; this is not standard practice in statistics textbooks. In books, you will find that the normal distribution is defined in terms of $\mu$ and $\sigma^2$. The reason I deviate from this convention is that in R the normal distribution is defined in terms of $\sigma$.
I will not mark a vector of values (e.g., $\beta$) any differently from a scalar $\beta$; it will usually be clear from context which is meant.
\chapter{Random variables, Expectation and Variance}
\section{Discrete random variables}
A random variable $X$ is a function $X : S \rightarrow \mathbb{R}$ that associates to each outcome
$\omega \in S$ exactly one number $X(\omega) = x$.
$S_X$ is all the $x$'s (all the possible values of X, the support of X). I.e., $x \in S_X$.
An example of a \textbf{discrete} random variable:
number of coin tosses until a heads appears. The number of
coin tosses could be 0, 1, 2,\dots. These are \textbf{discrete} because
the number of tosses can only be
an integer value between zero and infinity; they are
not \textbf{continuous} because the number of tosses can't be any real
number between 0 and infinity: 3.5 is not a possible value.
\begin{itemize}
\item $X: \omega \rightarrow x$
\item $\omega$: H, TH, TTH,\dots (infinite)
\item $x=0,1,2,\dots; x \in S_X$. (Note that the function $X : S \rightarrow
\mathbb{R}$ now maps to a subset of $\mathbb{R}$, the integers.)
\end{itemize}
Every discrete random variable X has associated with it a \textbf{probability mass/distribution function (PDF)}, also called \textbf{distribution function}.
\begin{equation}
p_X : S_X \rightarrow [0, 1]
\end{equation}
defined by
\begin{equation}
p_X(x) = P(X(\omega) = x), x \in S_X
\end{equation}
[\textbf{Note}: Books sometimes abuse notation by overloading the meaning of $X$. They usually have: $p_X(x) = P(X = x), x \in S_X$]
\medskip
The \textbf{cumulative distribution function} is
\begin{equation}
F(a)=\sum_{\hbox{all } x \leq a} p(x)
\end{equation}
\subsection{Example: The Binomial random variable} \label{binomialrv}
Suppose that $n$ independent trials are performed, there are two possible outcomes, success and failure, each with probability $\theta$ and $(1-\theta)$ respectively.
Then, from the binomial theorem, the probability of x successes out of n is:
\begin{equation}\label{binomialprob}
P(X=x) = {n \choose x} \theta^x (1-\theta)^{n-x}
\end{equation}
For example, if we toss a coin twice, the probability of one or
less successes out of 2 tosses is the sum of
\begin{itemize}
\item The probability of 0 successes
\begin{equation}
P(X=0) = {2 \choose 0} \theta^0 (1-\theta)^{2-0}
= 1 \times (1-\theta)^{2}
\end{equation}
\item The probability of 1 success
\begin{equation}
P(X=1) = {2 \choose 1} \theta^1 (1-\theta)^{2-1}
= 1 \times 2\times\theta (1-\theta)^{1}
\end{equation}
\end{itemize}
If $\theta=0.5$, we have $0.5^2 + 2\times 0.5 \time 0.5
= 0.75$.
This will quickly become cumbersome to do by hand.
Consider the case where we have
n=10 coin tosses. What's the prob.\ of 1 or fewer successes? 2 or fewer? We can quickly compute the probability of getting x or fewer successes where x=0 to 10. For this, we use the built in cumulative distribution function (CDF) function \texttt{pbinom}.
<<cdfbinomial>>=
## sample size
n<-10
## prob of success
p<-0.5
probs<-rep(NA,11)
for(x in 0:10){
## Cumulative Distribution Function:
probs[x+1]<-round(pbinom(x,size=n,prob=p),digits=2)
}
@
\begin{marginfigure}
<<echo=TRUE>>=
## Plot the CDF:
plot(1:11,probs,xaxt="n",
xlab="Prob(X<=x)",
main="CDF")
axis(1,at=1:11,labels=0:10)
@
\caption{The CDF of the binomial.}
\end{marginfigure}
The probability of getting exactly 1 success,
P(X=1) can be computed by subtracting the probability
of 0 heads using \texttt{pbinom} from the probability of getting
1 or 0 heads:
<<>>=
pbinom(1,size=10,prob=0.5)-pbinom(0,size=10,prob=0.5)
choose(10,1) * 0.5 * (1-0.5)^9
@
What about the probability density function (PDF)? The built-in function in R for the PDF is \texttt{dbinom}:
<<pdfbinomial>>=
## P(X=0)
dbinom(0,size=10,prob=0.5)
@
\begin{marginfigure}
<<>>=
## Plot the pdf:
plot(1:11,
dbinom(0:10,size=10,prob=0.5),
main="PDF",
xaxt="n")
axis(1,at=1:11,labels=0:10)
@
\caption{The PDF (actually, probability mass function) of the binomial.}
\end{marginfigure}
To summarize, a discrete random variable X will be defined by
\begin{enumerate}
\item the function $X: S\rightarrow \mathbb{R}$, where S is the discrete set of outcomes (i.e., outcomes are $\omega \in S$).
\item $X(\omega) = x$, and $S_X$ is the \textbf{support} of X (i.e., $x\in S_X$).
\item A PDF is defined for X:
\begin{equation*}
p_X : S_X \rightarrow [0, 1]
\end{equation*}
\item A CDF is defined for X:
\begin{equation*}
F(a)=\sum_{\hbox{all } x \leq a} p(x)
\end{equation*}
\end{enumerate}
\section{Continuous random variables}
As mentioned above in the discrete case,
a random variable $X$ is a function $X : S \rightarrow \mathbb{R}$ that associates to each outcome
$\omega \in S$ exactly one number $X(\omega) = x$.
$S_X$ is all the $x$'s (all the possible values of X, the support of X). I.e., $x \in S_X$.
$X$ is a continuous random variable if there is a non-negative function $f$ defined for all real $x \in (-\infty,\infty)$ having the property that for any set B of real numbers,
%(note that B is the support $S_X$ in Kerns' notation; the use of B is Ross' notation),
\begin{equation}
P\{X \in B\} = \int_B f(x) \, dx
\end{equation}
$f(x)$ is the probability density function of the random variable $X$.
Since $X$ must assume some value, $f$ must satisfy
\begin{equation}
1= P\{X \in (-\infty,\infty)\} = \int_{-\infty}^{\infty} f(x) \, dx
\end{equation}
If $B=[a,b]$, then
\begin{equation}
P\{a \leq X \leq b\} = \int_{a}^{b} f(x) \, dx
\end{equation}
If $a=b$, we get
\begin{equation}
P\{X=a\} = \int_{a}^{a} f(x) \, dx = 0
\end{equation}
Hence, for any continuous random variable,
\begin{equation}
P\{X < a\} = P \{X \leq a \} = F(a) = \int_{-\infty}^{a} f(x) \, dx
\end{equation}
$F$ is the \textbf{cumulative distribution function}. Differentiating both sides in the above equation:
\begin{equation}
\frac{d F(a)}{da} = f(a)
\end{equation}
The density (PDF) is the derivative of the CDF.
Ross\cite{RossProb} suggests that it is intuitive to think about it as follows:
\begin{equation}
P\{a - \frac{\epsilon}{2} \leq X \leq a + \frac{\epsilon}{2} \} = \int_{a - \epsilon/2}^{a + \epsilon/2} f(x)\, dx \approx \epsilon f(a)
\end{equation}
when $\epsilon$ is small and when $f(\cdot)$ is continuous. I.e., $\epsilon f(a)$ is the approximate probability that $X$ will be contained in an interval of length $\epsilon$ around the point $a$.
\subsection{Example: Normal random variable}
\begin{equation}
f_{X}(x)=\frac{1}{\sigma\sqrt{2\pi}}e^{ \frac{-(x-\mu)^{2}}{2\sigma^{2}}},\quad -\infty < x < \infty.
\end{equation}
We write $X\sim\mathsf{norm}(\mathtt{mean}=\mu,\,\mathtt{sd}=\sigma)$, and the associated $\mathsf{R}$ function for the PDF is \texttt{dnorm(x, mean = 0, sd = 1)}, and the one for CDF is \texttt{pnorm}.
Note the default values for $\mu$ and $\sigma$ as 0 and 1 respectively. Note also that R defines the PDF in terms of $\mu$ and $\sigma$,
not $\mu$ and $\sigma^2$.
\begin{figure}[!htbp]
\centering
<<normaldistr,echo=FALSE,fig.width=6>>=
plot(function(x) dnorm(x), -3, 3,
main = "Normal density",ylim=c(0,.4),
ylab="density",xlab="X")
@
\caption{Normal distribution.}
\label{fig:normaldistr}
\end{figure}
%If $X$ is normally distributed with parameters $\mu$ and $\sigma^2$, then $Y=aX+b$ is normally distributed with parameters $a\mu + b$ and $a^2\sigma^2$.
Computing probabilities using the CDF:
<<>>=
pnorm(Inf)-pnorm(-Inf)
pnorm(2)-pnorm(-2)
pnorm(1)-pnorm(-1)
@
\paragraph{Standard or unit normal random variable}
If $X$ is normally distributed with parameters $\mu$ and $\sigma$, then $Z=(X-\mu)/\sigma$ is normally distributed with parameters $\mu=0,\sigma = 1$.
We conventionally write $\Phi (a)$ for the CDF:
\begin{equation}
\Phi (x)=\frac{1}{\sqrt{2\pi}} \int_{-\infty}^{a} e^{\frac{-z^2}{2}} \, dy
\quad \textrm{where } z=(x-\mu)/\sigma
\end{equation}
For example: $\Phi(2)$:\footnote{How would you compute $\Phi(-2)$?
}
<<>>=
pnorm(2)
@
If $Z$ is a standard normal random variable (SNRV) then
\begin{equation}
p\{ Z\leq -x\} = P\{Z>x\}, \quad -\infty < x < \infty
\end{equation}
Since $Z=((X-\mu)/\sigma)$ is an SNRV whenever $X$ is normally distributed with parameters $\mu$ and $\sigma^2$, then the CDF of $X$ can be expressed as:
\begin{equation}
F_X(a) = P\{ X\leq a \} = P\left( \frac{X - \mu}{\sigma} \leq \frac{a - \mu}{\sigma}\right) = \Phi\left( \frac{a - \mu}{\sigma} \right)
\end{equation}
The standardized version of a normal
random variable X is used to compute specific probabilities relating to X (it's also easier to compute probabilities from different CDFs so that the two computations are comparable).
\section{Expectations and Variances}
\subsection{Expectations and variances of discrete RVs}
The expectation can be seen as the long-run average value.
Let X be a discrete random variable. Then, random samples will be designated as the values $x_1, x_2, \dots, x_n$. For example:
<<>>=
x<-0:10
## expectation in our binomial example:
sum(x*dbinom(x,size=10,prob=0.5))
@
\begin{equation}
E[X]= \underset{i=1}{\overset{n}{\sum}} x_i p(x_i)
\end{equation}
In the binomial case, $E[X] = np$, where n is the sample size, and p the probability of success.\footnote{Proof: see https://proofwiki.org/wiki/Expectation\_of\_Binomial\_Distribution.}
We will refer to $E[X]$ as $\mu$.
The variance of the discrete random variable X is
\begin{equation}
Var(X)= E[(X-\mu)^2]
\end{equation}
In the binomial case, $Var(X) = np(1-p)$.\footnote{Proof: see https://proofwiki.org/wiki/Variance\_of\_Binomial\_Distribution.}
\subsection{Expectations and variances of continuous RVs}
Let X be a continuous random variable with PDF f(x). Then, the expectation is:
\begin{equation}
E[X]= \int_{-\infty}^{\infty} x f(x) \, dx = \mu
\end{equation}
The expectation of a function of $X$, $g(X)$:
\begin{equation} \label{eq:expfun}
E[g(X)]= \int_{-\infty}^{\infty} g(x) f(x) \, dx = \mu
\end{equation}
The variance is defined as:
\begin{equation}
Var[X]= E[(X-E[X])^2]
\end{equation}
An easier way to find the variance is through this equality:
\begin{equation} \label{varianceequation}
Var[X]=E[X^2]-(E[X])^2
\end{equation}
That is, to compute variance
we just need to find $E[X]$ and then $E[X^2]$.
The proof of the above equality goes as follows:
Let $E[X]=\mu$. By the definition of variance:
\begin{equation}
Var[X]= E[(X-E[X])^2]=E[(X-\mu)^2]
\end{equation}
Expanding out the RHS:
\begin{equation}
Var[X]= E[(X-\mu)^2]= E[X^2-2\mu X + \mu^2]
\end{equation}
By the linearity of expectation, we can rewrite this as:
\begin{equation}
\begin{split}
Var[X]=& E[X^2] -2\mu E[X] + \mu^2\\
=& E[X^2] - 2 \mu^2 + \mu^2 \\
=& E[X^2] - \mu^2\\
\end{split}
\end{equation}
\hfill \BlackBox
% to-do derive the above
\subsection{Example: The expectation and variance of the standard normal RV}
\paragraph{Expectation}
Let X be a standard normal random variable.
\begin{equation*}
E[X] = \frac{1}{\sqrt{2\pi}} \int_{-\infty}^\infty x e^{-x^2/2} \, dx
\end{equation*}
Let $u = -x^2/2$.
Then, $du/dx = -2x/2=-x$. I.e., $du= -x \, dx$ or $-du=x \, dx$.
We can rewrite the integral as:
\begin{equation*}
E[X] = \frac{1}{\sqrt{2\pi}} \int_{-\infty}^\infty e^{u} x \, dx\\
\end{equation*}
Replacing $x\, dx$ with $-du$ we get:
\begin{equation*}
-\frac{1}{\sqrt{2\pi}} \int_{-\infty}^\infty e^{u} \, du
\end{equation*}
which yields:
\begin{equation*}
-\frac{1}{\sqrt{2\pi}} [ e^{u} ]_{-\infty}^{\infty}
\end{equation*}
Replacing $u$ with $-x^2/2$ we get:
\begin{equation*}
-\frac{1}{\sqrt{2\pi}} [ e^{-x^2/2} ]_{-\infty}^{\infty} = 0
\end{equation*}
\paragraph{Variance}
We know that
\begin{equation*}
\hbox{Var}(X)=E[X^2]-(E[X])^2
\end{equation*}
Since $(E[X])^2=0$ (see immediately above), we just have to compute $E[g(X)]=E[X^2]$. Here, we use the earlier definition, see Equation~\ref{eq:expfun}, of the expectation of a function of a random variable.
\begin{equation*}
\hbox{Var}(X)=E[X^2] =
\frac{1}{\sqrt{2\pi}} \int_{-\infty}^\infty \explain{x^2}{\textrm{This is g(X).}} e^{-x^2/2} \, dx
\end{equation*}
Write $x^2$ as $x\times x$ and use integration by parts:\footnote{Recall how integration by parts works:
\begin{equation}
\frac{d(uv)}{dx} = u\frac{dv}{dx} + \int v\frac{du}{dx}
\end{equation}
\begin{equation}
uv = \int u\frac{dv}{dx}\, dx + \int v\frac{du}{dx}\, dx
\end{equation}
\begin{equation}\label{eq:intbyparts}
\int u\frac{dv}{dx}\, dx = uv - \int v\frac{du}{dx}\, dx
\end{equation}
}
\begin{equation*}
\frac{1}{\sqrt{2\pi}} \int_{-\infty}^\infty
\explain{x}{u} \explain{x e^{-x^2/2}}{dv/dx} \, dx =
\frac{1}{\sqrt{2\pi}}\explain{x}{u} \explain{-e^{-x^2/2}}{v} -
\frac{1}{\sqrt{2\pi}}\int_{-\infty}^\infty \explain{-e^{-x^2/2}}{v}
\explain{1}{du/dx} \, dx = 1
\end{equation*}
[Explained in p.\ 274 of Grinstead and Snell\cite{GrinsteadSnell}:
``The first summand above can be shown to equal 0, since as
$x \rightarrow \pm \infty$,
$e^{-x^2/2}$
gets
small more quickly than $x$ gets large. The second summand is just the standard
normal density integrated over its domain, so the value of this summand is 1.
Therefore, the variance of the standard normal density equals 1.''
%\textbf{Example}:
%Given N(10,16), write distribution of $\bar{X}$, where $n=4$. Since $SE=sd/sqrt(n)$, the distribution of $\bar{X}$ is $N(10,4/\sqrt{4}$).
\chapter{Some useful probability distributions}
%\section{Some useful continuous distributions}
%%to-do: give examples from real life of each distrn.
\subsection{Exponential random variables}
For some $\lambda > 0$,
\begin{equation*}
f(x)= \left\{
\begin{array}{l l}
\lambda e^{-\lambda x} & \quad \textrm{if } x \geq 0\\
0 & \quad \textrm{if } x < 0.\\
\end{array} \right.
\end{equation*}
A continuous random variable with the above PDF is an exponential random variable (or is said to be exponentially distributed).
The CDF:
\begin{equation*}
\begin{split}
F(a) =& P(X\leq a)\\
=& \int_0^a \lambda e^{-\lambda x}\, dx\\
=& \left[ -e^{-\lambda x} \right]_0^a\\
=& 1-e^{-\lambda a} \quad a \geq 0\\
\end{split}
\end{equation*}
[Note: the integration requires the u-substitution: $u=-\lambda x$, and then $du/dx=-\lambda$, and then use $-du=\lambda dx$ to solve.]
\paragraph{Expectation and variance of an exponential random variable}
For some $\lambda > 0$ (called the rate), if we are given the PDF of a random variable $X$:
\begin{equation*}
f(x)= \left\{
\begin{array}{l l}
\lambda e^{-\lambda x} & \quad \textrm{if } x \geq 0\\
0 & \quad \textrm{if } x < 0.\\
\end{array} \right.
\end{equation*}
Find E[X].
[This proof seems very strange and arbitrary---one starts really generally and then scales down, so to speak. The standard method can equally well be used, but this is more general, it allows for easy calculation of the second moment, for example. Also, it's an example of how reduction formulae are used in integration.]
\begin{equation*}
E[X^n] = \int_0^\infty x^n \lambda e^{-\lambda x} \, dx
\end{equation*}
Use integration by parts (see equation~\ref{eq:intbyparts} on page~\pageref{eq:intbyparts}):
Let $u=x^n$, which gives $du/dx=n x^{n-1}$. Let $dv/dx= \lambda e^{-\lambda x}$, which gives
$v = -e^{-\lambda x}$. Therefore:
\begin{equation*}
\begin{split}
E[X^n] =& \int_0^\infty x^n \lambda e^{-\lambda x} \, dx \\
=& \left[ -x^n e^{-\lambda x}\right]_0^\infty + \int_0^\infty e^{\lambda x} n x^{n-1}\, dx\\
=& 0 + \frac{n}{\lambda} \int_0^\infty \lambda e^{-\lambda x} n^{n-1}\, dx
\end{split}
\end{equation*}
Thus,
\begin{equation*}
E[X^n] = \frac{n}{\lambda}E[X^{n-1}]
\end{equation*}
If we let $n=1$, we get $E[X]$:
\begin{equation*}
E[X] = \frac{1}{\lambda}
\end{equation*}
Note that when $n=2$, we have
\begin{equation*}
E[X^2] = \frac{2}{\lambda}E[X]= \frac{2}{\lambda^2}
\end{equation*}
Variance is, as usual,
\begin{equation*}
var(X) = E[X^2] - (E[X])^2 = \frac{2}{\lambda^2} - (\frac{1}{\lambda})^2 = \frac{1}{\lambda^2}
\end{equation*}
\subsection{Weibull distribution}
\begin{equation}
f(x\mid \alpha, \beta) = \alpha \beta (\beta x)^{\alpha-1} \exp (- (\beta x)^{\alpha})
\end{equation}
When $\alpha=1$, we have the exponential distribution.
\subsection{Gamma distribution}
[The text is an amalgam of
Kerns\cite{kerns} and Ross\cite{RossProb}. I don't put it in double-quotes as a citation because it would look ugly.]
This is a generalization of the exponential distribution. We say that $X$ has a gamma distribution and write $X\sim\mathsf{gamma}(\mathtt{shape}=\alpha,\,\mathtt{rate}=\lambda)$, where $\alpha>0$ (called shape) and $\lambda>0$ (called rate). It has PDF
%% Kerns:
%\begin{equation*}
%f_{X}(x)=\frac{\lambda^{\alpha}}{\Gamma(\alpha)}\: x^{\alpha-1}\mathrm{e}^{-\lambda x},\quad x>0.
%\end{equation*}
\begin{equation*}
f(x)= \left\{
\begin{array}{l l}
\frac{\lambda e^{-\lambda x} (\lambda x)^{\alpha - 1}}{\Gamma(\alpha)} & \quad \textrm{if } x \geq 0\\
0 & \quad \textrm{if } x < 0.\\
\end{array} \right.
\end{equation*}
$\Gamma(\alpha)$ is called the gamma function:
\begin{equation*}
\Gamma(\alpha) = \int_0^\infty e^{-y}y^{\alpha-1}\, dy \explain{=}{\textrm{integration by parts}} (\alpha -1 )\Gamma(\alpha - 1)
\end{equation*}
Note that for integral values of $n$, $\Gamma(n)=(n-1)!$ (follows from above equation).
The associated $\mathsf{R}$ functions are \texttt{gamma(x, shape, rate = 1)}, \texttt{pgamma}, \texttt{qgamma}, and \texttt{rgamma}, which give the PDF, CDF, quantile function, and simulate random variates, respectively. If $\alpha=1$ then $X\sim\mathsf{exp}(\mathtt{rate}=\lambda)$. The mean is $\mu=\alpha/\lambda$ and the variance is $\sigma^{2}=\alpha/\lambda^{2}$.
To motivate the gamma distribution recall that if $X$ measures the length of time until the first event occurs in a Poisson process with rate $\lambda$ then $X\sim\mathsf{exp}(\mathtt{rate}=\lambda)$. If we let $Y$ measure the length of time until the $\alpha^{\mathrm{th}}$ event occurs then $Y\sim\mathsf{gamma}(\mathtt{shape}=\alpha,\,\mathtt{rate}=\lambda)$. When $\alpha$ is an integer this distribution is also known as the \textbf{Erlang} distribution.
\begin{figure}[!htbp]
\centering
<<gamma,echo=FALSE,fig.width=6>>=
## fn refers to the fact that it
## is a function in R, it does not mean that
## this is the gamma function:
gamma.fn<-function(x){
lambda<-1
alpha<-1
(lambda * exp(1)^(-lambda*x) *
(lambda*x)^(alpha-1))/gamma(alpha)
}
x<-seq(0,4,by=.01)
plot(x,gamma.fn(x),type="l")
@
\caption{The gamma distribution.}.
\label{fig:gamma}
\end{figure}
The Chi-squared distribution is the gamma distribution with $\lambda=1/2$ and $\alpha=n/2$, where $n$ is an integer:
\begin{figure}[!htbp]
\centering
<<chisq,echo=FALSE,fig.width=6>>=
gamma.fn<-function(x){
lambda<-1/2
alpha<-8/2 ## n=4
(lambda * (exp(1)^(-lambda*x)) *
(lambda*x)^(alpha-1))/gamma(alpha)
}
x<-seq(0,100,by=.01)
plot(x,gamma.fn(x),type="l")
@
\caption{The chi-squared distribution.}
\label{fig:chisq}
\end{figure}
\paragraph{Mean and variance of gamma distribution}
Let $X$ be a gamma random variable with parameters $\alpha$ and $\lambda$.
\begin{equation*}
\begin{split}
E[X] =& \frac{1}{\Gamma(\alpha)} \int_0^\infty x \lambda e^{-\lambda x} (\lambda x)^{\alpha - 1}\, dx\\
=& \frac{1}{\lambda \Gamma(\alpha)} \int_0^\infty e^{-\lambda x} (\lambda x)^{\alpha}\, dx\\
=& \frac{\Gamma(\alpha+1)}{\lambda \Gamma(\alpha)}\\
=& \frac{\alpha}{\lambda} \\
\end{split}
\end{equation*}
(See derivation of $\Gamma(\alpha)$, p.\ 215 of Ross\cite{RossProb}.)
It is easy to show (exercise) that
\begin{equation*}
Var(X)=\frac{\alpha}{\lambda^2}
\end{equation*}
\subsection{Uniform random variable}
A random variable $(X)$ with the continuous uniform distribution on the interval $(\alpha,\beta)$ has PDF
\begin{equation}
f_{X}(x)=
\begin{cases}
\frac{1}{\beta-\alpha}, & \alpha < x < \beta,\\
0 , & \hbox{otherwise}
\end{cases}
\end{equation}
The associated $\mathsf{R}$ function is $\mathsf{dunif}(\mathtt{min}=a,\,\mathtt{max}=b)$. We write $X\sim\mathsf{unif}(\mathtt{min}=a,\,\mathtt{max}=b)$. Due to the particularly simple form of this PDF we can also write down explicitly a formula for the CDF $F_{X}$:
\begin{equation}
F_{X}(a)=
\begin{cases}
0, & a < 0,\\
\frac{a-\alpha}{\beta-\alpha}, & \alpha \leq t < \beta,\\
1, & a \geq \beta.
\end{cases}
\label{eq-unif-cdf}
\end{equation}
\begin{equation}
E[X]= \frac{\beta+\alpha}{2}
\end{equation}
\begin{equation}
Var(X)= \frac{(\beta-\alpha)^2}{12}
\end{equation}
\subsection{Beta distribution}
This is a generalization of the continuous uniform distribution.
\begin{equation*}
f(x)= \left\{
\begin{array}{l l}
\frac{1}{B(a,b)} x^{a - 1} (1-x)^{b-1} & \quad \textrm{if } 0< x < 1\\
0 & \quad \textrm{otherwise}\\
\end{array} \right.
\end{equation*}
\noindent
where
\begin{equation*}
B(a,b) = \int_0^1 x^{a-1}(1-x)^{b-1}\, dx
\end{equation*}
There is a connection between the beta and the gamma:
\begin{equation*}
B(a,b) = \int_0^1 x^{a-1}(1-x)^{b-1}\, dx = \frac{\Gamma(a)\Gamma(b)}{\Gamma(a+b)}
\end{equation*}
\noindent
which allows us to rewrite the beta PDF as
\begin{equation}
f(x)=\frac{\Gamma(a+b)}{\Gamma(a)\Gamma(b)}\, x^{a-1}(1-x)^{b-1},\quad 0 < x < 1.
\end{equation}
%We write $X\sim\mathsf{beta}(\mathtt{shape1}=\alpha,\,\mathtt{shape2}=\beta)$. The associated $\mathsf{R}$ function is =dbeta(x, shape1, shape2)=.
The mean and variance are
\begin{equation}
E[X]=\frac{a}{a+b}\mbox{ and }Var(X)=\frac{ab}{\left(a+b\right)^{2}\left(a+b+1\right)}.
\end{equation}
%See Example [[exa-cont-pdf3x2][Cont-pdf3x2]]. This distribution comes up a lot in Bayesian statistics because it is a good model for one's prior beliefs about a population proportion $p$, $0\leq p\leq1$.
%to-do: plot beta with different a,b.
\section{Jointly distributed random variables}
\subsection{Discrete case}
[This section is an extract from Kerns. I omit quotes as that would make the text harder to read.]
Consider two discrete random variables $X$ and $Y$ with PMFs $f_{X}$ and $f_{Y}$ that are supported on the sample spaces $S_{X}$ and $S_{Y}$, respectively. Let $S_{X,Y}$ denote the set of all possible observed \textbf{pairs} $(x,y)$, called the \textbf{joint support set} of $X$ and $Y$. Then the \textbf{joint probability mass function} of $X$ and $Y$ is the function $f_{X,Y}$ defined by
\begin{equation}
f_{X,Y}(x,y)=\mathbb{P}(X=x,\, Y=y),\quad \mbox{for }(x,y)\in S_{X,Y}.\label{eq-joint-pmf}
\end{equation}
Every joint PMF satisfies
\begin{equation}
f_{X,Y}(x,y)>0\mbox{ for all }(x,y)\in S_{X,Y},
\end{equation}
and
\begin{equation}
\sum_{(x,y)\in S_{X,Y}}f_{X,Y}(x,y)=1.
\end{equation}
It is customary to extend the function $f_{X,Y}$ to be defined on all of $\mathbb{R}^{2}$ by setting $f_{X,Y}(x,y)=0$ for $(x,y)\not\in S_{X,Y}$.
In the context of this chapter, the PMFs $f_{X}$ and $f_{Y}$ are called the \textbf{marginal PMFs} of $X$ and $Y$, respectively. If we are given only the joint PMF then we may recover each of the marginal PMFs by using the Theorem of Total Probability:
\begin{eqnarray}
f_{X}(x) & = & \mathbb{P}(X=x),\\
& = & \sum_{y\in S_{Y}}\mathbb{P}(X=x,\, Y=y),\\
& = & \sum_{y\in S_{Y}}f_{X,Y}(x,y).
\end{eqnarray}
By interchanging the roles of $X$ and $Y$ it is clear that
\begin{equation}
f_{Y}(y)=\sum_{x\in S_{X}}f_{X,Y}(x,y).\label{eq-marginal-pmf}
\end{equation}
Given the joint PMF we may recover the marginal PMFs, but the converse is not true. Even if we have \textbf{both} marginal distributions they are not sufficient to determine the joint PMF; more information is needed.
Associated with the joint PMF is the \textbf{joint cumulative distribution function} $F_{X,Y}$ defined by
\[
F_{X,Y}(x,y)=\mathbb{P}(X\leq x,\, Y\leq y),\quad \mbox{for }(x,y)\in\mathbb{R}^{2}.
\]
The bivariate joint CDF is not quite as tractable as the univariate CDFs, but in principle we could calculate it by adding up quantities of the form in Equation~\ref{eq-joint-pmf}. The joint CDF is typically not used in practice due to its inconvenient form; one can usually get by with the joint PMF alone.
\paragraph{Example: Discrete bivariate case}
Roll a fair die twice. Let $X$ be the face shown on the first roll, and let $Y$ be the face shown on the second roll. For this example, it suffices to define
\[
f_{X,Y}(x,y)=\frac{1}{36},\quad x=1,\ldots,6,\ y=1,\ldots,6.
\]
The marginal PMFs are given by $f_{X}(x)=1/6$, $x=1,2,\ldots,6$, and $f_{Y}(y)=1/6$, $y=1,2,\ldots,6$, since
\[
f_{X}(x)=\sum_{y=1}^{6}\frac{1}{36}=\frac{1}{6},\quad x=1,\ldots,6,
\]
and the same computation with the letters switched works for $Y$.
Here, and in many other ones, the joint support can be written as a product set of the support of $X$ ``times'' the support of $Y$, that is, it may be represented as a cartesian product set, or rectangle, $S_{X,Y}=S_{X}\times S_{Y}$, where $S_{X} \times S_{Y}= \{ (x,y):\ x\in S_{X},\, y\in S_{Y} \} $. This form is a necessary condition for $X$ and $Y$ to be \textbf{independent} (or alternatively \textbf{exchangeable} when $S_{X}=S_{Y}$). But please note that in general it is not required for $S_{X,Y}$ to be of rectangle form.
\subsection{Continuous case}
For random variables $X$ and $y$, the \textbf{joint cumulative pdf} is
\begin{equation}
F(a,b) = P(X\leq a, Y\leq b) \quad -\infty < a,b<\infty
\end{equation}
The \textbf{marginal distributions} of $F_X$ and $F_Y$ are the CDFs of each of the associated RVs:
\begin{enumerate}
\item The CDF of $X$:
\begin{equation}
F_X(a) = P(X\leq a) = F_X(a,\infty)
\end{equation}
\item The CDF of $Y$:
\begin{equation}
F_Y(a) = P(Y\leq b) = F_Y(\infty,b)
\end{equation}
\end{enumerate}
\begin{definition}\label{def:jointcont}
\textbf{Jointly continuous}: Two RVs $X$ and $Y$ are jointly continuous if there exists a function $f(x,y)$ defined for all real $x$ and $y$, such that for every set $C$:
\begin{equation} \label{jointpdf}
P((X,Y)\in C) =
\iintop_{(x,y)\in C} f(x,y)\, dx\,dy
\end{equation}
$f(x,y)$ is the \textbf{joint PDF} of $X$ and $Y$.
Every joint PDF satisfies
\begin{equation}
f(x,y)\geq 0\mbox{ for all }(x,y)\in S_{X,Y},
\end{equation}
and
\begin{equation}
\iintop_{S_{X,Y}}f(x,y)\,\mathrm{d} x\,\mathrm{d} y=1.
\end{equation}
\end{definition}
For any sets of real numbers $A$ and $B$, and if $C=\{(x,y): x\in A, y\in B \}$, it follows from equation~\ref{jointpdf} that
\begin{equation}
P((X\in A,Y\in B)\in C) = \int_B \int_{A} f(x,y)\, dx\,dy
\end{equation}
Note that
\begin{equation}
F(a,b) = P(X\in (-\infty,a]),Y\in (-\infty,b])) = \int_{-\infty}^b \int_{-\infty}^a f(x,y)\, dx\,dy
\end{equation}
Differentiating, we get the joint pdf:
\begin{equation}
f(a,b) = \frac{\partial^2}{\partial a\partial b} F(a,b)
\end{equation}
One way to understand the joint PDF:
\begin{equation}
P(a<X<a+da,b<Y<b+db)=\int_b^{d+db}\int_a^{a+da} f(x,y)\, dx\, dy \approx f(a,b) da db
\end{equation}
Hence, $f(x,y)$ is a measure of how probable it is that the random vector $(X,Y)$ will be near $(a,b)$.
\paragraph{Example: Bivariate normal distribution}
If we have two independent random variables U0, U1, and we examine their joint distribution, we can plot a 3-d plot which shows, u0, u1, and f(u0,u1). E.g.,
\begin{equation}
f(u0,u1) \sim \left(N\left(
\begin{pmatrix}
0\\
0\\
\end{pmatrix}
,
\begin{pmatrix}1 & 0\\
0 & 1\\