-
Notifications
You must be signed in to change notification settings - Fork 15
/
15-Cheatsheet2014-BE.tex
executable file
·457 lines (399 loc) · 18.5 KB
/
15-Cheatsheet2014-BE.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
\documentclass[main]{subfiles}
\begin{document}
%% Big thanks go to Helen Oleynikova who attended Machine Learning 2013
%% Tex source of the cheat sheet found at https://www.amiv.ethz.ch/studium/unterlagen/65
%change spacings according to your preferences (readability vs. content)
% Syntax : \titlespacing*{<command>}{<left>}{<before-sep>}{<after-sep>}
\titlespacing*{\section}
{0pt}{0.1pt}{0.1pt}
\titlespacing*{\subsection}
{0pt}{0.1pt}{0.1pt}
\titlespacing*{\subsubsection}
{0pt}{0.1pt}{0.1pt}
%############
% Cheatsheet setup depending on whether it is compiled on its own or within the whole summary
%############
\ifx\cheatsheet\undefined
%change page definitions to fullpage. Compile this document alone to make it compiled as fullpage.
\setlength{\topmargin}{-1.2in} %{20mm} %Topmargin
\setlength{\headsep}{2mm} %Body starts 55mm from top sheet edge
\setlength{\footskip}{0mm} %{15mm}
\setlength{\itemsep}{0pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
\setlength{\textwidth}{208mm-4mm} % A4 page width 210
\setlength{\textheight}{295mm-4mm} %A4 page height 297
\addtolength{\voffset}{15pt}
\setlength{\oddsidemargin}{-0.8in}
\setlength{\evensidemargin}{-1in}
\else
%set these settings if the cheatsheet is compiled for the whole summary. This might make that the cheatsheet is longer than 2 pages.
\setlength{\footskip}{0mm} %{15mm}
\setlength{\textwidth}{190mm} %{212mm}
\setlength{\itemsep}{0pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
\setlength{\topmargin}{15mm} %{20mm} %Topmargin
\setlength{\headsep}{2mm} %Body starts 55mm from top sheet edge
\setlength{\footskip}{0mm} %{15mm}
\setlength{\itemsep}{0pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
\setlength{\textwidth}{190mm} % A4 page width 210
\setlength{\textheight}{277mm} %Best {140mm}
\addtolength{\voffset}{-30mm}
\setlength{\oddsidemargin}{10mm-1in} %{35mm} %Left margin
\setlength{\evensidemargin}{10mm}
\fi
\begin{landscape}
%this removes the section title if the cheat sheet is compiled separately
\ifx\cheatsheet\undefined
\addcontentsline{toc}{section}{Cheat sheet 2014}
\else
{\color{sectionColor}\section{Cheat sheet 2014 Benjamin Ellenberger}}
\fi
%VISUALIZE LAYOUT AND LAYOUTVALUES:
%\pagevalues
%\newpage asd\newpage
%\layout \newpage adsf \newpage
%\newpage
%
%
\begin{multicols}{4}
%%%%%%%%%%%%%%%%%%%%%%%%%
\scriptsize
%\footnotesize
%\small
{\color{subsectionColor}\subsection{Probability}}
{\color{subsubsectionColor}\subsubsection{Probability Rules}}
%%%
\begin{tabular}{p{5em}l}
Sum Rule& \(P(X=x_i) = \sum_{j=1}^{J} p(X=x_i,Y=y_i)\)\\
Product rule& \(P(X, Y) = P(Y|X) P(X)\) \\
Independence& \(P(X, Y) = P(X)P(Y)\) \\
Bayes' Rule& \(P(Y|X) = \frac{P(X|Y)P(Y)}{P(X)} = \frac{P(X|Y)P(Y)}{\sum\limits^k_{i=1}P(X|Y_i)P(Y_i)}\)\\
Conditional independence& \(X\bot Y|Z \)\\
& \(P(X,Y|Z) = P(X|Z)P(Y|Z)\) \\
& \(P(X|Z,Y) = P(X|Z)\)
\end{tabular}
{\color{subsubsectionColor}\subsubsection{Expectation}}
\begin{tabular}{l}
\(E(X) = \int_{\inf}^{\inf} x p(x) dx\) \\
\(\sigma^2(X) = E(x^2)-{E(x)}^2\) \\
\(\sigma^2(X) = \int_x (x-\mu_x)^2 p(x) dx\) \\
\(Cov(X, Y) = \int_x \int_y p(x,y) (x-\mu_x)(y-\mu_y) dx dy\)
\end{tabular}
{\color{subsubsectionColor}\subsubsection{Gaussian}}
\begin{align}
p(X|\mu,\sigma)&=\frac{1}{\sigma \sqrt{2\pi}} \exp(-\frac{(x-\mu)^2}{2\sigma^2})\\
&=\frac{1}{\sqrt{2\pi^k |\Sigma|}} \exp(-\frac{1}{2}(x-\mu)^T\Sigma^{-1}(x-\mu))
\end{align}
{\color{subsubsectionColor}\subsubsection{Kernels}}
Requirements: Symmetric ($k(x,y)=k(y,x)$) \\ positive semi-definite $K$.
\begin{tabular}{l}
\(k(x,y) = a k_1(x,y) + b k_2(x,y)\)\\
\(k(x,y) = k_1(x,y)k_2(x,y)\)\\
\(k(x,y) = f(x) f(y)\)\\
\(k(x,y) = k_3(\phi(x), \phi(y))\) \\
with $a>0,b>0$
\end{tabular}
\begin{tabular}{ll}
Linear & \(k(x,y) = x^\top y\)\\
Polynomial & \(k(x,y) = (x^\top y + 1)^d\)\\
Gaussian RBF & \(k(x,y) = \exp(\frac{-\|x-y\|^2_2}{h^2})\)\\
Sigmoid & \(k(x,y) = \tanh(k x^\top y - b)\)
\end{tabular}
{\color{subsectionColor}\subsection{Regression}}
\textbf{Linear Regression:}
\begin{equation}
\min_{w} \sum_{i=1}^n (y_i - w^\top x_i)^2
\end{equation}
Closed form solution: $\beta^* = (X^\top X)^{-1} X^\top Y$ \\
\textbf{Ridge Regression:}
\begin{equation}
\min_{w} \sum_{i=1}^n (y_i - w^\top x_i)^2 + \lambda \|w\|_2^2
\end{equation}
Closed form solution: $\beta^* = (X^\top X + \lambda I)^{-1} X^\top Y$ \\
\textbf{Lasso Regression} (sparse), requiring $\sum\limits_j^d |\beta_j| < s$:
\begin{equation}
\min_{w} \sum_{i=1}^n (y_i - w^\top x_i)^2 + \lambda \|w\|_1
\end{equation}
\textbf{Kernelized Linear Regression:}
\begin{equation}
\min_{\alpha} \|K\alpha - y \|_2^2 + \lambda \alpha^\top K \alpha
\end{equation}
Closed form solution: $\alpha = (K-\lambda I)^{-1} y $ \\
\textbf{Nonlinear Regression}:
\begin{equation}
RSS(f,\lambda) = \sum\limits_{i=1}^n (y_i - f(x_i))^2 + \lambda \int (f''(x))^2dx
\end{equation}
\begin{equation}
f(x) = \sum\limits_{m=1}^M \beta_m h_m(x) \text{ with } h_m {1,x,(x-\xi_1)^2}
\end{equation}
$\xi_1$ are knots, the center of a function.
$f''(x)=0$ is required to have smoothness\\
\textbf{Bias vs. Variance}
\begin{align}
&\E_D\E_{X,Y}\left(\hat{f}(X)-Y\right)^2 = \E_D\E_X\left(\hat{f}(X) - \E(Y|X)\right)^2 \\
&+ \E_{X,Y}\left(Y - \E(Y|X)\right)^2\\
&= \underbrace{\E_X \E_D\left(\hat{f}(X) - \E_D(\hat{f}(X))\right)^2}_{variance} \\
&+ \underbrace{\E_X\left(\E_D(\hat{f}(X)) - \E(Y|X)\right)^2}_{\text{bias}^2}+ \underbrace{\E_{X,Y}\left(Y - \E(Y|X)\right)^2}_{\text{noise}}
\end{align}
\textbf{Combining regressions}
\begin{align}
bias[f (x)] &= \E_D \hat{f} (x) - \E(Y|x) = \frac{1}{B} \sum\limits_{i=1}^B \E_D\hat{f}_i(x) - \E(Y|x) \\
&= \frac{1}{B} \sum\limits_{i=1}^B (\E_D\hat{f}_i(x) - \E(Y|x)) = \frac{1}{B} \sum\limits_{i=1}^B bias[f_i(x)]
\end{align}
\[\V\{ \hat{f} (x)\} \approx \frac{\sigma^2}{B}\]
{\color{subsectionColor}\subsection{Classification}}
\begin{eqnarray}
\text{0/1 Loss} & w^* =& \argmin_w \sum_{i=1}^n [y_i \neq sign(w^\top x_i)] \\
\text{Perceptron} & w^* =& \argmin_w \sum_{i=1}^n [\max(0, y_i w^\top x_i)]
\end{eqnarray}
{\color{subsubsectionColor}\subsubsection{SVM}}
Primal, constrained:
\begin{equation}
\min_{w} w^\top w + C \sum_{i=1}^{n} \xi_i, \text{ s.t. } y_i w^\top x_i \geq 1 - \xi_i, \xi_i \geq 0
\end{equation}
Primal, unconstrained:
\begin{equation}
\min_{w} w^\top w + C \sum_{i=1}^{n} \max(0, 1-y_i w^\top x_i) \text{ (hinge loss)}
\end{equation}
Dual:
\begin{equation}
\max_{\alpha} \sum_{i=1}^{n} \alpha_i - \frac{1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j x_i^\top x_j, \text{ s.t. } 0 \leq \alpha_i \leq C
\end{equation}
Dual to primal: $w^* = \sum_{i=1}^{n} \alpha^*_i y_i x_i$, $\alpha_i > 0$: support vector.
{\color{subsubsectionColor}\subsubsection{Kernelized SVM}}
\begin{equation}
\max_{\alpha} \sum_{i=1}^{n} \alpha_i - \frac{1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j k(x_i, x_j), \text{ s.t. } 0 \geq \alpha_i \geq C
\end{equation}
Classify: $y = sign(\sum_{i=1}^{n} \alpha_i y_i k(x_i, x))$
{\color{subsubsectionColor}\subsubsection{How to find $a^T$?}}
$a = \{w_0,w\}$ used along $\widetilde{x} = \{1,x\}$
Gradient Descent: $a(k+1) = a(k) - \eta(k) \nabla J(a(k))$
Newton method: 2nd order Taylor to get $\eta_{opt} = H^{-1}$ with $H=\frac{\partial^2 J}{\partial a_i \partial a_j}$
$J$ is the cost matrix, popular choice is
Perceptron cost: $J_p (a) = \sum(-a^T \widetilde{x})$
{\color{subsubsectionColor}\subsubsection{Bootstrapping}}
Sample creation with replacement. Calculate mean and var of it and average.
\begin{eqnarray}
\bar{S} = \frac{1}{B}\sum S(Z*)\\
\sigma^2(S) = \frac{1}{B-1} (S(Z^*) - \bar{S})^2\\
\end{eqnarray}
Bootstrapping works if for $n \rightarrow \infty$ the error of empirical\&bootstrap is the same as real\&empiricial
Probability for a sample not to appear in set: $(1-\frac{1}{n})^n$. Goes to $\frac{1}{e}$ for $n \rightarrow \infty$
Multiplicity N sample to choose k times with replacement: $ N-1+k \choose k$. In bootstrapping $N=k$
{\color{subsubsectionColor}\subsubsection{Jackknife}}
Method for debiasing at the price of variance \(\printlatex{\bar{\theta}_\mathrm{Jack} =\frac{1}{n} \sum_{i=1}^n (\bar{\theta}_i)}\) (leave out ith sample, estimate and average)
{\color{subsectionColor}\subsection{Misc}}
\textbf{Lagrangian:} $f(x,y) s.t. g(x,y) = c$
\begin{equation}
\mathcal{L}(x, y, \gamma) = f(x,y) - \gamma ( g(x,y)-c)
\end{equation}
\textbf{Parametric learning}: model is parametrized with a finite set of parameters, like linear regression, linear SVM, etc. \\
\textbf{Nonparametric learning}: models grow in complexity with quantity of data: kernel SVM, k-NN, etc.\\
\textbf{Empirical variance}: Look for dense and sparse regions. Regularize so that sparse regions are not contained (decr. variance). Measure by Variance CV of some classifiers.
{\color{subsectionColor}\subsection{Probabilistic Methods}}
{\color{subsubsectionColor}\subsubsection{MLE}}
Least Squares, Gaussian Noise
\begin{align}
L(w) &= -\log(P(y_1 ... y_n | x_1 ... x_n, w)) \\
&= \frac{n}{2} \log(2\pi\sigma^2) + \sum_{i=1}^{n} \frac{(y_i-w^\top x_i)^2}{2\sigma^2}
\end{align}
\begin{align}
\argmax_w P(y|x, w) &= \argmin_w L(w) \\
&= \argmin_w \sum_{i=1}^{n} (y_i-w^\top x_i)^2
\end{align}
\begin{eqnarray}
\widehat\sigma^2 = \frac{1}{n} \sum_{i=1}^{n} (x_{i} - \bar{x})^2,
E \left[ \widehat\sigma^2 \right]= \frac{n-1}{n}\sigma^2
\end{eqnarray}
{\color{subsubsectionColor}\subsubsection{MAP}}
Ridge regression, Gaussian prior on weights
\begin{align}
&\hat{y}_{MAP} = \argmax_w P(w) \prod_{i}^{n} P(y_i|x_i,w)\\
&=\argmin_w \frac{1}{2\sigma^2} \sum_{i=1}^{n} (y_i - w^\top x_i) + \frac{1}{2 \beta^2}\sum_{i=1}^{n}w_i^2
\end{align}
$P(w)$ or $P(\theta)$ - conjugate prior (beta, Gaussian) (posterior same class as prior) \\
$P(y_i|\theta)$ - likelihood function (binomial, multinomial, Gaussian) \\
\textbf{Beta distribution}: $P(\theta) = Beta(\theta; \alpha_1, \alpha_2) \propto \theta^{\alpha_1 - 1}(1-\theta)^{\alpha_2-1}$
{\color{subsubsectionColor}\subsubsection{Bayesian Decision Theory}}
\begin{equation}
a^* = \argmin_{a \in A} E_y[C(y,a)|x]
\end{equation}
{\color{subsectionColor}\subsection{Ensemble Methods}}
Use combination of simple hypotheses (weak learners) to create one strong learner.
strong learners: minimum error is below some $\delta < 0.5$
weak learner: maximum error is below $0.5$
\begin{equation}
f(x) = \sum_{i=1}^{n} \beta_i h_i(x)
\end{equation}
\textbf{Bagging}: train weak learners on bootstrapped sets with equal weights. \\
\textbf{Boosting}: train on all data, but reweigh misclassified samples higher.
{\color{subsubsectionColor}\subsubsection{Decision Trees}}
\textbf{Stumps}: partition linearly along 1 axis
\begin{equation}
h(x) = sign(a x_i - t)
\end{equation}
\textbf{Decision Tree}: recursive tree of stumps, leaves have labels. To train, either label if leaf's data is pure enough, or split data based on score.
{\color{subsubsectionColor}\subsubsection{Ada Boost}}
Effectively minimize exponential loss.
\begin{equation}
f^*(x) = \argmin_{f\in F} \sum_{i=1}^{n} \exp(-y_i f(x_i))
\end{equation}
Train $m$ weak learners, greedily selecting each one
\begin{equation}
(\beta_i, h_i) = \argmin_{\beta,h} \sum_{i=1}^{n} \exp(-y_i (f_{i-1} (x_j) + \beta h(x_j)))
\end{equation}
\begin{eqnarray}
c_b(x) \text { trained with } w_i \\
\epsilon_b = \sum\limits_i^n \frac{w_i^b}{\sum\limits_i^n w_i^b} I_{c(x_i) \neq y_i} \\
\alpha_b = log \frac{1-\epsilon_b}{\epsilon_b} \\
w^{b+1}_i = w^b_i \cdot exp(\alpha_b I_{y_i \neq c_b(x_i)})
\end{eqnarray}
Exponential loss function
Additive logistic regression
Bayesian approached (assumes posteriors)
Newtonlike updates (Gradient Descent)
If previous classifier bad, next has heigh weight
{\color{subsectionColor}\subsection{Generative Methods}}
\textbf{Discriminative} - estimate $P(y|x)$ - conditional. \\
\textbf{Generative} - estimate $P(y, x)$ - joint, model data generation.
{\color{subsubsectionColor}\subsubsection{Naive Bayes}}
All features independent.
\begin{eqnarray}
P(y|x) = \frac{1}{Z} P(y) P(x|y), Z = \sum_{y} P(y) P(x|y) \\
y = \argmax_{y'} P(y'|x) = \argmax_{y'} \hat{P}(y') \prod_{i=1}^{d} \hat{P}(x_i|y')
\end{eqnarray}
\textbf{Discriminant Function}
\begin{equation}
f(x) = \log(\frac{P(y=1|x)}{P(y==1|x)}), y=sign(f(x))
\end{equation}
{\color{subsubsectionColor}\subsubsection{Fischer's Linear Discriminant Analysis (LDA)}}
Idea: project high dimensional data on one axis.
Complexity: $\mathcal{O}(d^2n$ with $d$ number of classifiers
\begin{eqnarray}
&& c=2, p=0.5, \hat{\Sigma}_- = \hat{\Sigma}_+ = \hat{\Sigma} \\
y &=& sign(w^\top x + w_0) \\
w &=& \hat{\Sigma}^{-1}(\hat{\mu}_+ - \hat{\mu}_-) \\
w_0 &=& \frac{1}{2}(\hat{\mu}_-^\top \Sigma^{-1} \hat{\mu}_- - \hat{\mu}_+^\top \Sigma^{-1} \hat{\mu}_+)
\end{eqnarray}
{\color{subsectionColor}\subsection{Neural Networks}}
\textbf{Backpropagation algorithm}
\begin{enumerate}[topsep=0pt,itemsep=0ex,partopsep=1ex,parsep=1ex,leftmargin=*]
\item Initialize the weights $W^m_{ij}$ with small random values.
\item Choose a pattern $x^k_\mu$ and feed it to the input layer (m = 0),
i.e., $V_k^0 = x_k^{\mu}$ , $\forall k$.
\item Propagate the activity through the network where
\[V^m_i = S \left(\sum\limits_j W^m_{ij}V^{m-1}_j\right)\]
holds for each i and each layer m.
\item Calculate the weight corrections for the output layer
\[\delta^M_i = S\sp{\prime}(h^M_i)[y^\mu_i-V^M_i]\]
by comparing the actual output $V_i^M$ with the desired output \(y^\mu_i\) for pattern $\mu$.
\item Calculate the weight corrections for the previous layers by sending back the error signal
\[\delta^{m-1}_i = S\sp{\prime}(h_i^{m-1})\sum\limits_jW^m_{ji}\delta_j^m\]
for \(m = M, M - 1, \ldots , 2\), until a weight correction has been
calculated for each unit.
\item Use \(\Delta W_{ij} = \eta\delta_i^m V_j^{m-1}\) , to adapt all weights according to
\(W^{new}_{ij} = W^{old}_{ij} + \Delta W_{ij}\)
\item Goto 2 and repeat the loop for the next pattern.
\end{enumerate}
{\color{subsectionColor}\subsection{Unsupervised Learning}}
{\color{subsubsectionColor}\subsubsection{Parzen}}
\begin{equation}
\hat{p}_n = \frac{1}{n} \sum\limits_{i=1}^n \frac{1}{V_n} \phi(\frac{x-x_i}{h_n})
\end{equation}
where $\int \phi(x)dx = 1$
{\color{subsubsectionColor}\subsubsection{K-NN}}
\begin{equation}
\hat{p}_n = \frac{1}{V_k} \text{ volume with } k \text{ neighbours}
\end{equation}
{\color{subsubsectionColor}\subsubsection{K-means}}
(clustering = classification)
\begin{equation}
L(\mu) = \sum_{i=1}^{n} \min_{j\in\{1...k\}} \|x_i - \mu_y \|_2^2
\end{equation}
\textbf{Lloyd's Heuristic}: (1) assign each $x_i$ to closest cluster \\
(2) recalculate means of clusters.
Iteration over (repeated till stable):
\begin{eqnarray}
\text{Step 1: }& \text{argmin}_c ||x-\mu_c||^2 \\
\text{Step 2: }& \mu_\alpha = \frac{1}{n_\alpha} \sum \vec{x}
\end{eqnarray}
{\color{subsubsectionColor}\subsubsection{Gaussian Mixture Modeling}}
Same as Bayes, but class label $z$ unobserved.
\begin{equation}
(\mu^*, \Sigma^*, w^*) = \argmin -\sum_i log \sum_{j=1}^{k} w_j \mathcal{N}(x_i|\mu_i,\Sigma_j)
\end{equation}
{\color{subsubsectionColor}\subsubsection{EM Algorithm}}
Problem: sum within log-term of likelihood.
\textbf{E-step}: expectation: pick clusters for points.
Calculate $\gamma_j^{(t)}(x_i) = \frac{P(c|\theta^j) (x_i|c,\theta^j)}{\sum P(x_i|\theta)}$ for each $i$ and $j$\\
\textbf{M-Step}: maximum likelihood: adjust clusters to best fit points.\\
\begin{eqnarray}
\text{prior}_j^{(t)} = \pi^{(t)}_j &\leftarrow& \frac{1}{n}\sum_{i=1}^n \gamma_j^{(t)}(x_i) \\
\mu_j^{(t)} &\leftarrow& \frac{\sum_{i=1}^n \gamma_j^{(t)}(x_i)(x_i)}{\sum_{i=1}^n \gamma^{(t)}(x_i)} \\
\Sigma^{(t)}_j &\leftarrow& \frac{\sum_{i=1}^n \gamma_j^{(t)}(x_i)(x_i-\mu_j^{(t)})(x_i-\mu_j^{(t)})^\top}{\sum_{i=1}^n \gamma_j^{(t)}(x_i)}
\end{eqnarray}
\subsubsection{EM for non-gaussian mixture}
\textbf{Problem setting}
Derive the EM algorithm. The model behind the EM in the form:
\(p(x) = \sum\limits^n_{j=1}\pi_j Pr_{\text{non-gaussian}}\)
\textbf{Solution}
\begin{enumerate}[topsep=0pt,itemsep=0ex,partopsep=1ex,parsep=1ex,leftmargin=*]
\item Write log likelihood function:
\[L(X,\{\text{params}Pr_{non-gaussian} u_js\}) = \sum\limits^n_{i=1} \log \left( p(x_i)\right)\]
\item Write \(\gamma_j(x_i) = p(z_j = j | x_i)\) (prob. that \(x_i\) was generated from \(j^{th}\) dist. of the mixture).
\begin{align*}
\gamma_j(x_i) &= p(z_i = j | x_i) = \frac{p(x_i|z_i = j) p(z_i = j)}{p(x_i)}\\
&= \frac{p(x_i|z_i = j) p(z_i = j)}{\sum\limits^n_{k=1}p(x_i|z_i = k) p(z_i = k)}
= \frac{\pi_i L(X,u_js)}{\sum^n_{k=1}\pi_k L(X,u_js)}
\end{align*}
\item \textbf{To get optimal }\(\mathbf{u_js}\), derive \(L(x,u_js)\) by each param \(u_j\).
You get:
\[\nabla x_j L(X,u_js) = \sum\limits^n_{i=1}\frac{1}{Pr_{non-gaussian}} \cdot (\nabla u_j Pr_{non-gaussian})\](possibly replace by \(\gamma_j(x_i)\) (above and below) leaving back some factor). Solve for the parameter \(u_j\).
\item \textbf{Estimate the \(\pi\) parameters} by Lagrange optimization on log likelihood function and constraint \(\lambda \left(\sum\limits^k_{j=1}\pi_j -1\right) \). Put the \(\lambda\) into the formula and find the formula for \(\pi_j\).
\end{enumerate}
{\color{subsectionColor}\subsection{Hidden-Markov model}}
State only depends on previous state.
Always given: sequence of symbols $\vec{s} = \{s_1,s_2, \ldots s_n\}$
{\color{subsubsectionColor}\subsubsection{Evaluation (Forward \& Backward)}}
Known: $a_{ij}, e_k(s_t)$
Wanted: $P(X = x_i | S = s_t)$
\begin{eqnarray}
f_l (s_{t+1}) = e_l(s_{t+1}) \sum f_k(s_t) a_{kl} \\
b_l(s_t) = e_l(s_t) \sum b_k(s_{t+1}) a_{lk} \\
P(\vec{s}) = \sum_k f_k(s_n) a_k \cdot \text{ end} \\
P(x_{l,t} | \vec{s}) = \frac{f_l(s_t) b_l(s_t)}{P(\vec{s})}
\end{eqnarray}
Complexity in time: $\mathcal{O}(|S|^2 \cdot T)$
{\color{subsubsectionColor}\subsubsection{Decoding (Viterbi)}}
Known: $a_{ij}, e_k(s_t)$
Wanted: most likely path $\vec{x} = \{x_1,x_2,\ldots x_n\}$
\begin{eqnarray}
v_l(s_{t+1}) = \text{max}_k \{ v_k(s_t) a_{kl} \} \\
\text{ptr}_t(l) = \text{argmax}_k \{ v_k(s_t) a_{kl} \}
\end{eqnarray}
Follow pointers back from end to beginning.
Dynamic, because splittable in sub parts (per time step)
Time: $\mathcal{O}(|S|^2 \cdot T)$
Space $\mathcal{O}(|S| \cdot T)$
{\color{subsubsectionColor}\subsubsection{Learning (Baum-Welch)}}
Known: only sequence and sequence space $\Theta$
Wanted: $a_{ij}, e_k(s_t)$ \& most likely path $\vec{x} = \{x_1,x_2,\ldots x_n\}$
\textbf{E-step I:} $f_k(s_t), b_k(s_t)$ by forward \& backward algorithm
\textbf{E-step II:}
\begin{eqnarray}
P(X_t = x_k, X_{t+1} = x_l | \vec{s}, \Theta) = \\
\frac{1}{P(\vec{s})} f_k(s_t) a_{kl} e_l(s_{t+1}) b_l(s_{t+1}) \\
A_{kl} = \sum\limits_{j=1}^m \sum\limits_{t=1}^n P(X_t = x_k, X_{t+1} = x_l | \vec{s}, \Theta)
\end{eqnarray}
\textbf{M-step :}
\begin{eqnarray}
a_{kl} = \frac{A_{kl}}{\sum\limits_i^n A_{ki}} \text{ and } e_k(b) = \frac{E_k(b)}{\sum_{b'} E_k(b')}
\end{eqnarray}
Complexity: $\mathcal{O}(|S|^2)$ in storage (space)
\end{multicols}
\end{landscape}
\end{document}