-
Notifications
You must be signed in to change notification settings - Fork 15
/
15-Cheatsheet2014-CVDP.tex
executable file
·392 lines (339 loc) · 15.2 KB
/
15-Cheatsheet2014-CVDP.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
\documentclass[main]{subfiles}
\begin{document}
%% Big thanks go to Helen Oleynikova who attended Machine Learning 2013
%% Tex source of the cheat sheet found at https://www.amiv.ethz.ch/studium/unterlagen/65
%change spacings according to your preferences (readability vs. content)
% Syntax : \titlespacing*{<command>}{<left>}{<before-sep>}{<after-sep>}
\titlespacing*{\section}
{0pt}{0.5ex plus 1ex minus .2ex}{0.3ex plus .2ex}
\titlespacing*{\subsection}
{0pt}{0.5ex plus 1ex minus .2ex}{0.3ex plus .2ex}
\titlespacing*{\subsubsection}
{0pt}{0.5ex plus 1ex minus .2ex}{0.3ex plus .2ex}
\setlength{\topmargin}{10mm-1in} %{20mm} %Topmargin
\setlength{\oddsidemargin}{10mm-1in} %{35mm} %Left margin
\setlength{\headsep}{2mm} %Body starts 55mm from top sheet edge
\setlength{\textheight}{277mm} %Best {140mm}
\setlength{\footskip}{0mm} %{15mm}
\setlength{\textwidth}{190mm} %{212mm}
\setlength{\itemsep}{0pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
\begin{landscape}
%this removes the section title if the cheat sheet is compiled separately
\ifx\cheatsheet\undefined
\addcontentsline{toc}{section}{Cheat sheet 2014}
\else
{\color{sectionColor}\section{Cheat sheet 2014 Corneel Van der Pol}} %Killing this gives us more space
\fi
%VISUALIZE LAYOUT AND LAYOUTVALUES:
%\pagevalues
%\newpage asd\newpage
%\layout \newpage adsf \newpage
%\newpage
%
%
\begin{multicols}{4}
%%%%%%%%%%%%%%%%%%%%%%%%%
\small
%\scriptsize
%\footnotesize
%\small
{\color{subsectionColor}\subsection{Probability}}
{\color{subsubsectionColor}\subsubsection{Probability Rules}}
%%%
\begin{tabular}{p{5em}l}
Sum Rule& \(P(X=x_i) = \sum_{j=1}^{J} p(X=x_i,Y=y_i)\)\\
Product rule& \(P(X, Y) = P(Y|X) P(X)\) \\
Independence& \(P(X, Y) = P(X)P(Y)\) \\
Bayes' Rule& \(P(Y|X) = \frac{P(X|Y)P(Y)}{P(X)}\) \\
Conditional independence& \(X\bot Y|Z \)\\
& \(P(X,Y|Z) = P(X|Z)P(Y|Z)\) \\
& \(P(X|Z,Y) = P(X|Z)\)
\end{tabular}
{\color{subsubsectionColor}\subsubsection{Expectation}}
\begin{tabular}{l}
\(E(X) = \int_{\inf}^{\inf} x p(x) dx\) \\
\(\sigma^2(X) = E(x^2)-{E(x)}^2\) \\
\(\sigma^2(X) = \int_x (x-\mu_x)^2 p(x) dx\) \\
\(Cov(X, Y) = \int_x \int_y p(x,y) (x-\mu_x)(y-\mu_y) dx dy\)
\end{tabular}
{\color{subsubsectionColor}\subsubsection{Gaussian}}
\begin{equation}
p(X|\mu,\sigma)=\frac{1}{\sigma \sqrt{2\pi}} \exp(-\frac{(x-\mu)^2}{2\sigma^2})
\end{equation}
{\color{subsubsectionColor}\subsubsection{Kernels}}
Requirements: Symmetric ($k(x,y)=k(y,x)$) \\ positive semi-definite $K$.
\begin{tabular}{l}
\(k(x,y) = a k_1(x,y) + b k_2(x,y)\)\\
\(k(x,y) = k_1(x,y)k_2(x,y)\)\\
\(k(x,y) = f(x) f(y)\)\\
\(k(x,y) = k_3(\phi(x), \phi(y))\) \\
with $a>0,b>0$
\end{tabular}
\begin{tabular}{ll}
Linear & \(k(x,y) = x^\top y\)\\
Polynomial & \(k(x,y) = (x^\top y + 1)^d\)\\
Gaussian RBF & \(k(x,y) = \exp(\frac{-\|x-y\|^2_2}{h^2})\)\\
Sigmoid & \(k(x,y) = \tanh(k x^\top y - b)\)
\end{tabular}
{\color{subsectionColor}\subsection{Regression}}
\textbf{Linear Regression:}
\begin{equation}
\min_{w} \sum_{i=1}^n (y_i - w^\top x_i)^2
\end{equation}
Closed form solution: $\beta^* = (X^\top X)^{-1} X^\top Y$ \\
\textbf{Ridge Regression:}
\begin{equation}
\min_{w} \sum_{i=1}^n (y_i - w^\top x_i)^2 + \lambda \|w\|_2^2
\end{equation}
Closed form solution: $\beta^* = (X^\top X + \lambda I)^{-1} X^\top Y$ \\
\textbf{Lasso Regression} (sparse), requiring $\sum\limits_j^d |\beta_j| < s$:
\begin{equation}
\min_{w} \sum_{i=1}^n (y_i - w^\top x_i)^2 + \lambda \|w\|_1
\end{equation}
\textbf{Kernelized Linear Regression:}
\begin{equation}
\min_{\alpha} \|K\alpha - y \|_2^2 + \lambda \alpha^\top K \alpha
\end{equation}
Closed form solution: $\alpha = (K-\lambda I)^{-1} y $ \\
\textbf{Nonlinear Regression}:
\begin{equation}
RSS(f,\lambda) = \sum\limits_{i=1}^n (y_i - f(x_i))^2 + \lambda \int (f''(x))^2dx
\end{equation}
\begin{equation}
f(x) = \sum\limits_{m=1}^M \beta_m h_m(x) \text{ with } h_m {1,x,(x-\xi_1)^2}
\end{equation}
$\xi_1$ are knots, the center of a function.
$f''(x)=0$ is required to have smoothness
{\color{subsectionColor}\subsection{Classification}}
\begin{eqnarray}
\text{0/1 Loss} & w^* =& \argmin_w \sum_{i=1}^n [y_i \neq sign(w^\top x_i)] \\
\text{Perceptron} & w^* =& \argmin_w \sum_{i=1}^n [\max(0, y_i w^\top x_i)]
\end{eqnarray}
{\color{subsubsectionColor}\subsubsection{SVM}}
Primal, constrained:
\begin{equation}
\min_{w} w^\top w + C \sum_{i=1}^{n} \xi_i, \text{ s.t. } y_i w^\top x_i \geq 1 - \xi_i, \xi_i \geq 0
\end{equation}
Primal, unconstrained:
\begin{equation}
\min_{w} w^\top w + C \sum_{i=1}^{n} \max(0, 1-y_i w^\top x_i) \text{ (hinge loss)}
\end{equation}
Dual:
\begin{equation}
\max_{\alpha} \sum_{i=1}^{n} \alpha_i - \frac{1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j x_i^\top x_j, \text{ s.t. } 0 \geq \alpha_i \geq C
\end{equation}
Dual to primal: $w^* = \sum_{i=1}^{n} \alpha^*_i y_i x_i$, $\alpha_i > 0$: support vector.
{\color{subsubsectionColor}\subsubsection{Kernelized SVM}}
\begin{equation}
\max_{\alpha} \sum_{i=1}^{n} \alpha_i - \frac{1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j k(x_i, x_j), \text{ s.t. } 0 \geq \alpha_i \geq C
\end{equation}
Classify: $y = sign(\sum_{i=1}^{n} \alpha_i y_i k(x_i, x))$
{\color{subsubsectionColor}\subsubsection{How to find $a^T$?}}
$a = \{w_0,w\}$ used along $\widetilde{x} = \{1,x\}$
Gradient Descent: $a(k+1) = a(k) - \eta(k) \nabla J(a(k))$
Newton method: 2nd order Taylor to get $\eta_{opt} = H^{-1}$ with $H=\frac{\partial^2 J}{\partial a_i \partial a_j}$
$J$ is the cost matrix, popular choice is
Perceptron cost: $J_p (a) = \sum(-a^T \widetilde{x})$
{\color{subsubsectionColor}\subsubsection{Bootstrapping}}
Method for reducing variance
\begin{eqnarray}
\bar{S} = \frac{1}{B}\sum S(Z*)\\
\sigma^2(S) = \frac{1}{B-1} (S(Z^*) - \bar{S})^2\\
\end{eqnarray}
Bootstrapping works if for $n \rightarrow \infty$ the error of empirical\&bootstrap is the same as real\&empiricial
Probability for a sample not to appear in set: $(1-\frac{1}{n})^n$. Goes to $\frac{1}{e}$ for $n \rightarrow \infty$
Multiplicity N sample to choose k times with replacement: $ N-1+k \choose k$. In bootstrapping $N=k$
{\color{subsubsectionColor}\subsubsection{Jackknife}}
Method for debiasing at the price of variance
{\color{subsectionColor}\subsection{Misc}}
\textbf{Lagrangian:} $f(x,y) s.t. g(x,y) = c$
\begin{equation}
\mathcal{L}(x, y, \gamma) = f(x,y) - \gamma ( g(x,y)-c)
\end{equation}
\textbf{Parametric learning}: model is parameterized with a finite set of parameters, like linear regression, linear SVM, etc. \\ \\
\textbf{Nonparametric learning}: models grow in complexity with quantity of data: kernel SVM, k-NN, etc.
{\color{subsectionColor}\subsection{Probabilistic Methods:}}
{\color{subsubsectionColor}\subsubsection{MLE}}
Least Squares, Gaussian Noise
\begin{align}
L(w) &= -\log(P(y_1 ... y_n | x_1 ... x_n, w)) \\
&= \frac{n}{2} \log(2\pi\sigma^2) + \sum_{i=1}^{n} \frac{(y_i-w^\top x_i)^2}{2\sigma^2}
\end{align}
\begin{align}
\argmax_w P(y|x, w) &= \argmin_w L(w) \\
&= \argmin_w \sum_{i=1}^{n} (y_i-w^\top x_i)^2
\end{align}
\begin{eqnarray}
\widehat\sigma^2 = \frac{1}{n} \sum_{i=1}^{n} (x_{i} - \bar{x})^2,
E \left[ \widehat\sigma^2 \right]= \frac{n-1}{n}\sigma^2
\end{eqnarray}
{\color{subsubsectionColor}\subsubsection{MAP}}
Ridge regression, Gaussian prior on weights
\begin{align}
&\hat{y}_{MAP} = \argmax_w P(w) \prod_{i}^{n} P(y_i|x_i,w)\\
&=\argmin_w \frac{1}{2\sigma^2} \sum_{i=1}^{n} (y_i - w^\top x_i) + \frac{1}{2 \beta^2}\sum_{i=1}^{n}w_i^2
\end{align}
$P(w)$ or $P(\theta)$ - conjugate prior (beta, Gaussian) (posterior same class as prior) \\
$P(y_i|\theta)$ - likelihood function (binomial, multinomial, Gaussian) \\
\textbf{Beta distribution}: $P(\theta) = Beta(\theta; \alpha_1, \alpha_2) \propto \theta^{\alpha_1 - 1}(1-\theta)^{\alpha_2-1}$
{\color{subsubsectionColor}\subsubsection{Bayesian Decision Theory}}
\begin{equation}
a^* = \argmin_{a \in A} E_y[C(y,a)|x]
\end{equation}
{\color{subsectionColor}\subsection{Ensemble Methods}}
Use combination of simple hypotheses (weak learners) to create one strong learner.
strong learners: minimum error is below some $\delta < 0.5$
weak learner: maximum error is below $0.5$
\begin{equation}
f(x) = \sum_{i=1}^{n} \beta_i h_i(x)
\end{equation}
\textbf{Bagging}: train weak learners on bootstrapped sets with equal weights. \\
\textbf{Boosting}: train on all data, but reweigh misclassified samples higher.
{\color{subsubsectionColor}\subsubsection{Decision Trees}}
\textbf{Stumps}: partition linearly along 1 axis
\begin{equation}
h(x) = sign(a x_i - t)
\end{equation}
\textbf{Decision Tree}: recursive tree of stumps, leaves have labels. To train, either label if leaf's data is pure enough, or split data based on score.
{\color{subsubsectionColor}\subsubsection{AdaBoost}}
Effectively minimize exponential loss.
\begin{equation}
f^*(x) = \argmin_{f\in F} \sum_{i=1}^{n} \exp(-y_i f(x_i))
\end{equation}
Train $m$ weak learners, greedily selecting each one
\begin{equation}
(\beta_i, h_i) = \argmin_{\beta,h} \sum_{i=1}^{n} \exp(-y_i (f_{i-1} (x_j) + \beta h(x_j)))
\end{equation}
\begin{eqnarray}
c_b(x) \text { trained with } w_i \\
\epsilon_b = \sum\limits_i^n \frac{w_i^b}{\sum\limits_i^n w_i^b} I_{c(x_i) \neq y_i} \\
\alpha_b = log \frac{1-\epsilon_b}{\epsilon_b} \\
w_i = e^{-\alpha_b I_{y_i \neq c_b(x_i)}}
\end{eqnarray}
Exponential loss function
Additive logistic regression
Bayesian approached (assumes posteriors)
Newtonlike updates (Gradient Descent)
If previous classifier bad, next has heigh weight
{\color{subsectionColor}\subsection{Generative Methods}}
\textbf{Discriminative} - estimate $P(y|x)$ - conditional. \\
\textbf{Generative} - estimate $P(y, x)$ - joint, model data generation.
{\color{subsubsectionColor}\subsubsection{Naive Bayes}}
All features independent.
\begin{eqnarray}
P(y|x) = \frac{1}{Z} P(y) P(x|y), Z = \sum_{y} P(y) P(x|y) \\
y = \argmax_{y'} P(y'|x) = \argmax_{y'} \hat{P}(y') \prod_{i=1}^{d} \hat{P}(x_i|y')
\end{eqnarray}
\textbf{Discriminant Function}
\begin{equation}
f(x) = \log(\frac{P(y=1|x)}{P(y==1|x)}), y=sign(f(x))
\end{equation}
{\color{subsubsectionColor}\subsubsection{Fischer's Linear Discriminant Analysis (LDA)}}
Idea: project high dimensional data on one axis.
Complexity: $\mathcal{O}(d^2n$ with $d$ number of classifiers
\begin{eqnarray}
&& c=2, p=0.5, \hat{\Sigma}_- = \hat{\Sigma}_+ = \hat{\Sigma} \\
y &=& sign(w^\top x + w_0) \\
w &=& \hat{\Sigma}^{-1}(\hat{\mu}_+ - \hat{\mu}_-) \\
w_0 &=& \frac{1}{2}(\hat{\mu}_-^\top \Sigma^{-1} \hat{\mu}_- - \hat{\mu}_+^\top \Sigma^{-1} \hat{\mu}_+)
\end{eqnarray}
{\color{subsectionColor}\subsection{Unsupervised Learning}}
{\color{subsubsectionColor}\subsubsection{Parzen}}
\begin{equation}
\hat{p}_n = \frac{1}{n} \sum\limits_{i=1}^n \frac{1}{V_n} \phi(\frac{x-x_i}{h_n})
\end{equation}
where $\int \phi(x)dx = 1$
{\color{subsubsectionColor}\subsubsection{K-NN}}
\begin{equation}
\hat{p}_n = \frac{1}{V_k} \text{ volume with } k \text{ neighbours}
\end{equation}
{\color{subsubsectionColor}\subsubsection{K-means}}
(clustering = classification)
\begin{equation}
L(\mu) = \sum_{i=1}^{n} \min_{j\in\{1...k\}} \|x_i - \mu_y \|_2^2
\end{equation}
\textbf{Lloyd's Heuristic}: (1) assign each $x_i$ to closest cluster \\
(2) recalculate means of clusters.
Iteration over (repeated till stable):
\begin{eqnarray}
\text{Step 1: }& \text{argmin}_c ||x-\mu_c||^2 \\
\text{Step 2: }& \mu_\alpha = \frac{1}{n_\alpha} \sum \vec{x}
\end{eqnarray}
{\color{subsubsectionColor}\subsubsection{Gaussian Mixture Modeling}}
Same as Bayes, but class label $z$ unobserved.
\begin{equation}
(\mu^*, \Sigma^*, w^*) = \argmin -\sum_i log \sum_{j=1}^{k} w_j \mathcal{N}(x_i|\mu_i,\Sigma_j)
\end{equation}
{\color{subsubsectionColor}\subsubsection{EM Algorithm}}
Problem: sum within log-term of likelihood.
\textbf{E-step}: expectation: pick clusters for points.
Calculate $\gamma_j^{(t)}(x_i) = \frac{P(c|\theta^j) (x_i|c,\theta^j)}{\sum P(x_i|\theta)}$ for each $i$ and $j$\\
\textbf{M-Step}: maximum likelihood: adjust clusters to best fit points.\\
\begin{eqnarray}
\text{prior}_j^{(t)} = \pi^{(t)}_j &\leftarrow& \frac{1}{n}\sum_{i=1}^n \gamma_j^{(t)}(x_i) \\
\mu_j^{(t)} &\leftarrow& \frac{\sum_{i=1}^n \gamma_j^{(t)}(x_i)(x_i)}{\sum_{i=1}^n \gamma^{(t)}(x_i)} \\
\Sigma^{(t)}_j &\leftarrow& \frac{\sum_{i=1}^n \gamma_j^{(t)}(x_i)(x_i-\mu_j^{(t)})(x_i-\mu_j^{(t)})^\top}{\sum_{i=1}^n \gamma_j^{(t)}(x_i)}
\end{eqnarray}
\subsubsection{EM for non-gaussian mixture}
\textbf{Problem setting}
Derive the EM algorithm. The model behind the EM in the form:
\(p(x) = \sum\limits^n_{j=1}\pi_j Pr_{\text{non-gaussian}}\)
\textbf{Solution}
\begin{enumerate}[topsep=0pt,itemsep=-2ex,partopsep=1ex,parsep=1ex]
\item Write log likelihood function:
\[L(X,\{\text{params}Pr_{non-gaussian} u_js\}) = \sum\limits^n_{i=1} \log \left( p(x_i)\right)\]
\item Write \(\gamma_j(x_i) = p(z_j = j | x_i)\) (prob. that \(x_i\) was generated from \(j^{th}\) dist. of the mixture).
\begin{align*}
\gamma_j(x_i) &= p(z_i = j | x_i) = \frac{p(x_i|z_i = j) p(z_i = j)}{p(x_i)}\\
&= \frac{p(x_i|z_i = j) p(z_i = j)}{\sum\limits^n_{k=1}p(x_i|z_i = k) p(z_i = k)}
= \frac{\pi_i L(X,u_js)}{\sum^n_{k=1}\pi_k L(X,u_js)}
\end{align*}
\item \textbf{To get optimal }\(\mathbf{u_js}\), derive \(L(x,u_js)\) by each param \(u_j\).
You get:
\[\nabla x_j L(X,u_js) = \sum\limits^n_{i=1}\frac{1}{Pr_{non-gaussian}} \cdot (\nabla u_j Pr_{non-gaussian})\](possibly replace by \(\gamma_j(x_i)\) (above and below) leaving back some factor). Solve for the parameter \(u_j\).
\item \textbf{Estimate the \(\pi\) parameters} by Lagrange optimization on log likelihood function and constraint \(\lambda \left(\sum\limits^k_{j=1}\pi_j -1\right) \). Put the \(\lambda\) into the formula and find the formula for \(\pi_j\).
\end{enumerate}
{\color{subsectionColor}\subsection{Hidden-Markov model}}
State only depends on previous state.
Always given: sequence of symbols $\vec{s} = \{s_1,s_2, \ldots s_n\}$
{\color{subsubsectionColor}\subsubsection{Evaluation (Forward \& Backward)}}
Known: $a_{ij}, e_k(s_t)$
Wanted: $P(X = x_i | S = s_t)$
\begin{eqnarray}
f_l (s_{t+1}) = e_l(s_{t+1}) \sum f_k(s_t) a_{kl} \\
b_l(s_t) = e_l(s_t) \sum b_k(s_{t+1}) a_{lk} \\
P(\vec{s}) = \sum_k f_k(s_n) a_k \cdot \text{ end} \\
P(x_{l,t} | \vec{s}) = \frac{f_l(s_t) b_l(s_t)}{P(\vec{s})}
\end{eqnarray}
Complexity in time: $\mathcal{O}(|S|^2 \cdot T)$
{\color{subsubsectionColor}\subsubsection{Decoding (Viterbi)}}
Known: $a_{ij}, e_k(s_t)$
Wanted: most likely path $\vec{x} = \{x_1,x_2,\ldots x_n\}$
\begin{eqnarray}
v_l(s_{t+1}) = \text{max}_k \{ v_k(s_t) a_{kl} \} \\
\text{ptr}_t(l) = \text{argmax}_k \{ v_k(s_t) a_{kl} \}
\end{eqnarray}
Follow pointers back from end to beginning.
Dynamic, because splittable in sub parts (per time step)
Time: $\mathcal{O}(|S|^2 \cdot T)$
Space $\mathcal{O}(|S| \cdot T)$
{\color{subsubsectionColor}\subsubsection{Learning (Baum-Welch)}}
Known: only sequence and sequence space $\Theta$
Wanted: $a_{ij}, e_k(s_t)$ \& most likely path $\vec{x} = \{x_1,x_2,\ldots x_n\}$
\textbf{E-step I:} $f_k(s_t), b_k(s_t)$ by forward \& backward algorithm
\textbf{E-step II:}
\begin{eqnarray}
P(X_t = x_k, X_{t+1} = x_l | \vec{s}, \Theta) = \\
\frac{1}{P(\vec{s})} f_k(s_t) a_{kl} e_l(s_{t+1}) b_l(s_{t+1}) \\
A_{kl} = \sum\limits_{j=1}^m \sum\limits_{t=1}^n P(X_t = x_k, X_{t+1} = x_l | \vec{s}, \Theta)
\end{eqnarray}
\textbf{M-step :}
\begin{eqnarray}
a_{kl} = \frac{A_{kl}}{\sum\limits_i^n A_{ki}} \text{ and } e_k(b) = \frac{E_k(b)}{\sum_{b'} E_k(b')}
\end{eqnarray}
Complexity: $\mathcal{O}(|S|^2)$ in storage (space)
\end{multicols}
\end{landscape}
\end{document}