15-Cheatsheet2014-BE.tex

\documentclass[main]{subfiles}
\begin{document}
%% Big thanks go to Helen Oleynikova who attended Machine Learning	2013	
%% Tex source of the cheat sheet found at https://www.amiv.ethz.ch/studium/unterlagen/65

%change spacings according to your preferences (readability vs. content)
% Syntax : \titlespacing*{<command>}{<left>}{<before-sep>}{<after-sep>}
\titlespacing*{\section}
{0pt}{0.1pt}{0.1pt}
\titlespacing*{\subsection}
{0pt}{0.1pt}{0.1pt}
\titlespacing*{\subsubsection}
{0pt}{0.1pt}{0.1pt}


%############
% Cheatsheet setup depending on whether it is compiled on its own or within the whole summary
%############
\ifx\cheatsheet\undefined
%change page definitions to fullpage. Compile this document alone to make it compiled as fullpage.
\setlength{\topmargin}{-1.2in} %{20mm} %Topmargin
\setlength{\headsep}{2mm} %Body starts 55mm from top sheet edge
\setlength{\footskip}{0mm} %{15mm}
\setlength{\itemsep}{0pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
\setlength{\textwidth}{208mm-4mm} % A4 page width 210
\setlength{\textheight}{295mm-4mm} %A4 page height 297
\addtolength{\voffset}{15pt}
\setlength{\oddsidemargin}{-0.8in}
\setlength{\evensidemargin}{-1in}
\else

%set these settings if the cheatsheet is compiled for the whole summary. This might make that the cheatsheet is longer than 2 pages.
\setlength{\footskip}{0mm} %{15mm}
\setlength{\textwidth}{190mm} %{212mm}
\setlength{\itemsep}{0pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
\setlength{\topmargin}{15mm} %{20mm} %Topmargin
\setlength{\headsep}{2mm} %Body starts 55mm from top sheet edge
\setlength{\footskip}{0mm} %{15mm}
\setlength{\itemsep}{0pt}
\setlength{\parskip}{0pt}
\setlength{\parsep}{0pt}
\setlength{\textwidth}{190mm} % A4 page width 210
\setlength{\textheight}{277mm} %Best {140mm}
\addtolength{\voffset}{-30mm}
\setlength{\oddsidemargin}{10mm-1in} %{35mm} %Left margin
\setlength{\evensidemargin}{10mm}
\fi

\begin{landscape}
%this removes the section title if the cheat sheet is compiled separately
\ifx\cheatsheet\undefined
\addcontentsline{toc}{section}{Cheat sheet 2014}
\else
{\color{sectionColor}\section{Cheat sheet 2014 Benjamin Ellenberger}}
\fi

%VISUALIZE LAYOUT AND LAYOUTVALUES:
%\pagevalues
%\newpage asd\newpage
%\layout \newpage adsf \newpage
%\newpage 
%
%
\begin{multicols}{4}
%%%%%%%%%%%%%%%%%%%%%%%%%
\scriptsize
%\footnotesize
%\small
{\color{subsectionColor}\subsection{Probability}}
{\color{subsubsectionColor}\subsubsection{Probability Rules}}
%%%
\begin{tabular}{p{5em}l}
Sum Rule& \(P(X=x_i) = \sum_{j=1}^{J} p(X=x_i,Y=y_i)\)\\
Product rule& \(P(X, Y) = P(Y|X) P(X)\) \\
Independence& \(P(X, Y) = P(X)P(Y)\) \\
Bayes' Rule& \(P(Y|X) = \frac{P(X|Y)P(Y)}{P(X)} =  \frac{P(X|Y)P(Y)}{\sum\limits^k_{i=1}P(X|Y_i)P(Y_i)}\)\\
Conditional independence& \(X\bot Y|Z \)\\
 & \(P(X,Y|Z) = P(X|Z)P(Y|Z)\) \\
 & \(P(X|Z,Y) = P(X|Z)\)
\end{tabular}
{\color{subsubsectionColor}\subsubsection{Expectation}}
\begin{tabular}{l}
\(E(X) = \int_{\inf}^{\inf} x p(x) dx\) \\
\(\sigma^2(X) = E(x^2)-{E(x)}^2\) \\
\(\sigma^2(X) = \int_x (x-\mu_x)^2 p(x) dx\) \\
\(Cov(X, Y) = \int_x \int_y p(x,y) (x-\mu_x)(y-\mu_y) dx dy\)
\end{tabular}

{\color{subsubsectionColor}\subsubsection{Gaussian}}
\begin{align}
p(X|\mu,\sigma)&=\frac{1}{\sigma \sqrt{2\pi}} \exp(-\frac{(x-\mu)^2}{2\sigma^2})\\
&=\frac{1}{\sqrt{2\pi^k |\Sigma|}} \exp(-\frac{1}{2}(x-\mu)^T\Sigma^{-1}(x-\mu))
\end{align}

{\color{subsubsectionColor}\subsubsection{Kernels}}
Requirements: Symmetric ($k(x,y)=k(y,x)$) \\ positive semi-definite $K$.
\begin{tabular}{l}
\(k(x,y) = a k_1(x,y) + b k_2(x,y)\)\\
\(k(x,y) = k_1(x,y)k_2(x,y)\)\\
\(k(x,y) = f(x) f(y)\)\\
\(k(x,y) = k_3(\phi(x), \phi(y))\) \\
with $a>0,b>0$
\end{tabular}
\begin{tabular}{ll}
Linear & \(k(x,y) = x^\top y\)\\
Polynomial & \(k(x,y) = (x^\top y + 1)^d\)\\
Gaussian RBF & \(k(x,y) = \exp(\frac{-\|x-y\|^2_2}{h^2})\)\\
Sigmoid & \(k(x,y) = \tanh(k x^\top y - b)\)
\end{tabular}

{\color{subsectionColor}\subsection{Regression}}
\textbf{Linear Regression:}
\begin{equation}
\min_{w} \sum_{i=1}^n (y_i - w^\top x_i)^2
\end{equation}
Closed form solution: $\beta^* = (X^\top X)^{-1} X^\top Y$ \\
\textbf{Ridge Regression:}
\begin{equation}
\min_{w} \sum_{i=1}^n (y_i - w^\top x_i)^2 + \lambda \|w\|_2^2
\end{equation}
Closed form solution: $\beta^* = (X^\top X + \lambda I)^{-1} X^\top Y$ \\
\textbf{Lasso Regression} (sparse), requiring $\sum\limits_j^d |\beta_j| < s$:
\begin{equation}
\min_{w} \sum_{i=1}^n (y_i - w^\top x_i)^2 + \lambda \|w\|_1
\end{equation}
\textbf{Kernelized Linear Regression:}
\begin{equation}
\min_{\alpha} \|K\alpha - y \|_2^2 + \lambda \alpha^\top K \alpha
\end{equation}
Closed form solution: $\alpha = (K-\lambda I)^{-1} y $ \\
\textbf{Nonlinear Regression}:
\begin{equation}
RSS(f,\lambda) = \sum\limits_{i=1}^n (y_i - f(x_i))^2 + \lambda  \int (f''(x))^2dx
\end{equation}
\begin{equation}
f(x) = \sum\limits_{m=1}^M \beta_m h_m(x) \text{ with } h_m {1,x,(x-\xi_1)^2}
\end{equation}
$\xi_1$ are knots, the center of a function.

$f''(x)=0$ is required to have smoothness\\
\textbf{Bias vs. Variance}
\begin{align}
&\E_D\E_{X,Y}\left(\hat{f}(X)-Y\right)^2 = \E_D\E_X\left(\hat{f}(X) - \E(Y|X)\right)^2 \\
&+ \E_{X,Y}\left(Y - \E(Y|X)\right)^2\\
&= \underbrace{\E_X \E_D\left(\hat{f}(X) - \E_D(\hat{f}(X))\right)^2}_{variance} \\
&+ \underbrace{\E_X\left(\E_D(\hat{f}(X)) - \E(Y|X)\right)^2}_{\text{bias}^2}+ \underbrace{\E_{X,Y}\left(Y - \E(Y|X)\right)^2}_{\text{noise}}
\end{align}
\textbf{Combining regressions}
\begin{align}
bias[f (x)] &= \E_D \hat{f} (x) - \E(Y|x) = \frac{1}{B} \sum\limits_{i=1}^B \E_D\hat{f}_i(x) - \E(Y|x) \\
&= \frac{1}{B} \sum\limits_{i=1}^B (\E_D\hat{f}_i(x) - \E(Y|x)) = \frac{1}{B} \sum\limits_{i=1}^B bias[f_i(x)]
\end{align}
\[\V\{  \hat{f} (x)\} \approx \frac{\sigma^2}{B}\]

{\color{subsectionColor}\subsection{Classification}}
\begin{eqnarray}
\text{0/1 Loss} & w^* =& \argmin_w \sum_{i=1}^n [y_i \neq sign(w^\top x_i)] \\
\text{Perceptron} & w^* =& \argmin_w \sum_{i=1}^n [\max(0, y_i w^\top x_i)]
\end{eqnarray}

{\color{subsubsectionColor}\subsubsection{SVM}}
Primal, constrained:
\begin{equation}
\min_{w} w^\top w + C \sum_{i=1}^{n} \xi_i, \text{ s.t. } y_i w^\top x_i \geq 1 - \xi_i, \xi_i \geq 0
\end{equation}
Primal, unconstrained:
\begin{equation}
\min_{w} w^\top w + C \sum_{i=1}^{n} \max(0, 1-y_i w^\top x_i) \text{ (hinge loss)}
\end{equation}
Dual:
\begin{equation}
\max_{\alpha} \sum_{i=1}^{n} \alpha_i - \frac{1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j x_i^\top x_j, \text{ s.t. } 0 \leq \alpha_i \leq C
\end{equation}
Dual to primal: $w^* = \sum_{i=1}^{n} \alpha^*_i y_i x_i$, $\alpha_i > 0$: support vector.

{\color{subsubsectionColor}\subsubsection{Kernelized SVM}}
\begin{equation}
\max_{\alpha} \sum_{i=1}^{n} \alpha_i - \frac{1}{2} \sum_{i,j} \alpha_i \alpha_j y_i y_j k(x_i, x_j), \text{ s.t. } 0 \geq \alpha_i \geq C
\end{equation}
Classify: $y = sign(\sum_{i=1}^{n} \alpha_i y_i k(x_i, x))$

{\color{subsubsectionColor}\subsubsection{How to find $a^T$?}}
$a = \{w_0,w\}$ used along $\widetilde{x} = \{1,x\}$

Gradient Descent: $a(k+1) = a(k) - \eta(k) \nabla J(a(k))$

Newton method: 2nd order Taylor to get $\eta_{opt} = H^{-1}$ with $H=\frac{\partial^2 J}{\partial a_i \partial a_j}$

$J$ is the cost matrix, popular choice is

Perceptron cost: $J_p (a) = \sum(-a^T \widetilde{x})$
{\color{subsubsectionColor}\subsubsection{Bootstrapping}}
Sample creation with replacement. Calculate mean and var of it and average.
\begin{eqnarray}
\bar{S} = \frac{1}{B}\sum S(Z*)\\
\sigma^2(S) = \frac{1}{B-1} (S(Z^*) - \bar{S})^2\\
\end{eqnarray}
Bootstrapping works if for $n \rightarrow \infty$ the error of empirical\&bootstrap is the same as real\&empiricial

Probability for a sample not to appear in set: $(1-\frac{1}{n})^n$. Goes to $\frac{1}{e}$ for $n \rightarrow \infty$

Multiplicity N sample to choose k times with replacement: $  N-1+k \choose k$. In bootstrapping $N=k$
{\color{subsubsectionColor}\subsubsection{Jackknife}}
Method for debiasing at the price of variance \(\printlatex{\bar{\theta}_\mathrm{Jack} =\frac{1}{n} \sum_{i=1}^n (\bar{\theta}_i)}\) (leave out ith sample, estimate and average)

{\color{subsectionColor}\subsection{Misc}}
\textbf{Lagrangian:} $f(x,y) s.t. g(x,y) = c$
\begin{equation}
\mathcal{L}(x, y, \gamma) = f(x,y) - \gamma ( g(x,y)-c)
\end{equation}
\textbf{Parametric learning}: model is parametrized with a finite set of parameters, like linear regression, linear SVM, etc. \\
\textbf{Nonparametric learning}: models grow in complexity with quantity of data: kernel SVM, k-NN, etc.\\
\textbf{Empirical variance}: Look for dense and sparse regions. Regularize so that sparse regions are not contained (decr. variance). Measure by Variance CV of some classifiers.

{\color{subsectionColor}\subsection{Probabilistic Methods}}
{\color{subsubsectionColor}\subsubsection{MLE}}
Least Squares, Gaussian Noise
\begin{align}
L(w) &= -\log(P(y_1 ... y_n | x_1 ... x_n, w)) \\
&= \frac{n}{2} \log(2\pi\sigma^2) + \sum_{i=1}^{n} \frac{(y_i-w^\top x_i)^2}{2\sigma^2}
\end{align}
\begin{align}
\argmax_w P(y|x, w) &= \argmin_w L(w) \\
&= \argmin_w \sum_{i=1}^{n} (y_i-w^\top x_i)^2
\end{align}

\begin{eqnarray}
\widehat\sigma^2 = \frac{1}{n} \sum_{i=1}^{n} (x_{i} - \bar{x})^2, 
E \left[ \widehat\sigma^2  \right]= \frac{n-1}{n}\sigma^2
\end{eqnarray}

{\color{subsubsectionColor}\subsubsection{MAP}}
Ridge regression, Gaussian prior on weights
\begin{align}
&\hat{y}_{MAP} = \argmax_w P(w) \prod_{i}^{n} P(y_i|x_i,w)\\
&=\argmin_w \frac{1}{2\sigma^2} \sum_{i=1}^{n} (y_i - w^\top x_i) + \frac{1}{2 \beta^2}\sum_{i=1}^{n}w_i^2
\end{align}
$P(w)$ or $P(\theta)$ - conjugate prior (beta, Gaussian) (posterior same class as prior) \\
$P(y_i|\theta)$ - likelihood function (binomial, multinomial, Gaussian) \\
\textbf{Beta distribution}: $P(\theta) = Beta(\theta; \alpha_1, \alpha_2) \propto \theta^{\alpha_1 - 1}(1-\theta)^{\alpha_2-1}$

{\color{subsubsectionColor}\subsubsection{Bayesian Decision Theory}}
\begin{equation}
a^* = \argmin_{a \in A} E_y[C(y,a)|x]
\end{equation}

{\color{subsectionColor}\subsection{Ensemble Methods}}
Use combination of simple hypotheses (weak learners) to create one strong learner.

strong learners: minimum error is below some $\delta < 0.5$

weak learner: maximum error is below $0.5$
\begin{equation}
f(x) = \sum_{i=1}^{n} \beta_i h_i(x)
\end{equation}
\textbf{Bagging}: train weak learners on bootstrapped sets with equal weights. \\
\textbf{Boosting}: train on all data, but reweigh misclassified samples higher.

{\color{subsubsectionColor}\subsubsection{Decision Trees}}
\textbf{Stumps}: partition linearly along 1 axis
\begin{equation}
h(x) = sign(a x_i - t)
\end{equation}
\textbf{Decision Tree}: recursive tree of stumps, leaves have labels. To train, either label if leaf's data is pure enough, or split data based on score.


{\color{subsubsectionColor}\subsubsection{Ada Boost}}
Effectively minimize exponential loss.
\begin{equation}
f^*(x) = \argmin_{f\in F} \sum_{i=1}^{n} \exp(-y_i f(x_i))
\end{equation}
Train $m$ weak learners, greedily selecting each one
\begin{equation}
(\beta_i, h_i) = \argmin_{\beta,h} \sum_{i=1}^{n} \exp(-y_i (f_{i-1} (x_j) + \beta h(x_j)))
\end{equation}
\begin{eqnarray}
c_b(x) \text { trained with } w_i \\
\epsilon_b = \sum\limits_i^n \frac{w_i^b}{\sum\limits_i^n w_i^b} I_{c(x_i) \neq y_i} \\
\alpha_b = log \frac{1-\epsilon_b}{\epsilon_b} \\
w^{b+1}_i = w^b_i \cdot exp(\alpha_b I_{y_i \neq c_b(x_i)})
\end{eqnarray}

Exponential loss function

Additive logistic regression

Bayesian approached (assumes posteriors)

Newtonlike updates (Gradient Descent)

If previous classifier bad, next has heigh weight

{\color{subsectionColor}\subsection{Generative Methods}}
\textbf{Discriminative} - estimate $P(y|x)$ - conditional. \\
\textbf{Generative} - estimate $P(y, x)$ - joint, model data generation.

{\color{subsubsectionColor}\subsubsection{Naive Bayes}}
All features independent.
\begin{eqnarray}
P(y|x) = \frac{1}{Z} P(y) P(x|y), Z = \sum_{y} P(y) P(x|y) \\
y = \argmax_{y'} P(y'|x) = \argmax_{y'} \hat{P}(y') \prod_{i=1}^{d} \hat{P}(x_i|y')
\end{eqnarray}
\textbf{Discriminant Function}
\begin{equation}
f(x) = \log(\frac{P(y=1|x)}{P(y==1|x)}), y=sign(f(x))
\end{equation}

{\color{subsubsectionColor}\subsubsection{Fischer's Linear Discriminant Analysis (LDA)}}
Idea: project high dimensional data on one axis.

Complexity: $\mathcal{O}(d^2n$ with $d$ number of classifiers
\begin{eqnarray}
&& c=2, p=0.5, \hat{\Sigma}_- = \hat{\Sigma}_+ = \hat{\Sigma} \\
y &=& sign(w^\top x + w_0) \\
w &=& \hat{\Sigma}^{-1}(\hat{\mu}_+ - \hat{\mu}_-) \\
w_0 &=& \frac{1}{2}(\hat{\mu}_-^\top \Sigma^{-1} \hat{\mu}_- - \hat{\mu}_+^\top \Sigma^{-1} \hat{\mu}_+)
\end{eqnarray}

{\color{subsectionColor}\subsection{Neural Networks}}
\textbf{Backpropagation algorithm}
\begin{enumerate}[topsep=0pt,itemsep=0ex,partopsep=1ex,parsep=1ex,leftmargin=*]
\item  Initialize the weights $W^m_{ij}$ with small random values.
\item Choose a pattern $x^k_\mu$ and feed it to the input layer (m = 0),
i.e., $V_k^0 = x_k^{\mu}$ , $\forall k$.
\item Propagate the activity through the network where
\[V^m_i = S \left(\sum\limits_j W^m_{ij}V^{m-1}_j\right)\]
holds for each i and each layer m.
\item Calculate the weight corrections for the output layer
\[\delta^M_i = S\sp{\prime}(h^M_i)[y^\mu_i-V^M_i]\]
by comparing the actual output $V_i^M$ with the desired output \(y^\mu_i\) for pattern $\mu$.
\item Calculate the weight corrections for the previous layers by sending back the error signal
\[\delta^{m-1}_i = S\sp{\prime}(h_i^{m-1})\sum\limits_jW^m_{ji}\delta_j^m\]
for \(m = M, M - 1, \ldots , 2\), until a weight correction has been
calculated for each unit.
\item Use \(\Delta W_{ij} = \eta\delta_i^m V_j^{m-1}\) , to adapt all weights according to
\(W^{new}_{ij} = W^{old}_{ij} + \Delta W_{ij}\)
\item Goto 2 and repeat the loop for the next pattern.
\end{enumerate}

{\color{subsectionColor}\subsection{Unsupervised Learning}}
{\color{subsubsectionColor}\subsubsection{Parzen}}
\begin{equation}
\hat{p}_n = \frac{1}{n} \sum\limits_{i=1}^n \frac{1}{V_n} \phi(\frac{x-x_i}{h_n})
\end{equation}
where $\int \phi(x)dx = 1$
{\color{subsubsectionColor}\subsubsection{K-NN}}
\begin{equation}
\hat{p}_n = \frac{1}{V_k} \text{ volume with } k \text{ neighbours}
\end{equation}
{\color{subsubsectionColor}\subsubsection{K-means}}
(clustering = classification)
\begin{equation}
L(\mu) = \sum_{i=1}^{n} \min_{j\in\{1...k\}} \|x_i - \mu_y \|_2^2
\end{equation}
\textbf{Lloyd's Heuristic}: (1) assign each $x_i$ to closest cluster \\
(2) recalculate means of clusters.

Iteration over (repeated till stable):
\begin{eqnarray}
\text{Step 1: }& \text{argmin}_c ||x-\mu_c||^2 \\
\text{Step 2: }& \mu_\alpha = \frac{1}{n_\alpha} \sum \vec{x}
\end{eqnarray}
{\color{subsubsectionColor}\subsubsection{Gaussian Mixture Modeling}}
Same as Bayes, but class label $z$ unobserved.
\begin{equation}
(\mu^*, \Sigma^*, w^*) = \argmin -\sum_i log \sum_{j=1}^{k} w_j \mathcal{N}(x_i|\mu_i,\Sigma_j)
\end{equation}

{\color{subsubsectionColor}\subsubsection{EM Algorithm}}
Problem: sum within log-term of likelihood.

\textbf{E-step}: expectation: pick clusters for points.
Calculate $\gamma_j^{(t)}(x_i) = \frac{P(c|\theta^j) (x_i|c,\theta^j)}{\sum P(x_i|\theta)}$ for each $i$ and $j$\\
\textbf{M-Step}: maximum likelihood: adjust clusters to best fit points.\\
\begin{eqnarray}
\text{prior}_j^{(t)} = \pi^{(t)}_j &\leftarrow& \frac{1}{n}\sum_{i=1}^n \gamma_j^{(t)}(x_i) \\
\mu_j^{(t)} &\leftarrow& \frac{\sum_{i=1}^n \gamma_j^{(t)}(x_i)(x_i)}{\sum_{i=1}^n \gamma^{(t)}(x_i)} \\
\Sigma^{(t)}_j &\leftarrow& \frac{\sum_{i=1}^n \gamma_j^{(t)}(x_i)(x_i-\mu_j^{(t)})(x_i-\mu_j^{(t)})^\top}{\sum_{i=1}^n \gamma_j^{(t)}(x_i)}
\end{eqnarray}

\subsubsection{EM for non-gaussian mixture}
\textbf{Problem setting}
Derive the EM algorithm. The model behind the EM in the form:
\(p(x) = \sum\limits^n_{j=1}\pi_j Pr_{\text{non-gaussian}}\)
\textbf{Solution}
\begin{enumerate}[topsep=0pt,itemsep=0ex,partopsep=1ex,parsep=1ex,leftmargin=*]
\item Write  log likelihood function:
\[L(X,\{\text{params}Pr_{non-gaussian} u_js\}) = \sum\limits^n_{i=1} \log \left( p(x_i)\right)\]
\item Write \(\gamma_j(x_i) = p(z_j = j | x_i)\) (prob. that \(x_i\) was generated from \(j^{th}\) dist. of the mixture).
\begin{align*}
\gamma_j(x_i) &= p(z_i = j | x_i) = \frac{p(x_i|z_i = j) p(z_i = j)}{p(x_i)}\\
&= \frac{p(x_i|z_i = j) p(z_i = j)}{\sum\limits^n_{k=1}p(x_i|z_i = k) p(z_i = k)}
= \frac{\pi_i L(X,u_js)}{\sum^n_{k=1}\pi_k L(X,u_js)}
\end{align*}
\item \textbf{To get optimal }\(\mathbf{u_js}\), derive \(L(x,u_js)\) by each param \(u_j\).
You get:
\[\nabla x_j L(X,u_js) = \sum\limits^n_{i=1}\frac{1}{Pr_{non-gaussian}} \cdot (\nabla u_j Pr_{non-gaussian})\](possibly replace by \(\gamma_j(x_i)\) (above and below) leaving back some factor). Solve for the parameter \(u_j\).
\item \textbf{Estimate the \(\pi\) parameters} by Lagrange optimization on log likelihood function and constraint \(\lambda \left(\sum\limits^k_{j=1}\pi_j -1\right) \). Put the \(\lambda\) into the formula and find the formula for \(\pi_j\).
\end{enumerate}

{\color{subsectionColor}\subsection{Hidden-Markov model}}
State only depends on previous state.

Always given: sequence of symbols $\vec{s} = \{s_1,s_2, \ldots s_n\}$
{\color{subsubsectionColor}\subsubsection{Evaluation (Forward \& Backward)}}
Known: $a_{ij}, e_k(s_t)$

Wanted: $P(X = x_i | S = s_t)$
\begin{eqnarray}
f_l (s_{t+1}) = e_l(s_{t+1}) \sum f_k(s_t) a_{kl} \\
b_l(s_t) = e_l(s_t) \sum b_k(s_{t+1}) a_{lk} \\
P(\vec{s}) = \sum_k f_k(s_n) a_k \cdot \text{ end} \\
P(x_{l,t} | \vec{s}) = \frac{f_l(s_t) b_l(s_t)}{P(\vec{s})}
\end{eqnarray}
Complexity in time: $\mathcal{O}(|S|^2 \cdot T)$

{\color{subsubsectionColor}\subsubsection{Decoding (Viterbi)}}
Known: $a_{ij}, e_k(s_t)$

Wanted: most likely path $\vec{x} = \{x_1,x_2,\ldots x_n\}$
\begin{eqnarray}
v_l(s_{t+1}) = \text{max}_k \{ v_k(s_t) a_{kl} \} \\
\text{ptr}_t(l) = \text{argmax}_k \{ v_k(s_t) a_{kl} \}
\end{eqnarray}
Follow pointers back from end to beginning.

Dynamic, because splittable in sub parts (per time step)

Time: $\mathcal{O}(|S|^2 \cdot T)$
Space $\mathcal{O}(|S| \cdot T)$

{\color{subsubsectionColor}\subsubsection{Learning (Baum-Welch)}}
Known: only sequence and sequence space $\Theta$

Wanted: $a_{ij}, e_k(s_t)$ \& most likely path $\vec{x} = \{x_1,x_2,\ldots x_n\}$

\textbf{E-step I:} $f_k(s_t), b_k(s_t)$ by forward \& backward algorithm

\textbf{E-step II:}
\begin{eqnarray}
P(X_t = x_k, X_{t+1} = x_l | \vec{s}, \Theta) = \\
 \frac{1}{P(\vec{s})} f_k(s_t) a_{kl} e_l(s_{t+1}) b_l(s_{t+1}) \\
A_{kl} = \sum\limits_{j=1}^m \sum\limits_{t=1}^n P(X_t = x_k, X_{t+1} = x_l | \vec{s}, \Theta)
\end{eqnarray}
\textbf{M-step :}
\begin{eqnarray}
a_{kl} = \frac{A_{kl}}{\sum\limits_i^n A_{ki}} \text{   and   } e_k(b) = \frac{E_k(b)}{\sum_{b'} E_k(b')}
\end{eqnarray}
Complexity: $\mathcal{O}(|S|^2)$ in storage (space)
\end{multicols}
\end{landscape}
\end{document}