MultivariateAnalysisPart2Summary.Rnw

\documentclass[12pt]{article}
%\usepackage[landscape]{geometry}  
\usepackage[landscape,hmargin=2cm,vmargin=1.5cm,headsep=0cm]{geometry} 
% See geometry.pdf to learn the layout options. There are lots.
\geometry{a4paper}                   % ... or a4paper or a5paper or ... 
%\geometry{landscape}                % Activate for for rotated page geometry
%\usepackage[parfill]{parskip}    % Activate to begin paragraphs with an empty line rather than an indent
\usepackage{hyperref}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{epstopdf}

\usepackage{framed}

\usepackage{multicol}

\usepackage{graphicx}

 \usepackage{float}
 \setkeys{Gin}{width=0.25\textwidth}

\usepackage[table]{xcolor}

\newcommand\x{\times}
\newcommand\y{\cellcolor{green!10}}

\newcommand{\pder}[2][]{\frac{\partial#1}{\partial#2}}

\newcommand{\argmin}{\arg\!\min}
\newcommand{\argmax}{\arg\!\max}


\newtheorem{definition}{Definition}

\newtheorem{theorem}{Theorem}

\newtheorem{fact}{Fact}

\newtheorem{proposition}{Proposition}


% Turn off header and footer
\pagestyle{plain}
 

% Redefine section commands to use less space
\makeatletter
\renewcommand{\section}{\@startsection{section}{1}{0mm}%
                                {-1ex plus -.5ex minus -.2ex}%
                                {0.5ex plus .2ex}%x
                                {\normalfont\large\bfseries}}
\renewcommand{\subsection}{\@startsection{subsection}{2}{0mm}%
                                {-1explus -.5ex minus -.2ex}%
                                {0.5ex plus .2ex}%
                                {\normalfont\normalsize\bfseries}}
\renewcommand{\subsubsection}{\@startsection{subsubsection}{3}{0mm}%
                                {-1ex plus -.5ex minus -.2ex}%
                                {1ex plus .2ex}%
                                {\normalfont\small\bfseries}}
\makeatother

% Define BibTeX command
\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em
    T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}

% Don't print section numbers
%\setcounter{secnumdepth}{0}

\newcommand{\eps}{\epsilon}
\newcommand{\al}{\alpha}

\setlength{\parindent}{0pt}
\setlength{\parskip}{0pt plus 0.5ex}


\usepackage{Sweave}
\DeclareGraphicsRule{.tif}{png}{.png}{`convert #1 `dirname #1`/`basename #1 .tif`.png}

\usepackage{cancel}

%% taken from http://brunoj.wordpress.com/2009/10/08/latex-the-framed-minipage/
\newsavebox{\fmbox}
\newenvironment{fmpage}[1]
{\begin{lrbox}{\fmbox}\begin{minipage}{#1}}
{\end{minipage}\end{lrbox}\fbox{\usebox{\fmbox}}}

\usepackage{mathtools}
\makeatletter
 
\newcommand{\explain}[2]{\underset{\mathclap{\overset{\uparrow}{#2}}}{#1}}
\newcommand{\explainup}[2]{\overset{\mathclap{\underset{\downarrow}{#2}}}{#1}}
 
\makeatother

\SweaveOpts{prefix.string=MatAlgfigs/MatAlgfig}

\SweaveOpts{cache=TRUE}

\title{Multivariate Analysis Summary Sheet (Part 2)}
\author{Shravan Vasishth (vasishth@uni-potsdam.de)}
\date{\today}                                           % Activate to display a given date or no date

\newcommand\myfigure[1]{%
\medskip\noindent\begin{minipage}{\columnwidth}
\centering%
#1%
%figure,caption, and label go here
\end{minipage}\medskip}


\begin{document}

\SweaveOpts{concordance=TRUE}
\footnotesize
\maketitle
\tableofcontents

\newpage

\begin{multicols}{2}

\section{Preliminaries}

\subsection{Arithmetic series}

General form: 

\begin{equation}
a+(a+d)+(a+2d)+\dots
\end{equation}

$k$-th partial sum for \textbf{arithmetic series}:

\begin{equation}
S_k = \underset{n=1}{\overset{k}{\sum}} (a+(n-1)d)
\end{equation}

The sum can be found by:
\begin{equation}
S_k = \frac{k}{2} (2a+(k-1)d)
\end{equation}

\subsection{Geometric series}

General form:

\begin{equation}
a+ar+ar^2\dots
\end{equation}

In summation notation:

\begin{equation}
\underset{n=1}{\overset{\infty}{\sum}} ar^{n-1}
\end{equation}

$k$-th partial sum:

\begin{equation}
S_k=\frac{a-(1-r^k)}{1-r}
\end{equation}

$S_\infty$ exists just in case $\mid r \mid < 1$.

\begin{equation}
S_\infty = \frac{a}{1-r}
\end{equation}

\subsection{Some basic results for computing covariance}

\begin{enumerate}
\item
Let $\bar{x}_i$ be vectors:

\begin{equation}
Var(\bar{x}_1 - \bar{x}_2) = var(\bar{x}_1) + var(\bar{x}_2) - 2 Cov(\bar{x}_1,\bar{x}_2)
\end{equation}

Note that $Cov(\bar{x}_1,\bar{x}_2)$ would be the matrix

$$
\begin{pmatrix}
Cov(x_1) & 0 \\
0 & Cov(x_2)\\
\end{pmatrix}
$$

where $Cov(x_1)$ is the covariance between the \textit{components} of $x_1$.

\item The sum of two RVs:

\begin{equation}
Var(X_1 + X_2) = var(X_1) + var(X_2) + 2 Cov(X_1,X_2)
\end{equation}

More generally:

\begin{equation}
Var(aX_1 + bX_2) = a^2 var(X_1) + b^2 var(X_2) + 2ab Cov(X_1,X_2)
\end{equation}

This generalizes to arbitrary numbers of terms:

\begin{equation}
Var(aX_1 + bX_2+cX_3) = a^2 var(X_1) + b^2 var(X_2) + 2ab Cov(X_1,X_2) +
2ac Cov(X_1,X_3) + 2bc Cov(X_2,X_3)
\end{equation}

\item Covariance of X and Y:

\begin{equation}
Cov(X,Y)=E[XY]-E[X]E[Y]
\end{equation}

\begin{equation}
Cov(aX+b,Y)= aCov(X,Y)
\end{equation}

\begin{equation}
Cov(aX+bY+cZ,W)= Cov(aX,W)+Cov(bY,W)+Cov(cZ,W)
\end{equation}

\end{enumerate}


\subsection{Completing the square: complex numbers}

Completing the square: $\frac{-b \pm \sqrt{b^2 - 4ac}}{2a}$

Often, we have imaginary numbers involved. Recall that $i^2 = \sqrt{-1}$. So we will often have roots with a real and complex part:
$4 + \sqrt{-25}= 4 + 5\sqrt{-1}=4 + 5i$. In the TS context, we need to know if the roots lie outside the unit circle; with imaginary components, we can determine this by simply computing the length of the vector:

$\sqrt{4^2 + 5^2}=6.4$.


\section{Time Series}         

\subsection{Smoothing: Moving averages (MAs)} \label{movavg}

At each time point t, take the average of the observations around t, and plot 

\begin{equation}
\frac{1}{2q+1} \sum_{i=-q}^q y_{t+i}\hbox{ against } t
\end{equation}

where $y_t$ is the observation at time $t$.

\textbf{The span} of the MA is the number of values averaged over.

\subsubsection{Linear filter: More general instance of MA (Weighting)}

Taking a moving average amounts to weighting each $y_t$ by 1/span:

\begin{equation}
x_i = \sum a_i y_{t+1} \hbox{ against } t \quad \sum a_i = 1
\end{equation}

The moving average is the special case where 

\begin{equation}
a_i= 
\begin{cases}
\frac{1}{2q+1} & \hbox{ if } \mid i \mid \leq q \\
0 & \hbox{otherwise}\\
\end{cases}
\end{equation}

A plausible weight be a=1/12 for monthly measurements, 1/4 for quarters.

Suppose we take a mean at $x=3$: $X_3'=\frac{y_1+y_2+y_3+y_4}{4}$, and then another one $X_3''=\frac{y_2+y_3+y_4+y_5}{4}$.
We could take a mean of these two means:

\begin{equation}
\begin{split}
\frac{1}{2}(X_3'+X_3'') =& \frac{1}{2}(\frac{y_1+y_2+y_3+y_4}{4}+
\frac{y_2+y_3+y_4+y_5}{4})\\
=& \frac{y_1}{8} + \frac{y_2+y_3+y_4}{4}+\frac{y_5}{8}
\end{split}
\end{equation}

The general formula is:

\begin{equation}
x_t = \frac{y_{t-2}}{8} 
+ 
\frac{y_{t-1} + y_t + y_{t+1}}{4}
+
\frac{y_{t+2}}{8}
\end{equation}

to-do: Add examples from problems

\paragraph{Detrending using moving averages}

<<>>=
library(astsa)
library(MASS)
data(jj)
## moving average:
k = c(.5,1,1,1,.5)
(k = k/sum(k)) 
## weights:
fractions(k)
fjj = filter(jj, sides=2, k)  
plot(jj)
## filtered MA:
lines(fjj, col="red")
@
  
<<>>=  
## detrending using MA:
plot(jj-fjj)
@

%\subsubsection{Linear and polynomial estimates of trend}

\subsection{The classical decomposition}

\begin{equation}
\begin{split}
y_t =& m_t + s_t + r_t\\
=& \hbox{trend} + \hbox{seasonal/cyclic} + \hbox{residual/random}
\end{split}
\end{equation}

The trend is computed by $m_t$. We can \textbf{detrend} the 
time series. $m_t$ is estimated from $\hat m_t$.

\begin{equation}
d_t = y-\hat m_t \approx s_t + r_t 
\end{equation}

This is the detrending example shown above with R.

Take logs for multiplicative relationships to get back into linear space:

\begin{equation}
y_t = m_t \times s_t \times r_t
\end{equation}

The decomposition can be done in R as follows:

<<>>=
#str(decompose(jj))
@

Decomposition is useful because:
\begin{enumerate}
\item You can isolate specific components
\item Allows seasonal adjustment (done for monthly economic series and unemployment figures)

\begin{equation}
 y-\hat s_t \approx m_t + r_t 
\end{equation}
\item 
Poor model performance in one component (trend, 
seasonality) can be rectified in isolation.
\end{enumerate}

\subsection{Differencing}

\subsubsection{First-order differerencing}

This provides a simple method of removing trend, without explicitly estimating it.

\begin{equation}
\nabla y_t = y_t - y_{t-1}
\end{equation}

Define an operator $B$, called a 
\textbf{backward shift operator}: 

\begin{framed}
\begin{equation}
B y_t = y_{t-1}
\end{equation}
\end{framed}

This allows us to rewrite 

\begin{equation}
\begin{split}
\nabla y_t =& y_t - y_{t-1}\\
=& y_t  - B y_t\\
=& (1- B) y_t\\
\end{split}
\end{equation}

So what we have is \textbf{first-order differencing}, which removes linear trend:

\begin{equation}
\nabla y_t = (1- B) y_t
\end{equation}

<<>>=
## removes linear trend:
#plot(diff(jj))
@

\subsubsection{Second-order differencing}

Note that $By_t = y_{t-1}$, and $B^2 y_t = y_{t-2}$.
The power on B is telling you the lag.

\begin{equation}
\begin{split}
\nabla^2 y_t =& (y_t - y_{t-1}) - (y_{t-1} - y_{t-2})\\
=& y_t - 2y_{t-1} + y_{t-2}\\
=&  (1-B)^2 y_t\\
\end{split}
\end{equation}

The last line holds because 

\begin{equation}
\begin{split}
(1-B)^2 y_t =& (1 - 2B + B^2) y_t\\
=& y_t - 2B y_y + B^2 y_t\\
=& y_t - y_{t-1} - y_{t-1} + y_{t-2}
\end{split}
\end{equation}


Second-order differening removes polynomial trend.

\subsubsection{k-th order differencing}

%\begin{framed}
\begin{equation}
\nabla^k y_t = (1-B)^k y_t 
\end{equation}
%\end{framed}

\noindent
removes k-th degree polynomial trend. Usually, k=1 or 
k=2 is enough. Not really practical with k=2 or 3.

``First-, second-, \dots, k-th order differencing are 
sometimes called \textbf{ordinary differencing}''.

\subsubsection{Seasonal differencing}

If there is \textbf{seasonal variation} with known 
cyclicity k, the 
period can be removed by doing a k-order differencing (Note: subscript on $\nabla$):

\begin{equation}
y_t - y_{t-12}=\nabla_{12} y_t = (1-B^{12}) y_t
\end{equation}

For k=12, we say \textbf{this is differencing with lag 12}, or seasonal differencing if monthly series are involved. 

\paragraph{Exercise (Task 3)}: Show $\nabla \nabla_{12} =  \nabla_{12}\nabla$, and interpret.

This first removes linear trend, then seasonal trend.

\begin{equation}
\begin{split}
\nabla \nabla_{12} y_t =&  \nabla (y_t - y_{12})\\
=& \nabla y_t -\nabla y_{12}\\
=& (y_t- y_{t-1}) - (y_{t-12}- y_{t-12-1}) \\
=& (y_t- y_{t-12}) - (y_{t-1}- y_{t-1-12})\\
=& \nabla_{12} y_t - \nabla_{12} y_{t-1} \\
=& \nabla_{12} (y_t - y_{t-1})\\
=& \nabla_{12} \nabla y_t \\
 =& \nabla_{12}(1-B)y_t=(1-B)\nabla_{12}y_t\\ 
 =& (1-B)(1-B^{12})y_t\\
\end{split}
\end{equation}

\subsubsection{Summary}

So, there is k-th order differencing and differencing with lag:

\begin{enumerate}
\item
k-th order diff: $\nabla^{k} y_t=(1-B)^k y_t$ 
\item
diff with lag k: $\nabla_{k} y_t=(1-B^k) y_t = y_t - y_{t-k}$
\end{enumerate}

to-do: Need to improve code examples below:

<<echo=FALSE>>=
dir<-"/Users/shravanvasishth/Dropbox/MScStatistics/2014-2015/MAS6011/Semester2/Data/"

whisk<-read.table(paste(dir,"Whisk.txt",
                        sep=""))
whisk_ts<-ts(whisk,start=c(1980),end=1987,
             frequency=12)
@

<<fig=FALSE>>=
ts.plot(whisk_ts,
        ylab="hectoliters of whiskey 
        per month",
        xlab="month")
@

Another example:
<<>>=
sheftempq<-scan(paste(dir,"Sheftemq.txt",
                        sep=""))
shef_ts<-ts(sheftempq,start=c(1963),end=1978,
             frequency=4)
ts.plot(shef_ts,ylab="temp.",
        xlab="quarter")
@

The decompose function:

<<>>=
whisk_ts_decomp<-decompose(whisk_ts)
#str(whisk_ts_decomp)
@

<<fig=FALSE>>=
op<-par(mfrow=c(1,3),pty="s")
plot(whisk_ts_decomp$seasonal)
plot(whisk_ts_decomp$trend)
plot(whisk_ts_decomp$random)
@

<<>>=
diffts<-diff(whisk_ts,lag=12)
plot(diffts)
@

\subsection{Auto-correlation function (ACF)}

ACF is not really a function, it's a sequence. It's a tool for 
detecting patterns in data. For each lag h, we compute the standard correlation
coefficient. 


\paragraph{Classical correlation} Suppose we have random variables
X, Y. Then, correlation is:

\begin{equation}
r = \frac{\sum (X_i - \bar{X})(Y_i - \bar{Y})}{
\sqrt{\sum (X_i - \bar{X})^2 \sum (Y_i - \bar{Y})^2} }
\end{equation}

Assuming that $y_t$ is stationary (h is the lag of the ACF), the correlation for each lag h is computed as follows:

\begin{equation}
r_h = \frac{\sum_{t=1}^{n-h} (Y_{t} - \bar{Y})(Y_{t+h} - \bar{Y})}{
\sqrt{\sum_{t=1}^{n} (Y_t - \bar{Y})^2 \sum_{t=1}^{n-h} (Y_{t+h} - 
\bar{Y})^2}}
\end{equation}

<<fig=FALSE>>=
dljj <- diff(log(jj)) 
plot(dljj)
## correlation structure:
## dljj(t-lag) vs dljj(t)
lag.plot(dljj, 9, do.lines=FALSE)  
lag1.plot(dljj, 9)  ## astsa
acf(dljj)
@

The \textbf{sample ACF} is: $h \mapsto r_h$, and the plot is called a
\textbf{correlogram}. The plot is only made for $h\geq 1$. The correlation at h=0 is by definition 1.  

<<fig=FALSE>>=
acf(whisk_ts)
@

\section{Probability models for stationarity and non-stationary time series}

General strategy:

\begin{enumerate}
\item 
Plot series, note features
\item
Remove trend and seasonal components
\item
Choose a model to fit the residuals
\item
To forecast, forecast the residuals, then invert transformations (put back trend and seasonal components) to get forecast of the original series.
\end{enumerate}

The convention usually is $Y_1,\dots, Y_n$ are random variables; and $y_1,\dots, y_n$ are realizations of their values.  In these notes, $y_i$ can refer to both the RV and the realization, depending on context. E.g., when we say $E[y_t]$, this refers to the RV, and $y_t=6$ refers to a realization.

\subsection{Strong stationarity}

This occurs when  the \textbf{joint} distribution $y_{t_1},\dots,y_{t_k}$ is the same 
as that of $y_{t_{1+s}},\dots,y_{t_{k+s}}$, for any $s, k, t_1, \dots, t_k$.

The probability properties of the sequence don't change over time. 
$y_t$ has the same distribution as $y_{t+s}$.

Consequences:

\begin{enumerate}
\item
$\mu_t = E[y_t] = E[y_{t+s}] = \mu_{t+s}=\mu$
\item
$\sigma_t^2 = Var(y_t) = Var(y_{t+s}) = \sigma_{t+s}^2=\sigma^2$
\item
Covariance function (function only of $h$): $\gamma_h = Cov(y_t, y_{t+h}) = Cov(y_{t+s}, y_{t+h+s})$. $\gamma_h$ is the covariance function at lag $h$.

Note that knowing $\gamma_h$ for all $h$ tells us what the variance $\sigma^2$ is too, since $\gamma_0=\sigma^2$.
\end{enumerate}

\subsection{Weak or second-order stationarity}

The means, variances, and covariances are finite, and the following hold:

\begin{enumerate}
\item 
$E[y_t] = \mu$
\item 
$Var(y_t) = \sigma^2$
\item
$Cov(y_t,y_{t+h}) = \gamma_h$
\end{enumerate}

whatever the value of t.

Whenever we say stationarity, we mean weak stationarity.


\subsubsection{Practical implication of weak stationarity}

Stationarity helps us compute $E[y_t]$ and $Var(y_t)$. This also illustrates how to compute expectation and variance in a fairly complicated setting.

If you have a model like $y_t = 19 - \frac{1}{3}B y_t -  \frac{1}{4}B^2 y_t + \epsilon_t-\frac{1}{2}\epsilon_{t-1}$, if you can prove causality, you have stationarity, and therefore you can compute $E[y_t]$ by solving for $\mu$ in 

$E[y_t]=\mu = 19 - \frac{1}{3}\mu -  \frac{1}{4}\mu$

Similarly, if $y_t$ is proved to be stationary, it's variance is 
$Var(y_t) = V = Var(19 - \frac{1}{3}y_t - \frac{1}{4}y_{t-1} + \epsilon_t - \frac{1}{2}\epsilon_{t-1})$.

This can be solved using the formula: 

$Var(aX + bY + cZ) = a^2 Var(X) + b^2 Var(Y) + c^2 Var(Z) + 2abCov(X,Y)+2acCov(X,Z) + 2bc Cov(Y,Z)$.

Notice that almost all covariances will be zero, except $Cov(y_{t-1},\epsilon_{t-1})=Cov(y_{t},\epsilon_{t})$ due to stationarity. Since the \textbf{random variables} $y_{t}=\epsilon_{t}$, 
$Cov(y_{t},\epsilon_{t})=Var(\epsilon_{t})$.

Another covariance that is non-zero is $Cov(y_t,y_{t-1})$. Just expand $y_t$, and multiply each term in the expansion with $y_{t-1}$:

Since
$y_t = 19 - \frac{1}{3}B y_t -  \frac{1}{4}B^2 y_t + \epsilon_t-\frac{1}{2}\epsilon_{t-1}$

we get

$Cov(19, y_{t-1}) + Cov(-\frac{1}{3}B y_t, y_{t-1}) + Cov(-  \frac{1}{4}B^2 y_t, y_{t-1}) + Cov(\epsilon_t, y_{t-1})+
Cov(-\frac{1}{2}\epsilon_{t-1}, y_{t-1})$. 

The second term,  $Cov(-\frac{1}{3}B y_t, y_{t-1})$ requires another expansion of $y_t$.

\underline{Important insight with stationary processes}

Note that

\begin{equation}
Cov(y_{t-1},y_{t-2})=Cov(y_{t},y_{t-1}) 
\end{equation}

A further trick: to compute such covariances, assume that $Var(y_t)=Var(y_{t-1})=V$, and then expand only the first term. For example:

\begin{equation}
y_t = \frac{1}{3}y_{t-1} - \frac{1}{4}y_{t-2} + \epsilon_t
\end{equation}

Then:

\begin{equation}
Cov(y_{t-1},y_{t-2})=Cov(y_{t-},y_{t-1})
\end{equation}

See review questions 2.

\subsection{(Theoretical) ACF $\rho_h$}

This is the theoretical analog of the sample ACF.
For a weakly stationary process:

\begin{equation}
\rho_h = Cor(y_t,y_{t+h}) =  
\frac{Cov(y_t,y_{t+h})}{\sqrt{Var(y_t) Var(y_{t+h})}} = \frac{\gamma_h}{\gamma_0}
\end{equation}

Note that $h=0, \pm 1,\dots$.


$\rho_h$ has the properties:

\begin{enumerate}
\item
$-1\leq  \rho_h \leq 1$
\item
$\rho_h = \rho_{-h}$ 
\item
$\rho_0=1$
\item
$\rho_h=0$ if $y_t$ and $y_{t+h}$ independent. 
Note: if $\rho_h=0$, this does not entail $y_t$ and $y_{t+h}$ independent unless both are normally distributed variables.
\end{enumerate}

\subsection{Purely random series (white noise, WN)}

This is a very important example of a stationary process. It's called white noise because, like with white light, all frequencies enter equally.

Such a sequence is called a white noise sequence:

\begin{equation}
y_t = \eps_t \quad E[\eps_t] = 0, Var( \eps_t) = \sigma^2
\end{equation}

$\rho_h=1$ for $h=0$, and 0 otherwise.

\subsubsection{Example of stationary process: A moving average}

See p.\ 14 of Cryer et al book.

\subsubsection{Example of non-stationary process: Random walk}

Examples: stock prices and Brownian motion.
Not stationary because of non-constant variance, and covariance function $\gamma_{t,s}=t\sigma^2$ does not depend on time lag.

Let $e_1,e_2,\dots$ be an iid sequence of RVs, mean zero and variance $\sigma^2$.
We can write:

\begin{equation}
y_t = y_{t-1} + e_1 = e_1 + \dots+e_t
\end{equation}

\begin{enumerate}
\item $E[y_t] = 0$
\item $Var(y_t) = t\sigma^2$
\item Covariance function. Assume without loss of generality that $1\leq t \leq s$.
Then:

\begin{equation}
\begin{split}
\gamma_{t,s}=&Cov(y_t,y_s) \\
=& Cov(e_1+\dots+e_t,e_1+\dots+e_s)\\
=& Cov(\sum_{i=1}^{t}e_i,\sum_{j=1}^{s}e_j)\\
\end{split}
\end{equation}

If $i\neq j$, then 0, else if $i=j$, it is $Var(e_i)=\sigma^2$.
Since $1\leq t \leq s$, there are t such cases, so 

\begin{equation}
\gamma_{t,s}=t \sigma^2
\end{equation}

Note also that 

\begin{equation}
\gamma_{s,s}=s \sigma^2 \quad \gamma_{t,t}=t \sigma^2 
\end{equation}


\item
The ACF: 

\begin{equation}
%% was incorrectly written as \gamma_{t,s}
\rho_s =
=\frac{\gamma_{t,s}}{\sqrt{\gamma_{t,t}\gamma_{s,s}}}=\frac{t \sigma^2}{\sqrt{t}\sqrt{s}\sigma^2}= \sqrt{\frac{t}{s}}
\end{equation}
\end{enumerate}

\textbf{Interesting observation in Cryer and Chan p 14}:
Note that in the plot below that even though the theoretical mean is 0 for all time points, the fact that variance increases with time and that neighboring values have correlation nearly 1 means that we should expect long excursions of the process away from the mean of zero. 

<<>>=
ys<-rep(NA,1000)
for(i in 1:1000){
  if (i==1){ys[i]<-rnorm(1)} else{
ys[i]<-ys[i-1]+rnorm(1)
  }
}
plot(1:1000,ys,type="l")
@

Note that differencing: $\nabla y_t = y_t - y_{t-1}$ will make it stationary. See below.


\subsection{Autoregressive (AR) models}

This is the definition of an AR(p) sequence:

\begin{equation}
y_t = a_1y_{t-1} +a_2y_{t-2} +\dots+a_py_{t-p}  + \eps_t 
\end{equation}

$E[\eps_t] = 0,  Var( \eps_t) = \sigma^2, \eps_t \sim WN(0,\sigma^2)$.


\paragraph{Example: Unemployment Figures}

\begin{itemize}
\item
$U_t$: the unemployment total at time t.
\item 
$1-\alpha$: proportion who find a job before the next month. So $\alpha$ is proportion unemployed in next month.
\item 
Then, $U_t$: people unemployed from last month plus people unemployed this month:

$U_t=\alpha U_{t-1} + \eta_t $
\item If $E[U_t]=\mu$ for all t, then, if we subtract $\mu$ from both sides:

\begin{equation}
U_t - \mu = \alpha U_{t-1} -\mu + \eta_t 
\end{equation}

and then add $\alpha \mu - \alpha \mu$ to the RHS:

\begin{equation}
U_t - \mu = \alpha U_{t-1} -\mu + \eta_t + (\alpha \mu - \alpha \mu)
\end{equation}

Rearranging terms and collecting common terms together:

\begin{equation}
U_t - \mu= \alpha( U_{t-1} - \mu)+ (\alpha -1)\mu + \eta_t
\end{equation}

Setting $y_t = U_{t} - \mu$,
and $y_{t-1} = U_{t-1} - \mu$, and 
$\eps_t = (\alpha-1)\mu + \eta_t$,
we can write:

\begin{equation}
y_t= \alpha y_{t-1}+ \eps_t
\end{equation}

\item
Two conditions that must be satisfied:

\begin{itemize}
\item
$E[y_t]=(\alpha-1)\mu+E[\eta_t] = 0$
\item
$E[\eps_t]=0, Var(\eps_t)=\sigma^2$
\end{itemize}
\end{itemize}

\subsubsection{AR(1)}

This is the special case where we regress only on the previous value:

\begin{equation}
y_t= \alpha y_{t-1}+ \eps_t \quad \eps_t \sim WN(0,\sigma^2)
\end{equation}

We can use the backward shift operator (a functional: a function that takes a function as an argument) B, to express AR(1).

By convention, we let $B^0 y_t = 1$. This allows us to write, more generally:

\begin{equation}
%By_t = y_{t-1} \quad \hbox{ or } 
B^{i}y_t = y_{t-i}, i=1,\dots,t, 0 \leq i \leq t 
\end{equation}

Then, we can write 

\begin{equation}
y_t= \alpha y_{t-1}+ \eps_t 
\end{equation}

as

\begin{equation}
y_t= \alpha By_t+ \eps_t 
\end{equation}

Rearranging to get $\eps_t$ on RHS:

\begin{equation}
y_t - \alpha By_t= \eps_t \Rightarrow y_t(1 - \alpha B)= \eps_t 
\end{equation}

\begin{equation}
\phi(B)y_t = \eps_t \quad \phi(B)=(1 - \alpha B)
\end{equation}

Here, $\phi(B)$ is a polynomial of degree one (a linear function of B).

We can express $y_t$ as follows:

\begin{equation}
y_t = \phi(B)^{-1}\eps_t \quad \phi(B)=(1 - \alpha B)
\end{equation}

\begin{framed}
More generally, for an AR(p) process,

$\phi(B)=1-a_1 B^1 -\dots-a_p B^p$
\end{framed}

\paragraph{Basic properties of AR(1) processes}\label{basicpropertiesAR1}

\begin{enumerate}
\item \textbf{MA($\infty$) representation}: Consider AR(1):

\begin{equation}
\begin{split}
y_t=& \alpha y_{t-1}+ \eps_t \\
=& \alpha (\alpha y_{t-2}+ \eps_{t-1}) + \eps_t \\
=& \alpha (\alpha (\alpha y_{t-3}+ \eps_{t-2})+ 
\eps_{t-1}) + \eps_t \\
=& \dots\\
=& \alpha^t y_0 + \sum_{i=0}^{t-1} \alpha^i \eps_{t-i}  
\end{split}
\end{equation}

As $t\rightarrow \infty$, if $\mid \alpha \mid < 1$ and $y_0$ finite, we get:

\begin{equation}
y_t = \sum_{i=0}^{\infty} a^i \eps_{t-i}
\end{equation}

Thus, as long as $\mid a_i\mid <1$, we have a \textbf{minimum moving average} representation of AR(1):

\begin{equation}
y_t = \sum_{i=0}^{\infty} a^i \eps_{t-i}
\end{equation}

[Recall definition of MA's, section~\ref{movavg}.]

This will make it easier to compute mean, variance, and acf.

\item \textbf{Expectation, variance and acf}:

\underline{Expectation}

\begin{equation}
E[y_t] = \sum_{i=0}^{\infty} \alpha^i E[\eps_{t-i}] = 0
\end{equation}

\underline{Variance}

\begin{equation}
Var(y_t) = \sum_{i=0}^{\infty} \alpha^{2i} 
\explain{Var(\eps_{t-i})}{\hbox{indep. because white noise}} = 
\frac{\sigma^2}{1-\alpha^2} \quad \mid \alpha \mid < 1
\end{equation}

The above follows due to the geometric series (infinite sum).

\underline{Covariance}

The covariance: (Recall: Cov(X,Y)=E[XY] - E[X]E[Y])

\begin{equation}
\begin{split}
~& Cov(y_t, y_{t+h})= \\
\gamma_h = 
~& E[y_t y_{t+h}]-\explain{E[y_t]}{=0}\explain{E[y_{t+h}]}{=0} =
E[y_ty_{t+h}] \\
\end{split}
\end{equation}

Since $y_t = \sum_{i=0}^{\infty} \alpha^i \eps_{t-i}$ and 
$y_{t+h} = \sum_{j=0}^{\infty} \alpha^j \eps_{t+h-j}$,

\begin{equation}
\gamma_h = E[y_ty_{t+h}]
\end{equation}

\begin{equation}
\begin{split} \label{expectationytytplush}
E[y_ty_{t+h}] =& 
E[\sum_{i=0}^{\infty} \alpha^i \eps_{t-i}
\sum_{j=0}^{\infty} \alpha^j \eps_{t+h-j}]\\
=& E[\sum_{i,j=0}^{\infty} \alpha^i \eps_{t-i}
\alpha^j \eps_{t+h-j}]\\
\end{split}
\end{equation}

\label{zeroexpectation}
If $t-i \neq t+h -j$, we have $E[\eps_{t-i}\eps_{t+h-j}]=0$ because the errors are independent.
And if $t-i = t+h -j$, this implies that $j=i+h$. Consequently, for this case, we can write

\begin{equation}
\begin{split}
E[\eps_{t-i}\eps_{t+h-j}] =& 
E[\eps_{t-i}\eps_{t+h-(i+h)}]\\
=& E[\eps_{t-i}\eps_{t-i}]\\
=& E[\eps_{t-i}^2]=Var(\eps_{t-i})=\sigma^2
\end{split}
\end{equation}

The last line above follows because 
$Var(\eps_{t-i})=E[\eps_{t-i}^2]-E[\eps_{t-i}]^2=E[\eps_{t-i}^2]$.

Now, going back to equation~\ref{expectationytytplush},
we can write $E[\sum_{i,j=0}^{\infty} \alpha^i \eps_{t-i}
\alpha^j \eps_{t+h-j}]$ as:

\begin{equation}
\begin{split}
E[y_ty_{t+h}] =& \sigma^2 \sum_{i=0}^{\infty} \alpha^{i}\alpha^{i+h} \quad j=i+h \\
\end{split}
\end{equation}

This amounts to saying that

\begin{equation}
\begin{split}
\sigma^2 \sum_{i=0}^{\infty} \alpha^{i}\alpha^{i+h} =& 
\sigma^2 \sum_{i=0}^{\infty} \alpha^{i}\alpha^{i}\alpha^h\\
=& \sigma^2 \alpha^h \sum_{i=0}^{\infty} \alpha^{2i}
\end{split}
\end{equation}

So, 
\begin{equation}
\begin{split}
E[y_ty_{t+h}] =& \sigma^2 \alpha^h\sum_{i=0}^{\infty}  \alpha^{2i}\\
=& \frac{\sigma^2\alpha^h}{1-\alpha^2} \quad \mid \alpha \mid < 1
\end{split}
\end{equation}

So, we have:

\begin{equation}
\gamma_h = \frac{\sigma^2\alpha^h}{1-\alpha^2}
\end{equation}

\underline{ACF}

\begin{equation}
\begin{split}
\rho_h =& \frac{\gamma_h}{\gamma_0}\\
=& \frac{\cancel{\sigma^2} \alpha^h/\cancel{(1-\alpha^2)}}{\cancel{\sigma^2}
\alpha^0/\cancel{(1-\alpha^2)}}\\
=& \alpha^{\mid h \mid} \quad h=0, \pm 1, \dots \mid \alpha \mid < 1\\
\end{split}
\end{equation}

%[All that hard work, for this?]

\textbf{So the mean, variance, covariance, and ACF do not depend on time t.} So, the $y_t$
satisfies the conditions for stationarity.  


\end{enumerate}

<<>>=
a<-seq(0.01,.99,by=.1)

h<-seq(0,100,by=1)

op<-par(mfrow=c(5,2),pty="s")
for(alpha in a){
plot(h,alpha^h/(1-alpha^2),type="l",main=alpha)
}

nega<--1*a
op<-par(mfrow=c(5,2),pty="s")
for(alpha in nega){
plot(h,alpha^h/(1-alpha^2),type="l",main=alpha)
}
@

\textbf{Note that $\mid \alpha \mid<1$ (in order for the series to converge).} This fact appears below as well, section~\ref{arpolynomial}.

\paragraph{Example of non-convergence when $ \alpha = 1$: Random Walk}


If $\alpha=1$, then the AR(1) doesn't 
have stationarity anymore:

The definition of AR(1) is:

\begin{equation}
y_t = \alpha y_{t-1} + \eps_t
\end{equation}

<<>>=
nsim<-10000
ys<-rep(NA,nsim)
ys[1]<-rnorm(1)

for(i in 2:nsim){
ycurrent<-ys[i-1]+rnorm(1)
ys[i]<-ycurrent
}
plot(ts(ys))
@

If $\alpha=1$, then 

\begin{equation}
y_t = y_{t-1} + \eps_t
\end{equation}

The expectation of $y_t$:

\begin{equation}
E[y_t] = E[y_{t-1}] + E[\eps_t] = E[y_{t-1}] \forall t
\end{equation}

Therefore, $E[y_t]=E[y_1]=constant$. This satisfies the first condition for
stationarity.

But the variance is:

\begin{equation}
Var(y_t) = Var(y_{t-1}) + Var(\eps_t)
\end{equation}

We have to establish here that $Var(y_{t-1})$ and $Var(\eps_t)$ are independent. This is so because, we can write:

\begin{equation}
y_{t-1} = y_{t-2}+\eps_{t-1}
\end{equation}

So, $y_{t-1}$ is a function of $\eps_{t-1},\eps_{t-2},\dots$. And
$\eps_{t-1}$ is independent of $\eps_{t}$ as $t-i \neq t$, for $i=1,2,\dots$.

Due to the independence of $Var(y_{t-1})$ and $Var(\eps_t)$, it follows that:

\begin{equation}
Var(y_t) = Var(y_{t-1}) + Var(\eps_t)
=  Var(y_{t-1}) + \sigma^2
\end{equation}

This proves that the variance will increase with t; it will go to infinity in the limit.
This violates stationarity unless $\sigma^2=0$. Note that if 
$\sigma>0$ and $Var(y_{t-1})$ is infinite, then we have stationarity.

Writing the above recursively:

\begin{equation}
\begin{split}
Var(y_t) =& Var(y_{t-1}) + Var(\eps_t)\\
=&  Var(y_{t-1}) + \sigma^2\\
=&  Var(y_{t-2}) + \sigma^2+\sigma^2\\
=& Var(y_{1})+\underbrace{\sigma^2+\dots+\sigma^2}_\text{t-1 times}\\
=& Var(y_{0})+\underbrace{\sigma^2+\dots+\sigma^2}_\text{t times}\\
=&Var(y_{0})+t\sigma^2
\end{split}
\end{equation}

So, as $t\rightarrow \infty$, $Var(y_t)\rightarrow \infty$, since $\sigma^2>0$. Actually, we don't even need to go so far as to find the limit, we just need to note that $Var(y_t)$ depends on $t$, so the sequence 
$\{y_t\}$ is not stationary. 
\textbf{This kind of model is relevant in finance (stock prices).}

\textbf{Note that for AR(1) in general, $Var(y_t)<\infty \Leftrightarrow$ stationarity}.

\underline{Note that differencing will reduce $y_t$ to stationarity}:

\begin{equation}
\nabla y_t = y_t - y_{t-1} = \cancel{y_{t-1}} + \eps_t - \cancel{y_{t-1}} =
 \eps_t
\end{equation}

$Var(\nabla y_t) = Var(y_{t-1} - y_{t-1} + \eps_t) = Var(\eps_t) = \sigma^2$.

\paragraph{Example} Note that a negative alpha will lead to the acf 
``oscillating'' between positive (even) and negative (odd) values.

\begin{equation}
y_t = -0.8\times y_{t-1}+\eps_t
\end{equation}

<<>>=
y<-arima.sim(100,model=list(ar=-.8))
ts.plot(y)
acf(y)
@

\subsection{AR(p), $p>1$}\label{arpprocess}

Given an AR(p) process:

\begin{equation} 
y_t = a_{1}y_{t-1} + \dots + a_p y_{t-p} + \eps_t 
\end{equation}

The general approach will be to get the white noise to the right, and the y's to the LHS:

\begin{equation}
\begin{split}
~& y_t - a_{1}y_{t-1} - \dots - a_p y_{t-p} =  \eps_t \\
\Leftrightarrow & y_t - a_1 B y_t - \dots - a_p B^p y_t = \eps_t \quad \hbox{ recall } B^i y_t =  \eps_t\\
\Leftrightarrow& \underbrace{(1-a_1 B - \dots - a_p B^p)}_\text{$\phi(B)$} 
y_t = \eps_t\\
\Leftrightarrow& \phi(B) y_t = \eps_t
\end{split}
\end{equation}

This gives us the moving average representation.

The $MA(\infty)$ representation is

\begin{equation}
y_t = \sum_{i=1}^{\infty} \psi_i \eps_t \quad \hbox{ where } \psi_i \hbox{ are appropriate constants }
\end{equation}

The polynomial $\phi(B)$ is extremely important, because it determines whether we have a stationary process or not.

\subsubsection{The Autoregressive Polynomial}\label{arpolynomial}

\begin{equation}
\phi(x) = 1- \alpha_1 x - \dots - \alpha_p x^p = 1- \sum_i^{p} a_i x^i 
\end{equation}

If $\phi(x)=0$, then $\mid x \mid >1$.

\begin{definition}
An AR(p) process is causal iff all roots of $\phi(x)= 0$ lie outside the unit circle.
\end{definition}

In sum:

\begin{enumerate}
\item The process

\begin{equation} 
y_t = a_{1}y_{t-1} + \dots + a_p y_{t-p} + \eps_t 
\end{equation}

is stationary iff the roots of the polynomial $\phi(x)$ lie outside the unit circle, i.e., $\phi(x) = 0 \Rightarrow \mid x \mid > 1$. (x can be real or complex)
\item
An AR(p) process is causal if 
\begin{enumerate}
\item it is stationary
\item and can be represented in terms of the white noise variable $\eps_i$ in $MA(\infty)$ form:
\begin{equation}
y_t = \sum_{i=1}^{\infty} \psi_i \eps_{t-i} 
\end{equation}

Proof of above is in Brockwell and Davis 2002.

The sequence $\{\psi_i \}$ depends on the AR parameters $a_j$ for $j = 1,\dots,p$. (the change in index is not a typo; think about it.)

\end{enumerate}

\end{enumerate}


\paragraph{Example} AR(1)

In AR(1)

\begin{equation}
y_t = ay_{t-1} + \eps_t
\end{equation}

was stationary if $\mid a \mid < 1$. If p=1, then the AR polynomial is 

$\phi(x)= 1- ax$

The root is $\phi(x)=0\Rightarrow x=\frac{1}{\mid a\mid}$. This implies that $\mid a\mid<1$. See discussion above on $\mid a\mid <1$ (section~\ref{basicpropertiesAR1}).

For examples: See Problem 9.

\subsubsection{Deriving ACF for AR(p): The Yule-Walker (difference) equations}

We are going to derive $\gamma_h$ for AR(p).

Recall that (we proved this earlier) that 
$\gamma_h = E[y_{t-h} y_t]=\gamma_{-h}$, because ACF depends only on lag, not on time t.

\begin{equation}
\begin{split}
\gamma_h =& Cov(y_{t-h} y_t)\\
=& E[y_{t-h} y_t] \quad \hbox{ recall } E[y_t]=0\\
\end{split}
\end{equation}
 
Replace $y_t$ with its expansion in AR(p). I will replace $\alpha$ with $a$ for convenience:

\begin{equation}
E[y_{t-h} y_t] = E[y_{t-h} (a_1y_t+\dots+a_p 
y_{t-p}+\eps_t)]
\end{equation}

Multiplying out the terms:

\begin{equation}
\begin{split}
E[y_{t-h} (a_1y_{t-1}+\dots+a_p 
y_{t-p}+\eps_t)] =& \\
a_1 E[y_{t-h}y_{t-1}]+ \dots ~&  \\
+ a_p E[y_{t-h}y_{t-p}]+
E[y_{t-h} \eps_t] ~&\\
\end{split}
\end{equation}

We will now work out $E[y_{t-h}y_{t-1}]$. 
We can rewrite this as $E[y_{(t-1)-(h-1)}y_{t-1}]$. Rewrite $t-1$ as $t*$.

\begin{equation}
\begin{split}
E[y_{t-h}y_{t-1}] =& E[y_{(t-1)-(h-1)}y_{t-1}]\\
=& E[y_{t*-(h-1)}y_{t*}]\\
\end{split}
\end{equation}

The model is causal; the ACF doesn't depend on t, so 

\begin{equation}
E[y_{t*-(h-1)}y_{t*}]=\gamma_{h-1}
\end{equation}

So, we are going to get:

\begin{itemize}
\item $E[y_{t*-(h-1)}y_{t*}]=\gamma_{h-1}
$
\item \dots
\item $E[y_{(t-p)-(h-p)}y_{t-p}]=
E[y_{t*-(h-p)}y_{t*}]=\gamma_{h-p}$
\end{itemize}

Next, we show that $E[y_{t-h}]E[\eps_t]=0$.
Since $y_t$ is causal, 
$y_{t-h}$ is written as a linear combination 
of $\eps_{t-h}, \eps_{t-h-1},\dots$, and each of these $\eps$ are independent of $\eps_t$ because the indices are different; 
i.e., $t-h\neq t, t-h-1\neq t,\dots$,
since $\{\eps_t\}$ is white noise.

Hence $y_{t-h}$ is independent of $\eps_t$.

%Since $y_{t-h}$ can be written as a linear combination of all $\eps$, the last term 
%$E[y_{t-h} \eps_t]=0$: 
%$E[y_{t-h}\eps_t]= E[y_{t-h}]E[\eps_t]=0$.

From the above results, we can write:

\begin{equation}
\gamma_h= E[y_{t-h}y_t] = a_1 \gamma_{h-1}+
\dots+a_p \gamma_{h-p}
\end{equation}

So, ACF of AR(p) is given by

\begin{equation}
\rho_h=\frac{\gamma_h}{\gamma_0}=
\frac{a_1 \gamma_{h-1}+ \dots+a_p \gamma_{h-p}}{\gamma_0}
\end{equation}

In other words:

\begin{framed}
\begin{equation}
\rho_h=a_1 \rho_{h-1}+ \dots+a_p \rho_{h-p}
\end{equation}
\end{framed}

The above is the \textbf{Yule-Walker (difference) equation}.

For examples: See Problems 11, 14.

For example, an AR(2) process:
$y_t = 0.5 y_t + 0.25 y_{t-2} + \eps_t$.

<<>>=
## not sure if this is right: 
#y<-arima.sim(100,model=list(ar=c(0.5,0.25)))
#ts.plot(y)
#acf(y)
@

\subsection{MA processes of order q: MA(q)}

\begin{equation}
y_t = \sum_{i=0}^{q} \beta_i \eps_{t-i} = \beta_0 \eps_t +
\beta_1 \eps_{t-1} + \dots +
\beta_q \eps_{t-q}
\end{equation}

Assume that $\beta_0=1$. 

Why do we call it a Moving Average (MA) process? If you consider $\beta_i$ as weights, then this equation is reminiscent of the MAs. But the MA process, confusingly, has nothing to do with Moving Averages discussed earlier in section~\ref{movavg}.
This equation is about estimation. 

\subsubsection{The expectation}

As before, the mean of $y_t=0$.

\begin{equation}
E[y_t] = \sum \beta_1 E[\eps_{t-i}] = 0 \hbox{ since } E[\eps_{t-i}]=0
\end{equation}

\subsubsection{Variance}

\begin{equation}
\begin{split}
Var(y_t) =& Var(\sum_{i=0}^{q} \beta_i \eps_{t-i})\\
=& \sum_{i=0}^{q} \beta_i^2 Var(\eps_{t-i}) \quad \hbox{ because } \eps_{t-i},\eps_{t} \hbox{ are indep.\ RVs }\\
=& \sigma^2 \sum_{i=0}^{q} \beta_i^2 \\
=& \sigma^2 (1+\beta_1^2+\dots+\beta_q^2)
\end{split}
\end{equation}

Note that this implies that 

\begin{equation}
\gamma_0 = Cov(y_t y_{t+0}) =  Var(y_t) = \sigma^2 \sum_{i=0}^{q} \beta_i^2
\end{equation}


\subsubsection{ACF of MA(q)}

We just established (above) that:

\begin{equation}
\gamma_0 = Cov(y_t y_{t+0}) =  Var(y_t) = \sigma^2 \sum_{i=0}^{q} \beta_i^2
\end{equation}

Next, we derive $\gamma_h$, in order to compute the ACF $\rho_h = \gamma_h/\gamma_0$.

\begin{equation} \label{gammahmaq}
\begin{split}
\gamma_h = & Cov(y_t y_{t+h}) \\
= & E[y_t y_{t+h}]\\
\end{split}
\end{equation}

Since $y_t = \sum_{i=0}^{q} \beta_i \eps_{t-i}$
and $y_{t+h} = \sum_{j=0}^{q} \beta_j \eps_{t+h-j}$,
we can write the last line in equation~\ref{gammahmaq} as:

\begin{equation}
\begin{split}
 E[y_t y_{t+h}] = & E[\sum_{i=0}^{q} \sum_{j=0}^{q}
 \beta_i \beta_j \eps_{t-i}\eps_{t+h-j} ]\\
 = & \sum_{i=0}^{q} \sum_{j=0}^{q}
 \beta_i \beta_j E[\eps_{t-i}\eps_{t+h-j}]\\
\end{split}
\end{equation}

Only when $t-i = t+h-j$, and consequently $j=i+h$, do we have a non-zero expectation. When $t-i \neq t+h-j$, expectation of the $\eps$ is 0. (see page~\pageref{zeroexpectation}).

When $E[\cdot]\neq 0$, $E[\eps_{t-i}^2] = Var(\eps_{t-i})=\sigma^2$.
So, 

\begin{equation}
\begin{split}
\gamma_h = & Cov(y_t y_{t+h}) \\
= & E[y_t y_{t+h}]\\
= & \sum_{i=0}^{q-h} \beta_i \beta_{i+h} \sigma^2 \\
=& \sigma^2 \sum_{i=0}^{q-h} \beta_i \beta_{i+h}\\
=& \sigma^2 \sum_{i=0}^{q-h} \beta_i \beta_{i+h}\\
\end{split}
\end{equation}

The upper bound in the summation is $q-h$ because we have $i+h$ as a subscript, and $(q-h) + h = q$, which is the maximum value we can have. Hence, $i$ can only have maximal value $q-h$ when $j=i+h$.

So:

\begin{equation}
\gamma_h= 
\begin{cases}
 \sigma^2 \sum_{i=0}^{q-h} \beta_i \beta_{i+h} & \hbox{ if } 
t-i = t+h-j  \\
0 & \hbox{otherwise}\\
\end{cases}
\end{equation}

It follows that 

\begin{equation}
\rho_h = \frac{\gamma_h}{\gamma_0} = \frac{\cancel{\sigma^2} \sum_{i=0}^{q-h} \beta_i \beta_{i+h}}{\cancel{\sigma^2} \sum_{i=0}^{q}\beta_i^2}
= \frac{\sum_{i=0}^{q-h} \beta_i \beta_{i+h}}{\sum_{i=0}^{q}\beta_i^2}
\end{equation}

Thus,

\begin{equation}
\rho_h= 
\begin{cases}
0 & \hbox{ for } h>q\\
\frac{\sum_{i=0}^{q-h} \beta_i \beta_{i+h}}{\sum_{i=0}^{q}\beta_i^2} & \hbox{ for }  h\leq q\\
\end{cases}
\end{equation}

So, we could in principle use ACF to identify which $q$ is appropriate, because as soon as $h>q$, the ACF function will a value of $\rho_h=0$.

<<>>=
y<-arima.sim(1000,model=list(ar=.9))
plot.ts(y)
acf(y)
@

The above kind of plot is a very useful tool for identifying MA(q) processes.

\paragraph{Notes}

\begin{enumerate}
\item $q\rightarrow \infty \Rightarrow MA(\infty)$ is only used to represent AR(1).
\item 
Stationarity holds iff $Var(y_t) < \infty$, that is, $\sum_{i=0}^{q} \beta_i^2 < \infty$.
\end{enumerate}

So, we can identify an appropriate model by looking at the ACF. We want a 1-to-1 relationship between ACF and model. But we have a problem: the ACF is non-unique.

\subsubsection{Non-uniqueness of ACF}

Consider an MA(1) process:

\begin{equation}
y_t = \beta_0\eps_t + \beta_1 \eps_{t-1}
=\eps_t + \beta_1 \eps_{t-1}
\end{equation}

The ACF is 
$\rho_h = \frac{\beta_1}{1+\beta_1^2}$ when h=1 and 0 otherwise.

Suppose now that we have a second MA(1) model:

\begin{equation}
x_t =\eps_t + \frac{1}{\beta_1} \eps_{t-1}
\end{equation}

The ACF now is 

\begin{equation}
\rho_h' = 
\frac{\frac{1}{\beta_1}}{1+\frac{1}{\beta_1^2}}
= \frac{\beta_1}{1+\beta_1^2}= \rho_h !!!
\end{equation}

So, two clearly different models have the same ACF: looking at the ACF will not necessarily lead to a unique model.

\subsubsection{Invertible process}

If we want to use ACF to identify a unique model, we can do the following. Take an MA(1) model:

\begin{equation}
y_t = \eps_t + \beta_1 \eps_{t-1}
\end{equation}

Let 

\begin{equation}
\eps_t = y_t  - \beta_1 \eps_{t-1}  
\end{equation}

and 

\begin{equation}
\eps_{t-1} = y_{t-1}  - \beta_1 \eps_{t-2}  
\end{equation}

Expanding out $\eps_{t-1}$ in the following equation

\begin{equation}
y_t = \eps_t + \beta_1 \eps_{t-1}
= \eps_t + \beta_1(y_{t-1}  - \beta_1 \eps_{t-2})
\end{equation}

This is going to be an infinite series:

\begin{equation}
y_t = - \sum_{i=1}^{\infty} (-\beta_i)^{i} y_{t-1} + \eps_t 
\end{equation}

This is an $AR(\infty)$ iff $\mid \beta_1\mid < 1$. This rules out second model above as $\mid \frac{1}{\beta} \mid = \mid \beta' \mid > 1$.

to-do: Add problem 11 on invertibility

\section{ARMA(p,q) processes}

\begin{definition}
$y_t$ is an ARMA(p,q) process if it is stationary and satisfies:

\begin{equation}
y_t = 
\underbrace{a_1 y_{t-1} + \dots + a_p y_{t-p} + \eps_{t}}_\text{AR(p)} + \underbrace{b_1 \eps_{t-1} + \dots + b_q \eps_{t-q}}_\text{MA(q)}
\end{equation}
\end{definition}

We can simplify this by using the backward shift operator. 

Let $\phi(x)=1-a_1 x - \dots - a_p x^p$ and $h(x)=1+b_1 x + \dots + b_q x^q$.

Since, in an AR(p) process,

\begin{equation}
y_t = 
a_1 y_{t-1} + \dots + a_p y_{t-p} + e_t
\end{equation}

and assuming that $By_t = y_{t-1}$, we can write:

\begin{equation}
y_t = 
a_1 By_t + a_2 B^2 y_t \dots + a_p B^py_t + \eps_t
\end{equation}

Moving all terms but $e_t$ to LHS:

\begin{equation}
y_t -a_1 By_t - a_2 B^2 y_t \dots - a_p B^py_t = 
  \eps_t
\end{equation}

we get

\begin{equation}
y_t\underbrace{(1 -a_1 B - a_2 B^2  \dots - a_p B^p}_\text{$\phi(B)$} = 
  \eps_t
\end{equation}


Similarly, (and recalling that we define $B^0 y_t = 1$), 
we can define $B\eps_{t} = \eps_{t-1}$ and write:

\begin{equation}
b_0 \eps_t + b_1 \eps_{t-1} + \dots + b_q \eps_{t-q}
\end{equation}

as

\begin{equation}
b_0 B^0 \eps_t + b_1 B\eps_{t} + \dots + b_qB^q\eps_{t}
\end{equation}

or as

\begin{equation}
\underbrace{(1 + b_1 B + \dots + b_qB^q)}_\text{h(B)}\eps_{t}
\end{equation}

So, we can write an ARMA(p,q) process as:

\begin{equation}
\phi(B)y_t = h(B)\eps_{t}
\end{equation}

Notice that this yields a stationary $MA(\infty)$ representation:

\begin{equation}
y_t = (\phi(B))^{-1}h(B)\eps_{t} = \sum_{i=0}^{\infty} \psi_i \eps_{t-i}
\end{equation}

$\phi(B)$ (p parameters) is the \textbf{characteristic polynomial} of the AR model, and 
$h(B)$ (q+1 parameters) is the \textbf{characteristic polynomial} of the MA model.

\textbf{We get a stationary $MA(\infty)$ representation}

\begin{equation}
y_t = (\phi(B))^{-1} h(B) \eps_t = \sum_{i=0}^{\infty} \psi_i \eps_{t-i}
\end{equation}

iff all roots of $\phi(x)=0$ lie outside the unit circle. This process is called \textbf{causal}. Recall that causality implies stationarity; so if you need to check for stationarity, check for causality.

\textbf{We get a stationary $AR$ representation}

\begin{equation}
\eps_t = \phi(B) (h(B))^{-1} y_t = \sum_{j=0}^{\infty} \delta_j y_{t-j}
\end{equation}

iff all roots of h(x)=0 lie outside unit circle. The process is said to be \textbf{invertible}. \textbf{If this part is invertible, then the ARMA(p,q) process is invertible.}

to-do add problem 12

\subsection{ACF of an ARMA(p,q) process}

The ACF of the AR part will be the ACF of the ARMA process.

This is because, when we compute the acf, we have to compute the expectation of
$y_{t-h}y_t$. Now,

\begin{equation}
\begin{split}
~& y_{t-h}y_t = 
y_{t-h}(a_1 y_{t-1} + \dots + a_p y_{t-p} +\eps_{t}) \\
~&+ \underbrace{y_{t-h}(b_1 \eps_{t-1} + \dots + b_q \eps_{t-q})}_\text{Expectation 0}\\
\end{split}
\end{equation}


If $h>q$, then $y_{t-h}$ is uncorrelated with $\eps_t, \eps_{t+1},\dots,\eps_{t+q}$.
This is because $y_{t-h}$ can be expressed in terms of $\eps$, and each of these will be independent from the $\eps$ in the MA part of the equation above. So, the part in underbrace will have expectation 0.

The AR part will determine $\gamma_h$, and this can be computed as earlier.

\textbf{We will not discuss the case where $h\leq q$ in the course.}

Example: ARMA(1,1):

\begin{equation}
y_t = 0.8 y_{t-1} + \eps_t + 0.9 \eps_{t-1}
\end{equation}

<<>>=
y <- arima.sim(100, model=list(ar=0.8,ma=0.9))
ts.plot(y)
acf(y)
@

\section{Non-stationary ARIMA}

Stationarity is only studied for convenience, because it has nice properties. But real data is non-stationary.

We will reduce a NS model to a stationary model by differencing, model the $\nabla$ series by a causal or invertible process, and then transform back to the NS process. This is the Box-Jenkins approach.

Recall that 

\begin{equation}
\nabla^k y_t = (1-B)^k y_t 
\end{equation}

Let $y_t$ be the time series. If we apply d-th order differencing to it, we get

\begin{equation}
x_t =  (1-B)^d \eps_t 
\end{equation}

$x_t$ is now a stationary process, and we can define an ARMA model for it.

\begin{definition}
For integer $d\geq 0$, the series $y_t$ is an ARIMA(p,d,q) process (integrated autoregressive moving average process of order p,d,q) if the d-times differenced $y_t$, call it $x_t=(1-B)^d y_t$, is ARMA(p,q).

That is:

\begin{equation}
\begin{split}
~& \phi(B)x_t =  h(B) \eps_t\\
~& \phi(B)(1-B)^d y_t = h(B)\eps_t
\end{split}
\end{equation}

Now, $\phi_1 (B)  = \phi(B)(1-B)^d $  is of order p+d. 
One root of $\phi_1 (B)$ is 1, and so $y_t$ is not stationary.
\end{definition}

\subsection{Seasonal ARIMA: SARIMA}

Note: we will use D for seasonal differencing, instead of d. D is only rarely greater than 2.

Recall: 
\begin{enumerate}
\item
$\nabla_s = 1-B^s$, so $\nabla_s y_t = y_t - y_{t-s}$. For monthly data, s=12. Usually D=1 suffices.
\item
$\nabla^d=(1-B)^d$.
\end{enumerate}

Let $x_t = \nabla_s^{D} y_t$ be a D-times seasonally differenced series, where s is the length of the seasonal variation.

In this case,

\begin{equation}
\begin{split}
x_t =&  \nabla_s^{D} y_t\\
=& (\nabla_s)^{D} y_t\\
=& (1-B^s)^D y_t \quad \hbox{ as } 
\nabla_s = 1-B^s 
\end{split}
\end{equation}

If $x_t$ is ARMA(P,Q) then $y_t$ is ARIMA(P,D,Q)$_s$.

If the data $y_t$ is non-stationary not due to seasonality but due to trends (for example),  then we can do differencing to get stationarity: 
 
 \begin{equation}
x_t = \nabla^d \nabla_s^D y_t 
 \end{equation}

Replacing each of the terms with their expansions:

\begin{equation}
x_t = \underbrace{(1-B)^d}_\text{$\nabla_d$} \underbrace{(1-B^s)^D }_\text{$\nabla_s^D$}y_t
\end{equation}
 
 Now we assume that $x_t$ follows an ARMA(P,Q) model for its seasonal terms (those lagged by s) and an ARMA(p,q) for its nonseasonal terms. 
This means that  

\begin{equation} 
\Phi_s (B^s) x_t = H_s(B^s)h(B)\eps_t
\end{equation}

Expanding out $x_t$:

\begin{equation} \label{sarimaformula}
\underbrace{\Phi_s(B^s)}_\text{seasonal}\underbrace{\phi(B)(1-B)^d}_\text{AR poly.}\underbrace{(1-B^s)^D y_t}_\text{stationary} ] = \underbrace{H_s (B^s)}_\text{seasonal} \underbrace{h(B)}_\text{MA poly.}\eps_t 
\end{equation}
 
$\Phi,\phi, H, h$  are polynomials:

\begin{framed}
Recall that

$\Phi_s(B^s)= 1-a_1 (B^s)^1 -\dots-a_P(B^s)^P$

$\phi(B)= 1-a_1 B^1 -\dots-a_p B^p$

$H_s(B^s)= 1+b_1 (B^s)^1 +\dots+b_Q (B^s)^Q$

$h(B)= 1+b_1 B^1 +\dots+b_q B^q$
\end{framed}


Such a model is called seasonal ARIMA(p,d,q)$\times$
(P,D,Q)$_s$ with period s. 
p,d,q refers to the trend, P,D,Q to the seasonal part.

BTW, note how the Backward shift operator simplifies the notation.

The application of this formula will be: Given a model SARIMA(p,d,q)$\times$
(P,D,Q)$_s$ with some instantiated values, write down the form of the model in the form $y_t=\dots$. Or vice versa. 

See Exercise 2.

\textbf{Example}:
Write down the model SARIMA(1,1,0)$\times$ (0,1,1)$_4$ as $y_t=\dots$.

\begin{enumerate}
\item Step 1: list all variables:

\begin{table}[htdp]
\begin{center}
\begin{tabular}{cc}
Trend & Seasonal\\
p=1 & P=0\\
d=1 & D=1\\
q=0 & Q=1\\
    & s=4
\end{tabular}
\end{center}
\end{table}%

These variables will determine what the polynomials should be. 

\item Step 2: Replace values of s and D in formula in equation~\ref{sarimaformula}:

\begin{equation} \label{sarimaformula2}
\begin{split}
~& \underbrace{\Phi_s(B^4)}_\text{P=0 so 1}\underbrace{\phi(B)}_\text{p=1 so $1-aB$}(1-B)\underbrace{(1-B^4)^1 y_t}_\text{stationary} ] \\
~& = \underbrace{H_s (B^4)}_\text{Q=1 so $1+bB^4$} \underbrace{h(B)}_\text{q=0 so 1}\eps_t 
\end{split}
\end{equation}

This gives us:

\begin{equation}
1(1-aB)(1-B^4)(1-B)y_t = (1+bB^4)\eps_t
\end{equation}

After some algebra, we get:

\begin{equation}
\begin{split}
~& (1-(a+1)B + aB^2-B^4 + (a+1)B^5 -aB^6) y_t \\
~& = (1+bB^4)\eps_t
\end{split}
\end{equation}

[to-do need to check above]

After some more algebraic manipulation,

\begin{equation}
\begin{split}
~& y_t = (a+1) \underbrace{y_{t-1}}_\text{$By_t$} + ay_{t-2} - y_{t-4} + (a+1) t_{t-5} - a y_{t-6}\\
~&= \eps_t + b\underbrace{\eps_{t-4}}_\text{$B^4$}
\end{split}
\end{equation}

Move everything but $y_t$ to RHS:

\begin{equation}
\begin{split}
y_t =& (a+1) y_{t-1} - ay_{t-2} + y_{t-4} - \\
~& (a+1)y_{t-5} + ay_{t-6} + \eps_t + b\eps_{t-4}
\end{split}
\end{equation}

\end{enumerate}

More examples are in the problem sets.

\section{Inference}

The Box-Jenkins iterative approach:

\begin{enumerate}
\item Identify initial values of p,d,q and P,D,Q
\item Estimate ARIMA parameters
\item  Diagnosis and model criticism
\item Modify the model based on 3 above (i.e., change p,d,q and P,D,Q) and go back to 2.
\end{enumerate}

\subsection{Identifying model}

\begin{figure}[H]
\includegraphics[width=8cm,height=8cm,angle=-90]{diggleplot}
\caption{Plot after Diggle.}\label{fig:diggleplot}
\end{figure}

Six ways to identify model:

\begin{enumerate}
\item Time plot: can indicate non-stationarity, seasonality, need to difference

<<>>=
## shows seasonality:
ts.plot(shef_ts,ylab="temp.",
         xlab="quarter")
@

\item ACF/Correlogram: can indicate non-stationarity, seasonality

<<>>=
## shows seasonality:
acf(shef_ts)
@

\begin{enumerate}
\item An ACF not decaying to 0 suggests non-stationarity
\item An ``oscillating'' ACF suggests seasonality (see above \texttt{shef} example).
\end{enumerate}
\item Test for white noise: Plot 95\% CIs in Correlogram, if sample autocorrelations $r_h$ lie outside intervals, then not white noise.


\underline{Details}: Under null hyp.\ $H_0: 
\rho_h = 0$ for $h\geq 1$. So, 

$r_h \sim N(0,1/n)$, for h=1,2,\dots $\Rightarrow$ CI: $\pm 2\sqrt{1/n}$

More generally, for ARMA processes, under $H_0$:

$(r_1, \dots, r_h)' \sim N((\rho_1, \dots,\rho_h)', \frac{1}{n} W)$ 

where

\begin{equation}
\begin{split}
W_{i,j}=&  \sum_{k=1}^{\infty} (\rho_{k+i}+\rho_{k-i} - 2\rho_i \rho_k)\\
~& (\rho_{k+j}+\rho_{k-j}-2\rho_j\rho_k) \quad t,j=1,\dots,h
\end{split}
\end{equation}

Note that this does not test the hypothesis $\rho_h = 0$, where, $h\geq p > 1$.
(what? this doesn't make sense)

\item Test for MA(q): If WN is ruled out, test for MA(q). Recall that for MA(q), $\rho_h = 0$ for $h>q$. 
Under $H_0: \rho_h = 0$, the $\rho_h$ for $h=q+1,q+2,\dots$ are independent.  So, for an MA(q) process we expect 
for large $n$:

\begin{equation}
r_h \sim N(0,\frac{1+2\sum_{j=1}^q\rho_j^2}{n})
\end{equation}

independently for $h>q$.

So, a CI based test (could be done visually):

\begin{equation}
r_h \in \pm 2\sqrt{\frac{1+2\sum_{j=1}^q\rho_j^2}{n}} \quad \hbox{ for } h>q (h=q+1,q+2,\dots)
\end{equation}

\begin{tabular}{cc}
MA(1) & $r_2, r_3,\dots < \pm 2\sqrt{\frac{1+2 (r_1^2)}{n}}$\\
MA(2) & $r_3, r_4,\dots < \pm 2\sqrt{\frac{1+2 (r_1^2 + r_2^2)}{n}}$ \\
MA(3) & $r_4, r_5,\dots < \pm 2\sqrt{\frac{1+2 (r_1^2 + r_2^2 + r_3^2)}{n}}$ \\
\end{tabular}

\item Test for AR(p): Partial ACF.The ACF of an AR(p) model does not cut off sharply but decays geometrically to 0. However, the PACF does does cut off to zero sharply beyond p, just like the MA(q)'s ACF is 0 beyond q.

\begin{tabular}{cc}
AR(1) & $a_2, a_3,\dots <\pm 2\sqrt{1/n}$ \\
AR(2) & $a_3, a_4,\dots <\pm 2\sqrt{1/n}$ \\
AR(3) & $a_4, a_5,\dots <\pm 2\sqrt{1/n}$ \\
\end{tabular}


To construct PACF, we successively build AR(1), AR(2), AR(3), \dots models.

\begin{equation}
\begin{split}
p=1 =& y_t = a_1^{(1)}y_{t-1} + \eps_t\\
p=2 =& y_t = a_1^{(2)}y_{t-1} + a_2^{(2)}y_{t-2} + \eps_t\\
p=3 =& y_t = a_1^{(3)}y_{t-1} + a_2^{(3)}y_{t-2} +
a_3^{(3)}y_{t-3} + \eps_t\\
 & \vdots \\
\end{split}
\end{equation}

We define the PACF at lag h to be $\hat a_h^{(h)}$. It can be thought of as a measure of the linear dependence of $y_t$ on $y_{t-h}$ after dependence on the intervening values $y_{t-1}, y_{t-(h-1)}$ has been taken into account. To simplify notation we can drop the hat on a and also drop the superscript ($a_h$ instead of 
$\hat a_h^{(h)}$).

So the statistical test is that 
$a_h \approx 0$ for $h>p$, i.e., that the PACF cuts off sharply after p.

\underline{If $y_t$ is a WN process} (i.e., if it is not AR(p)), then 

\begin{equation}
a_h \sim N(0,1/n) \hbox{ independently for } h\geq 1
\end{equation}

Visual check using PACF (just like ACF based test): the lines $\pm 2\sqrt{\frac{1}{n}}$ are the rejection regions. If for a given p+1, $a_{p+1}$ lies inside the CI, then it is an AR(p) process.

\underline{Calculating PACF}: 

The PACF at lag h is the coefficient $a_h$, in solution of the equations

\begin{equation}
\rho_j = a_1 \rho_{j-1} + \dots + a_h \rho_{j-h} \quad j=1,\dots,h
\end{equation}

Method: solve this equation for $r_1, r_2$ or $a_1$, $a_2$.

\begin{equation}
\begin{pmatrix}
r_1 \\
r_2\\
\end{pmatrix}
=
\begin{pmatrix}
1 & r_1 \\
r_1 & 1 \\
\end{pmatrix}
\begin{pmatrix}
a_1 \\
a_2\\
\end{pmatrix}
\end{equation}

Steps for finding PACF: 
\begin{enumerate}
\item Get the two equations
\item Then solve for $a_1$ in first
\item Plug $a_1$ into second equation and solve for $a_2$.
\end{enumerate}


\item Principle of parsimony: If AR, MA are not plausible, seek ARMA(p,q), but starting with simplest, i.e., p=q=1.
\end{enumerate}

\subsection{Fitting}

\subsubsection{MLE}

\subsection{Error analysis}

\subsection{Model selection}


\section{Forecasting}

Given data $y_{1:t}$, we want to forecast/predict $\hat y_{t+i}$. $t$ is called the \textbf{forecast origin} and $i$ is called \textbf{lead time}.

\subsection{Criterion for choice of $\hat y_{t+i}$: Minimum mean square error predictor}

Minimize: 

\begin{equation}
E[(\hat y_{t+i} - y_{t+i})^2 \mid y_{1:t}]
\end{equation}

The conditioning takes into account that we have observed $y_{1:t}$ and want to build this information in. 

$\hat y_{t+i}$, and $i\geq 1$ is called ``out of 
sample'' forecast in economics. 
If Z is a random variable, $Var(Z)<\infty$, 

\begin{equation}
f(a) = E[(z-a)^2]
\end{equation}

is minimized. Proof:

\begin{equation}
\begin{split}
f(a) =& E[z^2 - 2az + a^2]\\
=& E[z^2] - 2a E[z] + a^2\\
\end{split}
\end{equation}

Taking derivative:

\begin{equation}
\frac{d(f(a))}{da} = -2 E[z] + 2a 
\end{equation}

Equating this to $0$, we get $a = E[z]$.

The second derivative is positive: 
$\frac{d^2(f(a))}{da^2}=2$. Hence this is a minimum.

So, 

\begin{equation}
\hat y_{t+i} = E[y_{t+i} \mid  y_{1:t}]
\end{equation}

\subsection{Forecasting ARMA(p,q) process}

Model:

\begin{equation}\label{armaequation}
y_t = 
\underbrace{a_1 y_{t-1} + \dots + a_p y_{t-p} + \eps_{t}}_\text{AR(p)} + \underbrace{b_1 \eps_{t-1} + \dots + b_q \eps_{t-q}}_\text{MA(q)}
\end{equation}

i.e., 

\begin{equation}
\phi(B) y_t = h(B) \epsilon_t
\end{equation}

Assume that we know $a,b,\sigma^2$ and that $E[y_t]= 0$.

Invertibility guarantees that (via the $AR(\infty)$ representation): 

\begin{equation}
\epsilon_t = h(B)^{-1} \phi(B) y_t = 
\sum_{j=0}^{\infty} = \delta_i y_{t-j}
\end{equation}

($\delta_j$ are determined by $a$'s and $b$'s.). So if we know $y_t$, we know $\epsilon_t$.

Also, from the $MA(\infty)$ representation:

\begin{equation}
y_t = 
\sum_{i=0}^{\infty} = \zeta_i \epsilon_{t-i}
\end{equation}

So, we can go from $y_t$ to $\epsilon_t$. This becomes relevant below.

\begin{equation}
y_t = E[y_{t+1}\mid y_{1:t}]
\end{equation}

Replacing $t+1$ for $t$ in equation \ref{armaequation}:

\begin{equation}
\begin{split}
~& y_{t+1} = 
a_1 y_{(t+1)-1} + \dots + a_p y_{(t+1)-p} \\
+~ \eps_{(t+1)} + b_1 \eps_{(t+1)-1} + \dots + b_q \eps_{(t+1)-q}\\
\end{split}
\end{equation}


\begin{equation}
\begin{split}
y_t =& E[y_{t+1}\mid y_{1:t}] \\
=& 
E[a_1 y_{(t+1)-1} + \dots + a_p y_{(t+1)-p} + \eps_{(t+1)} + \\
~& b_1 \eps_{(t+1)-1} + \dots + b_q \eps_{(t+1)-q} \mid y_{1:t}]\\
=&
E[a_1 y_{t} + \dots + a_p y_{t+1-p} + \eps_{(t+1)} + \\
~& b_1 \eps_{t} + \dots + b_q \eps_{t+1-q} \mid y_{1:t}]\\
\end{split}
\end{equation}

Now, if we want $E[y_t \mid y_{1:t}]$, this will simply be the observed value $y_t$ in the vector $y_{1:t}$, since that is all the information we have about it. 

\begin{framed}
Remember the notion of conditional expectation of a RV. This is different than the expectation of a RV. If say you have Y a RV and X another RV, then you can define $E(Y \mid X=x)$ the expectation of Y given that the RV X took the value x. Now it is clear that if Y is a linear combination of X (still a RV) and since X is observed to be equal to x, \textbf{then Y is no more random, as it is a function of the observed x}. This is different than taking E(Y) in which Y is random (even if it is a function of X) as now we are not conditioning upon X=x.
\end{framed}

%\textbf{Note that $y_t$ is not a random variable, it is an observed value}.

So, we can write:

\begin{equation}
\begin{split}
y_t =& E[y_{t+1}\mid y_{1:t}] \\
=&
E[a_1 y_{t} + \dots + a_p y_{t+1-p} + \eps_{(t+1)} + \\
~& b_1 \eps_{t} + \dots + b_q \eps_{t+1-q} \mid y_{1:t}]\\
=& E[a_1 y_{t}\mid y_{1:t}] + \dots + E[a_p y_{t+1-p}\mid y_{1:t}] \\
~& + E[\eps_{(t+1)}\mid y_{1:t}] \\
~& + E[b_1 \eps_{t}\mid y_{1:t}] + \dots + E[b_q \eps_{t+1-q}\mid y_{1:t}]\\
=& a_1 y_{t} + \dots + a_p y_{t+1-p} +
b_1 \eps_{t} + \dots + b_q \eps_{t+1-q}
\end{split}
\end{equation}

The term $E[\eps_{(t+1)}\mid y_{1:t}]=0$, because 

\begin{equation}
\begin{split}
E[\eps_{(t+1)}\mid y_{1:t}] =& E[\eps_{(t+1)} 
\mid \eps_1,\dots, \eps_t]\\
=& E[\eps_{(t+1)}] \quad \hbox{ independence }\\
=& 0 \hbox{ by WN }
\end{split}
\end{equation}

This gives us the:

\subsubsection{One-step predictor} \label{onestepARMA}

\begin{equation}
\begin{split}
\hat y_{t+1} =& E[y_{t+1}\mid y_{1:t}]\\
=& E[a_1 y_t + \dots + a_p y_{t+1-p} \\
~& +\epsilon_{t+1}\\
~& +b_1 \epsilon_t + \dots + b_q \epsilon_{t+1-q}\mid y_{1:t}]\\
\end{split}
\end{equation}

[Notice that $E[a_1 y_t + \dots + a_p y_{t+1-p}]$ is $a_1 y_t + \dots + a_p y_{t+1-p}$ because we have already observed $y_{1:t}$.]

$E[\epsilon_{t+1}\mid y_{1:t}]=0$ because $\epsilon_{t+1}$ comes after $y_{1:t}$.

\begin{equation}
\hat y_{t+1} = 
a_1 y_{t} + \dots + a_p y_{t+1-p} +
b_1 \eps_{t} + \dots + b_q \eps_{t+1-q}
\end{equation}

The MA part of the terms, $b_1 \eps_{t} + \dots + b_q \eps_{t+1-q}$, arise because $E[\epsilon_t\mid y_{1:t}]=\epsilon_t$, $E[\epsilon_{t-1}\mid y_{1:t}]=\epsilon_{t-1}$, etc.

\subsubsection{Two- and i-step predictor}

We are looking for the conditional distribution of $y_{t+i}$ given $y_{1:t}$.

For the $t+2$-th step predictor:

\begin{equation}
\begin{split}
\hat y_{t+2} 
=& 
a_1 \hat y_{t+1} + a_2 y_{t} \dots + a_p y_{t+2-p} \\
~& +
b_2 \eps_{t} + \dots + b_q \eps_{t+2-q}
\end{split}
\end{equation}

For $t+i$-th step predictor:

\begin{equation}
\begin{split}
\hat y_{t+i} =& E[y_{t+i}\mid y_{1:t}]\\
=&
E[a_1  y_{t+i-1} +  a_2 y_{t+i-2} \dots + a_p y_{t+i-p} + \epsilon_{t+i} \\
+& b_1 \eps_{t+1} + \dots + b_q \eps_{t+i-q}\mid y_{1:t}]\\
=& 
E[a_1 y_{t+i-1} +  a_2 y_{t+i-2} \dots + a_p y_{t+i-p} \mid y_{1:t} ]\\
+& E[\epsilon_{t+i}\mid y_{1:t}] + E[b_1 \eps_{t+1} + \dots + b_q \eps_{t+i-q}\mid y_{1:t}]\\
=&  a_1 \hat y_{t+i-1} + a_2  y_{t+i -2} \dots + a_p y_{t+i-p} \\
~& + b_i \eps_{t+i} + \dots + b_q \eps_{t+i-q}
\end{split}
\end{equation}

\textbf{Note that we begin the MA part of the equation with $b_i$, not $b_1$}. That's because all ``preceding'' expectations are 0. (to-do: spell this out, see p.\ 66)

\subsection{Prediction variance and prediction intervals}

We have to compute $V=Var(y_{t+i}\mid y_{1:t})$, and then

\begin{equation}
\hat y_{t+1} \pm 2 \sqrt{V} 
\end{equation}

\underline{Example AR(1)}

Suppose we have a zero mean AR(1) process, 

$y_t =a_1 y_{t-1} + \epsilon_t$

where $\epsilon_t$ is white noise and normally distributed. Assume that $a_1$ and $\sigma^2$ have been estimated using MLE. Given data $y_{1:n}$, we want to estimate a prediction interval for $\hat y_{n+i}$.

\underline{Calculate $\hat y_{n+i}$}

Express $\hat y_{n+i}$ as a function of the observed data $y_{1:n}$.

\begin{equation}
\begin{split}
y_{n+i} = & ay_{n+i -1} + \epsilon_{n+i}\\
= & a(ay_{(n+i-1)-1} + \epsilon_{(n+i)-1}) + \epsilon_{n+i}\\
= & a(ay_{n+i-2} + a\epsilon_{n+i-1} + \epsilon_{n+i}\\
= & \dots \\
= & a^i y_n + \sum_{j=0}^{i-1} a^j \epsilon_{n+i-j}\\
\end{split}
\end{equation}

\begin{framed}
\begin{equation}
\hat y_{n+i}= E[y_{n+i}\mid y_{1:n}] = a^i y_n
\end{equation}

because $E[\epsilon_t]=0$ for all t.
\end{framed}

\underline{Calculate V}

\begin{equation}
\begin{split}
V =& Var(y_{n+i}\mid y_{1:n})\\
=& \sum_{j=0}^{i-1} a^{2j} Var(\epsilon_{n+i-j})\\
=& \sigma^2 \sum_{j=0}^{i-1} a^{2j} \\
=& \sigma^2 \frac{1-a^{2i}}{(1-a^2)}
\end{split}
\end{equation}

The last line is the sum of a finite geometric series: $\sum_{j=0}^{i-1} a^{2j}$. (to-do: unpack this).

\begin{framed}
\begin{equation}
\hat y_{n+i} \pm 2\sqrt{V} = a^i y_n + 2\sigma \sqrt{\frac{1-a^{2i}}{1-a^2}} 
\end{equation}
\end{framed}

\subsection{Prediction for ARIMA and SARIMA processes}

Let

\begin{equation}
y_t \sim ARIMA(p,d,q)
\end{equation}

Transforming gives us

\begin{equation}
x_t = (1-B)^d y_t \sim ARMA(p,q)
\end{equation}

Recall that $B^i y_t = y_{t-i}$.

Also, recall that $\hat x_{t+i}$ can be computed as shown in 
section~\ref{onestepARMA}.


\subsubsection{Consider the case where d=1}

Note that:
$\nabla y_t = x_t = y_t - y_{t-1}$

and

$y_t = x_t + y_{t-1}$.

Write $x_{t+i} = y_{t+i} - y_{t+i-1}$ as

$y_{t+i}= x_{t+i} + y_{t+i-1}$

\begin{equation}
E[y_{t+i}\mid y_{1:t}] = E[x_{t+i} \mid y_{1:n}]+ E[y_{t+i-1}\mid y_{1:t}] 
\end{equation}

This gives us:

\begin{framed}
\begin{equation}
\begin{split}
\hat y_{t+i}=& \hat x_{t+i} + \hat y_{t+i-1}\\
 =& \hat x_{t+i} + y_{t+i-1}\\
 \end{split}
\end{equation}
\end{framed}

For i=1, we have 

\begin{equation}
\begin{split}
\hat y_{t+1}=& \hat x_{t+1} + \hat y_{t+1-1}  \\
=& \hat x_{t+1} + \hat y_{t}\\
\end{split}
\end{equation}

For i=2, we have

\begin{equation}
\begin{split}
\hat y_{t+2}=& \hat x_{t+2} + \hat y_{t+2-1}  \\
=& \hat x_{t+2} + \hat y_{t+1}  \\
\end{split}
\end{equation}

Since $\hat y_{t+1} = \hat x_{t+1}+y_t$, we can write the last line as

\begin{equation}
\hat x_{t+2} + \hat y_{t+1}= \hat x_{t+2} + \hat x_{t+1}+y_t
\end{equation}

and so on.

\subsubsection{Consider the case where d>1}

If we have a trend TS $y_t$ and difference it d times to remove the trend, $x_t$.

\begin{equation}
x_t = (1-B)^d y_t 
\end{equation}

expanding the RHS using the binomial theorem:

\begin{equation}
x_t  =  \sum_{j=0}^{d} {n \choose j} (-1)^j y_{t-j} 
\end{equation}

Now, 

$\hat x_{t+1}=y_{t+1}+ \sum_{j=0}^{d} {n \choose k} (-1)^j y_{t+1-j}$.

Therefore,

$y_{t+1} = \hat x_{t+1} - \sum_{j=0}^{d} {n \choose k} (-1)^j y_{t+1-j}$

More generally, for i:

$y_{t+i} = \hat x_{t+i} - \sum_{j=0}^{d} {n \choose k} (-1)^j y_{t+i-j}$

\subsubsection{Prediction in SARIMA}

If $y_t$ is seasonal, we would transform as follows:

\begin{equation}
\begin{split}
x_t =& (1-B^s)^D y_t\\
=& \sum_{j=0}^D {n \choose j}(-1)^j  y_{t-sj} \\
\end{split}
\end{equation}

For example, for $x_t = (1-B^4)^3 y_t$, the predicted values would be:

\begin{equation}
\begin{split}
x_t =& (1-B^4)^3 y_t\\
=& \sum_{j=0}^3 {3 \choose j}(-1)^j B^{4j} y_t \\
=& {3 \choose 0} y_t + {3 \choose 1} (-1) y_{t-4} + {3 \choose 2} (-1)^2 y_{t-8} + {3 \choose 3} (-1)^3 y_{t-12}\\
=& y_t + 3 y_{t-4} - 3 y_{t-8}+ y_{t-12}\\ 
\end{split}
\end{equation}

If we put s=1 and D=d   then we would get the first formula above.

\section{State Space Models}

\subsection{Recall linear regression}

\begin{enumerate}
\item
an $n\times 1$ response vector $y_{1:t}$
\item 
$p$ predictor variables $x_{it}, i=1,\dots,n$. So, we have

$x_{11}, x_{12},\dots,x_{1t}$

$x_{21}, x_{22},\dots,x_{2t}$

etc., so that
$x_t = (x_{1t},\dots,x_{pt})^{T}$. 

For example, $[1~1]^T$ for t=1. For p=2, t=5:

<<>>=
## x^T:
matrix(rep(1,2*5),ncol=2)
@
\end{enumerate}


\begin{equation}
y_t = x_{1t}\beta_1 + \dots + x_{pt} \beta_p + \eps_t = x_t^T \beta+ \eps_t
\end{equation}

where $\beta=(\beta_1,\dots,\beta_p)^T$, a p-variate vector of regression coefficients.

$\eps_t$ is the error or innovation (white noise).

Estimation is done by minimizing sum of squares:

\begin{equation}
S(\beta) = \sum{t=1}^{n} (y_t - x_t^T \beta)^T
\end{equation}

The vector $\beta$ is time-invariant:  $\beta_n=\beta_{n-1}=\dots=\beta_1=\beta_0$.

\subsection{An almost time-invariant model}

When we have slowly varying coefficients:

\begin{equation}
\beta_t \approx \beta_{t-1}
\end{equation}

this can be described by the model

\begin{equation}
\beta_t = \beta_{t-1} + \zeta_t 
\end{equation}

$\zeta_t$ is a random variable, and so $\beta_t$ is also a random variable and is known as a \textbf{state vector}.

This gives us:

\begin{equation}
\begin{split}
y_t =& x_t^T \beta_t + \eps_t \quad \hbox{observation model}\\
\beta_t =& \beta_{t-1} + \zeta_t \quad \hbox{transition model}\\
\end{split}
\end{equation}

$\beta_t$ is modeled as a random walk.

If we set x=1 for all t:
$x_1 = x_2 = \dots = x_t=1$. Then, we get the 
\textbf{local level plus noise model}:

\begin{equation}
y_t = \beta_t + \eps_t \quad 
\beta_t = \beta_{t-1} + \zeta_t
\end{equation}

\textbf{The level of the time series}: $E[y_t\mid \beta_t]= \beta_t$. 

\subsection{The general state space model}


Replaces the random walk of $\beta_t$ with a \textbf{more general Markov chain}. That is:

\begin{equation}
\begin{split}
y_t =& x_t^T \beta_t + \eps_t \quad \hbox{observation model}\\
\beta_t =& F_t\beta_{t-1}+ \zeta_t \quad \hbox{transition model}\\ 
\end{split}
\end{equation}

$F_t$ is the transition matrix.

$\epsilon_t\sim WN$ and $\zeta_s\sim WN$, and are independent for all s,t. 

$\epsilon_t, \zeta_s, \beta_0$ are all normally distributed. 

$\beta_t$ is a random variable, and $\eps_t \sim N(0,\sigma^2)$
and
$\zeta_t \sim N(0,Z)$.
E.g., 

\begin{equation}
Z=
\begin{pmatrix}
\sigma_1^2 & 0 \\
0 & \sigma_2^2
\end{pmatrix}
\end{equation}

Z is the transition covariance matrix.

$\sigma^2$ and $Z$ can be time-varying; in such a case, we would write  $\sigma_t^2$ and $Z_t$.
In finance, $\sigma^2$ represents volatility. 

For now, assume $\sigma^2$ is time-invariant, and $Z_t$ is time-varying. Since $\beta$ are random variables, we have to specify a prior for $\beta_0$:

\begin{equation}
\beta_{0} \sim N(\hat \beta_{0\mid 0}, P_{0\mid 0})
\end{equation}

Estimation is happening sequentially: $y_t$ corresponds to a state $\beta_t$. In filtering we want to estimate $\beta_1$ given data up to t: $y_{1:t}$.


\subsection{Filtering: Kalman Filter (Theorem 5.1)}

The objective will be to get posterior distribution of $\beta_t$ given $y_{1:t}$ for any t. Given new datum $y_t$ (one data-point) compute posterior using Bayes:

\begin{equation}
P(\beta_t \mid y_{1:t}) \propto P(\beta_t\mid y_{1:t-1})P(y_t\mid \beta_t)
\end{equation}

May be easier to see if we single out the datum $y_t$ by writing it separately:

\begin{equation}
P(\beta_t \mid y_{1:t-1},y_t) \propto P(\beta_t\mid y_{1:t-1})P(y_t\mid \beta_t)
\end{equation}

The Kalman filter is just an application of Bayes' Theorem for computing the posterior sequentially.

\underline{\textbf{The Kalman Filter Theorem}}

Given:

\begin{equation}
\begin{split}
y_t =& x_t^T \beta_t + \eps_t \quad \hbox{(observational model)}\\
\beta_1 =& F\beta_{t-1} + \zeta_t \quad \hbox{(transition model)}\\
\beta_{0} =& N(\hat \beta_{0\mid 0}, P_{0\mid 0})\\
\end{split}
\end{equation}

For each $t=1,\dots,n$, the following applies:

\begin{enumerate}
\item 
The \textbf{forecast distribution} of $\beta_t$ at $t-1$ is 

\begin{equation}
\beta_t \mid y_{1:t-1}\sim N(\hat \beta_{t\mid t-1}, P_{t\mid t-1})
\end{equation}

where 
\begin{enumerate}
\item
$\hat \beta_{t\mid t-1} = F_t\hat \beta_{t-1\mid t-1}$
\item
$P_{t\mid t-1} = F_tP_{t-1\mid t-1} F_t^T + Z_t$
\end{enumerate}

(The transition matrix F gives the next value.)

\item The \textbf{posterior distribution} of $\beta_t$ at time $t$ is 

\begin{equation}
\beta_t \mid y_{1:t}  \sim N(\hat \beta_{t\mid t}, P_{t\mid t})
\end{equation}

\noindent
where 

\begin{enumerate}
\item
$\hat \beta_{t\mid t} = \hat \beta_{t\mid t-1} + k_t \eps_t = \hat \beta_{t\mid t-1} + k_t (y_t - \hat{y_{t\mid t-1}})$

[If the difference $(y_t - \hat{y_{t\mid t-1}})=\epsilon_t$ is 0, then posterior is identical to prior, else if the absolute value of the difference is greater than 0, then k$\times$ diff is the factor by which we adjust the prior to estimate posterior.]

\item 
$\hat y_{t\mid t-1} = x_t^T \hat \beta_{t\mid t-1}$
\item 
$e_t = y_t - \hat y_{t\mid t-1}$
\item Forecast variance: This will stabilize quickly to fixed values:

\begin{equation}
\begin{split}
q_{t\mid t-1} =& V(y_t\mid y_{1:t-1})\\
=&  x_t^T P_{t\mid t-1} x_t + \sigma^2\\
\end{split}
\end{equation}

\textbf{Example}: Compute $q_{1\mid 0}$, given $x=1, F=1, P_{0\mid 0}=1000, Z=9, \sigma^2 = 1$.

Recall (above) that 

\begin{equation}
P_{t\mid t-1} = FP_{t-1\mid t-1} F^T + Z_t
\end{equation}

It follows that

$P_{1\mid 0}=1\times 1000 \times 1 + 9$.

Plugging in the values into the formula for q:

\begin{equation}
\begin{split}
q_{1\mid 0} =& V(y_1\mid y_{1:0})\\
=&  x_1^T P_{1\mid 0} x_1 + \sigma^2\\
=&  1 \times (1000+9) \times 1 + 1\\
=& 1010
\end{split}
\end{equation}

\item Kalman gain:
$k_t = \frac{P_{t\mid t-1} x_t}{q_{t\mid t-1}}$
\item 
$P_{t\mid t} = P_{t\mid t-1} - q_{t\mid t-1} k_t k_t^T$

Note that 
$P_{t\mid t-1} = U_p D_p^2 U_p^T$
\end{enumerate}
\end{enumerate}

Some notes:

\begin{enumerate}
\item 
$\hat \beta_{t\mid t-1}$ is the \textbf{mean} at time $t$ given data up to time $t-1$: $E[\beta_t \mid y_{1:t-1}]$.
\item
When $y_t$ is observed, the data set goes from

$1:y_{t-1}$ to $1:y_t$ and then the $\hat \beta_{t\mid t}$ is the mean $E[\beta_t \mid t_{1:t}]$.
\item
\begin{enumerate}
\item
So, $t\mid t-1$ means: \textbf{forecast} at time t given information $y_{1:t}$.
\item
$t\mid t$ means \textbf{filtered estimate} at time $t$ given information $y_{1:t}$.
\end{enumerate}
\item 
$\hat y_{t\mid t-1}$ is the one-step ahead forecast of $y_t$ given information $y_{1:t-1}$.
\item
$e_t$ is the one-step ahead prediction error.
\item 
The conditional distribution of $\beta_t\mid y_{1:t}$ is the posterior distribution of $\beta_t$ at time $t$. 

\begin{equation}
p(\beta_t \mid y_{1:t}) \propto \explain{p(y_t \mid \beta_t)}{\hbox{Lik.}} 
\explain{p(\beta_t\mid y_{1:t-1})}{\hbox{Prior}}
\end{equation}

\begin{enumerate}
\item
The likelihood is from a single observation
\item
The prior is at time $t$.
\end{enumerate}

\end{enumerate}

So, the Kalman filter gives:

%\caption{default}
\begin{center}
\begin{tabular}{ccccccc}
$t$ & 0 & 1 & 2 & 3 & 4\\
$y_t$ & - & $y_1$ & $y_2$ & $y_3$ & $y_4$\\
$\beta_{t\mid t}$ & $\hat \beta_{1\mid 0}$ & $\hat \beta_{1\mid 1}$ & $\hat \beta_{2\mid 2}$ & \dots &   \\
$P_{t\mid t}$ & $P_{1\mid 0}$ & $P_{1\mid 1}$ & $P_{2\mid 2}$ \dots & & \\
\end{tabular}
\end{center}
%\label{default}

The great thing about the Kalman filter is that to compute $\hat \beta_{2\mid 2}$ we only need $\hat \beta_{1\mid 1}$, and to compute $P_{2\mid 2}$ only need $P_{1\mid 1}$. 

%\begin{equation}
%F=
%\begin{pmatrix}
%1 & 1\\
%0 & 1\\
%\end{pmatrix}
%\quad 
%\beta_{t-1}= 
%\begin{pmatrix}
%\beta_0_{t-1}\\
%\beta_1_{t-1}\\
%\end{pmatrix}
%\end{equation}

\subsection{Step-by-step example using Kalman filter}

We take a specific example. To be specific, assume that 
$x_t=1, F_t=1, Z_t=9, \sigma^2 = 1$.  Let prior be $\hat \beta_{0\mid 0} = 0$, $P_{0\mid 0}=1000$.

<<>>=
xt<-Ft<-1
Zt<-9
sigma2<-1
hatbeta00<-0
P00 <- 1000
@

\begin{equation}
\begin{split}
y_t =& x_t^T \beta_t + \eps_t \quad \hbox{(observational model)}\\
\beta_1 =& F\beta_{t-1} + \zeta_t \quad \hbox{(transition model)}\\
\beta_{0} =& N(\hat \beta_{0\mid 0}, P_{0\mid 0})\\
\end{split}
\end{equation}

At the outset, we have the prior distribution of $\beta_0$:

\begin{equation}
\beta_0 \sim N(\hat\beta_{0\mid 0},P_{0\mid 0})
\end{equation}

\begin{enumerate}
\item Step 0: We have no data: $y_0$. 

\item Step 1: Set t=1, and compute forecast distribution at t-1=0.

Compute \textbf{forecast distribution}:

\begin{equation}
\begin{split}
~& \beta_t \mid y_{1:t-1}\sim N(\hat \beta_{t\mid t-1}, P_{t\mid t-1})\\
~& \beta_1 \mid y_{1:0}\sim N(\hat \beta_{1\mid 0}, P_{1\mid 0}) \\
\end{split}
\end{equation}

where 
\begin{enumerate}
\item
\begin{equation}
\begin{split}
\hat \beta_{t\mid t-1} =& F_t\hat \beta_{t-1\mid t-1}\\
\hat \beta_{1\mid 0} =& 1 \hat \beta_{0\mid 0}\\
=& \hat \beta_{0\mid 0} = 0\\
\end{split}
\end{equation}

\item

\begin{equation}
\begin{split}
P_{t\mid t-1} =& F_tP_{t-1\mid t-1} F_t^T + Z_t\\
P_{1\mid 0} =& 1 P_{0\mid 0} 1 + 9\\
=& P_{0\mid 0}  + 9 = 1009\\
\end{split}
\end{equation}
\end{enumerate}

So, we have computed $\hat\beta_{1\mid 0} \sim N(\hat \beta_{0\mid 0}, P_{0\mid 0}+9) = N(0, 1009)$.

<<>>=
(hatbeta10<-Ft*hatbeta00)
(P10<-Ft*P00*Ft + Zt)
@

\item A single data point comes in: $y_1$. Suppose it has value 10.

<<>>=
y1<-10
@

The next steps are:

\begin{enumerate}
\item Compute $q_{t\mid t-1}=q_{1\mid 0}$. 

[This is needed to compute $k_1$, which is needed for computing $\beta_{1\mid 1}$.]

\begin{equation}
\begin{split}
q_{t\mid t-1} =& V(y_t\mid y_{1:t-1})\\
q_{1\mid 0} =& V(y_1\mid y_{1:0})\\
=&  x_t^T P_{t\mid t-1} x_t + \sigma^2\\
=&  x_1^T P_{1\mid 0} x_1 + 1\\
=&  1 (P_{0\mid 0} + 9)1 + 1\\
=&  P_{0\mid 0} + 9 + 1\\
=&  P_{0\mid 0} + 10\\
=&  1000 + 10 = 1010\\
\end{split}
\end{equation}

<<>>=
(q10<-xt*P10*xt + sigma2)
@

\item Compute $k_t = k_1$

\begin{equation}
\begin{split}
k_t =& \frac{P_{t\mid t-1} x_t}{q_{t\mid t-1}}\\
k_1 =& \frac{P_{1\mid 0} x_t}{q_{1\mid 0}} =  \frac{(P_{0\mid 0} + 9) 1}{(P_{0\mid 0} + 10)}\\
k_1 =& \frac{(1000 + 9) 1}{(1000 + 10)}= \frac{1009 1}{1010}\\ 
=& 0.9990099\\
\end{split}
\end{equation}

<<>>=
(k1 <- (P10*xt)/q10)
@

\item Compute (this is the goal) the \textbf{posterior distribution}:
$\beta_t \mid y_{1:t} \sim N(\hat \beta_{t\mid t}, P_{t\mid t})$, 

that is 

$\beta_1 \mid y_{1:1} \sim N(\hat \beta_{1\mid 1}, P_{1\mid 1})$.

Sub-steps:
\begin{enumerate}
\item Compute 

\begin{equation}
\begin{split}
\hat y_{t\mid t-1} =& x_t^T \hat \beta_{t\mid t-1}\\
\hat y_{1\mid 0} =& x_t^T \hat \beta_{1\mid 0}\\
=& 1 \times  \hat \beta_{1\mid 0} \quad \hbox{computed above}\\
=& 1\times 0 = 0
\end{split}
\end{equation}

<<>>=
(haty10<-xt*hatbeta10)
@

\item 
Compute $\epsilon_1=y_1 - \hat y_{1\mid 0}$ (needed in next step)

\begin{equation}
\epsilon_t=y_t - \hat y_{t\mid t-1}=
\epsilon_1=y_1 - \hat y_{1\mid 0}
\end{equation}

\begin{equation}
\epsilon_1= y_1 - \hat y_{1\mid 0} = y_1 - 0 = y_1
\end{equation}

<<>>=
(eps1<-y1-haty10)
@

\item Compute $\hat \beta_{1\mid 1}$

\begin{equation}
\begin{split}
\hat \beta_{t\mid t} =& \hat\beta_{t\mid t-1} + k_t \epsilon_t \\
\hat \beta_{1\mid 1} =& \hat\beta_{1\mid 0} + k_1 \epsilon_1 \\
=& 0 + 0.99 \times y_1 \\
=& 0.99 y_1 \\
\end{split}
\end{equation}

<<>>=
(hatbeta11<-hatbeta10+k1*eps1)
@

[Note that the Kalman gain is causing the shrinkage of the forecast]

\item Compute $P_{1\mid 1}$

\begin{equation}
\begin{split}
P_{t\mid t} =& P_{t\mid t-1} - q_{t\mid t-1} k_t k_t^T\\
P_{1\mid 1} =& P_{1\mid 0} - q_{1\mid 0} k_1 k_1^T\\
P_{1\mid 1} =& 1009 - 1010 \times k1 \times k1\\
=& 1009 - 1008.001\\ 
=&  0.999\\ 
\end{split}
\end{equation}

<<>>=
(P11 <- P10-(q10*k1*k1))
@

\end{enumerate}


\end{enumerate}
So now we know that 
$\beta_t \mid y_{t:t} \sim N(\hat \beta_{t\mid t}, P_{t\mid t})$, or rather 
$\beta_1 \mid y_{1:1} \sim N(\hat \beta_{1\mid 1}, P_{1\mid 1})$ is

\begin{equation}
\begin{split}
\beta_1 \mid y_{1:1} \sim & N(\hat \beta_{1\mid 1}, P_{1\mid 1})\\
\beta_1 \mid y_{1:1} \sim & N(0.99 y_1 , 20.099)\\
\beta_1 \mid y_{1:1} \sim & N(9.9 , 0.9990099)\\
\end{split}
\end{equation}

The above is now our posterior distribution, and the prior distribution for the next step.

\item Step 3: Data $y_2$ comes in. Repeat Step 2, setting t=2, to compute $\hat \beta_{t\mid t}, P_{t\mid t}$. And so on.
\end{enumerate}

\subsection{R example: Sheffield historical temperatures}

This example also shows how to estimate $\sigma^2$ and Z using MLE, and the effect that has on the prediction intervals when forecasting.

Sheffield historical temperatures example:

<<echo=FALSE>>=
dir<-"/Users/shravanvasishth/Dropbox/MScStatistics/2014-2015/MAS6011/Semester2/Data/"
@

<<>>=
library(dlm)
temp<-read.table(paste(dir,"temp.txt",
                        sep=""))
temp<-ts(temp[,2],start=1659,
         frequency=1)
## local level model:
x<-matrix(1)
F<-matrix(1)
sigma2<-1
Z<-10
beta0<-9
P0<-1000

mod<-dlm(temp,
         FF=x,
         GG=F,
         V=sigma2,
         W=Z,
         m0=beta0,
         C0=P0)

fit<-dlmFilter(temp,mod)  
## this is the *negative* of log lik:
(modLL<-dlmLL(y=temp,mod=mod))
## so loglik
-modLL
@

Next, we estimate $\sigma^2$ and $Z$ using MLE:

<<>>=
build<-function(parm){
  sigma2<-parm[1]
  Z<-parm[2]
  return(list(FF=x,GG=F,V=sigma2,W=Z,
              m0=beta0,C0=P0))
}

maxlikest<-dlmMLE(y=temp,
                  parm=c(3,2),
                  build=build,
                  lower=c(1e-6,0))
(sigma2<-maxlikest$par[1])
(Z<-maxlikest$par[2])
@

Refit model with MLEs for sigma2 and Z:

<<>>=
## refit model with MLEs for sigma2 and Z:
mod2<-dlm(temp,
         FF=x,
         GG=F,
         V=sigma2,
         W=Z,
         m0=beta0,
         C0=P0)
fit2<-dlmFilter(temp,mod2)
(mod2LL<-dlmLL(y=temp,mod=mod2))
## higher than -modLL
-mod2LL
@

Compare the original forecasts and MLE-based forecasts:
<<>>=
plot(1:length(fit$f),fit$f,type="l",
     main="Comparing MLE with non-MLE ests.",
     ylim=c(6.5,11))
lines(1:length(fit2$f),fit2$f,lty=1,col="red")
points(1:length(temp),temp)
@

<<>>=
modforecast<-dlmForecast(mod=mod,nAhead=3)
unlist(modforecast$Q)

mod2forecast<-dlmForecast(mod=mod2,nAhead=3)
## a bit smaller variance:
unlist(mod2forecast$Q)
@


\subsection{Forecasting}

Given data: $y_{1:t} = \{ y_1,\dots, y_t\}$, need to derive the distribution of

\begin{enumerate}
\item $\beta_{t+i}$, the \textbf{i-step ahead forecast state}
\item $y_{t+i}$, the \textbf{i-step ahead observation distribution}
\end{enumerate}

The variable $i$ is called \textbf{lead time}. and the maximum value $i$ takes is called the \textbf{forecast horizon}.

\subsubsection{Theorem 5.2}

Note that this assumes that $\beta_{t\mid t}$ and $P_{t\mid t}$ have been computed using Kalman filter.

Given $y_{1:t} = \{ y_1,\dots, y_t\}$, 

\begin{enumerate}
\item The i-step ahead forecast state distribution is 

\begin{equation}
\beta_{t+i\mid y_{1:t}} \sim N(\hat \beta_{t+i\mid t}, P_{t+i\mid t})
\end{equation}

where

\begin{equation}
\hat \beta_{t+i\mid t} = F^i \hat \beta_{t\mid t}
\end{equation}

and 
for the time-varying version of $Z_t$:

\begin{equation}
\begin{split}
P_{t+i\mid t} =& F^i P_{t\mid t} (F^i)^T \\
+& \sum_{k=0}^{i-1} F^k Z_{t+i-k} (F^k)^T
\end{split}
\end{equation}


If $Z_t =Z$, i.e., if Z is time-invariant, the covariance matrix 
$P_{t+i\mid t}$ takes the simplified form:

\begin{equation}
\begin{split}
P_{t+i\mid t} =& F^i P_{t\mid t} (F^i)^T \\
+& \sum_{k=0}^{i-1} F^k Z (F^k)^T
\end{split}
\end{equation}

\item The i-step ahead forecast observation distribution is

\begin{equation}
y_{t+i}\mid y_{1:t} \sim N(\hat y_{t+i\mid t},q_{t+i\mid t})
\end{equation}

where

\begin{enumerate}
\item $\hat y_{t+i\mid t} = x_{t+i}^T \hat \beta_{t+i\mid t}$
\item $q_{t+i\mid t} = x_{t+i}^T P_{t+i\mid t} x_{t+i}+\sigma^2$
\end{enumerate}
\end{enumerate}

Some comments:

\begin{enumerate}
\item $y_{t+i}\mid y_{1:t}$ is the \textbf{i-step ahead forecast distribution}.
\item $\beta_{t+i}\mid y_{1:t}$ is the \textbf{state forecast distribution} or the \textbf{forecast state distribution}.
\item The forecast mean is a function of i=1,2,\dots. It is therefore called the \textbf{forecast function}, and can be used for classifying different models aimed at forecasting.

The forecast function:

\begin{equation}
\hat y_{t+i\mid t} = x_{t+1}^T F^i \hat \beta_{t\mid t}
\end{equation}

\item $\hat \beta_{t\mid t-1}$, \textbf{the mean vector of the one-step forecast state distrn.}, and $P_{t\mid t-1}$, \textbf{the covariance matrix of the one-step forecast state distribution of $\beta_t$ given $y_{1:t-1}$}, are needed to calculate the predictive distribution of $\beta_t$ at time $t-1$: $\beta_t\mid y_{1:t-1}$.

Similarly, we have
$y_{t\mid t-1}$, the \textbf{mean of the one-step forecast observation distribution of $y_t$ given $y_{1:t-1}$}, and $q_{t\mid t-1}$, the variance of the one-step forecast distribution of $y_t$ given $y_{1:t-1}$.

\item $y_{t+i}$'s forecast interval is

\begin{equation}
\hat y_{t+i\mid t} \pm z \sqrt{q_{t+i\mid t}}
\end{equation}
where
$z=z_{1-\alpha/2}$\%th quantile of the standard Gaussian distribution N(0,1).

\item At each time t we compute $\hat \beta_{t\mid t}$ and $P_{t\mid t}$ using the Kalman filter, then we calculate 

\begin{equation}
\begin{split}
~& \hat \beta_{t+i\mid t}\\
~& P_{t+i\mid t}\\
~& \hat y_{t+i\mid t}\\
~& q_{t+i\mid t}\\
\end{split}
\end{equation}
\end{enumerate}

So, the procedure is: compute

1. initial state distribution

2. i-step forecast state distribution

3. i-step forecast observation distribution

\section{SSMs: Model specification}

A local level model will have x=1, F=1.

In dynamic regression, x will be predictors, and F=I, which defines a random walk evolution.

\subsection{Trend SSMs}

\paragraph{Linear growth/local linear SSM}

\begin{equation}
x^T = (1,0), 
\quad 
F=
\begin{pmatrix}
1 & 1 \\
0 & 1 \\
\end{pmatrix},
\quad 
\beta_t = 
\begin{pmatrix}
\beta_{1t}\\
\beta_{2t}\\
\end{pmatrix},
\quad 
\begin{pmatrix}
\zeta_{1t}\\
\zeta_{2t}\\
\end{pmatrix}
\end{equation}

\begin{equation}
y_t = 
\begin{pmatrix}
1 & 0 \\
\end{pmatrix}
\begin{pmatrix}
\beta_{1t} \\
\beta_{2t} \\
\end{pmatrix}
+\epsilon_t = \beta_{1t}+\epsilon_t
\end{equation}

So $y_t$ fluctuates randomly about $\beta_{1t}$.

The transition model:

\begin{equation}
\begin{pmatrix}
\beta_{1t} \\
\beta_{2t} \\
\end{pmatrix}
=
\begin{pmatrix}
1 & 1 \\
0 & 1 \\
\end{pmatrix}
\begin{pmatrix}
\beta_{1t-1} \\
\beta_{2t-1} \\
\end{pmatrix}
+
\begin{pmatrix}
\zeta_{1t} \\
\zeta_{2t} \\
\end{pmatrix}
\end{equation}

This expands to:

\begin{equation}
\beta_{1t} = \beta_{1t-1} + \beta_{2t-1} + \zeta_{1t}
\end{equation}

\begin{equation}
\beta_{2t} =  \beta_{2t-1} + \zeta_{2t}
\end{equation}

\textbf{Example}: aluminium data:

<<>>=
alum<-read.table(paste(dir,
                       "alum.txt",
                       sep=""))

alum_ts<-ts(alum[,2],
            start=1,end=210,
            frequency=1)
ts.plot(alum_ts,type="l")

x<-matrix(c(1,0),1,2)
F<-matrix(c(1,0,1,1),2,2)
Z<-matrix(c(10,0,0,2),2,2)
beta0<-c(1800,1)
P0<-1000*diag(2)
mod<-dlm(alum,FF=x,GG=F,V=1,
         W=Z,m0=beta0,C0=P0)

fit_alum<-dlmFilter(alum_ts,mod)

ts.plot(fit_alum$f,lty=2)
points(alum_ts,pch=2)
points(fit_alum$f,pch=20)
@

Forecast function:
<<>>=
##posterior mean vector:
fit_alum$m[211,]
x<-1989.08+1:10*17.10
## linear growth:
plot(x)
@

\textbf{Forecast function of linear trend model}

Recall that from Kalman filter $\beta_t\mid y_{t:t} \sim N(\hat \beta_{t\mid t}, P_{t\mid t})$.

\begin{equation}
\begin{split}
\hat y_{t+i\mid t} =& x_{t+i}^T F^i \hat \beta_{t\mid t}\\
=& 
\begin{pmatrix}
1 & 0 \\
\end{pmatrix}
\begin{pmatrix}
1 & i \\
0 & 1 \\
\end{pmatrix}
\begin{pmatrix}
\hat\beta_{1t\mid t} \\
\hat\beta_{2t\mid t} \\
\end{pmatrix}\\
=& 
\begin{pmatrix}
1 & i \\
\end{pmatrix}
\begin{pmatrix}
\hat\beta_{1t\mid t} \\
\hat\beta_{2t\mid t} \\
\end{pmatrix}\\
=& \hat\beta_{1t\mid t} +
\hat\beta_{2t\mid t}i \\
\end{split}
\end{equation}

which is a straight line.

\subsection{Superposition of SSMs}

We define N independent SSMs.

\begin{equation}
y_{it} = x_{it}^T \beta_{it} + \epsilon_{it} \quad \epsilon_{it} \sim N(0,\sigma_i^2)
\end{equation}

\begin{equation}
\beta_{it} = F_i \beta_{it-1}+\zeta_{it} \quad \zeta_{it} \sim N(0,Z_{it})
\end{equation}

We have $i=1,\dots,N$ independent models, $\epsilon_{it}$ and $\epsilon_{js}$ are independent for any i,j=1,\dots,N and any t,s=1,\dots. The same holds for $\zeta_{it}$ and $\zeta_{js}$.

The superposed model is the sum of all the models:

\begin{equation}
y_t = \sum_{i=1}^N y_{it} = \sum_{i=1}^N (x_{it}^T \beta_{it} + \epsilon_{it})
\end{equation}

\begin{equation}
\begin{pmatrix}
\beta_{1t}\\
\vdots\\
\beta_{Nt}\\
\end{pmatrix}
=
\begin{pmatrix}
F_1    & 0     & 0       & \dots & 0\\
0      & F_2   & 0       & \dots & 0\\
0      &  0    & \ddots  & 0     & 0\\
\vdots & \dots & \dots   & F_{N-1} & 0 \\
0      & 0     & \dots   &       & F_N \\
\end{pmatrix}
\end{equation}

\paragraph{Forecast function}

\begin{equation}
\hat y_{t+k\mid t} = \sum_{i=1}^N \hat y_{i,t+k}
\end{equation}

\begin{equation}
q_{t+k\mid t} = \sum_{i=1}^N q_{i,t+k\mid k}
\end{equation}

\begin{equation}
y_{t+k}\mid y_{1:t} 
\sim N(\hat y_{t+k\mid t}, q_{t+k\mid t}) 
\quad k=1,2,\dots 
\end{equation}

\textbf{Example of superposing: Local level + Linear trend}

$x^T = [1,\mid 1, 0]$ and 
$F=
\begin{pmatrix}
\mathbf{1} & 0 & 0 \\
0 & 1 & 1 \\
0 & 0 & 1 \\
\end{pmatrix}$

The bold-face 1 is the local level part, and the 2x2  matrix at the bottom right is the linear growth part.


\subsection{Fourier form seasonal models}

These are models exhibiting periodicity/seasonality. Definition of periodic function: $f(\cdot)$ is periodic if $f(t+T)=f(t)$.

A deterministic periodic function can be represented as a Fourier series:

\begin{equation}
a_0 + \sum_{i=1}^{\infty} [a_i \cos(i\omega t) + b_i \sin(i\omega t)]
\end{equation}

where $a_0, a_i, b_i$ are known as the Fourier coefficients.

This series can be approximated by a finite sum (a trig.\ polynomial):

\begin{equation}
S_N(t) = a_0 + \sum_{i=1}^{N} [a_i \cos(i\omega t) + b_i \sin(i\omega t)]
\end{equation}

$a_1,\dots,a_N$ and $b_1,\dots,b_N$ complex numbers and $\omega=2\pi/T$ the frequency.

To approximate a stochastic process $\{y_t\}$ with a Fourier form, we construct $S_N(t)$ by adding noise to it. We do this by superposing N component state space models; these are called \textbf{harmonic state space models}. 

\subsubsection{Harmonic state space models}

The components are:

\begin{equation}
x_{it}=x = \begin{pmatrix}
1\\
0
\end{pmatrix}
\quad F_i = 
\begin{pmatrix}
\cos(i\omega) &  \sin(i\omega)\\
-\sin(i\omega) & \cos(i\omega)\\
\end{pmatrix}
\end{equation}

$y_{it}, i=1,\dots,N$ represents the term
$a_0 +  [a_i \cos(i\omega t) + b_i \sin(i\omega t)]$  plus noise. Proof:
The observation equation is:

\begin{equation}
\begin{split}
y_{it} =& x_{it}^T \beta_{it} + \eps_{it}\\
=& 
\begin{pmatrix}
1 & 0\\
\end{pmatrix}
\begin{pmatrix}
\beta_{i,1,t}\\
\beta_{i,2,t}\\
\end{pmatrix}
+
\eps_{it}\\
=& \beta_{i,1,t} + \eps_{it}
\end{split}
\end{equation}

So, $y_{it} = \beta_{i,1,t} + \eps_{it}$

The transition equation is:

\begin{equation}
\begin{pmatrix}
\beta_{i,1,t}\\
\beta_{i,2,t}
\end{pmatrix}
=
\begin{pmatrix}
\cos(i\omega) &  \sin(i\omega)\\
-\sin(i\omega) & \cos(i\omega)\\
\end{pmatrix}
\begin{pmatrix}
\beta_{i,1,t-1}\\
\beta_{i,2,t-1}
\end{pmatrix}
+
\begin{pmatrix}
\zeta_{i,1,t}\\
\zeta_{i,2,t}
\end{pmatrix}
\end{equation}

This amounts to two equations:

\begin{equation}
\beta_{i,1,t} = \cos(i\omega)\beta_{i,1,t-1}+
\sin(i\omega)\beta_{i,2,t-1} + \zeta_{i,1,t}
\end{equation}

\begin{equation}
\beta_{i,2,t} = -\sin(i\omega)\beta_{i,1,t-1}+
\cos(i\omega)\beta_{i,2,t-1} + \zeta_{i,2,t}
\end{equation}

Recursively substituting the expansion of 
$\beta_{i,2,t}$ in the first equation by using the second equation, we get all the way to the equation for t=0:

\begin{equation}
\beta_{i,1,t} = \cos(i\omega)\beta_{i,1,0}+
\sin(i\omega)\beta_{i,2,0} + \hbox{error}
\end{equation}

where error is some function of $\zeta_t$. Substituting $\beta_{i,1,t}$ into: 

\begin{equation}
\begin{split}
y_{it} =& \beta_{i,1,t} + \eps_{it}\\
=& \cos(i\omega)\beta_{i,1,0}+
\sin(i\omega)\beta_{i,2,0} + \hbox{error} + \eps_{it}\\
\end{split}
\end{equation}

So, $a_i = \beta_{i,1,0}$ and $b_i = \beta_{i,2,0}$.

To build the state space representation of the seasonal time series, we construct a superposition of N harmonic state space models, along with a local level model for $a_0$. So,

$y_t = \sum_{i=0}^{N} y_{it}$, where $y_{0t}$ has a local level model defined for it.

First define N=T/2 if T is even, or N=(T-1)/2 if T is odd. 

For illustration, suppose T=2.

Then $\omega=2\pi/2=\pi$.  This means that 

$
F_N = 
\begin{pmatrix}
\cos(\pi) &  \sin(\pi)\\
-\sin(\pi) & \cos(\pi)\\
\end{pmatrix}=
\begin{pmatrix}
-1 &  0\\
0 & -1\\
\end{pmatrix}
$

If T=4, then 

$
F_N = 
\begin{pmatrix}
\cos(\pi/2) &  \sin(\pi/2)\\
-\sin(\pi/2) & \cos(\pi/2)\\
\end{pmatrix}=
\begin{pmatrix}
0 &  1\\
-1 & 0\\
\end{pmatrix}
$

If T=12, then  (to-do: NOT SURE ABOUT THIS)

$
F_N = 
\begin{pmatrix}
\cos(\pi/6) &  \sin(\pi/6)\\
-\sin(\pi/6) & \cos(\pi/6)\\
\end{pmatrix}\approx
\begin{pmatrix}
1 &  1/2\\
-1/2 & 1\\
\end{pmatrix}
$

If T=365, then
$
F_N = 
\begin{pmatrix}
\cos(2\pi/365) &  \sin(2\pi/365)\\
-\sin(2\pi/365) & \cos(2\pi/365)\\
\end{pmatrix}\approx
\begin{pmatrix}
1 &  0\\
0 & 1\\
\end{pmatrix}
$

Next we define N. Once defined, we can specify:

\begin{equation}
x_t^T = x^T = 
\begin{pmatrix}
1 & x_1 & \dots & x_N\\
\end{pmatrix}
F=
\begin{pmatrix}
1 & 0 & \dots & 0\\
0 & F_1 & \dots & 0\\
\vdots & \vdots & \vdots & \vdots \\ 
0 & 0 & \dots & F_N\\
\end{pmatrix}
\end{equation}

where $F_i = 
\begin{pmatrix}
\cos(i\omega) &  \sin(i\omega)\\
-\sin(i\omega) & \cos(i\omega)\\
\end{pmatrix}$.

The above is the \textbf{full-effects Fourier form model}. When T is large (e.g., 365), we just take a smaller N than required by N=T/2, as that would entail a lot of parameters. The reduced-N model is called the \textbf{reduced-effects Fourier model}.

\textbf{Example (Sheffield temperature example)}: Let T=4. Then N=2, so we will have two transition sub-matrices $F_1$ and $F_2$.

We let (not sure how this was determined, the last two elements seem arbitrary) 

$x_t^T=\begin{pmatrix}
1 &
1 &
0 &
1\\
\end{pmatrix}
$

and 

$F_1 = \begin{pmatrix}
 0 & 1 \\
 -1 & 0 \\
\end{pmatrix}
\quad 
F_2 = \begin{pmatrix}
-1\\
\end{pmatrix}
$

$F_2=-1$ because, as i=2, it has the form:

$
\begin{pmatrix}
\cos(i\omega) &  \sin(i\omega)\\
-\sin(i\omega) & \cos(i\omega)\\
\end{pmatrix}=
\begin{pmatrix}
\cos(2\omega) &  \sin(2\omega)\\
-\sin(2\omega) & \cos(2\omega)\\
\end{pmatrix}=
\begin{pmatrix}
\cos(\pi) &  \sin(\pi)\\
-\sin(\pi) & \cos(\pi)\\
\end{pmatrix}=
\begin{pmatrix}
-1 &  0\\
0 & 1\\
\end{pmatrix}
$.

Since $F_2$ can only have dimension $1\times 1$ due to the fact that we have four $\beta$ parameters), we take the top-left corner (I think this is how it works---not sure).

So here is what we get:

\begin{equation}
\begin{split}
y_{t} =& x_t^T \beta + \eps_{t}\\
=& 
\begin{pmatrix}
1 &
1 &
0 &
1\\
\end{pmatrix}
\begin{pmatrix}
\beta_{1t} \\
\beta_{2t} \\
\beta_{3t} \\
\beta_{4t}\\
\end{pmatrix}
+ \eps_{t}
\end{split}
\end{equation}

So, the \textbf{local level observation model} is:

\begin{equation}
y_t = \beta_{1t} + \beta_{2t}+ \beta_{4t} + \eps_t
\end{equation}

Next, we define the \textbf{transition model}:

\begin{equation}
\begin{split}
\beta_{t} =&
\begin{pmatrix}
\beta_{1t} \\
\beta_{2t} \\
\beta_{3t} \\
\beta_{4t}\\
\end{pmatrix}
=
\begin{pmatrix}
1 & 0 & 0 & 0\\
1 & 0 & 1 & 0\\
0 & -1 & 0 & 0\\
0 & 0 & 0 & -1\\
\end{pmatrix}
\begin{pmatrix}
\beta_{1,t-1} \\
\beta_{2,t-1} \\
\beta_{3,t-1} \\
\beta_{4,t-1}\\
\end{pmatrix}
+ \zeta_{t}
\end{split}
\end{equation}

[The top left corner of the F matrix is for the local level model.]

The above transition model gives four equations:

[This transition model is for the local level:]

\begin{equation}
\beta_{1,t} = \beta_{1,t-1} + \zeta_{1,t}
\end{equation}

[These transition models are for the seasonal components:]

\begin{equation}
\beta_{2,t} = \beta_{1,t-1} + \beta_{3,t-1} +  \zeta_{2,t}
\end{equation}

\begin{equation}
\beta_{3,t} = - \beta_{2,t-1} + \zeta_{3,t}
\end{equation}

\begin{equation}
\beta_{4,t} = -\beta_{4,t-1}+ \zeta_{4,t}
\end{equation}


\subsection{Fully worked examples using library \texttt{dlm}}

The data-set is Central England temperatures.

<<echo=FALSE>>=
dir<-"/Users/shravanvasishth/Dropbox/MScStatistics/2014-2015/MAS6011/Semester2/Data/"
@

<<>>=
library(dlm)

## Central England temperatures:
temp<-read.table(paste(dir,"temp.txt",
                        sep=""))
temp<-ts(temp[,2],start=1659,
         frequency=1)
x<-matrix(1)
## transition matrix:
F<-matrix(1)
## epsilon:
sigma2<-1
## zeta variance matrix:
Z<-10
## Prior beta ~ N(beta0,P0):
beta0<-9
P0<-1000
## model:
mod<-dlm(temp,
         FF=x, ## predictor 
         GG=F, ## transition matrix
         V=sigma2, ## epsilon var
         W=Z,      ## zeta var
         m0=beta0, ## prior mean beta
         C0=P0)    ## prior var  beta
fit<-dlmFilter(temp,mod)   

## Forecasting:
modForecast<-dlmForecast(mod=mod,nAhead=3)

## invariant forecast mean is 
## a characteristic of the local 
## level model:
modForecast$a[[1]]
modForecast$a[[2]]
modForecast$a[[3]]

modForecast$Q[[1]]
modForecast$Q[[2]]
modForecast$Q[[3]]

lower<-rep(NA,3)
upper<-rep(NA,3)
## intervals:
for(i in 1:3){
lower[i]<-modForecast$a[[i]]
           -2*sqrt(modForecast$Q[[i]])
upper[i]<-modForecast$a[[i]]
           +2*sqrt(modForecast$Q[[i]])
}
## uncertainty increases with i:
(forecst<-data.frame(i=1:3,lower=lower,
                     upper=upper))
@

\end{multicols}

\newpage

\section{Frequently Used Formulas}

\subsection{ACF/PACF calculations}

solve this equation for $r_1, r_2$ or $a_1$, $a_2$.

\begin{equation}
\begin{pmatrix}
r_1 \\
r_2\\
\end{pmatrix}
=
\begin{pmatrix}
1 & r_1 \\
r_1 & 1 \\
\end{pmatrix}
\begin{pmatrix}
a_1 \\
a_2\\
\end{pmatrix}
\end{equation}

Steps for finding PACF: 
\begin{enumerate}
\item Get the two equations
\item Then solve for $a_1$ in first
\item Plug $a_1$ into second equation and solve for $a_2$.
\end{enumerate}

\subsection{Forecast equations (AR, MA, ARMA, ARIMA)}

\begin{center}
\begin{tabular}{| l | l | l |}
\hline
Model         & Forecast Mean & Forecast Var\\ 
\hline
AR(1)         & $\hat y_{n+i\mid n}= a^i y_n$ & $\sigma^2 \frac{1-a^{2i}}{(1-a^2)}$\\
AR(p)         & $y_{t+i\mid t}$ = 
$\sum_{i=1}^p a_i y_{t+i-1}$& *\\
ARMA(p,q)     & $\hat y_{t+1\mid t}$ = 
$a_1 y_{t} + \dots + a_p y_{t+1-p} +
b_1 \epsilon_{t} + \dots + b_q \epsilon_{t+1-q}$ & \\%For k: $\sum_{i=0}^{k-1} (b_i) \epsilon_{t+k-1}$  \\
& 
$\hat y_{t+2\mid t} 
= a_1 \hat y_{t+1}+ \dots + a_p y_{t+2-p}  +
b_2 \epsilon_{t} + \dots + b_q \epsilon_{t+2-q}$ & \\%as above\\ 
MA(1)         & $\hat y_{t+i\mid t} = 0$ & $V_{t+1\mid t}=\sigma^2$\\
& & $V_{t+k\mid t}=(1+b^2)\sigma^2$\\
MA(q)         & $\hat y_{t+1\mid t} = \sum_{j=1}^q b_j \epsilon_{t+1-j}$ & $\sigma^2$\\
& $\hat y_{t+k\mid t} = \sum_{j=k}^q b_j \epsilon_{t+k-j}$ if $k\leq q$ else 0 & $\sum_{j=0}^q b_j^2\sigma^2$\\
ARIMA(p,d,q)  & If $d=1$: $\hat y_{t+i}= \hat x_{t+i} +  y_{t+i-1}$, $\hat x_{t+i}$ is ARMA(p,q)  & ?\\
& If $d>1$: $x_t  =  \sum_{j=0}^{d} {n \choose j} (-1)^j y_{t-j}$   & ? \\
SARIMA(P,D,Q) & $x_t = \sum_{j=0}^D {n \choose j}(-1)^j  y_{t-sj}$ & ? \\
\hline
\end{tabular}
\end{center}

* Translate to SSM (see Exercise 3) and then compute forecasts.

\subsection{State Space Models: Local level model forecast}

\underline{Step 1}:  Compute i-step ahead forecast state distribution:

\begin{equation}
\beta_{t+i\mid y_{1:t}} \sim N(\hat \beta_{t+i\mid t}, P_{t+i\mid t})
\end{equation}

where

\begin{equation}
\hat \beta_{t+i\mid t} = F^i \hat \beta_{t\mid t}
\end{equation}

and   

\begin{equation}
\begin{split}
P_{t+i\mid t} =& F^i P_{t\mid t} (F^i)^T \\
+& \sum_{k=0}^{i-1} F^k Z (F^k)^T
\end{split}
\end{equation}

Aside: If Z is time-varying version ($Z_t$):
\begin{equation}
\begin{split}
P_{t+i\mid t} =& F^i P_{t\mid t} (F^i)^T \\
+& \sum_{k=0}^{i-1} F^k Z_{t+i-k} (F^k)^T
\end{split}
\end{equation}

\underline{Step 2}: Compute the i-step ahead forecast observation distribution:

\begin{equation}
y_{t+i}\mid y_{1:t} \sim N(\hat y_{t+i\mid t},q_{t+i\mid t})
\end{equation}

where

\begin{enumerate}
\item $\hat y_{t+i\mid t} = x_{t+i}^T \hat \beta_{t+i\mid t}$
\item $q_{t+i\mid t} = x_{t+i}^T P_{t+i\mid t} x_{t+i}+\sigma^2$
\end{enumerate}

\subsection{State Space Models: Linear trend model forecast}

Recall that from Kalman filter $\beta_t\mid y_{t:t} \sim N(\hat \beta_{t\mid t}, P_{t\mid t})$.

\begin{equation}
\begin{split}
\hat y_{t+i\mid t} =& x_{t+i}^T F^i \hat \beta_{t\mid t}\\
=& 
\begin{pmatrix}
1 & 0 \\
\end{pmatrix}
\begin{pmatrix}
1 & i \\
0 & 1 \\
\end{pmatrix}
\begin{pmatrix}
\hat\beta_{1t\mid t} \\
\hat\beta_{2t\mid t} \\
\end{pmatrix}\\
=& 
\begin{pmatrix}
1 & i \\
\end{pmatrix}
\begin{pmatrix}
\hat\beta_{1t\mid t} \\
\hat\beta_{2t\mid t} \\
\end{pmatrix}\\
=& \hat\beta_{1t\mid t} +
\hat\beta_{2t\mid t}i \\
\end{split}
\end{equation}

which is a straight line.


\subsection{State Space Models: Superposed model forecast}

\begin{equation}
\hat y_{t+k\mid t} = \sum_{i=1}^N \hat y_{i,t+k}
\end{equation}

\begin{equation}
q_{t+k\mid t} = \sum_{i=1}^N q_{i,t+k\mid k}
\end{equation}

\begin{equation}
y_{t+k}\mid y_{1:t} 
\sim N(\hat y_{t+k\mid t}, q_{t+k\mid t}) 
\quad k=1,2,\dots 
\end{equation}

\section{Interesting problems}

\subsection{Problem 4}

If $y_t$ is a k-th order polynomial in $t$,  then it has the form:

\begin{equation}
y_t = a_0 + a_1 t + \dots + a_k t^k 
\end{equation}

Similarly, $y_{t-1}$ has the form: 

\begin{equation}
y_{t-1} = a_0 + a_1 (t-1) + \dots + a_k (t-1)^k
\end{equation}

So, taking the difference, we get:

\begin{equation}
y_t - y_{t-1} = [a_0 + a_1 t + \dots + a_k t^k]-
[a_0 + a_1 (t-1) + \dots + a_k (t-1)^k]
\end{equation}

Here, the two terms with the highest power, which contain the variable $t^{k}$, will cancel out. The highest remaining term is the next highest $t^{k-1}$. Hence the difference 
$\nabla y_t= y_t - y_{t-1}$ has order k-1.

Now, consider second-order differencing:

\begin{equation}
\nabla^2 y_t = (y_t - y_{t-1}) - (y_{t-1} - y_{t-2})
\end{equation}

The terms $(y_t - y_{t-1})$ will reduce the degree by 1, and the terms $(y_{t-1} - y_{t-2})$ will reduce 
the degree by 1 again. So, the degree of $\nabla^2 y_t$ is k-2. 

Continuing in this way all the way to k, 

\begin{equation}
\nabla^k y_t = (y_t - y_{t-1}) - (y_{t-1} - y_{t-2})-
(y_{t-2} - y_{t-3}) -\dots -(y_{t-(k-1)} - y_{t-k})
\end{equation}

we now get remove the all powers of t, leaving only $t^0$, which is a constant.

\end{document}