crf_math_1.py

from __future__ import division

from collections import namedtuple

import numpy as np
from numpy import random as rng
from scipy.linalg import cho_factor, cho_solve, LinAlgError
from sklearn.base import BaseEstimator
from progressbar import ProgressBar

from copy import deepcopy


def soft_thresh(r, w):
    return np.sign(w) * np.max(np.abs(w)-r, 0)


def check_pd(A, lower=True):
    """
    Checks if A is PD.
    If so returns True and Cholesky decomposition,
    otherwise returns False and None
    """
    try:
        return True, np.tril(cho_factor(A, lower=lower)[0])
    except LinAlgError as err:
        if 'not positive definite' in str(err):
            return False, None


def chol_inv(B, lower=True):
    """
    Returns the inverse of matrix A, where A = B*B.T,
    ie B is the Cholesky decomposition of A.
    Solves Ax = I
    given B is the cholesky factorization of A.
    """
    return cho_solve((B, lower), np.eye(B.shape[0]))


def inv(A):
    """
    Inversion of a SPD matrix using Cholesky decomposition.
    """
    return chol_inv(check_pd(A)[1])


def log(x):
    return np.log(x) if x > 0 else -np.inf


class SparseGaussianCRF(BaseEstimator):
    """
    GCRF models conditional probability density of y in R^q given x in R^p as
    p(y|x, Λ, Θ) = exp(-y' * Λ * y - 2 * x' * Θ * y) / Z(x)
    where Z(x) = c * |Λ|^-1 * exp(x' * Θ * Λ^-1 * Θ' * x)
    This is equivalent to:
    p(y|x) = N(-Θ * Λ^-1 * x, Λ^-1)
    Parameters
    ----------
    learning_rate : float, default 1.0
        step size
    lamL : float, default 0.01
        l1 regularization for the Λ matrix
    lamT : float, default 0.01
        l1 regularization for the Θ matrix
    References
    ----------
    Wytock and Kolter 2013
    Probabilistic Forecasting using Sparse Gaussian CRFs
    http://www.zicokolter.com/wp-content/uploads/2015/10/wytock-cdc13.pdf
    Wytock and Kolter 2013
    Sparse Gaussian CRFs Algorithms Theory and Application
    https://www.cs.cmu.edu/~mwytock/papers/gcrf_full.pdf
    McCarter and Kim 2015
    Large-Scale Optimization Algorithms for Sparse CGGMs
    http://arxiv.org/pdf/1509.04681.pdf
    McCarter and Kim 2016
    On Sparse Gaussian Chain Graph Models
    (info on Multi-Layer Sparse Gaussian CRFs)
    http://papers.nips.cc/paper/5320-on-sparse-gaussian-chain-graph-models.pdf
    Klinger and Tomanek 2007
    Classical Probabilistic Models and Conditional Random Fields
    http://www.scai.fraunhofer.de/fileadmin/images/bio/data_mining/paper/crf_klinger_tomanek.pdf
    Tong Tong Wu and Kenneth Lange 2008
    Coordinate Descent Algorithms for Lasso Penalized Regression
    http://arxiv.org/pdf/0803.3876.pdf
    """
    def __init__(self, learning_rate=1.0, lamL=1, lamT=1, n_iter=1000):
        self.lamL = lamL
        self.lamT = lamT
        self.learning_rate = learning_rate
        self.n_iter = n_iter

        self.Lam = None
        self.Theta = None

        # stuff for line search
        self.beta = 0.5
        self.slack = 0.05


    def fit(self, X, Y):
        """TODO: Docstring for fit.
        Parameters
        ----------
        X : np.array, shape (n_samples, input_dimension)
        Y : np.array, shape (n_samples, output_dimension)
        Returns
        -------
        TODO
        """
        assert X.shape[0] == Y.shape[0], 'Inputs and Outputs must have the same number of observations'

        self.alt_newton_coord_descent(X=X, Y=Y)
        return self


    def loss(self, X, Y, Lam=None, Theta=None):
        if Lam is None:
            Lam = self.Lam
        if Theta is None:
            Theta = self.Theta

        n, p, q = self._problem_size(X, Y)
        FixedParams = namedtuple('FixedParams', ['Sxx', 'Sxy', 'Syy'])
        VariableParams = namedtuple('VariableParams', ['Sigma', 'Psi'])

        fixed = FixedParams(Sxx=np.dot(X.T, X) / n,
                            Syy=np.dot(Y.T, Y) / n,
                            Sxy=np.dot(X.T, Y) / n)
        Sigma = inv(Lam)
        R = np.dot(np.dot(X, self.Theta), Sigma) / np.sqrt(n)
        vary = VariableParams(Sigma=Sigma,
                              Psi=np.dot(R.T, R))
        return self.l1_neg_log_likelihood(Lam, Theta, fixed, vary)


    # def check_gradient(self, fixed, vary):
    #     grad_lam = np.zeros_like(self.Lam)
    #     grad_theta = np.zeros_like(self.Theta)
    #     for i in range(self.Lam.shape[0]):
    #         for j in range(self.Lam.shape[1]):
    #             L = self.Lam.copy()
    #             run = 1e-10
    #             L[i,j] += run
    #             rise = self.neg_log_likelihood(L, self.Theta, fixed, vary) - self.neg_log_likelihood(self.Lam, self.Theta, fixed, vary)
    #             grad_lam[i,j] = rise/run
    #
    #     for i in range(self.Theta.shape[0]):
    #         for j in range(self.Theta.shape[1]):
    #             T = self.Theta.copy()
    #             run = 1e-10
    #             T[i,j] += run
    #             rise = self.neg_log_likelihood(self.Lam, T, fixed, vary) - self.neg_log_likelihood(self.Lam, self.Theta, fixed, vary)
    #             grad_theta[i,j] = rise/run
    #     return grad_lam, grad_theta


    def neg_log_likelihood(self, Lam, Theta, fixed, vary):
        "compute the negative log-likelihood of the GCRF"
        return -log(np.linalg.det(Lam)) + \
                np.trace(np.dot(fixed.Syy, Lam) + \
                2*np.dot(fixed.Sxy.T, Theta) + \
                np.dot(vary.Psi, Lam))


    @staticmethod
    def l1_norm_off_diag(A):
        "convenience method for l1 norm, excluding diagonal"
        # let's speed this up later
        # assume A symmetric, sparse too
        return np.linalg.norm(A - np.diag(A.diagonal()), ord=1)


    def l1_neg_log_likelihood(self, Lam, Theta, fixed, vary):
        "regluarized negative log likelihood"
        return self.neg_log_likelihood( Lam, Theta, fixed, vary) + \
               self.lamL * self.l1_norm_off_diag(Lam) + \
               self.lamT * np.linalg.norm(Theta, ord=1)


    def neg_log_likelihood_wrt_Lam(self, Lam, fixed, vary):
        # compute the negative log-likelihood of the GCRF when Theta is fixed
        return -log(np.linalg.det(Lam)) + \
                np.trace(np.dot(fixed.Syy, Lam) + \
                np.dot(vary.Psi, Lam))


    def l1_neg_log_likelihood_wrt_Lam(self, Lam, fixed, vary):
        # regularized neg log loss
        return self.neg_log_likelihood_wrt_Lam(Lam, fixed, vary) + \
               self.lamL * np.linalg.norm(Lam - np.diag(Lam.diagonal()), ord=1)

    # def neg_log_likelihood_wrt_Theta(self, Theta, fixed, vary):
    #     # compute the negative log-likelihood of the GCRF when Lamba is fixed
    #     return 2*np.dot(fixed.Sxy.T, Theta) + np.dot(vary.Sigma, vary.Psi)

    def grad_wrt_Lam(self, fixed, vary):
        return fixed.Syy - vary.Sigma - vary.Psi


    def grad_wrt_Theta(self, fixed, vary):
        # TODO this is not avoiding the Gamma computation!!!
        # gamma = Sxx Theta Sigma
        return 2 * fixed.Sxy + 2 * np.dot(fixed.Sxx, np.dot(self.Theta, vary.Sigma))


    def active_set(self, fixed, vary):
        return (self.active_set_Lam(fixed, vary),
                self.active_set_Theta(fixed, vary))


    def active_set_Lam(self, fixed, vary):
        grad = self.grad_wrt_Lam(fixed, vary)
        assert np.allclose(grad, grad.T, 1e-3)
        return np.where((np.abs(np.triu(grad)) > self.lamL) | (self.Lam != 0))
        # return np.where((np.abs(grad) > self.lamL) | (~np.isclose(self.Lam, 0)))


    def active_set_Theta(self, fixed, vary):
        grad = self.grad_wrt_Theta(fixed, vary)
        return np.where((np.abs(grad) > self.lamT) | (self.Theta != 0))
        # return np.where((np.abs(grad) > self.lamT) | (~np.isclose(self.Theta, 0)))


    def _problem_size(self, X, Y):
        (n, p), q = X.shape, Y.shape[1]
        return n, p, q


    def check_descent(self, newton_lambda, alpha, fixed, vary):
        # check if we have made suffcient descent
        DLam = np.trace(np.dot(self.grad_wrt_Lam(fixed, vary), newton_lambda)) + \
               self.lamL * np.linalg.norm(self.Lam + newton_lambda, ord=1) - \
               self.lamL * np.linalg.norm(self.Lam, ord=1)

        nll_a = self.l1_neg_log_likelihood_wrt_Lam(self.Lam + alpha * newton_lambda, fixed, vary)
        nll_b = self.l1_neg_log_likelihood_wrt_Lam(self.Lam, fixed, vary) + alpha * self.slack * DLam
        return nll_a <= nll_b


    def check_descent2(self, newton_lambda, alpha, fixed, vary):

        lhs = self.l1_neg_log_likelihood(self.Lam + alpha*newton_lambda, self.Theta, fixed, vary)

        mu = np.trace(np.dot(self.grad_wrt_Lam(fixed, vary), newton_lambda)) + \
             self.lamL*self.l1_norm_off_diag(self.Lam + newton_lambda) +\
             self.lamT*np.linalg.norm(self.Theta, ord=1)

        rhs = self.neg_log_likelihood(self.Lam, self.Theta, fixed, vary) +\
              alpha * self.slack * mu
        return lhs <= rhs

    def line_search(self, newton_lambda, fixed, vary):
        # returns cholesky decomposition of Lambda and the learning rate
        alpha = self.learning_rate
        while True:
            pd, L = check_pd(self.Lam + alpha * newton_lambda)
            if pd and self.check_descent(newton_lambda, alpha, fixed, vary):
                # step is positive definite and we have sufficient descent
                break
                #TODO maybe want to return newt+alpha, to reuse computation
            alpha = alpha * self.beta
            # if alpha < 0.1:
            #   return L, alpha
        return L, alpha


    def lambda_newton_direction(self, active, fixed, vary, max_iter=1):
        # TODO we should be able to do a warm start...
        delta = np.zeros_like(vary.Sigma)
        U = np.zeros_like(vary.Sigma)

        for _ in range(max_iter):
            for i, j in rng.permutation(np.array(active).T):
                if i > j:
                    # seems ok since we look for upper triangular indices in active set
                    continue

                if i==j:
                    a = vary.Sigma[i,i] ** 2 + 2 * vary.Sigma[i,i] * vary.Psi[i,i]
                else:
                    a = (vary.Sigma[i, j] ** 2 + vary.Sigma[i, i] * vary.Sigma[j, j] +
                         vary.Sigma[i, i] * vary.Psi[j, j] + 2 * vary.Sigma[i, j] * vary.Psi[i, j] +
                         vary.Sigma[j, j] * vary.Psi[i, i])

                b = (fixed.Syy[i, j] - vary.Sigma[i, j] - vary.Psi[i, j] +
                     np.dot(vary.Sigma[i,:], U[:,j]) +
                     np.dot(vary.Psi[i,:], U[:,j]) +
                     np.dot(vary.Psi[j,:], U[:,i]))

                if i==j:
                    u = -b/a
                    delta[i, i] += u
                    U[i, :] +=  u * vary.Sigma[i, :]
                else:
                    c = self.Lam[i, j] + delta[i, j]
                    u = soft_thresh(self.lamL / a, c - b/a) - c
                    delta[j, i] += u
                    delta[i, j] += u
                    U[j, :] +=  u * vary.Sigma[i, :]
                    U[i, :] +=  u * vary.Sigma[j, :]

        return delta


    def theta_coordinate_descent(self, active, fixed, vary, max_iter=1):
        V = np.dot(self.Theta, vary.Sigma)
        for _ in range(max_iter):
            for i, j in np.array(active).T:
                a = 2 * vary.Sigma[j, j] * fixed.Sxx[i, i]
                b = 2 * fixed.Sxy[i, j] + 2 * np.dot(fixed.Sxx[i,:], V[:,j])
                c = self.Theta[i, j]

                u = soft_thresh(self.lamT / a, c - b/a) - c

                self.Theta[i, j] += u
                V[i,:] += u * vary.Sigma[j,:]

        return self.Theta


    def alt_newton_coord_descent(self, X, Y):
        """
        JK Trying to follow Calvin's algorithm, then will merge back into orig
        """
        n, p, q = self._problem_size(X, Y)

        FixedParams = namedtuple('FixedParams', ['Sxx', 'Sxy', 'Syy'])
        VariableParams = namedtuple('VariableParams', ['Sigma', 'Psi'])

        fixed = FixedParams(Sxx=np.dot(X.T, X) / n,
                            Syy=np.dot(Y.T, Y) / n,
                            Sxy=np.dot(X.T, Y) / n)

        # allow for continued fitting
        if self.Lam is None:
            self.Lam = np.eye(q)
            Sigma = np.eye(q)
        else:
            Sigma = inv(self.Lam) # use cholesky decomp then solve system of eqs
        if self.Theta is None:
            self.Theta = np.zeros((p, q))

        self.nll = []
        self.lnll = []
        self.lrs = []
        from progressbar import ProgressBar
        pbar = ProgressBar()
        for it in pbar(range(self.n_iter)):

            # update variable params
            R = np.dot(np.dot(X, self.Theta), Sigma) / np.sqrt(n)
            vary = VariableParams(Sigma=Sigma,
                                  Psi=np.dot(R.T, R))

            self.nll.append(self.neg_log_likelihood(self.Lam, self.Theta, fixed, vary))
            self.lnll.append(self.l1_neg_log_likelihood(self.Lam, self.Theta, fixed, vary))

            # determine active set
            active_Lam = self.active_set_Lam(fixed, vary)

            # solve D_lambda via coordinate descent
            newton_lambda = self.lambda_newton_direction(active_Lam, fixed, vary, max_iter=1)

            # line search for best step size
            learning_rate = self.learning_rate
            LL, learning_rate = self.line_search(newton_lambda, fixed, vary)
            self.lrs.append(learning_rate)
            self.Lam = self.Lam.copy() + learning_rate * newton_lambda

            # update variable params
            Sigma = chol_inv(LL) # use chol decomp from the backtracking
            vary = VariableParams(Sigma=Sigma,
                                  Psi=None) # dont need psi here

            # determine active set
            active_Theta = self.active_set_Theta(fixed, vary)

            # solve theta
            self.Theta = self.theta_coordinate_descent(active_Theta, fixed, vary, max_iter=1)


    def sample(self, X, n=1, verbose=True):
        """
        Draw samples from the conditional probability for y given x.
        Inference in  GCRF given by:
        y|x ~ N(-Θ * Λ^-1 * x, Λ^-1)
        This algorithm uses some clever accelerations to generate samples.
        Inspired by Calvin McCarter[1].
        The idea is to make the desired number of draws from a white multivariate
        normal distribution, and then multiply these draws by the cholesky
        decomposition of the desired covariance matrix. This adds the necessary
        color to the original draws.
        An equivalent thing to do is to multiply the white draws by the inverse of the
        cholesky decompostition of the desired precision matrix:
        If S is the desired covariance matrix and L, the precision matrix then:
        S = SL * SL.T # cholesky decomposition
        L = LL * LL.T # cholesky decomposition
        S = L^-1 # by asssumption, then
        SL * SL.T = (LL * LL.T)^-1 = LL.T ^-1 * LL ^-1
        Although SL != LL.T^-1, the effective coloring is the same.
        A final acceleration is to solve linear systems of equations instead of
        explicitly computing matrix inversions.
        [1] https://calvinmccarter.wordpress.com/2015/01/06/multivariate-normal-random-number-generation-in-matlab/
        """
        LL = np.linalg.cholesky(self.Lam)
        Sigma = chol_inv(LL)

        means = -np.dot(np.dot(Sigma, self.Theta.T), X.T)
        means = np.tile(np.atleast_2d(means), n)
        N = means.shape[1]

        z = rng.randn(self.Lam.shape[0], N)
        samples = np.linalg.solve(LL.T, z) + means

        return samples.squeeze().T


    def predict(self, X, Y=None):
        """
        Return the mean of y given x.
        Inference in  GCRF given by:
        y|x ~ N(-Θ * Λ^-1 * x, Λ^-1)
        so this method returns:
        -Θ * Λ^-1 * x
        """
        return -np.dot(np.dot(inv(self.Lam), self.Theta.T), X.T).T


    def get_params(self, deep=True):
        return {'lamL': self.lamL,
                'lamT': self.lamT,
                'learning_rate': self.learning_rate,
                'n_iter': self.n_iter}


    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.setattr(parameter, value)