Skip to content

Commit

Permalink
add ridge example about regularization method
Browse files Browse the repository at this point in the history
* compares ridge with sklearn LOO CV in terms of numerical accuracy and
  timings
  • Loading branch information
agoscinski committed Aug 7, 2023
1 parent f83b1f6 commit d56ccbd
Show file tree
Hide file tree
Showing 2 changed files with 359 additions and 2 deletions.
4 changes: 2 additions & 2 deletions examples/regression/README.rst
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Orthogonal Regression
=====================
Regression
==========
357 changes: 357 additions & 0 deletions examples/regression/Ridge2FoldCVRegularization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,357 @@
# %%

r"""
RidgeRegression2FoldCV for data with low effective rank
=======================================================
In this notebook we explain in more detail how
:class:`skmatter.linear_model.RidgeRegression2FoldCV` speeds up the
cross-validation optimizing the regularitzation parameter :param alpha: and
compare it with existing solution for that in scikit-learn
:class:`slearn.linear_model.RidgeCV`.
:class:`skmatter.linear_model.RidgeRegression2FoldCV` was designed to predict
efficiently feature matrices, but it can be also useful for the prediction
single targets.
"""
# %%
#

import time

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_regression
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split

from skmatter.linear_model import RidgeRegression2FoldCV


# %%

SEED = 12616
N_REPEAT_MICRO_BENCH = 5

# %%
# Numerical instabilities of sklearn leave-one-out CV
# ---------------------------------------------------
#
# In linear regression, the complexity of computing the weight matrix is
# theoretically bounded by the inversion of the covariance matrix. This is
# more costly when conducting regularized regression, wherein we need to
# optimise the regularization parameter in a cross-validation (CV) scheme,
# thereby recomputing the inverse for each parameter. scikit-learn offers an
# efficient leave-one-out CV (LOO CV) for its ridge regression which avoids
# these repeated computations [loocv]_. Because we needed an efficient ridge that works
# in predicting for the reconstruction measures in :py:mod:`skmatter.metrics`
# we implemented with :class:`skmatter.linear_model.RidgeRegression2FoldCV` an
# efficient 2-fold CV ridge regression that uses a singular value decomposition
# (SVD) to reuse it for all regularization parameters :math:`\lambda`. Assuming
# we have the standard regression problem optimizing the weight matrix in
#
# .. math::
#
# \begin{align}
# \|\mathbf{X}\mathbf{W} - \mathbf{Y}\|
# \end{align}
#
# Here :math:`\mathbf{Y}` can be seen also a matrix as it is in the case of
# multi target learning. Then in 2-fold cross validation we would predict first
# the targets of fold 2 using fold 1 to estimate the weight matrix and vice
# versa. Using SVD the scheme estimation on fold 1 looks like this.
#
# .. math::
#
# \begin{align}
# &\mathbf{X}_1 = \mathbf{U}_1\mathbf{S}_1\mathbf{V}_1^T,
# \qquad\qquad\qquad\quad
# \textrm{feature matrix }\mathbf{X}\textrm{ for fold 1} \\
# &\mathbf{W}_1(\lambda) = \mathbf{V}_1
# \tilde{\mathbf{S}}_1(\lambda)^{-1} \mathbf{U}_1^T \mathbf{Y}_1,
# \qquad
# \textrm{weight matrix fitted on fold 1}\\
# &\tilde{\mathbf{Y}}_2 = \mathbf{X}_2 \mathbf{W}_1,
# \qquad\qquad\qquad\qquad
# \textrm{ prediction of }\mathbf{Y}\textrm{ for fold 2}
# \end{align}
#
# The efficient 2-fold scheme in `RidgeRegression2FoldCV` reuses the matrices
#
# .. math::
#
# \begin{align}
# &\mathbf{A}_1 = \mathbf{X}_2 \mathbf{V}_1, \quad
# \mathbf{B}_1 = \mathbf{U}_1^T \mathbf{Y}_1.
# \end{align}
#
# for each fold to not recompute the SVD. The computational complexity
# after the initial SVD is thereby reduced to that of matrix multiplications.


# %%
# We first create an artificial dataset


X, y = make_regression(
n_samples=1000,
n_features=400,
random_state=SEED,
)


# %%

# regularization parameters
alphas = np.geomspace(1e-12, 1e-1, 12)

# 2 folds for train and validation split
cv = KFold(n_splits=2, shuffle=True, random_state=SEED)

skmatter_ridge_2foldcv_cutoff = RidgeRegression2FoldCV(
alphas=alphas, regularization_method="cutoff", cv=cv
)

skmatter_ridge_2foldcv_tikhonov = RidgeRegression2FoldCV(
alphas=alphas, regularization_method="tikhonov", cv=cv
)

sklearn_ridge_2foldcv_tikhonov = RidgeCV(
alphas=alphas, cv=cv, fit_intercept=False # remove the incluence of learning bias
)

sklearn_ridge_loocv_tikhonov = RidgeCV(
alphas=alphas, cv=None, fit_intercept=False # remove the incluence of learning bias
)

# %%
# Now we do simple benchmarks


def micro_bench(ridge):
global N_REPEAT_MICRO_BENCH, X, y
timings = []
train_mse = []
test_mse = []
for _ in range(N_REPEAT_MICRO_BENCH):
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=0.5, random_state=SEED
)
start = time.time()
ridge.fit(X_train, y_train)
end = time.time()
timings.append(end - start)
train_mse.append(mean_squared_error(y_train, ridge.predict(X_train)))
test_mse.append(mean_squared_error(y_test, ridge.predict(X_test)))

print(f" Time: {np.mean(timings)}s")
print(f" Train MSE: {np.mean(train_mse)}")
print(f" Test MSE: {np.mean(test_mse)}")


print("skmatter 2-fold CV cutoff")
micro_bench(skmatter_ridge_2foldcv_cutoff)
print()
print("skmatter 2-fold CV tikhonov")
micro_bench(skmatter_ridge_2foldcv_tikhonov)
print()
print("sklearn 2-fold CV tikhonov")
micro_bench(sklearn_ridge_2foldcv_tikhonov)
print()
print("sklearn leave-one-out CV")
micro_bench(sklearn_ridge_loocv_tikhonov)


# %%
# We can see that leave-one-out CV is completely off. Let us manually check
# each regularization parameter individually and compare it with the store mean
# squared errors (MSE).


results = {}
results["sklearn 2-fold CV Tikhonov"] = {"MSE train": [], "MSE test": []}
results["sklearn LOO CV Tikhonov"] = {"MSE train": [], "MSE test": []}

X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=0.5, random_state=SEED
)


def get_train_test_error(estimator):
global X_train, y_train, X_test, y_test
estimator = estimator.fit(X_train, y_train)
return (
mean_squared_error(y_train, estimator.predict(X_train)),
mean_squared_error(y_test, estimator.predict(X_test)),
)


for i in range(len(alphas)):
print(f"Computing step={i} using alpha={alphas[i]}")

train_error, test_error = get_train_test_error(RidgeCV(alphas=[alphas[i]], cv=2))
results["sklearn 2-fold CV Tikhonov"]["MSE train"].append(train_error)
results["sklearn 2-fold CV Tikhonov"]["MSE test"].append(test_error)
train_error, test_error = get_train_test_error(RidgeCV(alphas=[alphas[i]], cv=None))

results["sklearn LOO CV Tikhonov"]["MSE train"].append(train_error)
results["sklearn LOO CV Tikhonov"]["MSE test"].append(test_error)


# returns array of errors, one error per fold/sample
# ndarray of shape (n_samples, n_alphas)
loocv_cv_train_error = (
RidgeCV(
alphas=alphas,
cv=None,
store_cv_values=True,
scoring=None, # uses by default mean squared error
fit_intercept=False,
)
.fit(X_train, y_train)
.cv_values_
)

results["sklearn LOO CV Tikhonov"]["MSE validation"] = np.mean(
loocv_cv_train_error, axis=0
).tolist()


# %%

# We plot all the results.
plt.figure(figsize=(12, 8))
for i, items in enumerate(results.items()):
method_name, errors = items

plt.loglog(
alphas,
errors["MSE test"],
label=f"{method_name} MSE test",
color=f"C{i}",
lw=3,
alpha=0.9,
)
plt.loglog(
alphas,
errors["MSE train"],
label=f"{method_name} MSE train",
color=f"C{i}",
lw=4,
alpha=0.9,
linestyle="--",
)
if "MSE validation" in errors.keys():
plt.loglog(
alphas,
errors["MSE validation"],
label=f"{method_name} MSE validation",
color=f"C{i}",
linestyle="dotted",
lw=5,
)
plt.ylim(1e-16, 1)
plt.xlabel("alphas (regularization parameter)")
plt.ylabel("MSE")

plt.legend()
plt.show()

# %%
# We can see that Leave-one-out CV is estimating the error wrong for low
# alpha values. That seems to be a numerical instability of the method. If we
# would have limit our alphas to 1E-5, then LOO CV would have reach similar
# accuracies as the 2-fold method.

# %%
# **Important** to note that this is not an fully encompasing comparison
# covering sufficient enough the parameter space. We just want to note that in
# cases with high feature size and low effective rank the ridge solvers in
# skmatter can be numerical more stable and act on a comparable speed.

# %%
# Cutoff and Tikhonov regularization
# ----------------------------------
# When using a hard threshold as regularization (using parameter ``cutoff``),
# the singular values below :math:`\lambda` are cut off, the size of the
# matrices :math:`\mathbf{A}_1` and :math:`\mathbf{B}_1` can then be reduced,
# resulting in further computation time savings. This performance advantage of
# ``cutoff`` over the ``tikhonov`` is visible if we to predict multiple targets
# and use a regularization range that cuts off a lot of singular values. For
# that we increase the feature size and use as regression task the prediction
# of a shuffled version of :math:`\mathbf{X}`.

X, y = make_regression(
n_samples=1000,
n_features=400,
n_informative=400,
effective_rank=5, # decreasiing effective rank
tail_strength=1e-9,
random_state=SEED,
)

idx = np.arange(X.shape[1])
np.random.seed(SEED)
np.random.shuffle(idx)
y = X.copy()[:, idx]

singular_values = np.linalg.svd(X, full_matrices=False)[1]

# %%

plt.loglog(singular_values)
plt.title("Singular values of our feature matrix X")
plt.axhline(1e-8, color="gray")
plt.xlabel("index feature")
plt.ylabel("singular value")
plt.show()

# %%
# We can see that a regularization value of 1e-8 cuts off a lot of singular
# values. This is crucial for the computational speed up of the ``cutoff``
# regularization method

# %%

# we use a denser range of regularization parameters to make
# the speed up more visible
alphas = np.geomspace(1e-8, 1e-1, 20)

cv = KFold(n_splits=2, shuffle=True, random_state=SEED)

skmatter_ridge_2foldcv_cutoff = RidgeRegression2FoldCV(
alphas=alphas,
regularization_method="cutoff",
cv=cv,
)

skmatter_ridge_2foldcv_tikhonov = RidgeRegression2FoldCV(
alphas=alphas,
regularization_method="tikhonov",
cv=cv,
)

sklearn_ridge_loocv_tikhonov = RidgeCV(
alphas=alphas, cv=None, fit_intercept=False # remove the incluence of learning bias
)

print("skmatter 2-fold CV cutoff")
micro_bench(skmatter_ridge_2foldcv_cutoff)
print()
print("skmatter 2-fold CV tikhonov")
micro_bench(skmatter_ridge_2foldcv_tikhonov)
print()
print("sklearn LOO CV tikhonov")
micro_bench(sklearn_ridge_loocv_tikhonov)


# %%
# We also want to note that these benchmarks have huge deviations per run and
# that more robust benchmarking methods would be adequate for this situation.
# However, we cannot do this here as we try to keep the computation of these
# examples as minimal as possible.

# %%
# References
# ----------
# .. [loocv] Rifkin "Regularized Least Squares."
# https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf

0 comments on commit d56ccbd

Please sign in to comment.