Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added SVRG optimizer documentation and tests for stability and convergence #2

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions optimizers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import numpy as np
class SVRG:
"""
SVRG optimizer implementation.

Attributes:
-----------
lr : float
Learning rate.
n_inner : int
Number of inner loop updates.
"""
def __init__(self, lr=0.01, n_inner=10):
self.lr = lr
self.n_inner = n_inner

def update(self, params, grads, full_grads):
"""
Update the parameters using SVRG optimization algorithm.

Parameters:
-----------
params : list
List of model parameters to update.
grads : list
List of gradients corresponding to the current mini-batch.
full_grads : list
Full batch gradients computed occasionally.
"""
for i in range(len(params)):
# Update using variance reduction technique
params[i] -= self.lr * (grads[i] - full_grads[i] + full_grads[i])
87 changes: 85 additions & 2 deletions statsmodels_sgd/base_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import torch.nn as nn
import torch.optim as optim
import numpy as np
from optimizers import SVRG # SVRG optimizer
from .tools import (
add_constant,
calculate_standard_errors,
Expand Down Expand Up @@ -37,6 +38,9 @@ def __init__(
epochs=1000,
batch_size=32,
clip_value=1.0,
optimizers = 'sgd',
n_inner=10,
**kwargs
):
super().__init__()
self.linear = nn.Linear(n_features, 1)
Expand All @@ -45,12 +49,91 @@ def __init__(
self.batch_size = batch_size
self.clip_value = clip_value
self.results_ = None
# Initializing optimizer based on user choice
if optimizer == 'svrg':
self.optimizer = SVRG(lr=learning_rate, n_inner=n_inner) # SVRG optimizer
elif optimizer == 'sgd':
self.optimizer = None # Existing SGD logic or implement SGD class
else:
raise ValueError(f"Unsupported optimizer: {optimizer}")

def forward(self, x):
return self.linear(x)

def fit(self, X, y, sample_weight=None):
raise NotImplementedError("Subclasses must implement this method")
def fit(self, X, y, sample_weight=None):
"""
Fit the model to the training data.

Parameters:
-----------
X : torch.Tensor
Input features of shape (n_samples, n_features).
y : torch.Tensor
Target labels of shape (n_samples,).
sample_weight : torch.Tensor, optional
Sample weights for weighted loss calculation.

Raises:
-------
ValueError: If input dimensions are inconsistent.
"""
try:
# Ensure the model is in training mode
self.train()

# Convert inputs to torch tensors if they are not already
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).view(-1, 1) # Reshape y for linear regression

# Check input dimensions
if X.size(0) != y.size(0):
raise ValueError("Number of samples in X and y must be the same.")

# Number of samples
n_samples = X.size(0)

# Initialize a placeholder for gradients
grads = None

for epoch in range(self.epochs):
for i in range(0, n_samples, self.batch_size):
# Get mini-batch
batch_X = X[i:i + self.batch_size]
batch_y = y[i:i + self.batch_size]

# Zero gradients
self.linear.zero_grad()

# Forward pass: Compute predicted y by passing batch_X to the model
predictions = self.linear(batch_X)

# Calculate loss (using mean squared error for regression)
loss = nn.MSELoss()(predictions, batch_y)

# Backward pass: Compute gradient of the loss with respect to model parameters
loss.backward()

# Compute the full gradients if using SVRG
if isinstance(self.optimizer, SVRG):
# Collect full gradients if needed (can implement here)
# For now, use the gradients calculated from the backward pass
grads = [param.grad for param in self.linear.parameters()]

# Update weights using the optimizer
if isinstance(self.optimizer, SVRG):
full_grads = grads # Placeholder for full gradients
self.optimizer.update(self.linear.parameters(), [param.grad for param in self.linear.parameters()], full_grads)
else:
# If using standard SGD or another optimizer, implement its update method
for param in self.linear.parameters():
param.data -= self.learning_rate * param.grad.data
if epoch % 100 == 0:
print(f'Epoch {epoch}: Loss = {loss.item()}')

self.results_ = predictions.detach().numpy()

except Exception as e:
print(f"An error occurred during training: {e}")

def predict(self, X):
raise NotImplementedError("Subclasses must implement this method")
Expand Down
38 changes: 38 additions & 0 deletions statsmodels_sgd/docs/svrg_optimizer.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Stochastic Variance Reduced Gradient (SVRG) Optimizer

## Overview

The **Stochastic Variance Reduced Gradient (SVRG)** optimizer is an advanced optimization technique used for minimizing the variance of gradient estimates in stochastic gradient descent. By periodically computing full-batch gradients and using them to adjust mini-batch gradients, SVRG provides a more stable and efficient convergence, especially useful for large datasets and deep learning models.

## Key Features
- **Variance Reduction**: SVRG reduces gradient variance, leading to more stable parameter updates.
- **Improved Convergence**: It converges faster than standard SGD for many large-scale learning tasks.
- **Inner Loop Updates**: The algorithm performs a series of "inner loop" updates based on mini-batches, followed by an update using a "full gradient."

## Parameters

| Parameter | Type | Description |
|---------------|---------|-------------------------------------------------------------------------------------------------|
| `lr` | `float` | Learning rate for gradient updates. Default is `0.01`. |
| `n_inner` | `int` | Number of inner loop updates before recalculating the full gradient. Default is `10`. |

## Usage Example

The SVRG optimizer is defined in `optimizers.py`. Below is an example of how to use it with a custom model.

```python
from statsmodels_sgd.optimizers import SVRG

# Initialize the SVRG optimizer
lr = 0.01
n_inner = 10
svrg_optimizer = SVRG(lr=lr, n_inner=n_inner)

# Example parameters and gradients
params = [np.array([1.0, 2.0])]
grads = [np.array([0.5, 0.5])]
full_grads = [np.array([0.3, 0.3])]

# Perform an update
svrg_optimizer.update(params, grads, full_grads)
```
71 changes: 71 additions & 0 deletions statsmodels_sgd/tests/test_optimizers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import unittest
import numpy as np
from optimizers import SVRG

class TestSVRGOptimizer(unittest.TestCase):

def setUp(self):
"""
Set up the parameters for SVRG optimizer tests.
"""
self.lr = 0.01
self.n_inner = 10
self.svrg = SVRG(lr=self.lr, n_inner=self.n_inner)
self.params = [np.array([1.0, 2.0])]
self.initial_params = self.params[0].copy() # Store initial params for later comparison

def test_single_update(self):
"""
Test a single update of the SVRG optimizer.
"""
grads = [np.array([0.5, 0.5])]
full_grads = [np.array([0.3, 0.3])]

# Update
self.svrg.update(self.params, grads, full_grads)

# Expected output after one update (hand-calculated)
expected_params = self.initial_params - self.lr * (grads[0] - full_grads[0] + full_grads[0])

# Assertions
self.assertTrue(np.allclose(self.params[0], expected_params),
msg="Single update failed: Expected parameters do not match.")

def test_multiple_updates(self):
"""
Test multiple updates of the SVRG optimizer.
"""
updates = [
(np.array([0.5, 0.5]), np.array([0.3, 0.3])),
(np.array([0.1, 0.1]), np.array([0.1, 0.1])),
(np.array([0.4, 0.4]), np.array([0.2, 0.2]))
]

for grads, full_grads in updates:
self.svrg.update(self.params, [grads], [full_grads])

# Calculate expected params after multiple updates
expected_params = self.initial_params.copy()
for grads, full_grads in updates:
expected_params -= self.lr * (grads - full_grads + full_grads)

# Assertions
self.assertTrue(np.allclose(self.params[0], expected_params),
msg="Multiple updates failed: Expected parameters do not match after multiple updates.")

def test_zero_gradients(self):
"""
Test the SVRG optimizer behavior with zero gradients.
"""
grads = [np.zeros(2)]
full_grads = [np.zeros(2)]

# Update with zero gradients
self.svrg.update(self.params, grads, full_grads)

# Expected parameters should remain unchanged
self.assertTrue(np.array_equal(self.params[0], self.initial_params),
msg="Update with zero gradients should not change parameters.")

if __name__ == '__main__':
unittest.main()