Merge pull request #380 from jdb78/feature/simple_models

Add recurrent and mlp models
sktime · Mar 7, 2021 · 884106c · 884106c
2 parents 48179d2 + 66cf2e9
commit 884106c
Show file tree

Hide file tree

Showing 13 changed files with 823 additions and 32 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,12 +5,15 @@
 ### Added
 
 - Adding a filter functionality to the timeseries datasset (#329)
+- Add simple models such as LSTM, GRU and a MLP on the decoder (#380)
+- Allow usage of any torch optimizer such as SGD (#380)
 
 ### Fixed
 
 - Moving predictions to CPU to avoid running out of memory (#329)
 - Correct determination of `output_size` for multi-target forecasting with the TemporalFusionTransformer (#328)
 - Tqdm autonotebook fix to work outside of Jupyter (#338)
+- Fix issue with yaml serialization for TensorboardLogger (#379)
 
 ### Contributors
 

diff --git a/README.md b/README.md
@@ -49,6 +49,8 @@ documentation with detailed tutorials.
   methods in the M4 competition. The M4 competition is arguably the most important benchmark for univariate time series forecasting.
 - [DeepAR: Probabilistic forecasting with autoregressive recurrent networks](https://www.sciencedirect.com/science/article/pii/S0169207019301888)
   which is the one of the most popular forecasting algorithms and is often used as a baseline
+- A baseline model that always predicts the latest known value
+- Simple standard networks for baselining: LSTM and GRU networks as well as a MLP on the decoder
 
 To implement new models, see the [How to implement new models tutorial](https://pytorch-forecasting.readthedocs.io/en/latest/tutorials/building.html).
 It covers basic as well as advanced architectures.

diff --git a/docs/source/models.rst b/docs/source/models.rst
@@ -24,6 +24,8 @@ and you should take into account. Here is an overview over the pros and cons of
 .. csv-table:: Model comparison
    :header: "Name",                                                                                        "Covariates", "Multiple targets", "Regression", "Classification", "Probabilistic", "Uncertainty", "Interactions between series", "Flexible history length", "Cold-start", "Required computational resources (1-5, 5=most)"
 
+   :py:class:`~pytorch_forecasting.models.rnn.RecurrentNetwork`,                                           "x",          "x",                "x",          "",               "",               "",           "",                            "x",                       "",           2
+   :py:class:`~pytorch_forecasting.models.mlp.DecoderMLP`,                                                 "x",          "x",                "x",          "x",              "",               "x",          "",                            "x",                       "x",          1
    :py:class:`~pytorch_forecasting.models.nbeats.NBeats`,                                                  "",           "",                 "x",          "",               "",               "",           "",                            "",                        "",           1
    :py:class:`~pytorch_forecasting.models.deepar.DeepAR`,                                                  "x",          "x",                "x",          "",               "x",              "x",          "",                            "x",                       "",           3
    :py:class:`~pytorch_forecasting.models.temporal_fusion_transformer.TemporalFusionTransformer`,          "x",          "x",                "x",          "x",              "",               "x",          "",                            "x",                       "x",          4

diff --git a/pytorch_forecasting/__init__.py b/pytorch_forecasting/__init__.py
@@ -36,9 +36,11 @@
     Baseline,
     BaseModel,
     BaseModelWithCovariates,
+    DecoderMLP,
     DeepAR,
     MultiEmbedding,
     NBeats,
+    RecurrentNetwork,
     TemporalFusionTransformer,
     get_rnn,
 )
@@ -85,6 +87,8 @@
     "get_embedding_size",
     "create_mask",
     "to_list",
+    "RecurrentNetwork",
+    "DecoderMLP",
 ]
 
 __version__ = "0.0.0"
diff --git a/pytorch_forecasting/models/__init__.py b/pytorch_forecasting/models/__init__.py
@@ -9,13 +9,16 @@
 )
 from pytorch_forecasting.models.baseline import Baseline
 from pytorch_forecasting.models.deepar import DeepAR
+from pytorch_forecasting.models.mlp import DecoderMLP
 from pytorch_forecasting.models.nbeats import NBeats
 from pytorch_forecasting.models.nn import GRU, LSTM, MultiEmbedding, get_rnn
+from pytorch_forecasting.models.rnn import RecurrentNetwork
 from pytorch_forecasting.models.temporal_fusion_transformer import TemporalFusionTransformer
 
 __all__ = [
     "NBeats",
     "TemporalFusionTransformer",
+    "RecurrentNetwork",
     "DeepAR",
     "BaseModel",
     "Baseline",
@@ -26,4 +29,5 @@
     "LSTM",
     "GRU",
     "MultiEmbedding",
+    "DecoderMLP",
 ]
diff --git a/pytorch_forecasting/models/base_model.py b/pytorch_forecasting/models/base_model.py
@@ -20,7 +20,16 @@
 
 from pytorch_forecasting.data import TimeSeriesDataSet
 from pytorch_forecasting.data.encoders import EncoderNormalizer, GroupNormalizer, MultiNormalizer, NaNLabelEncoder
-from pytorch_forecasting.metrics import MASE, SMAPE, DistributionLoss, Metric, MultiLoss
+from pytorch_forecasting.metrics import (
+    MAE,
+    MASE,
+    SMAPE,
+    DistributionLoss,
+    Metric,
+    MultiHorizonMetric,
+    MultiLoss,
+    QuantileLoss,
+)
 from pytorch_forecasting.optim import Ranger
 from pytorch_forecasting.utils import apply_to_list, create_mask, get_embedding_size, groupby_apply, to_list
 
@@ -154,6 +163,7 @@ def __init__(
         reduce_on_plateau_patience: int = 1000,
         reduce_on_plateau_min_lr: float = 1e-5,
         weight_decay: float = 0.0,
+        optimizer_params: Dict[str, Any] = None,
         monotone_constaints: Dict[str, int] = {},
         output_transformer: Callable = None,
         optimizer="ranger",
@@ -177,14 +187,16 @@ def __init__(
             reduce_on_plateau_min_lr (float): minimum learning rate for reduce on plateua learning rate scheduler.
                 Defaults to 1e-5
             weight_decay (float): weight decay. Defaults to 0.0.
+            optimizer_params (Dict[str, Any]): additional parameters for the optimizer. Defaults to {}.
             monotone_constaints (Dict[str, int]): dictionary of monotonicity constraints for continuous decoder
                 variables mapping
                 position (e.g. ``"0"`` for first position) to constraint (``-1`` for negative and ``+1`` for positive,
                 larger numbers add more weight to the constraint vs. the loss but are usually not necessary).
                 This constraint significantly slows down training. Defaults to {}.
             output_transformer (Callable): transformer that takes network output and transforms it to prediction space.
                 Defaults to None which is equivalent to ``lambda out: out["prediction"]``.
-            optimizer (str): Optimizer, "ranger", "adam" or "adamw". Defaults to "ranger".
+            optimizer (str): Optimizer, "ranger", "sgd", "adam", "adamw" or class name of optimizer in ``torch.optim``.
+                Defaults to "ranger".
         """
         super().__init__()
         # update hparams
@@ -203,6 +215,21 @@ def __init__(
         if not hasattr(self, "output_transformer"):
             self.output_transformer = output_transformer
 
+    @property
+    def n_targets(self) -> int:
+        """
+        Number of targets to forecast.
+
+        Based on loss function.
+
+        Returns:
+            int: number of targets
+        """
+        if isinstance(self.loss, MultiLoss):
+            return len(self.loss.metrics)
+        else:
+            return 1
+
     def transform_output(self, out: Dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Extract prediction from network output and rescale it to real space / de-normalize it.
@@ -251,6 +278,52 @@ def transform_output(self, out: Dict[str, torch.Tensor]) -> torch.Tensor:
             out = self.output_transformer(out)
         return out
 
+    @staticmethod
+    def deduce_default_output_parameters(
+        dataset: TimeSeriesDataSet, kwargs: Dict[str, Any], default_loss: MultiHorizonMetric = None
+    ) -> Dict[str, Any]:
+        """
+        Deduce default parameters for output for `from_dataset()` method.
+
+        Determines ``output_size`` and ``loss`` parameters.
+
+        Args:
+            dataset (TimeSeriesDataSet): timeseries dataset
+            kwargs (Dict[str, Any]): current hyperparameters
+            default_loss (MultiHorizonMetric, optional): default loss function.
+                Defaults to :py:class:`~pytorch_forecasting.metrics.MAE`.
+
+        Returns:
+            Dict[str, Any]: dictionary with ``output_size`` and ``loss``.
+        """
+        # infer output size
+        def get_output_size(normalizer, loss):
+            if isinstance(loss, QuantileLoss):
+                return len(loss.quantiles)
+            elif isinstance(normalizer, NaNLabelEncoder):
+                return len(normalizer.classes_)
+            else:
+                return 1
+
+        # handle multiple targets
+        new_kwargs = {}
+        n_targets = len(dataset.target_names)
+        if default_loss is None:
+            default_loss = MAE()
+        loss = kwargs.get("loss", default_loss)
+        if n_targets > 1:  # try to infer number of ouput sizes
+            if not isinstance(loss, MultiLoss):
+                loss = MultiLoss([deepcopy(loss)] * n_targets)
+                new_kwargs["loss"] = loss
+            if isinstance(loss, MultiLoss) and "output_size" not in kwargs:
+                new_kwargs["output_size"] = [
+                    get_output_size(normalizer, l)
+                    for normalizer, l in zip(dataset.target_normalizer.normalizers, loss.metrics)
+                ]
+        elif "output_size" not in kwargs:
+            new_kwargs["output_size"] = get_output_size(dataset.target_normalizer, loss)
+        return new_kwargs
+
     def size(self) -> int:
         """
         get number of parameters in model
@@ -673,6 +746,10 @@ def configure_optimizers(self):
             Tuple[List]: first entry is list of optimizers and second is list of schedulers
         """
         # either set a schedule of lrs or find it dynamically
+        if self.hparams.optimizer_params is None:
+            optimizer_params = {}
+        else:
+            optimizer_params = self.hparams.optimizer_params
         if isinstance(self.hparams.learning_rate, (list, tuple)):  # set schedule
             lrs = self.hparams.learning_rate
             if self.hparams.optimizer == "adam":
@@ -681,8 +758,17 @@ def configure_optimizers(self):
                 optimizer = torch.optim.AdamW(self.parameters(), lr=lrs[0])
             elif self.hparams.optimizer == "ranger":
                 optimizer = Ranger(self.parameters(), lr=lrs[0], weight_decay=self.hparams.weight_decay)
+            elif self.hparams.optimizer == "sgd":
+                optimizer = torch.optim.SGD(
+                    self.parameters(), lr=lrs[0], weight_decay=self.hparams.weight_decay, **optimizer_params
+                )
             else:
-                raise ValueError(f"Optimizer of self.hparams.optimizer={self.hparams.optimizer} unknown")
+                try:
+                    optimizer = getattr(torch.optim, self.hparams.optimizer)(
+                        self.parameters(), lr=lrs[0], weight_decay=self.hparams.weight_decay, **optimizer_params
+                    )
+                except AttributeError:
+                    raise ValueError(f"Optimizer of self.hparams.optimizer={self.hparams.optimizer} unknown")
             # normalize lrs
             lrs = np.array(lrs) / lrs[0]
             schedulers = [

diff --git a/pytorch_forecasting/models/deepar/__init__.py b/pytorch_forecasting/models/deepar/__init__.py
@@ -176,7 +176,6 @@ def from_dataset(
         Returns:
             DeepAR network
         """
-        # assert fixed encoder and decoder length for the moment
         new_kwargs = {}
         if dataset.multi_target:
             new_kwargs.setdefault("loss", MultiLoss([NormalDistributionLoss()] * len(dataset.target_names)))

diff --git a/pytorch_forecasting/models/mlp/__init__.py b/pytorch_forecasting/models/mlp/__init__.py
@@ -0,0 +1,155 @@
+"""
+Simple models based on fully connected networks
+"""
+
+
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from pytorch_forecasting.data import TimeSeriesDataSet
+from pytorch_forecasting.metrics import MAE, MAPE, MASE, RMSE, SMAPE, MultiHorizonMetric, QuantileLoss
+from pytorch_forecasting.models.base_model import BaseModelWithCovariates
+from pytorch_forecasting.models.mlp.submodules import FullyConnectedModule
+from pytorch_forecasting.models.nn.embeddings import MultiEmbedding
+
+
+class DecoderMLP(BaseModelWithCovariates):
+    """
+    MLP on the decoder.
+
+    MLP that predicts output only based on information available in the decoder.
+    """
+
+    def __init__(
+        self,
+        activation_class: str = "ReLU",
+        hidden_size: int = 300,
+        n_hidden_layers: int = 3,
+        dropout: float = 0.1,
+        norm: bool = True,
+        static_categoricals: List[str] = [],
+        static_reals: List[str] = [],
+        time_varying_categoricals_encoder: List[str] = [],
+        time_varying_categoricals_decoder: List[str] = [],
+        categorical_groups: Dict[str, List[str]] = {},
+        time_varying_reals_encoder: List[str] = [],
+        time_varying_reals_decoder: List[str] = [],
+        embedding_sizes: Dict[str, Tuple[int, int]] = {},
+        embedding_paddings: List[str] = [],
+        embedding_labels: Dict[str, np.ndarray] = {},
+        x_reals: List[str] = [],
+        x_categoricals: List[str] = [],
+        output_size: Union[int, List[int]] = 1,
+        target: Union[str, List[str]] = None,
+        loss: MultiHorizonMetric = None,
+        logging_metrics: nn.ModuleList = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            activation_class (str, optional): PyTorch activation class. Defaults to "ReLU".
+            hidden_size (int, optional): hidden recurrent size - the most important hyperparameter along with
+                ``n_hidden_layers``. Defaults to 10.
+            n_hidden_layers (int, optional): Number of hidden layers - important hyperparameter. Defaults to 2.
+            dropout (float, optional): Dropout. Defaults to 0.1.
+            norm (bool, optional): if to use normalization in the MLP. Defaults to True.
+            static_categoricals: integer of positions of static categorical variables
+            static_reals: integer of positions of static continuous variables
+            time_varying_categoricals_encoder: integer of positions of categorical variables for encoder
+            time_varying_categoricals_decoder: integer of positions of categorical variables for decoder
+            time_varying_reals_encoder: integer of positions of continuous variables for encoder
+            time_varying_reals_decoder: integer of positions of continuous variables for decoder
+            categorical_groups: dictionary where values
+                are list of categorical variables that are forming together a new categorical
+                variable which is the key in the dictionary
+            x_reals: order of continuous variables in tensor passed to forward function
+            x_categoricals: order of categorical variables in tensor passed to forward function
+            embedding_sizes: dictionary mapping (string) indices to tuple of number of categorical classes and
+                embedding size
+            embedding_paddings: list of indices for embeddings which transform the zero's embedding to a zero vector
+            embedding_labels: dictionary mapping (string) indices to list of categorical labels
+            output_size (Union[int, List[int]], optional): number of outputs (e.g. number of quantiles for
+                QuantileLoss and one target or list of output sizes).
+            target (str, optional): Target variable or list of target variables. Defaults to None.
+            loss (MultiHorizonMetric, optional): loss: loss function taking prediction and targets.
+                Defaults to QuantileLoss.
+            logging_metrics (nn.ModuleList, optional): Metrics to log during training.
+                Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]).
+        """
+        if loss is None:
+            loss = QuantileLoss()
+        if logging_metrics is None:
+            logging_metrics = nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()])
+        self.save_hyperparameters()
+        # store loss function separately as it is a module
+        super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs)
+
+        self.input_embeddings = MultiEmbedding(
+            embedding_sizes={
+                name: val
+                for name, val in embedding_sizes.items()
+                if name in self.decoder_variables + self.static_variables
+            },
+            embedding_paddings=embedding_paddings,
+            categorical_groups=categorical_groups,
+            x_categoricals=x_categoricals,
+        )
+        # define network
+        if isinstance(self.hparams.output_size, int):
+            mlp_output_size = self.hparams.output_size
+        else:
+            mlp_output_size = sum(self.hparams.output_size)
+
+        cont_size = len(self.decoder_reals_positions)
+        cat_size = sum([emb.embedding_dim for emb in self.input_embeddings.values()])
+        input_size = cont_size + cat_size
+
+        self.mlp = FullyConnectedModule(
+            dropout=dropout,
+            norm=self.hparams.norm,
+            activation_class=getattr(nn, self.hparams.activation_class),
+            input_size=input_size,
+            output_size=mlp_output_size,
+            hidden_size=self.hparams.hidden_size,
+            n_hidden_layers=self.hparams.n_hidden_layers,
+        )
+
+    @property
+    def decoder_reals_positions(self) -> List[int]:
+        return [
+            self.hparams.x_reals.index(name)
+            for name in self.reals
+            if name in self.decoder_variables + self.static_variables
+        ]
+
+    def forward(self, x: Dict[str, torch.Tensor], n_samples: int = None) -> Dict[str, torch.Tensor]:
+        """
+        Forward network
+        """
+        # x is a batch generated based on the TimeSeriesDataset
+        batch_size = x["decoder_lengths"].size(0)
+        embeddings = self.input_embeddings(x["decoder_cat"])  # returns dictionary with embedding tensors
+        network_input = torch.cat(
+            [x["decoder_cont"][..., self.decoder_reals_positions]] + list(embeddings.values()),
+            dim=-1,
+        )
+        prediction = self.mlp(network_input.view(-1, self.mlp.input_size)).view(
+            batch_size, network_input.size(1), self.mlp.output_size
+        )
+
+        # cut prediction into pieces for multiple targets
+        if self.n_targets > 1:
+            prediction = torch.split(prediction, self.hparams.output_size, dim=-1)
+
+        # We need to return a dictionary that at least contains the prediction and the target_scale.
+        # The parameter can be directly forwarded from the input.
+        return dict(prediction=prediction, target_scale=x["target_scale"])
+
+    @classmethod
+    def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs):
+        new_kwargs = cls.deduce_default_output_parameters(dataset, kwargs, QuantileLoss())
+        kwargs.update(new_kwargs)
+        return super().from_dataset(dataset, **kwargs)