diff --git a/README.md b/README.md index 86e52e6f..9b294fa2 100644 --- a/README.md +++ b/README.md @@ -192,39 +192,42 @@ The paper references are all listed at the bottom of this readme file. Please re 🌟 Since **v0.2**, all neural-network models in PyPOTS has got hyperparameter-optimization support. This functionality is implemented with the [Microsoft NNI](https://github.com/microsoft/nni) framework. -| ***`Imputation`*** | 🚥 | 🚥 | 🚥 | -|:----------------------:|:-----------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------:| -| **Type** | **Abbr.** | **Full name of the algorithm/model** | **Year** | -| Neural Net | SAITS | Self-Attention-based Imputation for Time Series [^1] | 2023 | -| Neural Net | Transformer | Attention is All you Need [^2];
Self-Attention-based Imputation for Time Series [^1];
Note: proposed in [^2], and re-implemented as an imputation model in [^1]. | 2017 | -| Neural Net | Crossformer | Transformer Utilizing Cross-Dimension Dependency for Multivariate Time Series Forecasting [^16] | 2023 | -| Neural Net | TimesNet | Temporal 2D-Variation Modeling for General Time Series Analysis [^14] | 2023 | -| Neural Net | PatchTST | A Time Series is Worth 64 Words: Long-Term Forecasting with Transformers [^18] | 2023 | -| Neural Net | DLinear | Are Transformers Effective for Time Series Forecasting? [^17] | 2023 | -| Neural Net | ETSformer | Exponential Smoothing Transformers for Time-series Forecasting [^19] | 2023 | -| Neural Net | FEDformer | Frequency Enhanced Decomposed Transformer for Long-term Series Forecasting [^20] | 2022 | -| Neural Net | Informer | Beyond Efficient Transformer for Long Sequence Time-Series Forecasting [^21] | 2021 | -| Neural Net | Autoformer | Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting [^15] | 2021 | -| Neural Net | CSDI | Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation [^12] | 2021 | -| Neural Net | US-GAN | Unsupervised GAN for Multivariate Time Series Imputation [^10] | 2021 | -| Neural Net | GP-VAE | Gaussian Process Variational Autoencoder [^11] | 2020 | -| Neural Net | BRITS | Bidirectional Recurrent Imputation for Time Series [^3] | 2018 | -| Neural Net | M-RNN | Multi-directional Recurrent Neural Network [^9] | 2019 | -| Naive | LOCF/NOCB | Last Observation Carried Forward / Next Observation Carried Backward | - | -| Naive | Median | Median Value Imputation | - | -| Naive | Mean | Mean Value Imputation | - | -| ***`Classification`*** | 🚥 | 🚥 | 🚥 | -| **Type** | **Abbr.** | **Full name of the algorithm/model/paper** | **Year** | -| Neural Net | BRITS | Bidirectional Recurrent Imputation for Time Series [^3] | 2018 | -| Neural Net | GRU-D | Recurrent Neural Networks for Multivariate Time Series with Missing Values [^4] | 2018 | -| Neural Net | Raindrop | Graph-Guided Network for Irregularly Sampled Multivariate Time Series [^5] | 2022 | -| ***`Clustering`*** | 🚥 | 🚥 | 🚥 | -| **Type** | **Abbr.** | **Full name of the algorithm/model/paper** | **Year** | -| Neural Net | CRLI | Clustering Representation Learning on Incomplete time-series data [^6] | 2021 | -| Neural Net | VaDER | Variational Deep Embedding with Recurrence [^7] | 2019 | -| ***`Forecasting`*** | 🚥 | 🚥 | 🚥 | -| **Type** | **Abbr.** | **Full name of the algorithm/model/paper** | **Year** | -| Probabilistic | BTTF | Bayesian Temporal Tensor Factorization [^8] | 2021 | +🔥 Note that Transformer, Crossformer, PatchTST, DLinear, ETSformer, FEDformer, Informer, Autoformer are not proposed as imputation methods in their original papers, +and they cannot accept POTS as input. **To make them applicable on POTS data, we apply the embedding strategy the same as we did in [SAITS paper](https://arxiv.org/pdf/2202.08516).** + +| ***`Imputation`*** | 🚥 | 🚥 | 🚥 | +|:----------------------:|:-----------:|:-----------------------------------------------------------------------------------------------:|:--------:| +| **Type** | **Abbr.** | **Full name of the algorithm/model** | **Year** | +| Neural Net | SAITS | Self-Attention-based Imputation for Time Series [^1] | 2023 | +| Neural Net | Transformer | Attention is All you Need [^2] | 2017 | +| Neural Net | Crossformer | Transformer Utilizing Cross-Dimension Dependency for Multivariate Time Series Forecasting [^16] | 2023 | +| Neural Net | TimesNet | Temporal 2D-Variation Modeling for General Time Series Analysis [^14] | 2023 | +| Neural Net | PatchTST | A Time Series is Worth 64 Words: Long-Term Forecasting with Transformers [^18] | 2023 | +| Neural Net | DLinear | Are Transformers Effective for Time Series Forecasting? [^17] | 2023 | +| Neural Net | ETSformer | Exponential Smoothing Transformers for Time-series Forecasting [^19] | 2023 | +| Neural Net | FEDformer | Frequency Enhanced Decomposed Transformer for Long-term Series Forecasting [^20] | 2022 | +| Neural Net | Informer | Beyond Efficient Transformer for Long Sequence Time-Series Forecasting [^21] | 2021 | +| Neural Net | Autoformer | Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting [^15] | 2021 | +| Neural Net | CSDI | Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation [^12] | 2021 | +| Neural Net | US-GAN | Unsupervised GAN for Multivariate Time Series Imputation [^10] | 2021 | +| Neural Net | GP-VAE | Gaussian Process Variational Autoencoder [^11] | 2020 | +| Neural Net | BRITS | Bidirectional Recurrent Imputation for Time Series [^3] | 2018 | +| Neural Net | M-RNN | Multi-directional Recurrent Neural Network [^9] | 2019 | +| Naive | LOCF/NOCB | Last Observation Carried Forward / Next Observation Carried Backward | - | +| Naive | Median | Median Value Imputation | - | +| Naive | Mean | Mean Value Imputation | - | +| ***`Classification`*** | 🚥 | 🚥 | 🚥 | +| **Type** | **Abbr.** | **Full name of the algorithm/model/paper** | **Year** | +| Neural Net | BRITS | Bidirectional Recurrent Imputation for Time Series [^3] | 2018 | +| Neural Net | GRU-D | Recurrent Neural Networks for Multivariate Time Series with Missing Values [^4] | 2018 | +| Neural Net | Raindrop | Graph-Guided Network for Irregularly Sampled Multivariate Time Series [^5] | 2022 | +| ***`Clustering`*** | 🚥 | 🚥 | 🚥 | +| **Type** | **Abbr.** | **Full name of the algorithm/model/paper** | **Year** | +| Neural Net | CRLI | Clustering Representation Learning on Incomplete time-series data [^6] | 2021 | +| Neural Net | VaDER | Variational Deep Embedding with Recurrence [^7] | 2019 | +| ***`Forecasting`*** | 🚥 | 🚥 | 🚥 | +| **Type** | **Abbr.** | **Full name of the algorithm/model/paper** | **Year** | +| Probabilistic | BTTF | Bayesian Temporal Tensor Factorization [^8] | 2021 | ## ❖ Citing PyPOTS diff --git a/pypots/__init__.py b/pypots/__init__.py index 8075ec06..566339bd 100644 --- a/pypots/__init__.py +++ b/pypots/__init__.py @@ -22,7 +22,7 @@ # # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' -__version__ = "0.3.2" +__version__ = "0.4" from . import imputation, classification, clustering, forecasting, optim, data, utils diff --git a/pypots/imputation/autoformer/modules/core.py b/pypots/imputation/autoformer/modules/core.py index c3747fde..14cdb53c 100644 --- a/pypots/imputation/autoformer/modules/core.py +++ b/pypots/imputation/autoformer/modules/core.py @@ -5,6 +5,7 @@ # Created by Wenjie Du # License: BSD-3-Clause +import torch import torch.nn as nn from .submodules import ( @@ -38,7 +39,7 @@ def __init__( self.seq_len = n_steps self.n_layers = n_layers self.enc_embedding = DataEmbedding( - n_features, + n_features * 2, d_model, dropout=dropout, with_pos=False, @@ -63,28 +64,35 @@ def __init__( ) # for the imputation task, the output dim is the same as input dim - self.projection = nn.Linear(d_model, n_features) + self.output_projection = nn.Linear(d_model, n_features) def forward(self, inputs: dict, training: bool = True) -> dict: X, masks = inputs["X"], inputs["missing_mask"] - # embedding - enc_out = self.enc_embedding(X) # [B,T,C] + # WDU: the original Autoformer paper isn't proposed for imputation task. Hence the model doesn't take + # the missing mask into account, which means, in the process, the model doesn't know which part of + # the input data is missing, and this may hurt the model's imputation performance. Therefore, I add the + # embedding layers to project the concatenation of features and masks into a hidden space, as well as + # the output layers to project back from the hidden space to the original space. + + # the same as SAITS, concatenate the time series data and the missing mask for embedding + input_X = torch.cat([X, masks], dim=2) + enc_out = self.enc_embedding(input_X) # Autoformer encoder processing enc_out, attns = self.encoder(enc_out) # project back the original data space - dec_out = self.projection(enc_out) + output = self.output_projection(enc_out) - imputed_data = masks * X + (1 - masks) * dec_out + imputed_data = masks * X + (1 - masks) * output results = { "imputed_data": imputed_data, } if training: # `loss` is always the item for backward propagating to update the model - loss = calc_mse(dec_out, inputs["X_ori"], inputs["indicating_mask"]) + loss = calc_mse(output, inputs["X_ori"], inputs["indicating_mask"]) results["loss"] = loss return results diff --git a/pypots/imputation/crossformer/modules/core.py b/pypots/imputation/crossformer/modules/core.py index 0cc9b07a..8eb04df6 100644 --- a/pypots/imputation/crossformer/modules/core.py +++ b/pypots/imputation/crossformer/modules/core.py @@ -33,6 +33,7 @@ def __init__( super().__init__() self.n_features = n_features + self.d_model = d_model # The padding operation to handle invisible sgemnet length pad_in_len = ceil(1.0 * n_steps / seg_len) * seg_len @@ -49,7 +50,7 @@ def __init__( 0, ) self.enc_pos_embedding = nn.Parameter( - torch.randn(1, n_features, in_seg_num, d_model) + torch.randn(1, d_model, in_seg_num, d_model) ) self.pre_norm = nn.LayerNorm(d_model) @@ -71,31 +72,40 @@ def __init__( ) self.head = FlattenHead(head_nf, n_steps, dropout) + self.embedding = nn.Linear(n_features * 2, d_model) + self.output_projection = nn.Linear(d_model, n_features) def forward(self, inputs: dict, training: bool = True) -> dict: X, masks = inputs["X"], inputs["missing_mask"] + # WDU: the original Crossformer paper isn't proposed for imputation task. Hence the model doesn't take + # the missing mask into account, which means, in the process, the model doesn't know which part of + # the input data is missing, and this may hurt the model's imputation performance. Therefore, I add the + # embedding layers to project the concatenation of features and masks into a hidden space, as well as + # the output layers to project back from the hidden space to the original space. # embedding - x_enc = self.enc_value_embedding(X.permute(0, 2, 1)) + input_X = self.embedding(torch.cat([X, masks], dim=2)) + x_enc = self.enc_value_embedding(input_X.permute(0, 2, 1)) # Crossformer processing x_enc = rearrange( - x_enc, "(b d) seg_num d_model -> b d seg_num d_model", d=self.n_features + x_enc, "(b d) seg_num d_model -> b d seg_num d_model", d=self.d_model ) x_enc += self.enc_pos_embedding x_enc = self.pre_norm(x_enc) enc_out, attns = self.encoder(x_enc) # project back the original data space dec_out = self.head(enc_out[-1].permute(0, 1, 3, 2)).permute(0, 2, 1) + output = self.output_projection(dec_out) - imputed_data = masks * X + (1 - masks) * dec_out + imputed_data = masks * X + (1 - masks) * output results = { "imputed_data": imputed_data, } if training: # `loss` is always the item for backward propagating to update the model - loss = calc_mse(dec_out, inputs["X_ori"], inputs["indicating_mask"]) + loss = calc_mse(output, inputs["X_ori"], inputs["indicating_mask"]) results["loss"] = loss return results diff --git a/pypots/imputation/crossformer/modules/submodules.py b/pypots/imputation/crossformer/modules/submodules.py index 2a67a227..6a6f1c7b 100644 --- a/pypots/imputation/crossformer/modules/submodules.py +++ b/pypots/imputation/crossformer/modules/submodules.py @@ -144,11 +144,12 @@ def __init__( d_ff, depth, dropout, - seg_num=10, - factor=10, + seg_num, + factor, ): super().__init__() + d_k = d_model // n_heads if win_size > 1: self.merge_layer = SegMerging(d_model, win_size, nn.LayerNorm) else: @@ -158,7 +159,9 @@ def __init__( for i in range(depth): self.encode_layers.append( - TwoStageAttentionLayer(seg_num, factor, d_model, n_heads, d_ff, dropout) + TwoStageAttentionLayer( + seg_num, factor, d_model, n_heads, d_k, d_k, d_ff, dropout + ) ) def forward(self, x, attn_mask=None, tau=None, delta=None): diff --git a/pypots/imputation/dlinear/model.py b/pypots/imputation/dlinear/model.py index d5c5e84a..f6c89976 100644 --- a/pypots/imputation/dlinear/model.py +++ b/pypots/imputation/dlinear/model.py @@ -47,7 +47,11 @@ class DLinear(BaseNNImputer): The window size of moving average. individual : - Whether to share model across different features. + Whether to make a linear layer for each variate/channel/feature individually. + + d_model: + The dimension of the space in which the time-series data will be embedded and modeled. + It is necessary only for DLinear in the non-individual mode. batch_size : The batch size for training and evaluating the model. @@ -96,6 +100,7 @@ def __init__( n_features: int, moving_avg_window_size: int, individual: bool = False, + d_model: Optional[int] = None, batch_size: int = 32, epochs: int = 100, patience: int = None, @@ -120,6 +125,7 @@ def __init__( # model hype-parameters self.moving_avg_window_size = moving_avg_window_size self.individual = individual + self.d_model = d_model # set up the model self.model = _DLinear( @@ -127,6 +133,7 @@ def __init__( n_features, moving_avg_window_size, individual, + d_model, ) self._send_model_to_given_device() self._print_model_size() diff --git a/pypots/imputation/dlinear/modules/core.py b/pypots/imputation/dlinear/modules/core.py index e8e5ec35..18f33cec 100644 --- a/pypots/imputation/dlinear/modules/core.py +++ b/pypots/imputation/dlinear/modules/core.py @@ -5,6 +5,8 @@ # Created by Wenjie Du # License: BSD-3-Clause +from typing import Optional + import torch import torch.nn as nn @@ -19,6 +21,7 @@ def __init__( n_features: int, moving_avg_window_size: int, individual: bool = False, + d_model: Optional[int] = None, ): super().__init__() @@ -28,39 +31,48 @@ def __init__( self.individual = individual if individual: - self.Linear_Seasonal = nn.ModuleList() - self.Linear_Trend = nn.ModuleList() - - for i in range(self.n_features): - self.Linear_Seasonal.append(nn.Linear(self.n_steps, self.n_steps)) - self.Linear_Trend.append(nn.Linear(self.n_steps, self.n_steps)) - - self.Linear_Seasonal[i].weight = nn.Parameter( - (1 / self.n_steps) * torch.ones([self.n_steps, self.n_steps]) + # create linear layers for each feature individually + self.linear_seasonal = nn.ModuleList() + self.linear_trend = nn.ModuleList() + for i in range(n_features): + self.linear_seasonal.append(nn.Linear(n_steps, n_steps)) + self.linear_trend.append(nn.Linear(n_steps, n_steps)) + self.linear_seasonal[i].weight = nn.Parameter( + (1 / n_steps) * torch.ones([n_steps, n_steps]) ) - self.Linear_Trend[i].weight = nn.Parameter( - (1 / self.n_steps) * torch.ones([self.n_steps, self.n_steps]) + self.linear_trend[i].weight = nn.Parameter( + (1 / n_steps) * torch.ones([n_steps, n_steps]) ) else: - self.Linear_Seasonal = nn.Linear(self.n_steps, self.n_steps) - self.Linear_Trend = nn.Linear(self.n_steps, self.n_steps) - - self.Linear_Seasonal.weight = nn.Parameter( - (1 / self.n_steps) * torch.ones([self.n_steps, self.n_steps]) + if d_model is None: + raise ValueError( + "The argument d_model is necessary for DLinear in the non-individual mode." + ) + self.linear_seasonal = nn.Linear(n_steps, n_steps) + self.linear_trend = nn.Linear(n_steps, n_steps) + self.linear_seasonal.weight = nn.Parameter( + (1 / n_steps) * torch.ones([n_steps, n_steps]) ) - self.Linear_Trend.weight = nn.Parameter( - (1 / self.n_steps) * torch.ones([self.n_steps, self.n_steps]) + self.linear_trend.weight = nn.Parameter( + (1 / n_steps) * torch.ones([n_steps, n_steps]) ) + self.linear_seasonal_embedding = nn.Linear(n_features * 2, d_model) + self.linear_trend_embedding = nn.Linear(n_features * 2, d_model) + self.linear_seasonal_output = nn.Linear(d_model, n_features) + self.linear_trend_output = nn.Linear(d_model, n_features) + def forward(self, inputs: dict, training: bool = True) -> dict: X, masks = inputs["X"], inputs["missing_mask"] - # DLinear encoder processing + # input preprocessing and embedding for DLinear seasonal_init, trend_init = self.series_decomp(X) - seasonal_init, trend_init = seasonal_init.permute(0, 2, 1), trend_init.permute( - 0, 2, 1 - ) + + # DLinear processing if self.individual: + seasonal_init, trend_init = seasonal_init.permute( + 0, 2, 1 + ), trend_init.permute(0, 2, 1) seasonal_output = torch.zeros( [seasonal_init.size(0), seasonal_init.size(1), self.n_steps], dtype=seasonal_init.dtype, @@ -70,15 +82,36 @@ def forward(self, inputs: dict, training: bool = True) -> dict: dtype=trend_init.dtype, ).to(trend_init.device) for i in range(self.n_features): - seasonal_output[:, i, :] = self.Linear_Seasonal[i]( + seasonal_output[:, i, :] = self.linear_seasonal[i]( seasonal_init[:, i, :] ) - trend_output[:, i, :] = self.Linear_Trend[i](trend_init[:, i, :]) + trend_output[:, i, :] = self.linear_trend[i](trend_init[:, i, :]) + + seasonal_output = seasonal_output.permute(0, 2, 1) + trend_output = trend_output.permute(0, 2, 1) else: - seasonal_output = self.Linear_Seasonal(seasonal_init) - trend_output = self.Linear_Trend(trend_init) + # WDU: the original DLinear paper isn't proposed for imputation task. Hence the model doesn't take + # the missing mask into account, which means, in the process, the model doesn't know which part of + # the input data is missing, and this may hurt the model's imputation performance. Therefore, I add the + # embedding layers to project the concatenation of features and masks into a hidden space, as well as + # the output layers to project the seasonal and trend from the hidden space to the original space. + # But this is only for the non-individual mode. + seasonal_init = torch.cat([seasonal_init, masks], dim=2) + trend_init = torch.cat([trend_init, masks], dim=2) + seasonal_init = self.linear_seasonal_embedding(seasonal_init) + trend_init = self.linear_trend_embedding(trend_init) + seasonal_init, trend_init = seasonal_init.permute( + 0, 2, 1 + ), trend_init.permute(0, 2, 1) + + seasonal_output = self.linear_seasonal(seasonal_init) + trend_output = self.linear_trend(trend_init) + seasonal_output = seasonal_output.permute(0, 2, 1) + trend_output = trend_output.permute(0, 2, 1) + seasonal_output = self.linear_seasonal_output(seasonal_output) + trend_output = self.linear_trend_output(trend_output) + output = seasonal_output + trend_output - output = output.permute(0, 2, 1) imputed_data = masks * X + (1 - masks) * output results = { diff --git a/pypots/imputation/etsformer/modules/core.py b/pypots/imputation/etsformer/modules/core.py index 1906174c..57faa6de 100644 --- a/pypots/imputation/etsformer/modules/core.py +++ b/pypots/imputation/etsformer/modules/core.py @@ -5,6 +5,7 @@ # Created by Wenjie Du # License: BSD-3-Clause +import torch import torch.nn as nn from .submodules import ( @@ -36,7 +37,7 @@ def __init__( self.n_steps = n_steps self.enc_embedding = DataEmbedding( - n_features, + n_features * 2, d_model, dropout=dropout, ) @@ -76,8 +77,15 @@ def __init__( def forward(self, inputs: dict, training: bool = True) -> dict: X, masks = inputs["X"], inputs["missing_mask"] - # embedding - res = self.enc_embedding(X) + # WDU: the original ETSformer paper isn't proposed for imputation task. Hence the model doesn't take + # the missing mask into account, which means, in the process, the model doesn't know which part of + # the input data is missing, and this may hurt the model's imputation performance. Therefore, I add the + # embedding layers to project the concatenation of features and masks into a hidden space, as well as + # the output layers to project back from the hidden space to the original space. + + # the same as SAITS, concatenate the time series data and the missing mask for embedding + input_X = torch.cat([X, masks], dim=2) + res = self.enc_embedding(input_X) # ETSformer encoder processing level, growths, seasons = self.encoder(res, X, attn_mask=None) diff --git a/pypots/imputation/fedformer/modules/core.py b/pypots/imputation/fedformer/modules/core.py index 895cf8d4..00f5241a 100644 --- a/pypots/imputation/fedformer/modules/core.py +++ b/pypots/imputation/fedformer/modules/core.py @@ -5,6 +5,7 @@ # Created by Wenjie Du # License: BSD-3-Clause +import torch import torch.nn as nn from .submodules import MultiWaveletTransform, FourierBlock @@ -37,7 +38,7 @@ def __init__( super().__init__() self.enc_embedding = DataEmbedding( - n_features, + n_features * 2, d_model, dropout=dropout, ) @@ -75,17 +76,24 @@ def __init__( ], norm_layer=SeasonalLayerNorm(d_model), ) - self.projection = nn.Linear(d_model, n_features) + self.output_projection = nn.Linear(d_model, n_features) def forward(self, inputs: dict, training: bool = True) -> dict: X, masks = inputs["X"], inputs["missing_mask"] - # embedding - enc_out = self.enc_embedding(X) + # WDU: the original FEDformer paper isn't proposed for imputation task. Hence the model doesn't take + # the missing mask into account, which means, in the process, the model doesn't know which part of + # the input data is missing, and this may hurt the model's imputation performance. Therefore, I add the + # embedding layers to project the concatenation of features and masks into a hidden space, as well as + # the output layers to project back from the hidden space to the original space. + + # the same as SAITS, concatenate the time series data and the missing mask for embedding + input_X = torch.cat([X, masks], dim=2) + enc_out = self.enc_embedding(input_X) # FEDformer encoder processing enc_out, attns = self.encoder(enc_out) - output = self.projection(enc_out) + output = self.output_projection(enc_out) imputed_data = masks * X + (1 - masks) * output results = { diff --git a/pypots/imputation/informer/__init__.py b/pypots/imputation/informer/__init__.py index 557abbaf..298d2345 100644 --- a/pypots/imputation/informer/__init__.py +++ b/pypots/imputation/informer/__init__.py @@ -7,7 +7,6 @@ In Proceedings of the AAAI conference on artificial intelligence, volume 35, pages 11106–11115, 2021. `_ - """ # Created by Wenjie Du diff --git a/pypots/imputation/informer/modules/core.py b/pypots/imputation/informer/modules/core.py index 455a7b1a..e6240c63 100644 --- a/pypots/imputation/informer/modules/core.py +++ b/pypots/imputation/informer/modules/core.py @@ -5,11 +5,12 @@ # Created by Wenjie Du # License: BSD-3-Clause +import torch import torch.nn as nn from .submodules import ProbAttention, ConvLayer, InformerEncoderLayer, InformerEncoder -from ....nn.modules.transformer.embedding import DataEmbedding from ....nn.modules.transformer import MultiHeadAttention +from ....nn.modules.transformer.embedding import DataEmbedding from ....utils.metrics import calc_mse @@ -33,7 +34,7 @@ def __init__( self.seq_len = n_steps self.n_layers = n_layers self.enc_embedding = DataEmbedding( - n_features, + n_features * 2, d_model, dropout=dropout, ) @@ -59,28 +60,35 @@ def __init__( ) # for the imputation task, the output dim is the same as input dim - self.projection = nn.Linear(d_model, n_features) + self.output_projection = nn.Linear(d_model, n_features) def forward(self, inputs: dict, training: bool = True) -> dict: X, masks = inputs["X"], inputs["missing_mask"] - # embedding - enc_out = self.enc_embedding(X) + # WDU: the original Informer paper isn't proposed for imputation task. Hence the model doesn't take + # the missing mask into account, which means, in the process, the model doesn't know which part of + # the input data is missing, and this may hurt the model's imputation performance. Therefore, I add the + # embedding layers to project the concatenation of features and masks into a hidden space, as well as + # the output layers to project back from the hidden space to the original space. + + # the same as SAITS, concatenate the time series data and the missing mask for embedding + input_X = torch.cat([X, masks], dim=2) + enc_out = self.enc_embedding(input_X) # Informer encoder processing enc_out, attns = self.encoder(enc_out) # project back the original data space - dec_out = self.projection(enc_out) + output = self.output_projection(enc_out) - imputed_data = masks * X + (1 - masks) * dec_out + imputed_data = masks * X + (1 - masks) * output results = { "imputed_data": imputed_data, } if training: # `loss` is always the item for backward propagating to update the model - loss = calc_mse(dec_out, inputs["X_ori"], inputs["indicating_mask"]) + loss = calc_mse(output, inputs["X_ori"], inputs["indicating_mask"]) results["loss"] = loss return results diff --git a/pypots/imputation/patchtst/modules/core.py b/pypots/imputation/patchtst/modules/core.py index 9013a802..c1fc97c7 100644 --- a/pypots/imputation/patchtst/modules/core.py +++ b/pypots/imputation/patchtst/modules/core.py @@ -5,6 +5,7 @@ # Created by Wenjie Du # License: BSD-3-Clause +import torch import torch.nn as nn from .submodules import PatchEmbedding, FlattenHead @@ -38,7 +39,9 @@ def __init__( self.n_steps = n_steps self.n_features = n_features self.n_layers = n_layers + self.d_model = d_model + self.embedding = nn.Linear(n_features * 2, d_model) self.patch_embedding = PatchEmbedding( d_model, patch_len, stride, padding, dropout ) @@ -57,38 +60,45 @@ def __init__( ] ) self.head = FlattenHead(head_nf, n_steps, dropout) + self.output_projection = nn.Linear(d_model, n_features) def forward(self, inputs: dict, training: bool = True) -> dict: X, masks = inputs["X"], inputs["missing_mask"] + # WDU: the original PatchTST paper isn't proposed for imputation task. Hence the model doesn't take + # the missing mask into account, which means, in the process, the model doesn't know which part of + # the input data is missing, and this may hurt the model's imputation performance. Therefore, I add the + # embedding layers to project the concatenation of features and masks into a hidden space, as well as + # the output layers to project back from the hidden space to the original space. + # do patching and embedding - x_enc = X.permute(0, 2, 1) - # u: [bs * n_features x patch_num x d_model] - enc_out = self.patch_embedding(x_enc) + input_X = self.embedding(torch.cat([X, masks], dim=2)) + enc_out = self.patch_embedding(input_X.permute(0, 2, 1)) # PatchTST encoder processing - # z: [bs * n_features x patch_num x d_model] + # z: [bs * d_model x patch_num x d_model] for i in range(self.n_layers): enc_out, _ = self.encoder[i](enc_out) - # z: [bs x n_features x patch_num x d_model] + # z: [bs x d_model x patch_num x d_model] enc_out = enc_out.reshape( - -1, self.n_features, enc_out.shape[-2], enc_out.shape[-1] + -1, self.d_model, enc_out.shape[-2], enc_out.shape[-1] ) - # z: [bs x n_features x d_model x patch_num] + # z: [bs x d_model x d_model x patch_num] enc_out = enc_out.permute(0, 1, 3, 2) # project back the original data space - dec_out = self.head(enc_out) # z: [bs x n_features x target_window] + dec_out = self.head(enc_out) # z: [bs x d_model x target_window] dec_out = dec_out.permute(0, 2, 1) + output = self.output_projection(dec_out) - imputed_data = masks * X + (1 - masks) * dec_out + imputed_data = masks * X + (1 - masks) * output results = { "imputed_data": imputed_data, } if training: # `loss` is always the item for backward propagating to update the model - loss = calc_mse(dec_out, inputs["X_ori"], inputs["indicating_mask"]) + loss = calc_mse(output, inputs["X_ori"], inputs["indicating_mask"]) results["loss"] = loss return results diff --git a/pypots/nn/modules/transformer/attention.py b/pypots/nn/modules/transformer/attention.py index 1c23efd8..448abf7c 100644 --- a/pypots/nn/modules/transformer/attention.py +++ b/pypots/nn/modules/transformer/attention.py @@ -195,11 +195,12 @@ def forward( # keep useful variables batch_size, n_steps = q.size(0), q.size(1) + k_n_steps = k.size(1) # now separate the last dimension of q, k, v into different heads -> [batch_size, n_steps, n_heads, d_k or d_v] q = self.w_qs(q).view(batch_size, n_steps, self.n_heads, self.d_k) - k = self.w_ks(k).view(batch_size, n_steps, self.n_heads, self.d_k) - v = self.w_vs(v).view(batch_size, n_steps, self.n_heads, self.d_v) + k = self.w_ks(k).view(batch_size, k_n_steps, self.n_heads, self.d_k) + v = self.w_vs(v).view(batch_size, k_n_steps, self.n_heads, self.d_v) # transpose for self-attention calculation -> [batch_size, n_steps, d_k or d_v, n_heads] q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2) diff --git a/tests/imputation/dlinear.py b/tests/imputation/dlinear.py index e2680b23..c1351305 100644 --- a/tests/imputation/dlinear.py +++ b/tests/imputation/dlinear.py @@ -47,15 +47,30 @@ class TestDLinear(unittest.TestCase): DATA["n_features"], moving_avg_window_size=3, individual=False, + d_model=128, epochs=EPOCHS, saving_path=saving_path, optimizer=optimizer, device=DEVICE, ) + individual_optimizer = Adam(lr=0.001, weight_decay=1e-5) + individual_dlinear = DLinear( + DATA["n_steps"], + DATA["n_features"], + moving_avg_window_size=3, + individual=True, + d_model=None, # d_model is useless for DLinear in the individual mode + epochs=EPOCHS, + saving_path=saving_path, + optimizer=individual_optimizer, + device=DEVICE, + ) + @pytest.mark.xdist_group(name="imputation-dlinear") def test_0_fit(self): self.dlinear.fit(TRAIN_SET, VAL_SET) + self.individual_dlinear.fit(TRAIN_SET, VAL_SET) @pytest.mark.xdist_group(name="imputation-dlinear") def test_1_impute(self): @@ -71,6 +86,14 @@ def test_1_impute(self): ) logger.info(f"DLinear test_MSE: {test_MSE}") + imputation_results = self.individual_dlinear.predict(TEST_SET) + test_MSE = calc_mse( + imputation_results["imputation"], + DATA["test_X_ori"], + DATA["test_X_indicating_mask"], + ) + logger.info(f"Individual DLinear test_MSE: {test_MSE}") + @pytest.mark.xdist_group(name="imputation-dlinear") def test_2_parameters(self): assert hasattr(self.dlinear, "model") and self.dlinear.model is not None