Merge pull request #441 from WenjieDu/(feat)restore_from_sliding_window

Enable to restore from `sliding_window()`
WenjieDu · Jun 20, 2024 · c7ecd0b · c7ecd0b
2 parents 47446d9 + 1f8d102
commit c7ecd0b
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 245 deletions.
diff --git a/pypots/data/__init__.py b/pypots/data/__init__.py
@@ -11,14 +11,18 @@
     gene_complete_random_walk_for_anomaly_detection,
     gene_complete_random_walk_for_classification,
     gene_random_walk,
-    gene_physionet2012,
 )
-from .load_specific_datasets import (
-    list_supported_datasets,
-    load_specific_dataset,
+from .saving import (
+    save_dict_into_h5,
+    load_dict_from_h5,
+    pickle_dump,
+    pickle_load,
+)
+from .utils import (
+    parse_delta,
+    sliding_window,
+    inverse_sliding_window,
 )
-from .saving import save_dict_into_h5
-from .utils import parse_delta, sliding_window
 
 __all__ = [
     # base dataset classes
@@ -29,13 +33,13 @@
     "gene_complete_random_walk_for_anomaly_detection",
     "gene_complete_random_walk_for_classification",
     "gene_random_walk",
-    "gene_physionet2012",
-    # list and load datasets
-    "list_supported_datasets",
-    "load_specific_dataset",
     # utils
     "parse_delta",
     "sliding_window",
+    "inverse_sliding_window",
     # saving
     "save_dict_into_h5",
+    "load_dict_from_h5",
+    "pickle_dump",
+    "pickle_load",
 ]
diff --git a/pypots/data/generating.py b/pypots/data/generating.py
@@ -14,8 +14,6 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils import check_random_state
 
-from .load_specific_datasets import load_specific_dataset
-
 
 def gene_complete_random_walk(
     n_samples: int = 1000,
@@ -320,109 +318,3 @@ def gene_random_walk(
         data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X)
 
     return data
-
-
-def gene_physionet2012(artificially_missing_rate: float = 0.1):
-    """Generate a fully-prepared PhysioNet-2012 dataset for model testing.
-
-    Parameters
-    ----------
-    artificially_missing_rate : float, default=0.1
-        The rate of artificially missing values to generate for model evaluation.
-        This ratio is calculated based on the number of observed values, i.e. if artificially_missing_rate = 0.1,
-        then 10% of the observed values will be randomly masked as missing data and hold out for model evaluation.
-
-    Returns
-    -------
-    data: dict,
-        A dictionary containing the generated PhysioNet-2012 dataset.
-
-    """
-    assert (
-        0 <= artificially_missing_rate < 1
-    ), "artificially_missing_rate must be in [0,1)"
-
-    # generate samples
-    dataset = load_specific_dataset("physionet_2012")
-    X = dataset["X"]
-    y = dataset["y"]
-    ICUType = dataset["ICUType"]
-
-    all_recordID = X["RecordID"].unique()
-    train_set_ids, test_set_ids = train_test_split(all_recordID, test_size=0.2)
-    train_set_ids, val_set_ids = train_test_split(train_set_ids, test_size=0.2)
-    train_set_ids.sort()
-    val_set_ids.sort()
-    test_set_ids.sort()
-    train_set = X[X["RecordID"].isin(train_set_ids)].sort_values(["RecordID", "Time"])
-    val_set = X[X["RecordID"].isin(val_set_ids)].sort_values(["RecordID", "Time"])
-    test_set = X[X["RecordID"].isin(test_set_ids)].sort_values(["RecordID", "Time"])
-
-    train_set = train_set.drop(["RecordID", "Time"], axis=1)
-    val_set = val_set.drop(["RecordID", "Time"], axis=1)
-    test_set = test_set.drop(["RecordID", "Time"], axis=1)
-    train_X, val_X, test_X = (
-        train_set.to_numpy(),
-        val_set.to_numpy(),
-        test_set.to_numpy(),
-    )
-
-    # normalization
-    scaler = StandardScaler()
-    train_X = scaler.fit_transform(train_X)
-    val_X = scaler.transform(val_X)
-    test_X = scaler.transform(test_X)
-
-    # reshape into time series samples
-    train_X = train_X.reshape(len(train_set_ids), 48, -1)
-    val_X = val_X.reshape(len(val_set_ids), 48, -1)
-    test_X = test_X.reshape(len(test_set_ids), 48, -1)
-
-    train_y = y[y.index.isin(train_set_ids)].sort_index()
-    val_y = y[y.index.isin(val_set_ids)].sort_index()
-    test_y = y[y.index.isin(test_set_ids)].sort_index()
-    train_y, val_y, test_y = train_y.to_numpy(), val_y.to_numpy(), test_y.to_numpy()
-
-    train_ICUType = ICUType[ICUType.index.isin(train_set_ids)].sort_index()
-    val_ICUType = ICUType[ICUType.index.isin(val_set_ids)].sort_index()
-    test_ICUType = ICUType[ICUType.index.isin(test_set_ids)].sort_index()
-    train_ICUType, val_ICUType, test_ICUType = (
-        train_ICUType.to_numpy(),
-        val_ICUType.to_numpy(),
-        test_ICUType.to_numpy(),
-    )
-
-    data = {
-        "n_classes": 2,
-        "n_steps": 48,
-        "n_features": train_X.shape[-1],
-        "train_X": train_X,
-        "train_y": train_y.flatten(),
-        "train_ICUType": train_ICUType.flatten(),
-        "val_X": val_X,
-        "val_y": val_y.flatten(),
-        "val_ICUType": val_ICUType.flatten(),
-        "test_X": test_X,
-        "test_y": test_y.flatten(),
-        "test_ICUType": test_ICUType.flatten(),
-        "scaler": scaler,
-    }
-
-    if artificially_missing_rate > 0:
-        # mask values in the validation set as ground truth
-        val_X_ori = val_X
-        val_X = mcar(val_X, artificially_missing_rate)
-        # mask values in the test set as ground truth
-        test_X_ori = test_X
-        test_X = mcar(test_X, artificially_missing_rate)
-
-        data["val_X"] = val_X
-        data["val_X_ori"] = val_X_ori
-
-        # test_X is for model input
-        data["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
-        data["test_X_ori"] = np.nan_to_num(test_X_ori)  # fill NaNs for later error calc
-        data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X)
-
-    return data
diff --git a/pypots/data/load_preprocessing.py b/pypots/data/load_preprocessing.py
diff --git a/pypots/data/load_specific_datasets.py b/pypots/data/load_specific_datasets.py
diff --git a/pypots/data/utils.py b/pypots/data/utils.py
@@ -5,11 +5,14 @@
 # Created by Wenjie Du <[email protected]>
 # License: BSD-3-Clause
 
+import math
 from typing import Union
 
 import numpy as np
 import torch
 
+from ..utils.logging import logger
+
 
 def turn_data_into_specified_dtype(
     data: Union[np.ndarray, torch.Tensor, list],
@@ -194,8 +197,13 @@ def sliding_window(time_series, window_len, sliding_len=None):
     start_indices = np.asarray(range(total_len // sliding_len)) * sliding_len
 
     # remove the last one if left length is not enough
-    if total_len - start_indices[-1] * sliding_len < window_len:
-        start_indices = start_indices[:-1]
+    if total_len - start_indices[-1] < window_len:
+        to_drop = math.ceil(window_len / sliding_len)
+        left_len = total_len - start_indices[-1]
+        start_indices = start_indices[:-to_drop]
+        logger.warning(
+            f"The last {to_drop} samples are dropped due to the left length {left_len} is not enough."
+        )
 
     sample_collector = []
     for idx in start_indices:
@@ -204,3 +212,51 @@ def sliding_window(time_series, window_len, sliding_len=None):
     samples = np.asarray(sample_collector).astype("float32")
 
     return samples
+
+
+def inverse_sliding_window(X, sliding_len):
+    """Restore the original time-series data from the generated sliding window samples.
+    Note that this is the inverse operation of the `sliding_window` function, but there is no guarantee that
+    the restored data is the same as the original data considering that
+    1. the sliding length may be larger than the window size and there will be gaps between restored data;
+    2. if values in the samples get changed, the overlap part may not be the same as the original data after averaging;
+    3. some incomplete samples at the tail may be dropped during the sliding window operation, hence the restored data
+       may be shorter than the original data.
+
+    Parameters
+    ----------
+    X :
+        The generated time-series samples with sliding window method, shape of [n_samples, n_steps, n_features],
+        where n_steps is the window size of the used sliding window method.
+
+    sliding_len :
+        The sliding length of the window for each moving step in the sliding window method used to generate X.
+
+    Returns
+    -------
+    restored_data :
+        The restored time-series data with shape of [total_length, n_features].
+
+    """
+    assert len(X.shape) == 3, f"X should be a 3D array, but got {X.shape}"
+    n_samples, window_size, n_features = X.shape
+
+    if sliding_len >= window_size:
+        if sliding_len > window_size:
+            logger.warning(
+                f"sliding_len {sliding_len} is larger than the window size {window_size}, "
+                f"hence there will be gaps between restored data."
+            )
+        restored_data = X.reshape(n_samples * window_size, n_features)
+    else:
+        collector = [X[0][:sliding_len]]
+        overlap = X[0][sliding_len:]
+        for x in X[1:]:
+            overlap_avg = (overlap + x[:-sliding_len]) / 2
+            collector.append(overlap_avg[:sliding_len])
+            overlap = np.concatenate(
+                [overlap_avg[sliding_len:], x[-sliding_len:]], axis=0
+            )
+        collector.append(overlap)
+        restored_data = np.concatenate(collector, axis=0)
+    return restored_data