refactor: remove gene_physionet2012();

WenjieDu · Jun 16, 2024 · d041eb2 · d041eb2
1 parent 14a9c00
commit d041eb2
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 242 deletions.
diff --git a/pypots/data/__init__.py b/pypots/data/__init__.py
@@ -11,13 +11,13 @@
     gene_complete_random_walk_for_anomaly_detection,
     gene_complete_random_walk_for_classification,
     gene_random_walk,
-    gene_physionet2012,
 )
-from .load_specific_datasets import (
-    list_supported_datasets,
-    load_specific_dataset,
+from .saving import (
+    save_dict_into_h5,
+    load_dict_from_h5,
+    pickle_dump,
+    pickle_load,
 )
-from .saving import save_dict_into_h5
 from .utils import (
     parse_delta,
     sliding_window,
@@ -33,14 +33,13 @@
     "gene_complete_random_walk_for_anomaly_detection",
     "gene_complete_random_walk_for_classification",
     "gene_random_walk",
-    "gene_physionet2012",
-    # list and load datasets
-    "list_supported_datasets",
-    "load_specific_dataset",
     # utils
     "parse_delta",
     "sliding_window",
     "inverse_sliding_window",
     # saving
     "save_dict_into_h5",
+    "load_dict_from_h5",
+    "pickle_dump",
+    "pickle_load",
 ]
diff --git a/pypots/data/generating.py b/pypots/data/generating.py
@@ -14,8 +14,6 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils import check_random_state
 
-from .load_specific_datasets import load_specific_dataset
-
 
 def gene_complete_random_walk(
     n_samples: int = 1000,
@@ -320,109 +318,3 @@ def gene_random_walk(
         data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X)
 
     return data
-
-
-def gene_physionet2012(artificially_missing_rate: float = 0.1):
-    """Generate a fully-prepared PhysioNet-2012 dataset for model testing.
-
-    Parameters
-    ----------
-    artificially_missing_rate : float, default=0.1
-        The rate of artificially missing values to generate for model evaluation.
-        This ratio is calculated based on the number of observed values, i.e. if artificially_missing_rate = 0.1,
-        then 10% of the observed values will be randomly masked as missing data and hold out for model evaluation.
-
-    Returns
-    -------
-    data: dict,
-        A dictionary containing the generated PhysioNet-2012 dataset.
-
-    """
-    assert (
-        0 <= artificially_missing_rate < 1
-    ), "artificially_missing_rate must be in [0,1)"
-
-    # generate samples
-    dataset = load_specific_dataset("physionet_2012")
-    X = dataset["X"]
-    y = dataset["y"]
-    ICUType = dataset["ICUType"]
-
-    all_recordID = X["RecordID"].unique()
-    train_set_ids, test_set_ids = train_test_split(all_recordID, test_size=0.2)
-    train_set_ids, val_set_ids = train_test_split(train_set_ids, test_size=0.2)
-    train_set_ids.sort()
-    val_set_ids.sort()
-    test_set_ids.sort()
-    train_set = X[X["RecordID"].isin(train_set_ids)].sort_values(["RecordID", "Time"])
-    val_set = X[X["RecordID"].isin(val_set_ids)].sort_values(["RecordID", "Time"])
-    test_set = X[X["RecordID"].isin(test_set_ids)].sort_values(["RecordID", "Time"])
-
-    train_set = train_set.drop(["RecordID", "Time"], axis=1)
-    val_set = val_set.drop(["RecordID", "Time"], axis=1)
-    test_set = test_set.drop(["RecordID", "Time"], axis=1)
-    train_X, val_X, test_X = (
-        train_set.to_numpy(),
-        val_set.to_numpy(),
-        test_set.to_numpy(),
-    )
-
-    # normalization
-    scaler = StandardScaler()
-    train_X = scaler.fit_transform(train_X)
-    val_X = scaler.transform(val_X)
-    test_X = scaler.transform(test_X)
-
-    # reshape into time series samples
-    train_X = train_X.reshape(len(train_set_ids), 48, -1)
-    val_X = val_X.reshape(len(val_set_ids), 48, -1)
-    test_X = test_X.reshape(len(test_set_ids), 48, -1)
-
-    train_y = y[y.index.isin(train_set_ids)].sort_index()
-    val_y = y[y.index.isin(val_set_ids)].sort_index()
-    test_y = y[y.index.isin(test_set_ids)].sort_index()
-    train_y, val_y, test_y = train_y.to_numpy(), val_y.to_numpy(), test_y.to_numpy()
-
-    train_ICUType = ICUType[ICUType.index.isin(train_set_ids)].sort_index()
-    val_ICUType = ICUType[ICUType.index.isin(val_set_ids)].sort_index()
-    test_ICUType = ICUType[ICUType.index.isin(test_set_ids)].sort_index()
-    train_ICUType, val_ICUType, test_ICUType = (
-        train_ICUType.to_numpy(),
-        val_ICUType.to_numpy(),
-        test_ICUType.to_numpy(),
-    )
-
-    data = {
-        "n_classes": 2,
-        "n_steps": 48,
-        "n_features": train_X.shape[-1],
-        "train_X": train_X,
-        "train_y": train_y.flatten(),
-        "train_ICUType": train_ICUType.flatten(),
-        "val_X": val_X,
-        "val_y": val_y.flatten(),
-        "val_ICUType": val_ICUType.flatten(),
-        "test_X": test_X,
-        "test_y": test_y.flatten(),
-        "test_ICUType": test_ICUType.flatten(),
-        "scaler": scaler,
-    }
-
-    if artificially_missing_rate > 0:
-        # mask values in the validation set as ground truth
-        val_X_ori = val_X
-        val_X = mcar(val_X, artificially_missing_rate)
-        # mask values in the test set as ground truth
-        test_X_ori = test_X
-        test_X = mcar(test_X, artificially_missing_rate)
-
-        data["val_X"] = val_X
-        data["val_X_ori"] = val_X_ori
-
-        # test_X is for model input
-        data["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
-        data["test_X_ori"] = np.nan_to_num(test_X_ori)  # fill NaNs for later error calc
-        data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X)
-
-    return data
diff --git a/pypots/data/load_preprocessing.py b/pypots/data/load_preprocessing.py
diff --git a/pypots/data/load_specific_datasets.py b/pypots/data/load_specific_datasets.py