From d041eb2f719f375fdcb4a051a4cb12b27d30ce91 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sun, 16 Jun 2024 14:37:34 +0800 Subject: [PATCH] refactor: remove gene_physionet2012(); --- pypots/data/__init__.py | 17 ++-- pypots/data/generating.py | 108 -------------------------- pypots/data/load_preprocessing.py | 54 ------------- pypots/data/load_specific_datasets.py | 71 ----------------- 4 files changed, 8 insertions(+), 242 deletions(-) delete mode 100644 pypots/data/load_preprocessing.py delete mode 100644 pypots/data/load_specific_datasets.py diff --git a/pypots/data/__init__.py b/pypots/data/__init__.py index 05402a66..976a1bf8 100644 --- a/pypots/data/__init__.py +++ b/pypots/data/__init__.py @@ -11,13 +11,13 @@ gene_complete_random_walk_for_anomaly_detection, gene_complete_random_walk_for_classification, gene_random_walk, - gene_physionet2012, ) -from .load_specific_datasets import ( - list_supported_datasets, - load_specific_dataset, +from .saving import ( + save_dict_into_h5, + load_dict_from_h5, + pickle_dump, + pickle_load, ) -from .saving import save_dict_into_h5 from .utils import ( parse_delta, sliding_window, @@ -33,14 +33,13 @@ "gene_complete_random_walk_for_anomaly_detection", "gene_complete_random_walk_for_classification", "gene_random_walk", - "gene_physionet2012", - # list and load datasets - "list_supported_datasets", - "load_specific_dataset", # utils "parse_delta", "sliding_window", "inverse_sliding_window", # saving "save_dict_into_h5", + "load_dict_from_h5", + "pickle_dump", + "pickle_load", ] diff --git a/pypots/data/generating.py b/pypots/data/generating.py index 5d452374..c979ac27 100644 --- a/pypots/data/generating.py +++ b/pypots/data/generating.py @@ -14,8 +14,6 @@ from sklearn.preprocessing import StandardScaler from sklearn.utils import check_random_state -from .load_specific_datasets import load_specific_dataset - def gene_complete_random_walk( n_samples: int = 1000, @@ -320,109 +318,3 @@ def gene_random_walk( data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X) return data - - -def gene_physionet2012(artificially_missing_rate: float = 0.1): - """Generate a fully-prepared PhysioNet-2012 dataset for model testing. - - Parameters - ---------- - artificially_missing_rate : float, default=0.1 - The rate of artificially missing values to generate for model evaluation. - This ratio is calculated based on the number of observed values, i.e. if artificially_missing_rate = 0.1, - then 10% of the observed values will be randomly masked as missing data and hold out for model evaluation. - - Returns - ------- - data: dict, - A dictionary containing the generated PhysioNet-2012 dataset. - - """ - assert ( - 0 <= artificially_missing_rate < 1 - ), "artificially_missing_rate must be in [0,1)" - - # generate samples - dataset = load_specific_dataset("physionet_2012") - X = dataset["X"] - y = dataset["y"] - ICUType = dataset["ICUType"] - - all_recordID = X["RecordID"].unique() - train_set_ids, test_set_ids = train_test_split(all_recordID, test_size=0.2) - train_set_ids, val_set_ids = train_test_split(train_set_ids, test_size=0.2) - train_set_ids.sort() - val_set_ids.sort() - test_set_ids.sort() - train_set = X[X["RecordID"].isin(train_set_ids)].sort_values(["RecordID", "Time"]) - val_set = X[X["RecordID"].isin(val_set_ids)].sort_values(["RecordID", "Time"]) - test_set = X[X["RecordID"].isin(test_set_ids)].sort_values(["RecordID", "Time"]) - - train_set = train_set.drop(["RecordID", "Time"], axis=1) - val_set = val_set.drop(["RecordID", "Time"], axis=1) - test_set = test_set.drop(["RecordID", "Time"], axis=1) - train_X, val_X, test_X = ( - train_set.to_numpy(), - val_set.to_numpy(), - test_set.to_numpy(), - ) - - # normalization - scaler = StandardScaler() - train_X = scaler.fit_transform(train_X) - val_X = scaler.transform(val_X) - test_X = scaler.transform(test_X) - - # reshape into time series samples - train_X = train_X.reshape(len(train_set_ids), 48, -1) - val_X = val_X.reshape(len(val_set_ids), 48, -1) - test_X = test_X.reshape(len(test_set_ids), 48, -1) - - train_y = y[y.index.isin(train_set_ids)].sort_index() - val_y = y[y.index.isin(val_set_ids)].sort_index() - test_y = y[y.index.isin(test_set_ids)].sort_index() - train_y, val_y, test_y = train_y.to_numpy(), val_y.to_numpy(), test_y.to_numpy() - - train_ICUType = ICUType[ICUType.index.isin(train_set_ids)].sort_index() - val_ICUType = ICUType[ICUType.index.isin(val_set_ids)].sort_index() - test_ICUType = ICUType[ICUType.index.isin(test_set_ids)].sort_index() - train_ICUType, val_ICUType, test_ICUType = ( - train_ICUType.to_numpy(), - val_ICUType.to_numpy(), - test_ICUType.to_numpy(), - ) - - data = { - "n_classes": 2, - "n_steps": 48, - "n_features": train_X.shape[-1], - "train_X": train_X, - "train_y": train_y.flatten(), - "train_ICUType": train_ICUType.flatten(), - "val_X": val_X, - "val_y": val_y.flatten(), - "val_ICUType": val_ICUType.flatten(), - "test_X": test_X, - "test_y": test_y.flatten(), - "test_ICUType": test_ICUType.flatten(), - "scaler": scaler, - } - - if artificially_missing_rate > 0: - # mask values in the validation set as ground truth - val_X_ori = val_X - val_X = mcar(val_X, artificially_missing_rate) - # mask values in the test set as ground truth - test_X_ori = test_X - test_X = mcar(test_X, artificially_missing_rate) - - data["val_X"] = val_X - data["val_X_ori"] = val_X_ori - - # test_X is for model input - data["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs - data["test_X_ori"] = np.nan_to_num(test_X_ori) # fill NaNs for later error calc - data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X) - - return data diff --git a/pypots/data/load_preprocessing.py b/pypots/data/load_preprocessing.py deleted file mode 100644 index 2e7026ae..00000000 --- a/pypots/data/load_preprocessing.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Preprocessing functions to load supported open-source time-series datasets. -""" - -# Created by Wenjie Du -# License: BSD-3-Clause - -import pandas as pd - - -def preprocess_physionet2012(data: dict) -> dict: - """The preprocessing function for dataset PhysioNet-2012. - - Parameters - ---------- - data : - A data dict from tsdb.load_dataset(). - - Returns - ------- - dataset : - A dict containing processed data, including: - X : pandas.DataFrame, - A dataframe contains all time series vectors from 11988 patients, distinguished by column `RecordID`. - y : pandas.Series - The 11988 classification labels of all patients, indicating whether they were deceased. - """ - data["static_features"].remove("ICUType") # keep ICUType for now - # remove the other static features, e.g. age, gender - X = data["X"].drop(data["static_features"], axis=1) - - def apply_func(df_temp): # pad and truncate to set the max length of samples as 48 - missing = list(set(range(0, 48)).difference(set(df_temp["Time"]))) - missing_part = pd.DataFrame({"Time": missing}) - df_temp = pd.concat( - [df_temp, missing_part], ignore_index=False, sort=False - ) # pad the sample's length to 48 if it doesn't have enough time steps - df_temp = df_temp.set_index("Time").sort_index().reset_index() - df_temp = df_temp.iloc[:48] # truncate - return df_temp - - X = X.groupby("RecordID").apply(apply_func) - X = X.drop("RecordID", axis=1) - X = X.reset_index() - ICUType = X[["RecordID", "ICUType"]].set_index("RecordID").dropna() - X = X.drop(["level_1", "ICUType"], axis=1) - - dataset = { - "X": X, - "y": data["y"], - "ICUType": ICUType, - } - - return dataset diff --git a/pypots/data/load_specific_datasets.py b/pypots/data/load_specific_datasets.py deleted file mode 100644 index 9db37175..00000000 --- a/pypots/data/load_specific_datasets.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Functions to load supported open-source time-series datasets. -""" - -# Created by Wenjie Du -# License: BSD-3-Clause - - -import tsdb - -from .load_preprocessing import preprocess_physionet2012 -from ..utils.logging import logger - -# currently supported datasets -SUPPORTED_DATASETS = [ - "physionet_2012", -] - -# preprocessing functions of the supported datasets -PREPROCESSING_FUNC = { - "physionet_2012": preprocess_physionet2012, -} - - -def list_supported_datasets() -> list: - """Return the datasets natively supported by PyPOTS so far. - - Returns - ------- - SUPPORTED_DATASETS : - A list including all supported datasets. - - """ - return SUPPORTED_DATASETS - - -def load_specific_dataset(dataset_name: str, use_cache: bool = True) -> dict: - """Load specific datasets supported by PyPOTS. - Different from tsdb.load_dataset(), which only produces merely raw data, - load_specific_dataset here does some preprocessing operations, - like truncating time series to generate samples with the same length. - - Parameters - ---------- - dataset_name : - The name of the dataset to be loaded, which should be supported, i.e. in SUPPORTED_DATASETS. - - use_cache : - Whether to use cache. This is an argument of tsdb.load_dataset(). - - Returns - ------- - data : - A dict contains the preprocessed dataset. - Users only need to continue the preprocessing steps to generate the data they want, - e.g. standardizing and splitting. - - """ - logger.info( - f"Loading the dataset {dataset_name} with TSDB (https://github.com/WenjieDu/Time_Series_Data_Beans)..." - ) - assert dataset_name in SUPPORTED_DATASETS, ( - f"Dataset {dataset_name} is not supported. " - f"If you believe this dataset is valuable to be supported by PyPOTS," - f"please create an issue on GitHub " - f"https://github.com/WenjieDu/PyPOTS/issues" - ) - logger.info(f"Starting preprocessing {dataset_name}...") - data = tsdb.load(dataset_name, use_cache) - data = PREPROCESSING_FUNC[dataset_name](data) - return data