Skip to content

Commit

Permalink
refactor: remove gene_physionet2012();
Browse files Browse the repository at this point in the history
  • Loading branch information
WenjieDu committed Jun 16, 2024
1 parent 14a9c00 commit d041eb2
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 242 deletions.
17 changes: 8 additions & 9 deletions pypots/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
gene_complete_random_walk_for_anomaly_detection,
gene_complete_random_walk_for_classification,
gene_random_walk,
gene_physionet2012,
)
from .load_specific_datasets import (
list_supported_datasets,
load_specific_dataset,
from .saving import (
save_dict_into_h5,
load_dict_from_h5,
pickle_dump,
pickle_load,
)
from .saving import save_dict_into_h5
from .utils import (
parse_delta,
sliding_window,
Expand All @@ -33,14 +33,13 @@
"gene_complete_random_walk_for_anomaly_detection",
"gene_complete_random_walk_for_classification",
"gene_random_walk",
"gene_physionet2012",
# list and load datasets
"list_supported_datasets",
"load_specific_dataset",
# utils
"parse_delta",
"sliding_window",
"inverse_sliding_window",
# saving
"save_dict_into_h5",
"load_dict_from_h5",
"pickle_dump",
"pickle_load",
]
108 changes: 0 additions & 108 deletions pypots/data/generating.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state

from .load_specific_datasets import load_specific_dataset


def gene_complete_random_walk(
n_samples: int = 1000,
Expand Down Expand Up @@ -320,109 +318,3 @@ def gene_random_walk(
data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X)

return data


def gene_physionet2012(artificially_missing_rate: float = 0.1):
"""Generate a fully-prepared PhysioNet-2012 dataset for model testing.
Parameters
----------
artificially_missing_rate : float, default=0.1
The rate of artificially missing values to generate for model evaluation.
This ratio is calculated based on the number of observed values, i.e. if artificially_missing_rate = 0.1,
then 10% of the observed values will be randomly masked as missing data and hold out for model evaluation.
Returns
-------
data: dict,
A dictionary containing the generated PhysioNet-2012 dataset.
"""
assert (
0 <= artificially_missing_rate < 1
), "artificially_missing_rate must be in [0,1)"

# generate samples
dataset = load_specific_dataset("physionet_2012")
X = dataset["X"]
y = dataset["y"]
ICUType = dataset["ICUType"]

all_recordID = X["RecordID"].unique()
train_set_ids, test_set_ids = train_test_split(all_recordID, test_size=0.2)
train_set_ids, val_set_ids = train_test_split(train_set_ids, test_size=0.2)
train_set_ids.sort()
val_set_ids.sort()
test_set_ids.sort()
train_set = X[X["RecordID"].isin(train_set_ids)].sort_values(["RecordID", "Time"])
val_set = X[X["RecordID"].isin(val_set_ids)].sort_values(["RecordID", "Time"])
test_set = X[X["RecordID"].isin(test_set_ids)].sort_values(["RecordID", "Time"])

train_set = train_set.drop(["RecordID", "Time"], axis=1)
val_set = val_set.drop(["RecordID", "Time"], axis=1)
test_set = test_set.drop(["RecordID", "Time"], axis=1)
train_X, val_X, test_X = (
train_set.to_numpy(),
val_set.to_numpy(),
test_set.to_numpy(),
)

# normalization
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
val_X = scaler.transform(val_X)
test_X = scaler.transform(test_X)

# reshape into time series samples
train_X = train_X.reshape(len(train_set_ids), 48, -1)
val_X = val_X.reshape(len(val_set_ids), 48, -1)
test_X = test_X.reshape(len(test_set_ids), 48, -1)

train_y = y[y.index.isin(train_set_ids)].sort_index()
val_y = y[y.index.isin(val_set_ids)].sort_index()
test_y = y[y.index.isin(test_set_ids)].sort_index()
train_y, val_y, test_y = train_y.to_numpy(), val_y.to_numpy(), test_y.to_numpy()

train_ICUType = ICUType[ICUType.index.isin(train_set_ids)].sort_index()
val_ICUType = ICUType[ICUType.index.isin(val_set_ids)].sort_index()
test_ICUType = ICUType[ICUType.index.isin(test_set_ids)].sort_index()
train_ICUType, val_ICUType, test_ICUType = (
train_ICUType.to_numpy(),
val_ICUType.to_numpy(),
test_ICUType.to_numpy(),
)

data = {
"n_classes": 2,
"n_steps": 48,
"n_features": train_X.shape[-1],
"train_X": train_X,
"train_y": train_y.flatten(),
"train_ICUType": train_ICUType.flatten(),
"val_X": val_X,
"val_y": val_y.flatten(),
"val_ICUType": val_ICUType.flatten(),
"test_X": test_X,
"test_y": test_y.flatten(),
"test_ICUType": test_ICUType.flatten(),
"scaler": scaler,
}

if artificially_missing_rate > 0:
# mask values in the validation set as ground truth
val_X_ori = val_X
val_X = mcar(val_X, artificially_missing_rate)
# mask values in the test set as ground truth
test_X_ori = test_X
test_X = mcar(test_X, artificially_missing_rate)

data["val_X"] = val_X
data["val_X_ori"] = val_X_ori

# test_X is for model input
data["test_X"] = test_X
# test_X_ori is for error calc, not for model input, hence mustn't have NaNs
data["test_X_ori"] = np.nan_to_num(test_X_ori) # fill NaNs for later error calc
data["test_X_indicating_mask"] = np.isnan(test_X_ori) ^ np.isnan(test_X)

return data
54 changes: 0 additions & 54 deletions pypots/data/load_preprocessing.py

This file was deleted.

71 changes: 0 additions & 71 deletions pypots/data/load_specific_datasets.py

This file was deleted.

0 comments on commit d041eb2

Please sign in to comment.