From f8da4f6365e6fc2cea2e68338d3625088755ba3f Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Mon, 16 Jan 2023 21:37:10 +0800 Subject: [PATCH 01/22] feat: add the lazy-loading strategy for BaseDataset; --- pypots/data/base.py | 158 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 9 deletions(-) diff --git a/pypots/data/base.py b/pypots/data/base.py index 827b5d93..462852e1 100644 --- a/pypots/data/base.py +++ b/pypots/data/base.py @@ -5,8 +5,12 @@ # Created by Wenjie Du # License: GPL-v3 -from torch.utils.data import Dataset +import h5py import torch +from torch.utils.data import Dataset + +# Currently we only support h5 files +SUPPORTED_DATASET_FILE_TYPE = ["h5py"] class BaseDataset(Dataset): @@ -19,32 +23,81 @@ class BaseDataset(Dataset): y : tensor, shape of [n_samples], optional, default=None, Classification labels of according time-series samples. + + file_path : str, + The path to the dataset file. + + file_type : str, + The type of the given file, should be one of `numpy`, `h5py`, `pickle`. """ - def __init__(self, X, y=None): + def __init__(self, X=None, y=None, file_path=None, file_type="h5py"): super().__init__() # types and shapes had been checked after X and y input into the model # So they are safe to use here. No need to check again. + + assert X is None and file_path is None, f"X and file_path cannot both be None." + assert ( + X is not None and file_path is not None + ), f"X and file_path cannot both be given. Either of them should be given." + assert ( + file_type in SUPPORTED_DATASET_FILE_TYPE + ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}" + self.X = X self.y = y - self.n_steps = self.X.shape[1] - self.n_features = self.X.shape[2] + self.file_path = file_path + self.file_type = file_type + if self.file_path is not None: + self.file_handler = self._open_file_handle() + assert ( + "X" in self.file_handler.keys() + ), "The given dataset file doesn't contains X. Please double check." + self.sample_num = self._get_sample_num() + + if self.X is not None: + self.fetch_data = self._fetch_data_from_array + else: + self.fetch_data = self._fetch_data_from_file + + def _get_sample_num(self): + """Determine the number of samples in the dataset and return the number. + + Returns + ------- + sample_num : int + The number of the samples in the given dataset. + """ + if self.X is not None: + sample_num = len(self.X) + elif self.file_type == "h5py": + with h5py.File(self.file_path, "r") as hf: + sample_num = len(hf["X"]) + else: + raise TypeError(f"So far only h5py is supported.") + + return sample_num def __len__(self): - return len(self.X) + return self.sample_num - def __getitem__(self, idx): - """Fetch data according to index. + def _fetch_data_from_array(self, idx): + """Fetch data from self.X if it is given. Parameters ---------- idx : int, - The index to fetch the specified sample. + The index of the sample to be return. + + Returns + ------- + sample : list, + The collated data sample, a list including all necessary sample info. """ + X = self.X[idx] missing_mask = ~torch.isnan(X) X = torch.nan_to_num(X) - sample = [ torch.tensor(idx), X.to(torch.float32), @@ -55,3 +108,90 @@ def __getitem__(self, idx): sample.append(self.y[idx].to(torch.long)) return sample + + def _open_file_handle(self): + """Open the file handle for reading data from the file. + + Notes + ----- + This function can also help confirm if the given file and file type match. + + Returns + ------- + file_handle : file. + + """ + try: + file_handler = h5py.File( + self.file_path, "r" + ) # set swmr=True if the h5 file need to be written into new content during reading + except OSError as e: + raise TypeError( + f"{e} This probably is caused by file type error. " + f"Please confirm that the given file {self.file_path} is an h5 file." + ) + except Exception as e: + raise RuntimeError(e) + return file_handler + + def _fetch_data_from_file(self, idx): + """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples. + Here the opened file handle does not load the entire dataset into RAM but only load the currently accessed slice + + Notes + ----- + Multi workers reading from h5 file is tricky, and I was confronted with a problem similar to + https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/7 in 2020, please + refer to it for more details about the problem. + The implementation here is referred to + https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/10 + And according to https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/37, + pytorch v1.7.1 and h5py v3.2.0 work well, so probably updating to the latest version can avoid the + issue I met. After all, this implementation may need to be updated in the near future. + + Parameters + ---------- + idx : int, + The index of the sample to be return. + + Returns + ------- + sample : list, + The collated data sample, a list including all necessary sample info. + """ + + if self.file_handler is None: + self.file_handler = self._open_file_handle() + + X = self.file_handler["X"][idx] + missing_mask = ~torch.isnan(X) + X = torch.nan_to_num(X) + sample = [ + torch.tensor(idx), + X.to(torch.float32), + missing_mask.to(torch.float32), + ] + + if ( + "y" in self.file_handler.keys() + ): # if the dataset has labels, then fetch it from the file + sample.append(self.file_handler["y"][idx].to(torch.long)) + + return sample + + def __getitem__(self, idx): + """Fetch data according to index. + + Parameters + ---------- + idx : int, + The index to fetch the specified sample. + + Returns + ------- + sample : list, + The collated data sample, a list including all necessary sample info. + """ + + sample = self.fetch_data(idx) + return sample From df2414b4a767c8f1b775b90301dacd936011a785 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sun, 19 Feb 2023 16:06:23 +0800 Subject: [PATCH 02/22] feat: add the file lazy-loading strategy for classes derived from BaseDataset; --- pypots/data/base.py | 5 +- pypots/data/dataset_for_brits.py | 127 ++++++++++++++++++++++--------- pypots/data/dataset_for_grud.py | 71 +++++++++++++---- pypots/data/dataset_for_mit.py | 46 +++++++++-- 4 files changed, 194 insertions(+), 55 deletions(-) diff --git a/pypots/data/base.py b/pypots/data/base.py index 462852e1..a1aff6b7 100644 --- a/pypots/data/base.py +++ b/pypots/data/base.py @@ -5,6 +5,7 @@ # Created by Wenjie Du # License: GPL-v3 +from abc import abstractmethod import h5py import torch from torch.utils.data import Dataset @@ -81,6 +82,7 @@ def _get_sample_num(self): def __len__(self): return self.sample_num + @abstractmethod def _fetch_data_from_array(self, idx): """Fetch data from self.X if it is given. @@ -134,9 +136,10 @@ def _open_file_handle(self): raise RuntimeError(e) return file_handler + @abstractmethod def _fetch_data_from_file(self, idx): """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples. - Here the opened file handle does not load the entire dataset into RAM but only load the currently accessed slice + Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice. Notes ----- diff --git a/pypots/data/dataset_for_brits.py b/pypots/data/dataset_for_brits.py index 0f3ee6a7..415d1530 100644 --- a/pypots/data/dataset_for_brits.py +++ b/pypots/data/dataset_for_brits.py @@ -5,6 +5,7 @@ # Created by Wenjie Du # License: GLP-v3 +import numpy as np import torch from pypots.data.base import BaseDataset @@ -56,43 +57,44 @@ class DatasetForBRITS(BaseDataset): Classification labels of according time-series samples. """ - def __init__(self, X, y=None): - super().__init__(X, y) - - # calculate all delta here. - # Training will take too much time if we put delta calculation in __getitem__(). - forward_missing_mask = (~torch.isnan(X)).type(torch.float32) - forward_X = torch.nan_to_num(X) - forward_delta = parse_delta(forward_missing_mask) - backward_X = torch.flip(forward_X, dims=[1]) - backward_missing_mask = torch.flip(forward_missing_mask, dims=[1]) - backward_delta = parse_delta(backward_missing_mask) - - self.data = { - "forward": { - "X": forward_X, - "missing_mask": forward_missing_mask, - "delta": forward_delta, - }, - "backward": { - "X": backward_X, - "missing_mask": backward_missing_mask, - "delta": backward_delta, - }, - } - - def __getitem__(self, idx): - """Fetch data according to index. + def __init__(self, X=None, y=None, file_path=None, file_type="h5py"): + super().__init__(X, y, file_path, file_type) + + if self.X is not None: + # calculate all delta here. + # Training will take too much time if we put delta calculation in __getitem__(). + forward_missing_mask = (~torch.isnan(X)).type(torch.float32) + forward_X = torch.nan_to_num(X) + forward_delta = parse_delta(forward_missing_mask) + backward_X = torch.flip(forward_X, dims=[1]) + backward_missing_mask = torch.flip(forward_missing_mask, dims=[1]) + backward_delta = parse_delta(backward_missing_mask) + + self.processed_data = { + "forward": { + "X": forward_X, + "missing_mask": forward_missing_mask, + "delta": forward_delta, + }, + "backward": { + "X": backward_X, + "missing_mask": backward_missing_mask, + "delta": backward_delta, + }, + } + + def _fetch_data_from_array(self, idx): + """Fetch data from self.X if it is given. Parameters ---------- idx : int, - The index to fetch the specified sample. + The index of the sample to be return. Returns ------- - dict, - A dict contains + sample : list, + A list contains index : int tensor, The index of the sample. @@ -112,16 +114,69 @@ def __getitem__(self, idx): sample = [ torch.tensor(idx), # for forward - self.data["forward"]["X"][idx].to(torch.float32), - self.data["forward"]["missing_mask"][idx].to(torch.float32), - self.data["forward"]["delta"][idx].to(torch.float32), + self.processed_data["forward"]["X"][idx].to(torch.float32), + self.processed_data["forward"]["missing_mask"][idx].to(torch.float32), + self.processed_data["forward"]["delta"][idx].to(torch.float32), # for backward - self.data["backward"]["X"][idx].to(torch.float32), - self.data["backward"]["missing_mask"][idx].to(torch.float32), - self.data["backward"]["delta"][idx].to(torch.float32), + self.processed_data["backward"]["X"][idx].to(torch.float32), + self.processed_data["backward"]["missing_mask"][idx].to(torch.float32), + self.processed_data["backward"]["delta"][idx].to(torch.float32), ] if self.y is not None: sample.append(self.y[idx].to(torch.long)) return sample + + def _fetch_data_from_file(self, idx): + """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples. + Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice. + + Parameters + ---------- + idx : int, + The index of the sample to be return. + + Returns + ------- + sample : list, + The collated data sample, a list including all necessary sample info. + """ + + if self.file_handler is None: + self.file_handler = self._open_file_handle() + + X = self.file_handler["X"][idx] + missing_mask = (~np.isnan(X)).astype("float32") + X = np.nan_to_num(X) + + forward = { + "X": X, + "missing_mask": missing_mask, + "deltas": parse_delta(missing_mask), + } + + backward = { + "X": np.flip(forward["X"], axis=0).copy(), + "missing_mask": np.flip(forward["missing_mask"], axis=0).copy(), + } + backward["deltas"] = parse_delta(backward["missing_mask"]) + + sample = [ + torch.tensor(idx), + # for forward + torch.from_numpy(forward["X"].astype("float32")), + torch.from_numpy(forward["missing_mask"].astype("float32")), + torch.from_numpy(forward["deltas"].astype("float32")), + # for backward + torch.from_numpy(backward["X"].astype("float32")), + torch.from_numpy(backward["missing_mask"].astype("float32")), + torch.from_numpy(backward["deltas"].astype("float32")), + ] + + if ( + "y" in self.file_handler.keys() + ): # if the dataset has labels, then fetch it from the file + sample.append(self.file_handler["y"][idx].to(torch.long)) + + return sample diff --git a/pypots/data/dataset_for_grud.py b/pypots/data/dataset_for_grud.py index f3dd1d80..dccaa102 100644 --- a/pypots/data/dataset_for_grud.py +++ b/pypots/data/dataset_for_grud.py @@ -25,19 +25,21 @@ class DatasetForGRUD(BaseDataset): Classification labels of according time-series samples. """ - def __init__(self, X, y=None): - super().__init__(X, y) + def __init__(self, X=None, y=None, file_path=None, file_type="h5py"): + super().__init__(X, y, file_path, file_type) self.locf = LOCF() - self.missing_mask = (~torch.isnan(X)).to(torch.float32) - self.X = torch.nan_to_num(X) - self.deltas = parse_delta(self.missing_mask) - self.X_filledLOCF = self.locf.locf_torch(X) - self.empirical_mean = torch.sum( - self.missing_mask * self.X, dim=[0, 1] - ) / torch.sum(self.missing_mask, dim=[0, 1]) - - def __getitem__(self, idx): + + if self.X is not None: + self.missing_mask = (~torch.isnan(X)).to(torch.float32) + self.X_filledLOCF = self.locf.locf_torch(X) + self.X = torch.nan_to_num(X) + self.deltas = parse_delta(self.missing_mask) + self.empirical_mean = torch.sum( + self.missing_mask * self.X, dim=[0, 1] + ) / torch.sum(self.missing_mask, dim=[0, 1]) + + def _fetch_data_from_array(self, idx): """Fetch data according to index. Parameters @@ -47,8 +49,8 @@ def __getitem__(self, idx): Returns ------- - dict, - A dict contains + sample : list, + A list contains index : int tensor, The index of the sample. @@ -81,3 +83,46 @@ def __getitem__(self, idx): sample.append(self.y[idx].to(torch.long)) return sample + + def _fetch_data_from_file(self, idx): + """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples. + Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice. + + Parameters + ---------- + idx : int, + The index of the sample to be return. + + Returns + ------- + sample : list, + The collated data sample, a list including all necessary sample info. + """ + + if self.file_handler is None: + self.file_handler = self._open_file_handle() + + X = torch.from_numpy(self.file_handler["X"][idx]) + missing_mask = (~torch.isnan(X)).to(torch.float32) + X_filledLOCF = self.locf.locf_torch(X) + X = torch.nan_to_num(X) + deltas = parse_delta(missing_mask) + empirical_mean = torch.sum(missing_mask * X, dim=[0, 1]) / torch.sum( + missing_mask, dim=[0, 1] + ) + + sample = [ + torch.tensor(idx), + X, + X_filledLOCF, + missing_mask, + deltas, + empirical_mean, + ] + + if ( + "y" in self.file_handler.keys() + ): # if the dataset has labels, then fetch it from the file + sample.append(self.file_handler["y"][idx].to(torch.long)) + + return sample diff --git a/pypots/data/dataset_for_mit.py b/pypots/data/dataset_for_mit.py index b24e3f75..5da35590 100644 --- a/pypots/data/dataset_for_mit.py +++ b/pypots/data/dataset_for_mit.py @@ -36,11 +36,11 @@ class DatasetForMIT(BaseDataset): """ - def __init__(self, X, y=None, rate=0.2): - super().__init__(X, y) + def __init__(self, X=None, y=None, file_path=None, file_type="h5py", rate=0.2): + super().__init__(X, y, file_path, file_type) self.rate = rate - def __getitem__(self, idx): + def _fetch_data_from_array(self, idx): """Fetch data according to index. Parameters @@ -50,8 +50,8 @@ def __getitem__(self, idx): Returns ------- - dict, - A dict contains + sample : list, + A list contains index : int tensor, The index of the sample. @@ -83,3 +83,39 @@ def __getitem__(self, idx): sample.append(self.y[idx].to(torch.long)) return sample + + def _fetch_data_from_file(self, idx): + """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples. + Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice. + + Parameters + ---------- + idx : int, + The index of the sample to be return. + + Returns + ------- + sample : list, + The collated data sample, a list including all necessary sample info. + """ + + if self.file_handler is None: + self.file_handler = self._open_file_handle() + + X = torch.from_numpy(self.file_handler["X"][idx]) + X_intact, X, missing_mask, indicating_mask = mcar(X, rate=self.rate) + + sample = [ + torch.tensor(idx), + X_intact.to(torch.float32), + X.to(torch.float32), + missing_mask.to(torch.float32), + indicating_mask.to(torch.float32), + ] + + if ( + "y" in self.file_handler.keys() + ): # if the dataset has labels, then fetch it from the file + sample.append(self.file_handler["y"][idx].to(torch.long)) + + return sample From 19c5bb3ce644b548c8c5b4cd24947ba3f26c15f0 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Wed, 29 Mar 2023 21:20:46 +0800 Subject: [PATCH 03/22] doc: update the reference info; --- CITATION.cff | 4 ++-- README.md | 50 ++++++++++++++++++++++++++------------------------ 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 49eed6c0..64753889 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -5,7 +5,7 @@ authors: given-names: "Wenjie" orcid: "https://orcid.org/0000-0003-3046-7835" title: "PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series" -version: 0.0.7 -doi: 10.5281/zenodo.6823222 +version: 0.0.9 +doi: 10.5281/zenodo.6823221 date-released: 2022-07-12 url: "https://github.com/WenjieDu/PyPOTS" \ No newline at end of file diff --git a/README.md b/README.md index 686b3042..5f95c52e 100644 --- a/README.md +++ b/README.md @@ -5,44 +5,46 @@

- + - + - PyPI + PyPI - on anaconda + on anaconda - + - - - + + + + + - - - + + + + + + + - + - - - - - - - + + + - - - + + +

@@ -112,13 +114,13 @@ author = {Wenjie Du}, title = {{PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series}}, howpublished = {\url{https://github.com/wenjiedu/pypots}}, year = {2022}, -doi = {10.5281/zenodo.6823222}, +doi = {10.5281/zenodo.6823221}, } ``` or -`Wenjie Du. (2022). PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series. Zenodo. https://doi.org/10.5281/zenodo.6823222` +`Wenjie Du. (2022). PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series. Zenodo. https://doi.org/10.5281/zenodo.6823221` ## ❖ Attention 👀 The documentation and tutorials are under construction. And a short paper introducing PyPOTS is on the way! 🚀 Stay tuned please! From 3c56ce272ef7dc935b85662886f128e8847c4dc5 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Thu, 30 Mar 2023 15:08:12 +0800 Subject: [PATCH 04/22] fix: imputation models applying MIT do not need use DatasetForMIT on val_set; --- pypots/imputation/base.py | 4 ++-- pypots/imputation/brits.py | 4 ++-- pypots/imputation/saits.py | 12 +++++++++--- pypots/imputation/transformer.py | 13 ++++++++++--- 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py index e62ae50c..a16769fe 100644 --- a/pypots/imputation/base.py +++ b/pypots/imputation/base.py @@ -164,8 +164,8 @@ def _train_model( with torch.no_grad(): for idx, data in enumerate(val_loader): inputs = self.assemble_input_for_validating(data) - results = self.model.forward(inputs) - imputation_collector.append(results["imputed_data"]) + imputed_data, _ = self.model.impute(inputs) + imputation_collector.append(imputed_data) imputation_collector = torch.cat(imputation_collector) imputation_collector = imputation_collector diff --git a/pypots/imputation/brits.py b/pypots/imputation/brits.py index d15c8e33..dd61b1e6 100644 --- a/pypots/imputation/brits.py +++ b/pypots/imputation/brits.py @@ -336,7 +336,7 @@ def impute(self, inputs): imputed_data_b = {"imputed_data_b": imputed_data_b} imputed_data_b = self.reverse(imputed_data_b)["imputed_data_b"] imputed_data = (imputed_data_f + imputed_data_b) / 2 - return imputed_data + return imputed_data, None @staticmethod def get_consistency_loss(pred_f, pred_b): @@ -620,7 +620,7 @@ def impute(self, X): with torch.no_grad(): for idx, data in enumerate(test_loader): inputs = self.assemble_input_for_testing(data) - imputed_data = self.model.impute(inputs) + imputed_data, _ = self.model.impute(inputs) imputation_collector.append(imputed_data) imputation_collector = torch.cat(imputation_collector) diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py index d32bd0ab..4b3a33cc 100644 --- a/pypots/imputation/saits.py +++ b/pypots/imputation/saits.py @@ -230,7 +230,7 @@ def fit(self, train_X, val_X=None): val_X, 0.2 ) val_X = masked_fill(val_X, 1 - val_X_missing_mask, torch.nan) - val_set = DatasetForMIT(val_X) + val_set = BaseDataset(val_X) val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False) self._train_model( training_loader, val_loader, val_X_intact, val_X_indicating_mask @@ -282,7 +282,13 @@ def assemble_input_for_validating(self, data) -> dict: inputs : dict, A python dictionary contains the input data for model validating. """ - return self.assemble_input_for_training(data) + indices, X, missing_mask = data + + inputs = { + "X": X, + "missing_mask": missing_mask, + } + return inputs def assemble_input_for_testing(self, data) -> dict: """Assemble the given data into a dictionary for testing input. @@ -301,7 +307,7 @@ def assemble_input_for_testing(self, data) -> dict: inputs : dict, A python dictionary contains the input data for model testing. """ - return self.assemble_input_for_training(data) + return self.assemble_input_for_validating(data) def impute(self, X): X = self.check_input(self.n_steps, self.n_features, X) diff --git a/pypots/imputation/transformer.py b/pypots/imputation/transformer.py index c84c30b1..9ea288ed 100644 --- a/pypots/imputation/transformer.py +++ b/pypots/imputation/transformer.py @@ -320,7 +320,7 @@ def fit(self, train_X, val_X=None): val_X, 0.2 ) val_X = masked_fill(val_X, 1 - val_X_missing_mask, np.nan) - val_set = DatasetForMIT(val_X) + val_set = BaseDataset(val_X) val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False) self._train_model( training_loader, val_loader, val_X_intact, val_X_indicating_mask @@ -373,7 +373,14 @@ def assemble_input_for_validating(self, data) -> dict: inputs : dict, A python dictionary contains the input data for model validating. """ - return self.assemble_input_for_training(data) + indices, X, missing_mask = data + + inputs = { + "X": X, + "missing_mask": missing_mask, + } + + return inputs def assemble_input_for_testing(self, data) -> dict: """Assemble the given data into a dictionary for testing input. @@ -392,7 +399,7 @@ def assemble_input_for_testing(self, data) -> dict: inputs : dict, A python dictionary contains the input data for model testing. """ - return self.assemble_input_for_training(data) + return self.assemble_input_for_validating(data) def impute(self, X): X = self.check_input(self.n_steps, self.n_features, X) From 5927909f8d23a90eee8374805fe0291cc1ccc1aa Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Thu, 30 Mar 2023 15:35:38 +0800 Subject: [PATCH 05/22] fix: only import h5py when needed; --- pypots/data/base.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pypots/data/base.py b/pypots/data/base.py index a1aff6b7..c4da8c79 100644 --- a/pypots/data/base.py +++ b/pypots/data/base.py @@ -6,7 +6,6 @@ # License: GPL-v3 from abc import abstractmethod -import h5py import torch from torch.utils.data import Dataset @@ -71,9 +70,10 @@ def _get_sample_num(self): """ if self.X is not None: sample_num = len(self.X) - elif self.file_type == "h5py": - with h5py.File(self.file_path, "r") as hf: - sample_num = len(hf["X"]) + elif self.file_path is not None and self.file_type == "h5py": + if self.file_handler is None: + self.file_handler = self._open_file_handle() + sample_num = len(self.file_handler["X"]) else: raise TypeError(f"So far only h5py is supported.") @@ -124,9 +124,15 @@ def _open_file_handle(self): """ try: + import h5py + file_handler = h5py.File( self.file_path, "r" ) # set swmr=True if the h5 file need to be written into new content during reading + except ImportError: + raise ImportError( + "h5py is missing and cannot be imported. Please install it first." + ) except OSError as e: raise TypeError( f"{e} This probably is caused by file type error. " From 4a9c5be0d1417872f197771f83336bbd85c3ae83 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Thu, 30 Mar 2023 15:51:53 +0800 Subject: [PATCH 06/22] feat: move check_input() to BaseDataset; --- pypots/base.py | 96 ------------------------------- pypots/classification/brits.py | 5 -- pypots/classification/grud.py | 5 -- pypots/classification/raindrop.py | 5 -- pypots/clustering/crli.py | 2 - pypots/clustering/vader.py | 2 - pypots/data/base.py | 83 ++++++++++++++++++++++++++ pypots/forecasting/bttf.py | 1 - pypots/imputation/brits.py | 5 -- pypots/imputation/saits.py | 4 -- pypots/imputation/transformer.py | 4 -- 11 files changed, 83 insertions(+), 129 deletions(-) diff --git a/pypots/base.py b/pypots/base.py index 49b1b0c2..0f2e69e4 100644 --- a/pypots/base.py +++ b/pypots/base.py @@ -8,7 +8,6 @@ import os from abc import ABC -import numpy as np import torch from pypots.utils.files import create_dir_if_not_exist @@ -32,101 +31,6 @@ def __init__(self, device): else: self.device = device - def check_input( - self, expected_n_steps, expected_n_features, X, y=None, out_dtype="tensor" - ): - """Check value type and shape of input X and y - - Parameters - ---------- - expected_n_steps : int - Number of time steps of input time series (X) that the model expects. - This value is the same with the argument `n_steps` used to initialize the model. - - expected_n_features : int - Number of feature dimensions of input time series (X) that the model expects. - This value is the same with the argument `n_features` used to initialize the model. - - X : array-like, - Time-series data that must have a shape like [n_samples, expected_n_steps, expected_n_features]. - - y : array-like, default=None - Labels of time-series samples (X) that must have a shape like [n_samples] or [n_samples, n_classes]. - - out_dtype : str, in ['tensor', 'ndarray'], default='tensor' - Data type of the output, should be np.ndarray or torch.Tensor - - Returns - ------- - X : tensor - - y : tensor - """ - assert out_dtype in [ - "tensor", - "ndarray", - ], f'out_dtype should be "tensor" or "ndarray", but got {out_dtype}' - is_list = isinstance(X, list) - is_array = isinstance(X, np.ndarray) - is_tensor = isinstance(X, torch.Tensor) - assert is_tensor or is_array or is_list, TypeError( - "X should be an instance of list/np.ndarray/torch.Tensor, " - f"but got {type(X)}" - ) - - # convert the data type if in need - if out_dtype == "tensor": - if is_list: - X = torch.tensor(X).to(self.device) - elif is_array: - X = torch.from_numpy(X).to(self.device) - else: # is tensor - X = X.to(self.device) - else: # out_dtype is ndarray - # convert to np.ndarray first for shape check - if is_list: - X = np.asarray(X) - elif is_tensor: - X = X.numpy() - else: # is ndarray - pass - - # check the shape of X here - X_shape = X.shape - assert len(X_shape) == 3, ( - f"input should have 3 dimensions [n_samples, seq_len, n_features]," - f"but got shape={X.shape}" - ) - assert ( - X_shape[1] == expected_n_steps - ), f"expect X.shape[1] to be {expected_n_steps}, but got {X_shape[1]}" - assert ( - X_shape[2] == expected_n_features - ), f"expect X.shape[2] to be {expected_n_features}, but got {X_shape[2]}" - - if y is not None: - assert len(X) == len(y), ( - f"lengths of X and y must match, " f"but got f{len(X)} and {len(y)}" - ) - if isinstance(y, torch.Tensor): - y = y.to(self.device) if out_dtype == "tensor" else y.numpy() - elif isinstance(y, list): - y = ( - torch.tensor(y).to(self.device) - if out_dtype == "tensor" - else np.asarray(y) - ) - elif isinstance(y, np.ndarray): - y = torch.from_numpy(y).to(self.device) if out_dtype == "tensor" else y - else: - raise TypeError( - "y should be an instance of list/np.ndarray/torch.Tensor, " - f"but got {type(y)}" - ) - return X, y - else: - return X - def save_logs_to_tensorboard(self, saving_path): """Save logs (self.logger) into a tensorboard file. diff --git a/pypots/classification/brits.py b/pypots/classification/brits.py index 5ef03860..961979d5 100644 --- a/pypots/classification/brits.py +++ b/pypots/classification/brits.py @@ -196,10 +196,6 @@ def fit(self, train_X, train_y, val_X=None, val_y=None): self : object, Trained model. """ - train_X, train_y = self.check_input( - self.n_steps, self.n_features, train_X, train_y - ) - val_X, val_y = self.check_input(self.n_steps, self.n_features, val_X, val_y) training_set = DatasetForBRITS( train_X, train_y @@ -326,7 +322,6 @@ def assemble_input_for_testing(self, data) -> dict: return inputs def classify(self, X): - X = self.check_input(self.n_steps, self.n_features, X) self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForBRITS(X) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) diff --git a/pypots/classification/grud.py b/pypots/classification/grud.py index 69929dcc..4435afeb 100644 --- a/pypots/classification/grud.py +++ b/pypots/classification/grud.py @@ -160,10 +160,6 @@ def fit(self, train_X, train_y, val_X=None, val_y=None): self : object, Trained model. """ - train_X, train_y = self.check_input( - self.n_steps, self.n_features, train_X, train_y - ) - val_X, val_y = self.check_input(self.n_steps, self.n_features, val_X, val_y) training_set = DatasetForGRUD(train_X, train_y) training_loader = DataLoader( @@ -260,7 +256,6 @@ def assemble_input_for_testing(self, data) -> dict: return inputs def classify(self, X): - X = self.check_input(self.n_steps, self.n_features, X) self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForGRUD(X) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) diff --git a/pypots/classification/raindrop.py b/pypots/classification/raindrop.py index c6204bc5..fd5f8885 100644 --- a/pypots/classification/raindrop.py +++ b/pypots/classification/raindrop.py @@ -681,10 +681,6 @@ def fit(self, train_X, train_y, val_X=None, val_y=None): self : object, Trained model. """ - train_X, train_y = self.check_input( - self.n_steps, self.n_features, train_X, train_y - ) - val_X, val_y = self.check_input(self.n_steps, self.n_features, val_X, val_y) training_set = DatasetForGRUD(train_X, train_y) training_loader = DataLoader( @@ -789,7 +785,6 @@ def assemble_input_for_testing(self, data) -> dict: return inputs def classify(self, X): - X = self.check_input(self.n_steps, self.n_features, X) self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForGRUD(X) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) diff --git a/pypots/clustering/crli.py b/pypots/clustering/crli.py index b0bd9723..2cbd1c33 100644 --- a/pypots/clustering/crli.py +++ b/pypots/clustering/crli.py @@ -353,7 +353,6 @@ def __init__( self.logger = {"training_loss_generator": [], "training_loss_discriminator": []} def fit(self, train_X): - train_X = self.check_input(self.n_steps, self.n_features, train_X) training_set = DatasetForGRUD(train_X) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True @@ -516,7 +515,6 @@ def _train_model(self, training_loader, val_loader=None): logger.info("Finished training.") def cluster(self, X): - X = self.check_input(self.n_steps, self.n_features, X) self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForGRUD(X) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) diff --git a/pypots/clustering/vader.py b/pypots/clustering/vader.py index 14f682fe..141e772e 100644 --- a/pypots/clustering/vader.py +++ b/pypots/clustering/vader.py @@ -379,7 +379,6 @@ def __init__( self._print_model_size() def fit(self, train_X): - train_X = self.check_input(self.n_steps, self.n_features, train_X) training_set = DatasetForGRUD(train_X) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True @@ -558,7 +557,6 @@ def _train_model(self, training_loader, val_loader=None): logger.info("Finished training.") def cluster(self, X): - X = self.check_input(self.n_steps, self.n_features, X) self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForGRUD(X) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) diff --git a/pypots/data/base.py b/pypots/data/base.py index c4da8c79..4339b761 100644 --- a/pypots/data/base.py +++ b/pypots/data/base.py @@ -6,6 +6,8 @@ # License: GPL-v3 from abc import abstractmethod + +import numpy as np import torch from torch.utils.data import Dataset @@ -44,17 +46,23 @@ def __init__(self, X=None, y=None, file_path=None, file_type="h5py"): file_type in SUPPORTED_DATASET_FILE_TYPE ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}" + if X is not None: + X, y = self.check_input(X, y) + self.X = X self.y = y self.file_path = file_path self.file_type = file_type + if self.file_path is not None: self.file_handler = self._open_file_handle() assert ( "X" in self.file_handler.keys() ), "The given dataset file doesn't contains X. Please double check." + self.sample_num = self._get_sample_num() + # set up function fetch_data() if self.X is not None: self.fetch_data = self._fetch_data_from_array else: @@ -82,6 +90,81 @@ def _get_sample_num(self): def __len__(self): return self.sample_num + def check_input(self, X, y=None, out_dtype="tensor"): + """Check value type and shape of input X and y + + Parameters + ---------- + X : array-like, + Time-series data that must have a shape like [n_samples, expected_n_steps, expected_n_features]. + + y : array-like, default=None + Labels of time-series samples (X) that must have a shape like [n_samples] or [n_samples, n_classes]. + + out_dtype : str, in ['tensor', 'ndarray'], default='tensor' + Data type of the output, should be np.ndarray or torch.Tensor + + Returns + ------- + X : array-like + + y : array-like + """ + assert out_dtype in [ + "tensor", + "ndarray", + ], f'out_dtype should be "tensor" or "ndarray", but got {out_dtype}' + + is_list = isinstance(X, list) + is_array = isinstance(X, np.ndarray) + is_tensor = isinstance(X, torch.Tensor) + assert is_tensor or is_array or is_list, TypeError( + "X should be an instance of list/np.ndarray/torch.Tensor, " + f"but got {type(X)}" + ) + + # convert the data type if in need + if out_dtype == "tensor": + if is_list: + X = torch.tensor(X).to() + elif is_array: + X = torch.from_numpy(X).to() + else: # is tensor + pass + else: # out_dtype is ndarray + # convert to np.ndarray first for shape check + if is_list: + X = np.asarray(X) + elif is_tensor: + X = X.numpy() + else: # is ndarray + pass + + # check the shape of X here + X_shape = X.shape + assert len(X_shape) == 3, ( + f"input should have 3 dimensions [n_samples, seq_len, n_features]," + f"but got shape={X_shape}" + ) + + if y is not None: + assert len(X) == len(y), ( + f"lengths of X and y must match, " f"but got f{len(X)} and {len(y)}" + ) + if isinstance(y, torch.Tensor): + y = y if out_dtype == "tensor" else y.numpy() + elif isinstance(y, list): + y = torch.tensor(y) if out_dtype == "tensor" else np.asarray(y) + elif isinstance(y, np.ndarray): + y = torch.from_numpy(y) if out_dtype == "tensor" else y + else: + raise TypeError( + "y should be an instance of list/np.ndarray/torch.Tensor, " + f"but got {type(y)}" + ) + + return X, y + @abstractmethod def _fetch_data_from_array(self, idx): """Fetch data from self.X if it is given. diff --git a/pypots/forecasting/bttf.py b/pypots/forecasting/bttf.py index 03711d5f..4f81cb4c 100644 --- a/pypots/forecasting/bttf.py +++ b/pypots/forecasting/bttf.py @@ -462,7 +462,6 @@ def fit(self, train_X): warnings.warn("Please run func forecast(X) directly.") def forecast(self, X): - self.check_input(self.n_steps, self.n_features, X, out_dtype="ndarray") X = X.transpose((0, 2, 1)) pred = BTTF_forecast( diff --git a/pypots/imputation/brits.py b/pypots/imputation/brits.py index dd61b1e6..b93311bd 100644 --- a/pypots/imputation/brits.py +++ b/pypots/imputation/brits.py @@ -511,10 +511,6 @@ def fit(self, train_X, val_X=None): self : object, Trained model. """ - train_X = self.check_input(self.n_steps, self.n_features, train_X) - if val_X is not None: - val_X = self.check_input(self.n_steps, self.n_features, val_X) - training_set = DatasetForBRITS(train_X) # time_gaps is necessary for BRITS training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True @@ -611,7 +607,6 @@ def assemble_input_for_testing(self, data) -> dict: return self.assemble_input_for_training(data) def impute(self, X): - X = self.check_input(self.n_steps, self.n_features, X) self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForBRITS(X) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py index 4b3a33cc..981aa544 100644 --- a/pypots/imputation/saits.py +++ b/pypots/imputation/saits.py @@ -215,10 +215,6 @@ def __init__( self._print_model_size() def fit(self, train_X, val_X=None): - train_X = self.check_input(self.n_steps, self.n_features, train_X) - if val_X is not None: - val_X = self.check_input(self.n_steps, self.n_features, val_X) - training_set = DatasetForMIT(train_X) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True diff --git a/pypots/imputation/transformer.py b/pypots/imputation/transformer.py index 9ea288ed..a9c8e221 100644 --- a/pypots/imputation/transformer.py +++ b/pypots/imputation/transformer.py @@ -305,9 +305,6 @@ def __init__( self._print_model_size() def fit(self, train_X, val_X=None): - train_X = self.check_input(self.n_steps, self.n_features, train_X) - if val_X is not None: - val_X = self.check_input(self.n_steps, self.n_features, val_X) training_set = DatasetForMIT(train_X) training_loader = DataLoader( @@ -402,7 +399,6 @@ def assemble_input_for_testing(self, data) -> dict: return self.assemble_input_for_validating(data) def impute(self, X): - X = self.check_input(self.n_steps, self.n_features, X) self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset(X) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) From c71c8faa928e30926a1912d27f9ab9a346d4aa73 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Thu, 30 Mar 2023 16:18:55 +0800 Subject: [PATCH 07/22] fix: correct mistaken operator from & to ^; --- pypots/data/base.py | 17 +++++++++++------ pypots/data/dataset_for_brits.py | 4 ++-- pypots/data/dataset_for_grud.py | 6 +++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/pypots/data/base.py b/pypots/data/base.py index 4339b761..382915a4 100644 --- a/pypots/data/base.py +++ b/pypots/data/base.py @@ -38,10 +38,14 @@ def __init__(self, X=None, y=None, file_path=None, file_type="h5py"): # types and shapes had been checked after X and y input into the model # So they are safe to use here. No need to check again. - assert X is None and file_path is None, f"X and file_path cannot both be None." - assert ( - X is not None and file_path is not None + assert (X is None) ^ ( + file_path is None + ), f"X and file_path cannot both be None." + + assert (X is not None) ^ ( + file_path is not None ), f"X and file_path cannot both be given. Either of them should be given." + assert ( file_type in SUPPORTED_DATASET_FILE_TYPE ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}" @@ -90,7 +94,8 @@ def _get_sample_num(self): def __len__(self): return self.sample_num - def check_input(self, X, y=None, out_dtype="tensor"): + @staticmethod + def check_input(X, y=None, out_dtype="tensor"): """Check value type and shape of input X and y Parameters @@ -126,9 +131,9 @@ def check_input(self, X, y=None, out_dtype="tensor"): # convert the data type if in need if out_dtype == "tensor": if is_list: - X = torch.tensor(X).to() + X = torch.tensor(X) elif is_array: - X = torch.from_numpy(X).to() + X = torch.from_numpy(X) else: # is tensor pass else: # out_dtype is ndarray diff --git a/pypots/data/dataset_for_brits.py b/pypots/data/dataset_for_brits.py index 38e1feeb..139e7f4c 100644 --- a/pypots/data/dataset_for_brits.py +++ b/pypots/data/dataset_for_brits.py @@ -63,8 +63,8 @@ def __init__(self, X=None, y=None, file_path=None, file_type="h5py"): if self.X is not None: # calculate all delta here. # Training will take too much time if we put delta calculation in __getitem__(). - forward_missing_mask = (~torch.isnan(X)).type(torch.float32) - forward_X = torch.nan_to_num(X) + forward_missing_mask = (~torch.isnan(self.X)).type(torch.float32) + forward_X = torch.nan_to_num(self.X) forward_delta = parse_delta(forward_missing_mask) backward_X = torch.flip(forward_X, dims=[1]) backward_missing_mask = torch.flip(forward_missing_mask, dims=[1]) diff --git a/pypots/data/dataset_for_grud.py b/pypots/data/dataset_for_grud.py index dccaa102..f7dd9df5 100644 --- a/pypots/data/dataset_for_grud.py +++ b/pypots/data/dataset_for_grud.py @@ -31,9 +31,9 @@ def __init__(self, X=None, y=None, file_path=None, file_type="h5py"): self.locf = LOCF() if self.X is not None: - self.missing_mask = (~torch.isnan(X)).to(torch.float32) - self.X_filledLOCF = self.locf.locf_torch(X) - self.X = torch.nan_to_num(X) + self.missing_mask = (~torch.isnan(self.X)).to(torch.float32) + self.X_filledLOCF = self.locf.locf_torch(self.X) + self.X = torch.nan_to_num(self.X) self.deltas = parse_delta(self.missing_mask) self.empirical_mean = torch.sum( self.missing_mask * self.X, dim=[0, 1] From af4586a60d27f913c69527dada8bb1c0133b5f44 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Thu, 30 Mar 2023 16:51:24 +0800 Subject: [PATCH 08/22] fix: turn imputation to numpy.ndarray in the validation stage; --- pypots/imputation/base.py | 2 +- pypots/imputation/saits.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py index a16769fe..743ce958 100644 --- a/pypots/imputation/base.py +++ b/pypots/imputation/base.py @@ -168,7 +168,7 @@ def _train_model( imputation_collector.append(imputed_data) imputation_collector = torch.cat(imputation_collector) - imputation_collector = imputation_collector + imputation_collector = imputation_collector.numpy() mean_val_loss = cal_mae( imputation_collector, val_X_intact, val_indicating_mask diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py index 981aa544..9832865c 100644 --- a/pypots/imputation/saits.py +++ b/pypots/imputation/saits.py @@ -306,7 +306,6 @@ def assemble_input_for_testing(self, data) -> dict: return self.assemble_input_for_validating(data) def impute(self, X): - X = self.check_input(self.n_steps, self.n_features, X) self.model.eval() # set the model as eval status to freeze it. test_set = BaseDataset(X) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) From fababb1f3586285907c836a1fdd123a832030c56 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 01:23:36 +0800 Subject: [PATCH 09/22] feat: update the data given and input logic to support loading dataset from files; --- pypots/classification/base.py | 48 ++++++++++----- pypots/classification/brits.py | 60 ++++++++++++++----- pypots/classification/grud.py | 56 ++++++++++++++---- pypots/classification/raindrop.py | 52 ++++++++++++---- pypots/clustering/base.py | 26 +++++--- pypots/clustering/crli.py | 44 ++++++++++++-- pypots/clustering/vader.py | 44 ++++++++++++-- pypots/data/base.py | 92 +++++++++++++---------------- pypots/data/dataset_for_brits.py | 33 ++++++----- pypots/data/dataset_for_grud.py | 30 ++++++---- pypots/data/dataset_for_mit.py | 35 ++++++----- pypots/forecasting/base.py | 29 +++++++-- pypots/forecasting/bttf.py | 25 +++++++- pypots/imputation/base.py | 40 +++++++++---- pypots/imputation/brits.py | 67 ++++++++++++++++----- pypots/imputation/locf.py | 50 +++++++++++++--- pypots/imputation/saits.py | 69 +++++++++++++++++++--- pypots/imputation/transformer.py | 66 ++++++++++++++++++--- pypots/tests/test_classification.py | 40 ++++--------- pypots/tests/test_clustering.py | 24 ++++---- pypots/tests/test_forecasting.py | 10 ++-- pypots/tests/test_imputation.py | 52 +++++++--------- 22 files changed, 701 insertions(+), 291 deletions(-) diff --git a/pypots/classification/base.py b/pypots/classification/base.py index 598902aa..27dcac5a 100644 --- a/pypots/classification/base.py +++ b/pypots/classification/base.py @@ -22,19 +22,31 @@ def __init__(self, device): super().__init__(device) @abstractmethod - def fit(self, train_X, train_y, val_X=None, val_y=None): - """Train the classifier. + def fit(self, train_set, val_set=None, file_type="h5py"): + """Train the classifier on the given data. Parameters ---------- - train_X : array-like of shape [n_samples, sequence length (time steps), n_features], - Time-series data for training, can contain missing values. - train_y : array, - Classification labels for training. - val_X : array-like of shape [n_samples, sequence length (time steps), n_features], - Time-series data for validation, can contain missing values. - val_y : array, - Classification labels for validation. + train_set : dict or str, + The dataset for model training, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + val_set : dict or str, + The dataset for model validating, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : str, default = "h5py", + The type of the given file if train_set and val_set are path strings. Returns ------- @@ -44,18 +56,22 @@ def fit(self, train_X, train_y, val_X=None, val_y=None): return self @abstractmethod - def classify(self, X): - """Classify the input with the trained model. + def classify(self, X, file_type="h5py"): + """Classify the input data with the trained model. Parameters ---------- - X : array-like of shape [n_samples, sequence length (time steps), n_features], - Time-series data contains missing values. + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py", + The type of the given file if X is a path string. Returns ------- - array-like, shape [n_samples, sequence length (time steps), n_features], - Classification results. + array-like, shape [n_samples], + Classification results of the given samples. """ pass diff --git a/pypots/classification/brits.py b/pypots/classification/brits.py index 961979d5..6cd9a959 100644 --- a/pypots/classification/brits.py +++ b/pypots/classification/brits.py @@ -123,8 +123,6 @@ class BRITS(BaseNNClassifier): The underlying BRITS model. optimizer : object, The optimizer for model training. - data_loader : object, - The data loader for dataset loading. Parameters ---------- @@ -181,33 +179,47 @@ def __init__( self.model = self.model.to(self.device) self._print_model_size() - def fit(self, train_X, train_y, val_X=None, val_y=None): - """Fit the model on the given training data. + def fit(self, train_set, val_set=None, file_type="h5py"): + """Train the classifier on the given data. Parameters ---------- - train_X : array, shape [n_samples, sequence length (time steps), n_features], - Time-series vectors. - train_y : array, - Classification labels. + train_set : dict or str, + The dataset for model training, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + val_set : dict or str, + The dataset for model validating, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : str, default = "h5py" + The type of the given file if train_set and val_set are path strings. Returns ------- self : object, - Trained model. + Trained classifier. """ - training_set = DatasetForBRITS( - train_X, train_y - ) # time_gaps is necessary for BRITS + training_set = DatasetForBRITS(train_set) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True ) - if val_X is None: + if val_set is None: self._train_model(training_loader) else: - val_set = DatasetForBRITS(val_X, val_y) + val_set = DatasetForBRITS(val_set) val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False) self._train_model(training_loader, val_loader) @@ -321,9 +333,25 @@ def assemble_input_for_testing(self, data) -> dict: } return inputs - def classify(self, X): + def classify(self, X, file_type="h5py"): + """Classify the input data with the trained model. + + Parameters + ---------- + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py", + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples], + Classification results of the given samples. + """ self.model.eval() # set the model as eval status to freeze it. - test_set = DatasetForBRITS(X) + test_set = DatasetForBRITS(X, file_type) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) prediction_collector = [] diff --git a/pypots/classification/grud.py b/pypots/classification/grud.py index 4435afeb..fb13df4f 100644 --- a/pypots/classification/grud.py +++ b/pypots/classification/grud.py @@ -145,31 +145,47 @@ def __init__( self.model = self.model.to(self.device) self._print_model_size() - def fit(self, train_X, train_y, val_X=None, val_y=None): - """Fit the model on the given training data. + def fit(self, train_set, val_set=None, file_type="h5py"): + """Train the classifier on the given data. Parameters ---------- - train_X : array, shape [n_samples, sequence length (time steps), n_features], - Time-series vectors. - train_y : array, - Classification labels. + train_set : dict or str, + The dataset for model training, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + val_set : dict or str, + The dataset for model validating, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : str, default = "h5py" + The type of the given file if train_set and val_set are path strings. Returns ------- self : object, - Trained model. + Trained classifier. """ - training_set = DatasetForGRUD(train_X, train_y) + training_set = DatasetForGRUD(train_set, file_type) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True ) - if val_X is None: + if val_set is None: self._train_model(training_loader) else: - val_set = DatasetForGRUD(val_X, val_y) + val_set = DatasetForGRUD(val_set) val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False) self._train_model(training_loader, val_loader) @@ -255,9 +271,25 @@ def assemble_input_for_testing(self, data) -> dict: return inputs - def classify(self, X): + def classify(self, X, file_type="h5py"): + """Classify the input data with the trained model. + + Parameters + ---------- + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py", + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples], + Classification results of the given samples. + """ self.model.eval() # set the model as eval status to freeze it. - test_set = DatasetForGRUD(X) + test_set = DatasetForGRUD(X, file_type) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) prediction_collector = [] diff --git a/pypots/classification/raindrop.py b/pypots/classification/raindrop.py index fd5f8885..31220608 100644 --- a/pypots/classification/raindrop.py +++ b/pypots/classification/raindrop.py @@ -666,15 +666,31 @@ def __init__( self.model = self.model.to(self.device) self._print_model_size() - def fit(self, train_X, train_y, val_X=None, val_y=None): + def fit(self, train_set, val_set=None, file_type="h5py"): """Fit the model on the given training data. Parameters ---------- - train_X : array, shape [n_samples, sequence length (time steps), n_features], - Time-series vectors. - train_y : array, - Classification labels. + train_set : dict or str, + The dataset for model training, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + val_set : dict or str, + The dataset for model validating, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : str, default = "h5py" + The type of the given file if train_set and val_set are path strings. Returns ------- @@ -682,15 +698,15 @@ def fit(self, train_X, train_y, val_X=None, val_y=None): Trained model. """ - training_set = DatasetForGRUD(train_X, train_y) + training_set = DatasetForGRUD(train_set) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True ) - if val_X is None: + if val_set is None: self._train_model(training_loader) else: - val_set = DatasetForGRUD(val_X, val_y) + val_set = DatasetForGRUD(val_set) val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False) self._train_model(training_loader, val_loader) @@ -784,9 +800,25 @@ def assemble_input_for_testing(self, data) -> dict: return inputs - def classify(self, X): + def classify(self, X, file_type="h5py"): + """Classify the input data with the trained model. + + Parameters + ---------- + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py", + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples], + Classification results of the given samples. + """ self.model.eval() # set the model as eval status to freeze it. - test_set = DatasetForGRUD(X) + test_set = DatasetForGRUD(X, file_type) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) prediction_collector = [] diff --git a/pypots/clustering/base.py b/pypots/clustering/base.py index f3cc8c2e..8b66eb35 100644 --- a/pypots/clustering/base.py +++ b/pypots/clustering/base.py @@ -22,13 +22,21 @@ def __init__(self, device): super().__init__(device) @abstractmethod - def fit(self, train_X): + def fit(self, train_set, file_type="h5py"): """Train the cluster. Parameters ---------- - train_X : array-like of shape [n_samples, sequence length (time steps), n_features], - Time-series data for training, can contain missing values. + train_set : dict or str, + The dataset for model training, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + file_type : str, default = "h5py" + The type of the given file if train_set is a path string. Returns ------- @@ -38,17 +46,21 @@ def fit(self, train_X): return self @abstractmethod - def cluster(self, X): + def cluster(self, X, file_type="h5py"): """Cluster the input with the trained model. Parameters ---------- - X : array-like of shape [n_samples, sequence length (time steps), n_features], - Time-series data contains missing values. + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py" + The type of the given file if X is a path string. Returns ------- - array-like, shape [n_samples, sequence length (time steps), n_features], + array-like, shape [n_samples], Clustering results. """ pass diff --git a/pypots/clustering/crli.py b/pypots/clustering/crli.py index 2cbd1c33..b062fc33 100644 --- a/pypots/clustering/crli.py +++ b/pypots/clustering/crli.py @@ -352,8 +352,28 @@ def __init__( self._print_model_size() self.logger = {"training_loss_generator": [], "training_loss_discriminator": []} - def fit(self, train_X): - training_set = DatasetForGRUD(train_X) + def fit(self, train_set, file_type="h5py"): + """Train the cluster. + + Parameters + ---------- + train_set : dict or str, + The dataset for model training, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + file_type : str, default = "h5py" + The type of the given file if train_set is a path string. + + Returns + ------- + self : object, + Trained classifier. + """ + training_set = DatasetForGRUD(train_set, file_type) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True ) @@ -514,9 +534,25 @@ def _train_model(self, training_loader, val_loader=None): logger.info("Finished training.") - def cluster(self, X): + def cluster(self, X, file_type="h5py"): + """Cluster the input with the trained model. + + Parameters + ---------- + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py" + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples], + Clustering results. + """ self.model.eval() # set the model as eval status to freeze it. - test_set = DatasetForGRUD(X) + test_set = DatasetForGRUD(X, file_type) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) latent_collector = [] diff --git a/pypots/clustering/vader.py b/pypots/clustering/vader.py index 141e772e..128743a4 100644 --- a/pypots/clustering/vader.py +++ b/pypots/clustering/vader.py @@ -378,8 +378,28 @@ def __init__( self.model = self.model.to(self.device) self._print_model_size() - def fit(self, train_X): - training_set = DatasetForGRUD(train_X) + def fit(self, train_set, file_type="h5py"): + """Train the cluster. + + Parameters + ---------- + train_set : dict or str, + The dataset for model training, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + file_type : str, default = "h5py" + The type of the given file if train_set is a path string. + + Returns + ------- + self : object, + Trained classifier. + """ + training_set = DatasetForGRUD(train_set, file_type) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True ) @@ -556,9 +576,25 @@ def _train_model(self, training_loader, val_loader=None): logger.info("Finished training.") - def cluster(self, X): + def cluster(self, X, file_type="h5py"): + """Cluster the input with the trained model. + + Parameters + ---------- + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py" + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples], + Clustering results. + """ self.model.eval() # set the model as eval status to freeze it. - test_set = DatasetForGRUD(X) + test_set = DatasetForGRUD(X, file_type) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) clustering_results_collector = [] diff --git a/pypots/data/base.py b/pypots/data/base.py index 382915a4..84707c2d 100644 --- a/pypots/data/base.py +++ b/pypots/data/base.py @@ -18,52 +18,45 @@ class BaseDataset(Dataset): """Base dataset class in PyPOTS. - Parameters - ---------- - X : tensor, shape of [n_samples, n_steps, n_features] - Time-series feature vector. - - y : tensor, shape of [n_samples], optional, default=None, - Classification labels of according time-series samples. - - file_path : str, - The path to the dataset file. - - file_type : str, - The type of the given file, should be one of `numpy`, `h5py`, `pickle`. + data : dict or str, + The dataset for model input, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for input, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : str, default = "h5py" + The type of the given file if train_set and val_set are path strings. """ - def __init__(self, X=None, y=None, file_path=None, file_type="h5py"): + def __init__(self, data, file_type="h5py"): super().__init__() # types and shapes had been checked after X and y input into the model # So they are safe to use here. No need to check again. - assert (X is None) ^ ( - file_path is None - ), f"X and file_path cannot both be None." - - assert (X is not None) ^ ( - file_path is not None - ), f"X and file_path cannot both be given. Either of them should be given." - - assert ( - file_type in SUPPORTED_DATASET_FILE_TYPE - ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}" - - if X is not None: - X, y = self.check_input(X, y) + self.data = data + if isinstance(data, str): + self.file_type = file_type - self.X = X - self.y = y - self.file_path = file_path - self.file_type = file_type + # check if the given file type is supported + assert ( + file_type in SUPPORTED_DATASET_FILE_TYPE + ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}" - if self.file_path is not None: - self.file_handler = self._open_file_handle() + # open the file handle + self.file_handle = self._open_file_handle() + # check if X exists in the file assert ( - "X" in self.file_handler.keys() + "X" in self.file_handle.keys() ), "The given dataset file doesn't contains X. Please double check." + else: + X = data["X"] + y = None if "y" not in data.keys() else data["y"] + self.X, self.y = self.check_input(X, y) + self.sample_num = self._get_sample_num() # set up function fetch_data() @@ -80,14 +73,12 @@ def _get_sample_num(self): sample_num : int The number of the samples in the given dataset. """ - if self.X is not None: - sample_num = len(self.X) - elif self.file_path is not None and self.file_type == "h5py": - if self.file_handler is None: - self.file_handler = self._open_file_handle() - sample_num = len(self.file_handler["X"]) + if isinstance(self.data, str): + if self.file_handle is None: + self.file_handle = self._open_file_handle() + sample_num = len(self.file_handle["X"]) else: - raise TypeError(f"So far only h5py is supported.") + sample_num = len(self.X) return sample_num @@ -208,14 +199,15 @@ def _open_file_handle(self): Returns ------- - file_handle : file. + file_handle : file """ + data_file_path = self.data try: import h5py file_handler = h5py.File( - self.file_path, "r" + data_file_path, "r" ) # set swmr=True if the h5 file need to be written into new content during reading except ImportError: raise ImportError( @@ -224,7 +216,7 @@ def _open_file_handle(self): except OSError as e: raise TypeError( f"{e} This probably is caused by file type error. " - f"Please confirm that the given file {self.file_path} is an h5 file." + f"Please confirm that the given file {data_file_path} is an h5 file." ) except Exception as e: raise RuntimeError(e) @@ -257,10 +249,10 @@ def _fetch_data_from_file(self, idx): The collated data sample, a list including all necessary sample info. """ - if self.file_handler is None: - self.file_handler = self._open_file_handle() + if self.file_handle is None: + self.file_handle = self._open_file_handle() - X = self.file_handler["X"][idx] + X = self.file_handle["X"][idx] missing_mask = ~torch.isnan(X) X = torch.nan_to_num(X) sample = [ @@ -270,9 +262,9 @@ def _fetch_data_from_file(self, idx): ] if ( - "y" in self.file_handler.keys() + "y" in self.file_handle.keys() ): # if the dataset has labels, then fetch it from the file - sample.append(self.file_handler["y"][idx].to(torch.long)) + sample.append(self.file_handle["y"][idx].to(torch.long)) return sample diff --git a/pypots/data/dataset_for_brits.py b/pypots/data/dataset_for_brits.py index 139e7f4c..e7a74b13 100644 --- a/pypots/data/dataset_for_brits.py +++ b/pypots/data/dataset_for_brits.py @@ -50,19 +50,24 @@ class DatasetForBRITS(BaseDataset): Parameters ---------- - X : tensor, shape of [n_samples, n_steps, n_features] - Time-series data. - - y : tensor, shape of [n_samples], optional, default=None, - Classification labels of according time-series samples. + data : dict or str, + The dataset for model input, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for input, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : str, default = "h5py" + The type of the given file if train_set and val_set are path strings. """ - def __init__(self, X=None, y=None, file_path=None, file_type="h5py"): - super().__init__(X, y, file_path, file_type) + def __init__(self, data, file_type="h5py"): + super().__init__(data, file_type) - if self.X is not None: + if not isinstance(self.data, str): # calculate all delta here. - # Training will take too much time if we put delta calculation in __getitem__(). forward_missing_mask = (~torch.isnan(self.X)).type(torch.float32) forward_X = torch.nan_to_num(self.X) forward_delta = parse_delta(forward_missing_mask) @@ -143,10 +148,10 @@ def _fetch_data_from_file(self, idx): The collated data sample, a list including all necessary sample info. """ - if self.file_handler is None: - self.file_handler = self._open_file_handle() + if self.file_handle is None: + self.file_handle = self._open_file_handle() - X = self.file_handler["X"][idx] + X = self.file_handle["X"][idx] missing_mask = (~np.isnan(X)).astype("float32") X = np.nan_to_num(X) @@ -175,8 +180,8 @@ def _fetch_data_from_file(self, idx): ] if ( - "y" in self.file_handler.keys() + "y" in self.file_handle.keys() ): # if the dataset has labels, then fetch it from the file - sample.append(self.file_handler["y"][idx].to(torch.long)) + sample.append(self.file_handle["y"][idx].to(torch.long)) return sample diff --git a/pypots/data/dataset_for_grud.py b/pypots/data/dataset_for_grud.py index f7dd9df5..8db41843 100644 --- a/pypots/data/dataset_for_grud.py +++ b/pypots/data/dataset_for_grud.py @@ -18,15 +18,21 @@ class DatasetForGRUD(BaseDataset): Parameters ---------- - X : tensor, shape of [n_samples, seq_len, n_features] - Time-series feature vector. - - y : tensor, shape of [n_samples], optional, default=None, - Classification labels of according time-series samples. + data : dict or str, + The dataset for model input, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for input, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : str, default = "h5py" + The type of the given file if train_set and val_set are path strings. """ - def __init__(self, X=None, y=None, file_path=None, file_type="h5py"): - super().__init__(X, y, file_path, file_type) + def __init__(self, data, file_type="h5py"): + super().__init__(data, file_type) self.locf = LOCF() @@ -99,10 +105,10 @@ def _fetch_data_from_file(self, idx): The collated data sample, a list including all necessary sample info. """ - if self.file_handler is None: - self.file_handler = self._open_file_handle() + if self.file_handle is None: + self.file_handle = self._open_file_handle() - X = torch.from_numpy(self.file_handler["X"][idx]) + X = torch.from_numpy(self.file_handle["X"][idx]) missing_mask = (~torch.isnan(X)).to(torch.float32) X_filledLOCF = self.locf.locf_torch(X) X = torch.nan_to_num(X) @@ -121,8 +127,8 @@ def _fetch_data_from_file(self, idx): ] if ( - "y" in self.file_handler.keys() + "y" in self.file_handle.keys() ): # if the dataset has labels, then fetch it from the file - sample.append(self.file_handler["y"][idx].to(torch.long)) + sample.append(self.file_handle["y"][idx].to(torch.long)) return sample diff --git a/pypots/data/dataset_for_mit.py b/pypots/data/dataset_for_mit.py index 89fa78e9..787e7c9e 100644 --- a/pypots/data/dataset_for_mit.py +++ b/pypots/data/dataset_for_mit.py @@ -18,26 +18,29 @@ class DatasetForMIT(BaseDataset): Parameters ---------- - X : tensor, shape of [n_samples, n_steps, n_features] - Time-series feature vector. - - y : tensor, shape of [n_samples], optional, default=None, - Classification labels of according time-series samples. + data : dict or str, + The dataset for model input, should be a dictionary including keys as 'X' and 'y', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for input, can contain missing values, and y should be array-like of shape + [n_samples], which is classification labels of X. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include keys as 'X' and 'y'. + + file_type : str, default = "h5py" + The type of the given file if train_set and val_set are path strings. rate : float, in (0,1), Artificially missing rate, rate of the observed values which will be artificially masked as missing. - - Note that, - `rate` = (number of artificially missing values) / np.sum(~np.isnan(self.data)), + Note that, `rate` = (number of artificially missing values) / np.sum(~np.isnan(self.data)), not (number of artificially missing values) / np.product(self.data.shape), considering that the given data may already contain missing values, the latter way may be confusing because if the original missing rate >= `rate`, the function will do nothing, i.e. it won't play the role it has to be. - """ - def __init__(self, X=None, y=None, file_path=None, file_type="h5py", rate=0.2): - super().__init__(X, y, file_path, file_type) + def __init__(self, data, file_type="h5py", rate=0.2): + super().__init__(data, file_type) self.rate = rate def _fetch_data_from_array(self, idx): @@ -99,10 +102,10 @@ def _fetch_data_from_file(self, idx): The collated data sample, a list including all necessary sample info. """ - if self.file_handler is None: - self.file_handler = self._open_file_handle() + if self.file_handle is None: + self.file_handle = self._open_file_handle() - X = torch.from_numpy(self.file_handler["X"][idx]) + X = torch.from_numpy(self.file_handle["X"][idx]) X_intact, X, missing_mask, indicating_mask = mcar(X, rate=self.rate) sample = [ @@ -114,8 +117,8 @@ def _fetch_data_from_file(self, idx): ] if ( - "y" in self.file_handler.keys() + "y" in self.file_handle.keys() ): # if the dataset has labels, then fetch it from the file - sample.append(self.file_handler["y"][idx].to(torch.long)) + sample.append(self.file_handle["y"][idx].to(torch.long)) return sample diff --git a/pypots/forecasting/base.py b/pypots/forecasting/base.py index 282b0336..5423657c 100644 --- a/pypots/forecasting/base.py +++ b/pypots/forecasting/base.py @@ -22,13 +22,29 @@ def __init__(self, device): super().__init__(device) @abstractmethod - def fit(self, train_X): - """Train the cluster. + def fit(self, train_set, val_set=None, file_type="h5py"): + """Train the classifier on the given data. Parameters ---------- - train_X : array-like of shape [n_samples, sequence length (time steps), n_features], - Time-series data for training, can contain missing values. + train_set : dict or str, + The dataset for model training, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + val_set : dict or str, + The dataset for model validating, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validation, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + file_type : str, default = "h5py", + The type of the given file if train_set and val_set are path strings. Returns ------- @@ -38,7 +54,7 @@ def fit(self, train_X): return self @abstractmethod - def forecast(self, X): + def forecast(self, X, file_type="h5py"): """Forecast the future the input with the trained model. Parameters @@ -46,6 +62,9 @@ def forecast(self, X): X : array-like of shape [n_samples, sequence length (time steps), n_features], Time-series data containing missing values. + file_type : str, default = "h5py" + The type of the given file if X is a path string. + Returns ------- array-like, shape [n_samples, prediction_horizon, n_features], diff --git a/pypots/forecasting/bttf.py b/pypots/forecasting/bttf.py index 4f81cb4c..4bcd1cf2 100644 --- a/pypots/forecasting/bttf.py +++ b/pypots/forecasting/bttf.py @@ -458,10 +458,31 @@ def __init__( self.burn_iter = burn_iter self.gibbs_iter = gibbs_iter - def fit(self, train_X): + def fit(self, train_set, val_set=None, file_type="h5py"): warnings.warn("Please run func forecast(X) directly.") - def forecast(self, X): + def forecast(self, X, file_type="h5py"): + """Forecast the future the input with the trained model. + + Parameters + ---------- + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py" + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples, prediction_horizon, n_features], + Forecasting results. + """ + assert not isinstance( + X, str + ), "BTTF so far does not accept file input. It needs a specified Dataset class." + + X = X["X"] X = X.transpose((0, 2, 1)) pred = BTTF_forecast( diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py index 743ce958..dbb70d9c 100644 --- a/pypots/imputation/base.py +++ b/pypots/imputation/base.py @@ -28,31 +28,49 @@ def __init__(self, device): super().__init__(device) @abstractmethod - def fit(self, train_X, val_X=None): - """Train the imputer. + def fit(self, train_set, val_set=None, file_type="h5py"): + """Train the imputer on the given data. Parameters ---------- - train_X : array-like, shape: [n_samples, sequence length (time steps), n_features], - Time-series data for training, can contain missing values. - val_X : array-like, optional, shape [n_samples, sequence length (time steps), n_features], - Time-series data for validating, can contain missing values. + train_set : dict or str, + The dataset for model training, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + val_set : dict or str, + The dataset for model validating, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + file_type : str, default = "h5py", + The type of the given file if train_set and val_set are path strings. Returns ------- self : object, - Trained imputer. + The trained imputer. """ return self @abstractmethod - def impute(self, X): - """Impute missing data with the trained model. + def impute(self, X, file_type="h5py"): + """Impute missing values in the given data with the trained model. Parameters ---------- - X : array-like of shape [n_samples, sequence length (time steps), n_features], - Time-series data for imputing contains missing values. + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py", + The type of the given file if X is a path string. Returns ------- diff --git a/pypots/imputation/brits.py b/pypots/imputation/brits.py index b93311bd..19d8450d 100644 --- a/pypots/imputation/brits.py +++ b/pypots/imputation/brits.py @@ -6,6 +6,7 @@ # License: GPL-v3 import math +import numpy as np import torch import torch.nn as nn @@ -495,36 +496,58 @@ def __init__( self.model = self.model.to(self.device) self._print_model_size() - def fit(self, train_X, val_X=None): - """Fit the model on the given training data. + def fit(self, train_set, val_set=None, file_type="h5py"): + """Train the imputer on the given data. Parameters ---------- - train_X : array-like, shape of [n_samples, n_steps, n_features], - Data for training. - - val_X : array-like, optional, shape of [n_samples, n_steps, n_features], - Data for validating. + train_set : dict or str, + The dataset for model training, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + val_set : dict or str, + The dataset for model validating, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + file_type : str, default = "h5py", + The type of the given file if train_set and val_set are path strings. Returns ------- self : object, - Trained model. + The trained imputer. """ - training_set = DatasetForBRITS(train_X) # time_gaps is necessary for BRITS + training_set = DatasetForBRITS(train_set, file_type) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True ) - if val_X is None: + if val_set is None: self._train_model(training_loader) else: + if isinstance(val_set, str): + import h5py + + with h5py.File(val_set, "r") as hf: + val_X = hf["X"] + val_set = {"X": val_X} + val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar( - val_X, 0.2 + val_set["X"], 0.2 ) - val_X = masked_fill(val_X, 1 - val_X_missing_mask, torch.nan) - val_set = DatasetForBRITS(val_X) + val_X = masked_fill(val_X, 1 - val_X_missing_mask, np.nan) + val_set["X"] = val_X + val_set = DatasetForBRITS(val_set) val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False) + self._train_model( training_loader, val_loader, val_X_intact, val_X_indicating_mask ) @@ -606,7 +629,23 @@ def assemble_input_for_testing(self, data) -> dict: """ return self.assemble_input_for_training(data) - def impute(self, X): + def impute(self, X, file_type="h5py"): + """Impute missing values in the given data with the trained model. + + Parameters + ---------- + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py", + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples, sequence length (time steps), n_features], + Imputed data. + """ self.model.eval() # set the model as eval status to freeze it. test_set = DatasetForBRITS(X) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) diff --git a/pypots/imputation/locf.py b/pypots/imputation/locf.py index 2d391bb9..9bdde882 100644 --- a/pypots/imputation/locf.py +++ b/pypots/imputation/locf.py @@ -26,7 +26,35 @@ def __init__(self, nan=0): super().__init__("cpu") self.nan = nan - def fit(self, train_X, val_X=None): + def fit(self, train_set, val_set=None, file_type="h5py"): + """Train the imputer on the given data. + + Parameters + ---------- + train_set : dict or str, + The dataset for model training, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + val_set : dict or str, + The dataset for model validating, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + file_type : str, default = "h5py", + The type of the given file if train_set and val_set are path strings. + + Returns + ------- + self : object, + The trained imputer. + """ warnings.warn( "LOCF (Last Observed Carried Forward) imputation class has no parameter to train. " "Please run func impute(X) directly." @@ -103,19 +131,27 @@ def locf_torch(self, X): return X_imputed - def impute(self, X): - """Impute missing values + def impute(self, X, file_type="h5py"): + """Impute missing values in the given data with the trained model. Parameters ---------- - X : array-like, - Time-series vectors containing missing values (NaN). + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py", + The type of the given file if X is a path string. Returns ------- - array-like, - Imputed time series. + array-like, shape [n_samples, sequence length (time steps), n_features], + Imputed data. """ + + assert not isinstance(X, str) + X = X["X"] + assert len(X.shape) == 3, ( f"Input X should have 3 dimensions [n_samples, n_steps, n_features], " f"but the actual shape of X: {X.shape}" diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py index 9832865c..627e2b6a 100644 --- a/pypots/imputation/saits.py +++ b/pypots/imputation/saits.py @@ -6,6 +6,7 @@ # Created by Wenjie Du # License: GPL-v3 +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F @@ -214,19 +215,55 @@ def __init__( self.model = self.model.to(self.device) self._print_model_size() - def fit(self, train_X, val_X=None): - training_set = DatasetForMIT(train_X) + def fit(self, train_set, val_set=None, file_type="h5py"): + """Train the imputer on the given data. + + Parameters + ---------- + train_set : dict or str, + The dataset for model training, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + val_set : dict or str, + The dataset for model validating, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + file_type : str, default = "h5py", + The type of the given file if train_set and val_set are path strings. + + Returns + ------- + self : object, + The trained imputer. + """ + training_set = DatasetForMIT(train_set, file_type) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True ) - if val_X is None: + if val_set is None: self._train_model(training_loader) else: + if isinstance(val_set, str): + import h5py + + with h5py.File(val_set, "r") as hf: + val_X = hf["X"] + val_set = {"X": val_X} + val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar( - val_X, 0.2 + val_set["X"], 0.2 ) - val_X = masked_fill(val_X, 1 - val_X_missing_mask, torch.nan) - val_set = BaseDataset(val_X) + val_X = masked_fill(val_X, 1 - val_X_missing_mask, np.nan) + val_set["X"] = val_X + val_set = BaseDataset(val_set) val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False) self._train_model( training_loader, val_loader, val_X_intact, val_X_indicating_mask @@ -305,9 +342,25 @@ def assemble_input_for_testing(self, data) -> dict: """ return self.assemble_input_for_validating(data) - def impute(self, X): + def impute(self, X, file_type="h5py"): + """Impute missing values in the given data with the trained model. + + Parameters + ---------- + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py", + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples, sequence length (time steps), n_features], + Imputed data. + """ self.model.eval() # set the model as eval status to freeze it. - test_set = BaseDataset(X) + test_set = BaseDataset(X, file_type) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) imputation_collector = [] diff --git a/pypots/imputation/transformer.py b/pypots/imputation/transformer.py index a9c8e221..4b89a94c 100644 --- a/pypots/imputation/transformer.py +++ b/pypots/imputation/transformer.py @@ -304,20 +304,56 @@ def __init__( self.model = self.model.to(self.device) self._print_model_size() - def fit(self, train_X, val_X=None): + def fit(self, train_set, val_set=None, file_type="h5py"): + """Train the imputer on the given data. - training_set = DatasetForMIT(train_X) + Parameters + ---------- + train_set : dict or str, + The dataset for model training, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for training, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + val_set : dict or str, + The dataset for model validating, should be a dictionary including the key 'X', + or a path string locating a data file. + If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features], + which is time-series data for validating, can contain missing values. + If it is a path string, the path should point to a data file, e.g. a h5 file, which contains + key-value pairs like a dict, and it has to include the key 'X'. + + file_type : str, default = "h5py", + The type of the given file if train_set and val_set are path strings. + + Returns + ------- + self : object, + The trained imputer. + """ + + training_set = DatasetForMIT(train_set, file_type) training_loader = DataLoader( training_set, batch_size=self.batch_size, shuffle=True ) - if val_X is None: + if val_set is None: self._train_model(training_loader) else: + if isinstance(val_set, str): + import h5py + + with h5py.File(val_set, "r") as hf: + val_X = hf["X"] + val_set = {"X": val_X} + val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar( - val_X, 0.2 + val_set["X"], 0.2 ) val_X = masked_fill(val_X, 1 - val_X_missing_mask, np.nan) - val_set = BaseDataset(val_X) + val_set["X"] = val_X + val_set = BaseDataset(val_set) val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False) self._train_model( training_loader, val_loader, val_X_intact, val_X_indicating_mask @@ -398,9 +434,25 @@ def assemble_input_for_testing(self, data) -> dict: """ return self.assemble_input_for_validating(data) - def impute(self, X): + def impute(self, X, file_type="h5py"): + """Impute missing values in the given data with the trained model. + + Parameters + ---------- + X : array-like or str, + The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps), + n_features], or a path string locating a data file, e.g. h5 file. + + file_type : str, default = "h5py", + The type of the given file if X is a path string. + + Returns + ------- + array-like, shape [n_samples, sequence length (time steps), n_features], + Imputed data. + """ self.model.eval() # set the model as eval status to freeze it. - test_set = BaseDataset(X) + test_set = BaseDataset(X, file_type) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False) imputation_collector = [] diff --git a/pypots/tests/test_classification.py b/pypots/tests/test_classification.py index 8148980b..034d65ab 100644 --- a/pypots/tests/test_classification.py +++ b/pypots/tests/test_classification.py @@ -14,15 +14,13 @@ EPOCHS = 5 +TRAIN_SET = {"X": DATA["train_X"], "y": DATA["train_y"]} +VAL_SET = {"X": DATA["val_X"], "y": DATA["val_y"]} +TEST_SET = {"X": DATA["test_X"]} + class TestBRITS(unittest.TestCase): def setUp(self) -> None: - self.train_X = DATA["train_X"] - self.train_y = DATA["train_y"] - self.val_X = DATA["val_X"] - self.val_y = DATA["val_y"] - self.test_X = DATA["test_X"] - self.test_y = DATA["test_y"] logger.info("Running test cases for BRITS...") self.brits = BRITS( DATA["n_steps"], @@ -31,7 +29,7 @@ def setUp(self) -> None: n_classes=DATA["n_classes"], epochs=EPOCHS, ) - self.brits.fit(self.train_X, self.train_y, self.val_X, self.val_y) + self.brits.fit(TRAIN_SET, VAL_SET) def test_parameters(self): assert hasattr(self.brits, "model") and self.brits.model is not None @@ -47,8 +45,8 @@ def test_parameters(self): ) def test_classify(self): - predictions = self.brits.classify(self.test_X) - metrics = cal_binary_classification_metrics(predictions, self.test_y) + predictions = self.brits.classify(TEST_SET) + metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) logger.info( f'ROC_AUC: {metrics["roc_auc"]}, \n' f'PR_AUC: {metrics["pr_auc"]},\n' @@ -61,12 +59,6 @@ def test_classify(self): class TestGRUD(unittest.TestCase): def setUp(self) -> None: - self.train_X = DATA["train_X"] - self.train_y = DATA["train_y"] - self.val_X = DATA["val_X"] - self.val_y = DATA["val_y"] - self.test_X = DATA["test_X"] - self.test_y = DATA["test_y"] logger.info("Running test cases for GRUD...") self.grud = GRUD( DATA["n_steps"], @@ -75,7 +67,7 @@ def setUp(self) -> None: n_classes=DATA["n_classes"], epochs=EPOCHS, ) - self.grud.fit(self.train_X, self.train_y, self.val_X, self.val_y) + self.grud.fit(TRAIN_SET, VAL_SET) def test_parameters(self): assert hasattr(self.grud, "model") and self.grud.model is not None @@ -91,8 +83,8 @@ def test_parameters(self): ) def test_classify(self): - predictions = self.grud.classify(self.test_X) - metrics = cal_binary_classification_metrics(predictions, self.test_y) + predictions = self.grud.classify(TEST_SET) + metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) logger.info( f'ROC_AUC: {metrics["roc_auc"]}, \n' f'PR_AUC: {metrics["pr_auc"]},\n' @@ -105,12 +97,6 @@ def test_classify(self): class TestRaindrop(unittest.TestCase): def setUp(self) -> None: - self.train_X = DATA["train_X"] - self.train_y = DATA["train_y"] - self.val_X = DATA["val_X"] - self.val_y = DATA["val_y"] - self.test_X = DATA["test_X"] - self.test_y = DATA["test_y"] logger.info("Running test cases for Raindrop...") self.raindrop = Raindrop( DATA["n_features"], @@ -127,7 +113,7 @@ def setUp(self) -> None: False, epochs=EPOCHS, ) - self.raindrop.fit(self.train_X, self.train_y, self.val_X, self.val_y) + self.raindrop.fit(TRAIN_SET, VAL_SET) def test_parameters(self): assert hasattr(self.raindrop, "model") and self.raindrop.model is not None @@ -145,8 +131,8 @@ def test_parameters(self): ) def test_classify(self): - predictions = self.raindrop.classify(self.test_X) - metrics = cal_binary_classification_metrics(predictions, self.test_y) + predictions = self.raindrop.classify(TEST_SET) + metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) logger.info( f'ROC_AUC: {metrics["roc_auc"]}, \n' f'PR_AUC: {metrics["pr_auc"]},\n' diff --git a/pypots/tests/test_clustering.py b/pypots/tests/test_clustering.py index ce22c64a..ddc36887 100644 --- a/pypots/tests/test_clustering.py +++ b/pypots/tests/test_clustering.py @@ -17,11 +17,13 @@ EPOCHS = 5 +TRAIN_SET = {"X": DATA["train_X"]} +VAL_SET = {"X": DATA["val_X"]} +TEST_SET = {"X": DATA["test_X"]} + class TestCRLI(unittest.TestCase): def setUp(self) -> None: - self.train_X = DATA["train_X"] - self.train_y = DATA["train_y"] logger.info("Running test cases for CRLI...") self.crli = CRLI( n_steps=DATA["n_steps"], @@ -31,7 +33,7 @@ def setUp(self) -> None: rnn_hidden_size=128, epochs=EPOCHS, ) - self.crli.fit(self.train_X) + self.crli.fit(TRAIN_SET) def test_parameters(self): assert hasattr(self.crli, "model") and self.crli.model is not None @@ -48,16 +50,14 @@ def test_parameters(self): ) def test_cluster(self): - clustering = self.crli.cluster(self.train_X) - RI = cal_rand_index(clustering, self.train_y) - CP = cal_cluster_purity(clustering, self.train_y) + clustering = self.crli.cluster(TEST_SET) + RI = cal_rand_index(clustering, DATA["test_y"]) + CP = cal_cluster_purity(clustering, DATA["test_y"]) logger.info(f"RI: {RI}\nCP: {CP}") class TestVaDER(unittest.TestCase): def setUp(self) -> None: - self.train_X = DATA["train_X"] - self.train_y = DATA["train_y"] logger.info("Running test cases for VaDER...") self.vader = VaDER( n_steps=DATA["n_steps"], @@ -68,7 +68,7 @@ def setUp(self) -> None: pretrain_epochs=20, epochs=EPOCHS, ) - self.vader.fit(self.train_X) + self.vader.fit(TRAIN_SET) def test_parameters(self): assert hasattr(self.vader, "model") and self.vader.model is not None @@ -85,9 +85,9 @@ def test_parameters(self): def test_cluster(self): try: - clustering = self.vader.cluster(self.train_X) - RI = cal_rand_index(clustering, self.train_y) - CP = cal_cluster_purity(clustering, self.train_y) + clustering = self.vader.cluster(TEST_SET) + RI = cal_rand_index(clustering, DATA["test_y"]) + CP = cal_cluster_purity(clustering, DATA["test_y"]) logger.info(f"RI: {RI}\nCP: {CP}") except np.linalg.LinAlgError as e: logger.info( diff --git a/pypots/tests/test_forecasting.py b/pypots/tests/test_forecasting.py index 27734a68..409c7f81 100644 --- a/pypots/tests/test_forecasting.py +++ b/pypots/tests/test_forecasting.py @@ -15,14 +15,12 @@ from pypots.utils.metrics import cal_mae EPOCHS = 5 +DATA = gene_random_walk_data(n_steps=120, n_features=10) +TEST_SET = {"X": DATA["test_X"][:, :100]} class TestBTTF(unittest.TestCase): def setUp(self) -> None: - DATA = gene_random_walk_data(n_steps=120, n_features=10) - self.test_X = DATA["test_X"] - self.test_X_intact = DATA["test_X_intact"] - self.test_X_for_input = self.test_X[:, :100] logger.info("Running test cases for BTTF...") self.bttf = BTTF( 100, @@ -36,8 +34,8 @@ def setUp(self) -> None: ) def test_forecasting(self): - predictions = self.bttf.forecast(self.test_X_for_input) - mae = cal_mae(predictions, self.test_X_intact[:, 100:]) + predictions = self.bttf.forecast(TEST_SET) + mae = cal_mae(predictions, DATA["test_X_intact"][:, 100:]) logger.info(f"prediction MAE: {mae}") diff --git a/pypots/tests/test_imputation.py b/pypots/tests/test_imputation.py index 957a4d34..8367eb21 100644 --- a/pypots/tests/test_imputation.py +++ b/pypots/tests/test_imputation.py @@ -22,14 +22,13 @@ EPOCH = 5 +TRAIN_SET = {"X": DATA["train_X"]} +VAL_SET = {"X": DATA["val_X"]} +TEST_SET = {"X": DATA["test_X"]} + class TestSAITS(unittest.TestCase): def setUp(self) -> None: - self.train_X = DATA["train_X"] - self.val_X = DATA["val_X"] - self.test_X = DATA["test_X"] - self.test_X_intact = DATA["test_X_intact"] - self.test_X_indicating_mask = DATA["test_X_indicating_mask"] logger.info("Running test cases for SAITS...") self.saits = SAITS( DATA["n_steps"], @@ -43,7 +42,7 @@ def setUp(self) -> None: dropout=0.1, epochs=EPOCH, ) - self.saits.fit(self.train_X, self.val_X) + self.saits.fit(TRAIN_SET, VAL_SET) def test_parameters(self): assert hasattr(self.saits, "model") and self.saits.model is not None @@ -59,21 +58,18 @@ def test_parameters(self): ) def test_impute(self): - imputed_X = self.saits.impute(self.test_X) + imputed_X = self.saits.impute(TEST_SET) assert not np.isnan( imputed_X ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae(imputed_X, self.test_X_intact, self.test_X_indicating_mask) + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) logger.info(f"SAITS test_MAE: {test_MAE}") class TestTransformer(unittest.TestCase): def setUp(self) -> None: - self.train_X = DATA["train_X"] - self.val_X = DATA["val_X"] - self.test_X = DATA["test_X"] - self.test_X_intact = DATA["test_X_intact"] - self.test_X_indicating_mask = DATA["test_X_indicating_mask"] logger.info("Running test cases for Transformer...") self.transformer = Transformer( DATA["n_steps"], @@ -87,7 +83,7 @@ def setUp(self) -> None: dropout=0.1, epochs=EPOCH, ) - self.transformer.fit(self.train_X, self.val_X) + self.transformer.fit(TRAIN_SET, VAL_SET) def test_parameters(self): assert hasattr(self.transformer, "model") and self.transformer.model is not None @@ -106,24 +102,21 @@ def test_parameters(self): ) def test_impute(self): - imputed_X = self.transformer.impute(self.test_X) + imputed_X = self.transformer.impute(TEST_SET) assert not np.isnan( imputed_X ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae(imputed_X, self.test_X_intact, self.test_X_indicating_mask) + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) logger.info(f"Transformer test_MAE: {test_MAE}") class TestBRITS(unittest.TestCase): def setUp(self) -> None: - self.train_X = DATA["train_X"] - self.val_X = DATA["val_X"] - self.test_X = DATA["test_X"] - self.test_X_intact = DATA["test_X_intact"] - self.test_X_indicating_mask = DATA["test_X_indicating_mask"] logger.info("Running test cases for BRITS...") self.brits = BRITS(DATA["n_steps"], DATA["n_features"], 256, epochs=EPOCH) - self.brits.fit(self.train_X, self.val_X) + self.brits.fit(TRAIN_SET, VAL_SET) def test_parameters(self): assert hasattr(self.brits, "model") and self.brits.model is not None @@ -139,21 +132,18 @@ def test_parameters(self): ) def test_impute(self): - imputed_X = self.brits.impute(self.test_X) + imputed_X = self.brits.impute(TEST_SET) assert not np.isnan( imputed_X ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae(imputed_X, self.test_X_intact, self.test_X_indicating_mask) + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) logger.info(f"BRITS test_MAE: {test_MAE}") class TestLOCF(unittest.TestCase): def setUp(self) -> None: - self.train_X = DATA["train_X"] - self.val_X = DATA["val_X"] - self.test_X = DATA["test_X"] - self.test_X_intact = DATA["test_X_intact"] - self.test_X_indicating_mask = DATA["test_X_indicating_mask"] logger.info("Running test cases for LOCF...") self.locf = LOCF(nan=0) @@ -161,12 +151,12 @@ def test_parameters(self): assert hasattr(self.locf, "nan") and self.locf.nan is not None def test_impute(self): - test_X_imputed = self.locf.impute(self.test_X) + test_X_imputed = self.locf.impute(TEST_SET) assert not np.isnan( test_X_imputed ).any(), "Output still has missing values after running impute()." test_MAE = cal_mae( - test_X_imputed, self.test_X_intact, self.test_X_indicating_mask + test_X_imputed, DATA["test_X_intact"], DATA["test_X_indicating_mask"] ) logger.info(f"LOCF test_MAE: {test_MAE}") From 7dfbf87c9110f0862b5ab3ab1c4b1fcecd4ad56a Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 10:34:17 +0800 Subject: [PATCH 10/22] fix: bugs in Dataset classes' functions with lazy-loading strategy; --- pypots/data/base.py | 16 +++--- pypots/data/dataset_for_brits.py | 99 +++++++++++++++++++++++--------- pypots/data/dataset_for_grud.py | 10 ++-- pypots/data/dataset_for_mit.py | 2 +- pypots/imputation/saits.py | 2 +- 5 files changed, 88 insertions(+), 41 deletions(-) diff --git a/pypots/data/base.py b/pypots/data/base.py index 84707c2d..f0303839 100644 --- a/pypots/data/base.py +++ b/pypots/data/base.py @@ -37,7 +37,7 @@ def __init__(self, data, file_type="h5py"): # So they are safe to use here. No need to check again. self.data = data - if isinstance(data, str): + if isinstance(self.data, str): # data from file self.file_type = file_type # check if the given file type is supported @@ -52,7 +52,7 @@ def __init__(self, data, file_type="h5py"): "X" in self.file_handle.keys() ), "The given dataset file doesn't contains X. Please double check." - else: + else: # data from array X = data["X"] y = None if "y" not in data.keys() else data["y"] self.X, self.y = self.check_input(X, y) @@ -60,10 +60,10 @@ def __init__(self, data, file_type="h5py"): self.sample_num = self._get_sample_num() # set up function fetch_data() - if self.X is not None: - self.fetch_data = self._fetch_data_from_array - else: + if isinstance(self.data, str): self.fetch_data = self._fetch_data_from_file + else: + self.fetch_data = self._fetch_data_from_array def _get_sample_num(self): """Determine the number of samples in the dataset and return the number. @@ -207,8 +207,8 @@ def _open_file_handle(self): import h5py file_handler = h5py.File( - data_file_path, "r" - ) # set swmr=True if the h5 file need to be written into new content during reading + data_file_path, "r", swmr=True + ) # set if the h5 file need to be written into new content during reading except ImportError: raise ImportError( "h5py is missing and cannot be imported. Please install it first." @@ -252,7 +252,7 @@ def _fetch_data_from_file(self, idx): if self.file_handle is None: self.file_handle = self._open_file_handle() - X = self.file_handle["X"][idx] + X = torch.from_numpy(self.file_handle["X"][idx]) missing_mask = ~torch.isnan(X) X = torch.nan_to_num(X) sample = [ diff --git a/pypots/data/dataset_for_brits.py b/pypots/data/dataset_for_brits.py index e7a74b13..eb360583 100644 --- a/pypots/data/dataset_for_brits.py +++ b/pypots/data/dataset_for_brits.py @@ -16,7 +16,7 @@ def parse_delta(missing_mask): Parameters ---------- - missing_mask : tensor, shape of [n_samples, n_steps, n_features] + missing_mask : tensor, shape of [n_steps, n_features] or [n_samples, n_steps, n_features] Binary masks indicate missing values. Returns @@ -25,23 +25,70 @@ def parse_delta(missing_mask): Delta matrix indicates time gaps of missing values. Its math definition please refer to :cite:`che2018GRUD`. """ - # missing_mask is from X, and X's shape and type had been checked. So no need to double-check here. - n_samples, n_steps, n_features = missing_mask.shape - device = missing_mask.device - delta_collector = [] - for m_mask in missing_mask: - delta = [] + + def cal_delta_for_single_sample(mask): + d = [] # single sample's delta for step in range(n_steps): if step == 0: - delta.append(torch.zeros(1, n_features, device=device)) + d.append(torch.zeros(1, n_features, device=device)) else: - delta.append( - torch.ones(1, n_features, device=device) - + (1 - m_mask[step]) * delta[-1] + d.append( + torch.ones(1, n_features, device=device) + (1 - mask[step]) * d[-1] ) - delta = torch.concat(delta, dim=0) - delta_collector.append(delta.unsqueeze(0)) - delta = torch.concat(delta_collector, dim=0) + d = torch.concat(d, dim=0) + return d + + # missing_mask is from X, and X's shape and type had been checked. So no need to double-check here. + device = missing_mask.device + if len(missing_mask.shape) == 2: + n_steps, n_features = missing_mask.shape + delta = cal_delta_for_single_sample(missing_mask) + else: + n_samples, n_steps, n_features = missing_mask.shape + delta_collector = [] + for m_mask in missing_mask: + delta = cal_delta_for_single_sample(m_mask) + delta_collector.append(delta.unsqueeze(0)) + delta = torch.concat(delta_collector, dim=0) + + return delta + + +def parse_delta_np(missing_mask): + """Generate time-gap (delta) matrix from missing masks. + + Parameters + ---------- + missing_mask : array, shape of [seq_len, n_features] + Binary masks indicate missing values. + + Returns + ------- + delta, array, + Delta matrix indicates time gaps of missing values. + Its math definition please refer to :cite:`che2018MissingData`. + """ + + def cal_delta_for_single_sample(mask): + d = [] + for step in range(seq_len): + if step == 0: + d.append(np.zeros(n_features)) + else: + d.append(np.ones(n_features) + (1 - mask[step]) * d[-1]) + d = np.asarray(d) + return d + + if len(missing_mask.shape) == 2: + seq_len, n_features = missing_mask.shape + delta = cal_delta_for_single_sample(missing_mask) + else: + n_samples, seq_len, n_features = missing_mask.shape + delta_collector = [] + for m_mask in missing_mask: + delta = cal_delta_for_single_sample(m_mask) + delta_collector.append(delta) + delta = np.asarray(delta_collector) return delta @@ -151,9 +198,9 @@ def _fetch_data_from_file(self, idx): if self.file_handle is None: self.file_handle = self._open_file_handle() - X = self.file_handle["X"][idx] - missing_mask = (~np.isnan(X)).astype("float32") - X = np.nan_to_num(X) + X = torch.from_numpy(self.file_handle["X"][idx]) + missing_mask = (~torch.isnan(X)).to(torch.float32) + X = torch.nan_to_num(X) forward = { "X": X, @@ -162,26 +209,26 @@ def _fetch_data_from_file(self, idx): } backward = { - "X": np.flip(forward["X"], axis=0).copy(), - "missing_mask": np.flip(forward["missing_mask"], axis=0).copy(), + "X": torch.flip(forward["X"], dims=[0]), + "missing_mask": torch.flip(forward["missing_mask"], dims=[0]), } backward["deltas"] = parse_delta(backward["missing_mask"]) sample = [ torch.tensor(idx), # for forward - torch.from_numpy(forward["X"].astype("float32")), - torch.from_numpy(forward["missing_mask"].astype("float32")), - torch.from_numpy(forward["deltas"].astype("float32")), + forward["X"], + forward["missing_mask"], + forward["deltas"], # for backward - torch.from_numpy(backward["X"].astype("float32")), - torch.from_numpy(backward["missing_mask"].astype("float32")), - torch.from_numpy(backward["deltas"].astype("float32")), + backward["X"], + backward["missing_mask"], + backward["deltas"], ] if ( "y" in self.file_handle.keys() ): # if the dataset has labels, then fetch it from the file - sample.append(self.file_handle["y"][idx].to(torch.long)) + sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) return sample diff --git a/pypots/data/dataset_for_grud.py b/pypots/data/dataset_for_grud.py index 8db41843..77f4f5f1 100644 --- a/pypots/data/dataset_for_grud.py +++ b/pypots/data/dataset_for_grud.py @@ -36,7 +36,7 @@ def __init__(self, data, file_type="h5py"): self.locf = LOCF() - if self.X is not None: + if not isinstance(self.data, str): # data from array self.missing_mask = (~torch.isnan(self.X)).to(torch.float32) self.X_filledLOCF = self.locf.locf_torch(self.X) self.X = torch.nan_to_num(self.X) @@ -110,11 +110,11 @@ def _fetch_data_from_file(self, idx): X = torch.from_numpy(self.file_handle["X"][idx]) missing_mask = (~torch.isnan(X)).to(torch.float32) - X_filledLOCF = self.locf.locf_torch(X) + X_filledLOCF = self.locf.locf_torch(X.unsqueeze(dim=0)).squeeze() X = torch.nan_to_num(X) deltas = parse_delta(missing_mask) - empirical_mean = torch.sum(missing_mask * X, dim=[0, 1]) / torch.sum( - missing_mask, dim=[0, 1] + empirical_mean = torch.sum(missing_mask * X, dim=[0]) / torch.sum( + missing_mask, dim=[0] ) sample = [ @@ -129,6 +129,6 @@ def _fetch_data_from_file(self, idx): if ( "y" in self.file_handle.keys() ): # if the dataset has labels, then fetch it from the file - sample.append(self.file_handle["y"][idx].to(torch.long)) + sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) return sample diff --git a/pypots/data/dataset_for_mit.py b/pypots/data/dataset_for_mit.py index 787e7c9e..7dfc4e4c 100644 --- a/pypots/data/dataset_for_mit.py +++ b/pypots/data/dataset_for_mit.py @@ -119,6 +119,6 @@ def _fetch_data_from_file(self, idx): if ( "y" in self.file_handle.keys() ): # if the dataset has labels, then fetch it from the file - sample.append(self.file_handle["y"][idx].to(torch.long)) + sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long)) return sample diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py index 627e2b6a..3870b218 100644 --- a/pypots/imputation/saits.py +++ b/pypots/imputation/saits.py @@ -255,7 +255,7 @@ def fit(self, train_set, val_set=None, file_type="h5py"): import h5py with h5py.File(val_set, "r") as hf: - val_X = hf["X"] + val_X = hf["X"][:] val_set = {"X": val_X} val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar( From fdc1459ade7753ce118530e690514d380ea155e9 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 10:49:20 +0800 Subject: [PATCH 11/22] fix: update the dependencies; --- environment.yml | 9 ++++----- pypots/tests/environment_test.yml | 7 +++---- requirements.txt | 3 ++- setup.py | 1 + 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/environment.yml b/environment.yml index 396b79b2..c1cb2024 100644 --- a/environment.yml +++ b/environment.yml @@ -9,10 +9,9 @@ dependencies: - conda-forge::scipy - conda-forge::pandas - conda-forge::scikit-learn - - conda-forge::matplotlib - conda-forge::tensorboard - conda-forge::pip - - pytorch::pytorch==1.11.0 - - pip: - - pycorruptor==0.0.4 - - tsdb==0.0.7 + - conda-forge::pycorruptor + - conda-forge::tsdb + - conda-forge::h5py + - pytorch::pytorch==1.11.0 \ No newline at end of file diff --git a/pypots/tests/environment_test.yml b/pypots/tests/environment_test.yml index 44c3a21c..dc4e3316 100644 --- a/pypots/tests/environment_test.yml +++ b/pypots/tests/environment_test.yml @@ -10,14 +10,13 @@ dependencies: - conda-forge::scipy - conda-forge::pandas - conda-forge::scikit-learn - - conda-forge::matplotlib + - conda-forge::h5py - conda-forge::tensorboard - conda-forge::pip - conda-forge::pytest-cov - conda-forge::pytest-xdist - conda-forge::coverage + - conda-forge::pycorruptor + - conda-forge::tsdb - pytorch::pytorch==1.11.0 - pyg::pyg==2.0.4 - - pip: - - pycorruptor==0.0.4 - - tsdb==0.0.7 diff --git a/requirements.txt b/requirements.txt index 59de6847..41a9e125 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ scipy tensorboard pandas pycorruptor -tsdb \ No newline at end of file +tsdb +h5py diff --git a/setup.py b/setup.py index ba9febff..9cafa889 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ "pandas", "pycorruptor", "tsdb", + "h5py", ], setup_requires=["setuptools>=38.6.0"], ) From ee5270a5e7f1709d66c20792dda44d6147a4c95f Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 10:50:14 +0800 Subject: [PATCH 12/22] feat: add testing cases for lazy-loading datasets; --- .../tests/test_data_lazy_loading_from_file.py | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 pypots/tests/test_data_lazy_loading_from_file.py diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data_lazy_loading_from_file.py new file mode 100644 index 00000000..c9b741f3 --- /dev/null +++ b/pypots/tests/test_data_lazy_loading_from_file.py @@ -0,0 +1,103 @@ +""" +Test cases for data classes with the lazy-loading strategy of reading from files. +""" +import os +import sys + +# Created by Wenjie Du +# License: GLP-v3 + +import unittest + +from pypots.tests.unified_data_for_test import DATA +import h5py +from pypots.imputation import SAITS +from pypots.classification import BRITS, GRUD +from pypots.data import DatasetForBRITS, DatasetForMIT, DatasetForGRUD +import numpy as np + +EPOCHS = 1 + +TRAIN_SET = "./train_set.h5" +VAL_SET = "./val_set.h5" +TEST_SET = "./test_set.h5" + +IMPUTATION_TRAIN_SET = "./imputation_train_set.h5" +IMPUTATION_VAL_SET = "./imputation_val_set.h5" + + +def save_data_set_into_h5(data, path): + with h5py.File(path, "w") as hf: + for i in data.keys(): + hf.create_dataset(i, data=data[i].astype(np.float32)) + + +save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET) +save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET) +save_data_set_into_h5( + { + "X": DATA["test_X"], + "X_intact": DATA["test_X_intact"], + "X_indicating_mask": DATA["test_X_indicating_mask"], + }, + TEST_SET, +) + +save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET) +save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET) + + +class TestLazyLoadingClasses(unittest.TestCase): + def setUp(self) -> None: + + assert os.path.exists(TRAIN_SET) + assert os.path.exists(VAL_SET) + assert os.path.exists(TEST_SET) + + assert os.path.exists(IMPUTATION_TRAIN_SET) + assert os.path.exists(IMPUTATION_VAL_SET) + + self.saits = SAITS( + DATA["n_steps"], + DATA["n_features"], + n_layers=2, + d_model=256, + d_inner=128, + n_head=4, + d_k=64, + d_v=64, + dropout=0.1, + epochs=EPOCHS, + ) + + self.brits = BRITS( + DATA["n_steps"], + DATA["n_features"], + 256, + n_classes=DATA["n_classes"], + epochs=EPOCHS, + ) + + self.grud = GRUD( + DATA["n_steps"], + DATA["n_features"], + 256, + n_classes=DATA["n_classes"], + epochs=EPOCHS, + ) + + def test_DatasetForMIT(self): + self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET) + _ = self.saits.impute(X=TEST_SET) + + def test_DatasetForBRITS(self): + self.brits.fit(train_set=TRAIN_SET, val_set=VAL_SET) + _ = self.brits.classify(X=TEST_SET) + + def test_DatasetForGRUD(self): + self.grud.fit(train_set=TRAIN_SET, val_set=VAL_SET) + _ = self.grud.classify(X=TEST_SET) + + +if __name__ == "__main__": + unittest.main() From 8a4f68280de00776b3151c615be561ff50c3b0c6 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 10:57:49 +0800 Subject: [PATCH 13/22] doc: update README; --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5f95c52e..615570c3 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,9 @@ Install it with `conda install pypots`, you may need to specify the channel with Install the latest release from PyPI: > pip install pypots +or install from the source code with the latest features not officially released in a version: +> pip install `https://github.com/WenjieDu/PyPOTS/archive/main.zip` +
Below is an example applying SAITS in PyPOTS to impute missing values in the dataset PhysioNet2012: @@ -84,10 +87,11 @@ X = StandardScaler().fit_transform(X.to_numpy()) X = X.reshape(num_samples, 48, -1) X_intact, X, missing_mask, indicating_mask = mcar(X, 0.1) # hold out 10% observed values as ground truth X = masked_fill(X, 1 - missing_mask, np.nan) +dataset = {"X": X} # Model training. This is PyPOTS showtime. 💪 saits = SAITS(n_steps=48, n_features=37, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=10) -saits.fit(X) # train the model. Here I use the whole dataset as the training set, because ground truth is not visible to the model. -imputation = saits.impute(X) # impute the originally-missing values and artificially-missing values +saits.fit(dataset) # train the model. Here I use the whole dataset as the training set, because ground truth is not visible to the model. +imputation = saits.impute(dataset) # impute the originally-missing values and artificially-missing values mae = cal_mae(imputation, X_intact, indicating_mask) # calculate mean absolute error on the ground truth (artificially-missing values) ```
From 0fb57d4ba7b5531dccb96f6cadd96b20fa831cee Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 17:38:50 +0800 Subject: [PATCH 14/22] feat: v0.0.10 is ready; --- pypots/__version__.py | 2 +- pypots/tests/test_data_lazy_loading_from_file.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pypots/__version__.py b/pypots/__version__.py index b4069ba5..c6345fc4 100644 --- a/pypots/__version__.py +++ b/pypots/__version__.py @@ -21,4 +21,4 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' -version = "0.0.9" +version = "0.0.10" diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data_lazy_loading_from_file.py index c9b741f3..6ef6edf8 100644 --- a/pypots/tests/test_data_lazy_loading_from_file.py +++ b/pypots/tests/test_data_lazy_loading_from_file.py @@ -1,21 +1,20 @@ """ Test cases for data classes with the lazy-loading strategy of reading from files. """ -import os -import sys # Created by Wenjie Du # License: GLP-v3 +import os import unittest -from pypots.tests.unified_data_for_test import DATA import h5py -from pypots.imputation import SAITS -from pypots.classification import BRITS, GRUD -from pypots.data import DatasetForBRITS, DatasetForMIT, DatasetForGRUD import numpy as np +from pypots.classification import BRITS, GRUD +from pypots.imputation import SAITS +from pypots.tests.unified_data_for_test import DATA + EPOCHS = 1 TRAIN_SET = "./train_set.h5" From 72eaf20649760a5a21ae96c9cd01d0479b3a75f8 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 18:10:13 +0800 Subject: [PATCH 15/22] fix: running testing cases for forecasting models and lazy-loading datasets; --- .github/workflows/testing.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 462a555a..fda30e59 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -47,6 +47,8 @@ jobs: python -m pytest -rA pypots/tests/test_classification.py -n auto --cov=pypots python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append + python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append + python -m pytest -rA pypots/tests/test_data_lazy_loading_from_file.py -n auto --cov=pypots --cov-append - name: Generate the LCOV report run: | From fa5f5b6af08aba4a2b38c68fae873ac2ddb53199 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 18:53:46 +0800 Subject: [PATCH 16/22] fix: running testing cases for logging; --- .github/workflows/testing.yml | 1 + .../tests/test_data_lazy_loading_from_file.py | 38 ++++--------------- pypots/tests/unified_data_for_test.py | 29 ++++++++++++++ 3 files changed, 37 insertions(+), 31 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index fda30e59..6c201f5f 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -49,6 +49,7 @@ jobs: python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append python -m pytest -rA pypots/tests/test_data_lazy_loading_from_file.py -n auto --cov=pypots --cov-append + python -m pytest -rA pypots/tests/test_logging.py -n auto --cov=pypots --cov-append - name: Generate the LCOV report run: | diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data_lazy_loading_from_file.py index 6ef6edf8..6ec23a86 100644 --- a/pypots/tests/test_data_lazy_loading_from_file.py +++ b/pypots/tests/test_data_lazy_loading_from_file.py @@ -8,42 +8,18 @@ import os import unittest -import h5py -import numpy as np - from pypots.classification import BRITS, GRUD from pypots.imputation import SAITS -from pypots.tests.unified_data_for_test import DATA - -EPOCHS = 1 - -TRAIN_SET = "./train_set.h5" -VAL_SET = "./val_set.h5" -TEST_SET = "./test_set.h5" - -IMPUTATION_TRAIN_SET = "./imputation_train_set.h5" -IMPUTATION_VAL_SET = "./imputation_val_set.h5" - - -def save_data_set_into_h5(data, path): - with h5py.File(path, "w") as hf: - for i in data.keys(): - hf.create_dataset(i, data=data[i].astype(np.float32)) - - -save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET) -save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET) -save_data_set_into_h5( - { - "X": DATA["test_X"], - "X_intact": DATA["test_X_intact"], - "X_indicating_mask": DATA["test_X_indicating_mask"], - }, +from pypots.tests.unified_data_for_test import ( + DATA, + TRAIN_SET, + VAL_SET, TEST_SET, + IMPUTATION_TRAIN_SET, + IMPUTATION_VAL_SET, ) -save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET) -save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET) +EPOCHS = 1 class TestLazyLoadingClasses(unittest.TestCase): diff --git a/pypots/tests/unified_data_for_test.py b/pypots/tests/unified_data_for_test.py index ffb0f395..93e59990 100644 --- a/pypots/tests/unified_data_for_test.py +++ b/pypots/tests/unified_data_for_test.py @@ -5,6 +5,8 @@ # Created by Wenjie Du # License: GLP-v3 +import h5py +import numpy as np import torch from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler @@ -13,6 +15,12 @@ from pypots.data import load_specific_dataset +def save_data_set_into_h5(data, path): + with h5py.File(path, "w") as hf: + for i in data.keys(): + hf.create_dataset(i, data=data[i].astype(np.float32)) + + def gene_random_walk_data( n_steps=24, n_features=10, n_classes=2, n_samples_each_class=1000 ): @@ -128,3 +136,24 @@ def gene_physionet2012(): # generate and cache data first. # Otherwise, file lock will cause bug if running test parallely with pytest-xdist. DATA = gene_random_walk_data() + +TRAIN_SET = "./train_set.h5" +VAL_SET = "./val_set.h5" +TEST_SET = "./test_set.h5" + +IMPUTATION_TRAIN_SET = "./imputation_train_set.h5" +IMPUTATION_VAL_SET = "./imputation_val_set.h5" + +save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET) +save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET) +save_data_set_into_h5( + { + "X": DATA["test_X"], + "X_intact": DATA["test_X_intact"], + "X_indicating_mask": DATA["test_X_indicating_mask"], + }, + TEST_SET, +) + +save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET) +save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET) From e9aea749483a9d363f5ee93ea6e3d705ee1fe3d4 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 21:30:31 +0800 Subject: [PATCH 17/22] fix: try to fix the BlockingIOError, see below message for details; BlockingIOError: [Errno 35] Unable to create file (unable to lock file, errno = 35, error message = 'Resource temporarily unavailable') This may be caused by the program creates h5 files for multiple times; --- .../tests/test_data_lazy_loading_from_file.py | 39 +++++++++++++++---- pypots/tests/unified_data_for_test.py | 29 -------------- 2 files changed, 31 insertions(+), 37 deletions(-) diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data_lazy_loading_from_file.py index 6ec23a86..bfc1464a 100644 --- a/pypots/tests/test_data_lazy_loading_from_file.py +++ b/pypots/tests/test_data_lazy_loading_from_file.py @@ -8,22 +8,45 @@ import os import unittest +import h5py +import numpy as np + from pypots.classification import BRITS, GRUD from pypots.imputation import SAITS -from pypots.tests.unified_data_for_test import ( - DATA, - TRAIN_SET, - VAL_SET, - TEST_SET, - IMPUTATION_TRAIN_SET, - IMPUTATION_VAL_SET, -) +from pypots.tests.unified_data_for_test import DATA + +TRAIN_SET = "./train_set.h5" +VAL_SET = "./val_set.h5" +TEST_SET = "./test_set.h5" + +IMPUTATION_TRAIN_SET = "./imputation_train_set.h5" +IMPUTATION_VAL_SET = "./imputation_val_set.h5" + + +def save_data_set_into_h5(data, path): + with h5py.File(path, "w") as hf: + for i in data.keys(): + hf.create_dataset(i, data=data[i].astype(np.float32)) + EPOCHS = 1 class TestLazyLoadingClasses(unittest.TestCase): def setUp(self) -> None: + save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET) + save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET) + save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET) + save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET) + + save_data_set_into_h5( + { + "X": DATA["test_X"], + "X_intact": DATA["test_X_intact"], + "X_indicating_mask": DATA["test_X_indicating_mask"], + }, + TEST_SET, + ) assert os.path.exists(TRAIN_SET) assert os.path.exists(VAL_SET) diff --git a/pypots/tests/unified_data_for_test.py b/pypots/tests/unified_data_for_test.py index 93e59990..ffb0f395 100644 --- a/pypots/tests/unified_data_for_test.py +++ b/pypots/tests/unified_data_for_test.py @@ -5,8 +5,6 @@ # Created by Wenjie Du # License: GLP-v3 -import h5py -import numpy as np import torch from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler @@ -15,12 +13,6 @@ from pypots.data import load_specific_dataset -def save_data_set_into_h5(data, path): - with h5py.File(path, "w") as hf: - for i in data.keys(): - hf.create_dataset(i, data=data[i].astype(np.float32)) - - def gene_random_walk_data( n_steps=24, n_features=10, n_classes=2, n_samples_each_class=1000 ): @@ -136,24 +128,3 @@ def gene_physionet2012(): # generate and cache data first. # Otherwise, file lock will cause bug if running test parallely with pytest-xdist. DATA = gene_random_walk_data() - -TRAIN_SET = "./train_set.h5" -VAL_SET = "./val_set.h5" -TEST_SET = "./test_set.h5" - -IMPUTATION_TRAIN_SET = "./imputation_train_set.h5" -IMPUTATION_VAL_SET = "./imputation_val_set.h5" - -save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET) -save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET) -save_data_set_into_h5( - { - "X": DATA["test_X"], - "X_intact": DATA["test_X_intact"], - "X_indicating_mask": DATA["test_X_indicating_mask"], - }, - TEST_SET, -) - -save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET) -save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET) From 46fca4148b39aa01e29c80e3f0150e6dad1c9066 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 22:55:29 +0800 Subject: [PATCH 18/22] refactor: test scripts; --- pypots/tests/test_classification.py | 133 +++++++++-------- pypots/tests/test_clustering.py | 78 +++++----- .../tests/test_data_lazy_loading_from_file.py | 98 +++++++------ pypots/tests/test_forecasting.py | 29 ++-- pypots/tests/test_imputation.py | 138 +++++++++--------- 5 files changed, 254 insertions(+), 222 deletions(-) diff --git a/pypots/tests/test_classification.py b/pypots/tests/test_classification.py index 034d65ab..b57b9d37 100644 --- a/pypots/tests/test_classification.py +++ b/pypots/tests/test_classification.py @@ -20,18 +20,33 @@ class TestBRITS(unittest.TestCase): - def setUp(self) -> None: - logger.info("Running test cases for BRITS...") - self.brits = BRITS( - DATA["n_steps"], - DATA["n_features"], - 256, - n_classes=DATA["n_classes"], - epochs=EPOCHS, - ) + logger.info("Running tests for a classification model BRITS...") + + # initialize a BRITS model + brits = BRITS( + DATA["n_steps"], + DATA["n_features"], + 256, + n_classes=DATA["n_classes"], + epochs=EPOCHS, + ) + + def test_0_fit(self): self.brits.fit(TRAIN_SET, VAL_SET) - def test_parameters(self): + def test_1_classify(self): + predictions = self.brits.classify(TEST_SET) + metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) + logger.info( + f'ROC_AUC: {metrics["roc_auc"]}, \n' + f'PR_AUC: {metrics["pr_auc"]},\n' + f'F1: {metrics["f1"]},\n' + f'Precision: {metrics["precision"]},\n' + f'Recall: {metrics["recall"]},\n' + ) + assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" + + def test_2_parameters(self): assert hasattr(self.brits, "model") and self.brits.model is not None assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None @@ -44,8 +59,24 @@ def test_parameters(self): and self.brits.best_model_dict is not None ) - def test_classify(self): - predictions = self.brits.classify(TEST_SET) + +class TestGRUD(unittest.TestCase): + logger.info("Running tests for a classification model GRUD...") + + # initialize a GRUD model + grud = GRUD( + DATA["n_steps"], + DATA["n_features"], + 256, + n_classes=DATA["n_classes"], + epochs=EPOCHS, + ) + + def test_0_fit(self): + self.grud.fit(TRAIN_SET, VAL_SET) + + def test_1_classify(self): + predictions = self.grud.classify(TEST_SET) metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) logger.info( f'ROC_AUC: {metrics["roc_auc"]}, \n' @@ -56,20 +87,7 @@ def test_classify(self): ) assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" - -class TestGRUD(unittest.TestCase): - def setUp(self) -> None: - logger.info("Running test cases for GRUD...") - self.grud = GRUD( - DATA["n_steps"], - DATA["n_features"], - 256, - n_classes=DATA["n_classes"], - epochs=EPOCHS, - ) - self.grud.fit(TRAIN_SET, VAL_SET) - - def test_parameters(self): + def test_2_parameters(self): assert hasattr(self.grud, "model") and self.grud.model is not None assert hasattr(self.grud, "optimizer") and self.grud.optimizer is not None @@ -82,8 +100,32 @@ def test_parameters(self): and self.grud.best_model_dict is not None ) - def test_classify(self): - predictions = self.grud.classify(TEST_SET) + +class TestRaindrop(unittest.TestCase): + logger.info("Running tests for a classification model Raindrop...") + + # initialize a Raindrop model + raindrop = Raindrop( + DATA["n_features"], + 2, + DATA["n_features"] * 4, + 256, + 2, + DATA["n_classes"], + 0.3, + DATA["n_steps"], + 0, + "mean", + False, + False, + epochs=EPOCHS, + ) + + def test_0_fit(self): + self.raindrop.fit(TRAIN_SET, VAL_SET) + + def test_1_classify(self): + predictions = self.raindrop.classify(TEST_SET) metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) logger.info( f'ROC_AUC: {metrics["roc_auc"]}, \n' @@ -94,28 +136,7 @@ def test_classify(self): ) assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" - -class TestRaindrop(unittest.TestCase): - def setUp(self) -> None: - logger.info("Running test cases for Raindrop...") - self.raindrop = Raindrop( - DATA["n_features"], - 2, - DATA["n_features"] * 4, - 256, - 2, - DATA["n_classes"], - 0.3, - DATA["n_steps"], - 0, - "mean", - False, - False, - epochs=EPOCHS, - ) - self.raindrop.fit(TRAIN_SET, VAL_SET) - - def test_parameters(self): + def test_2_parameters(self): assert hasattr(self.raindrop, "model") and self.raindrop.model is not None assert ( @@ -130,18 +151,6 @@ def test_parameters(self): and self.raindrop.best_model_dict is not None ) - def test_classify(self): - predictions = self.raindrop.classify(TEST_SET) - metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) - logger.info( - f'ROC_AUC: {metrics["roc_auc"]}, \n' - f'PR_AUC: {metrics["pr_auc"]},\n' - f'F1: {metrics["f1"]},\n' - f'Precision: {metrics["precision"]},\n' - f'Recall: {metrics["recall"]},\n' - ) - assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" - if __name__ == "__main__": unittest.main() diff --git a/pypots/tests/test_clustering.py b/pypots/tests/test_clustering.py index ddc36887..3696d2b5 100644 --- a/pypots/tests/test_clustering.py +++ b/pypots/tests/test_clustering.py @@ -23,19 +23,22 @@ class TestCRLI(unittest.TestCase): - def setUp(self) -> None: - logger.info("Running test cases for CRLI...") - self.crli = CRLI( - n_steps=DATA["n_steps"], - n_features=DATA["n_features"], - n_clusters=DATA["n_classes"], - n_generator_layers=2, - rnn_hidden_size=128, - epochs=EPOCHS, - ) + logger.info("Running tests for a clustering model CRLI...") + + # initialize a CRLI model + crli = CRLI( + n_steps=DATA["n_steps"], + n_features=DATA["n_features"], + n_clusters=DATA["n_classes"], + n_generator_layers=2, + rnn_hidden_size=128, + epochs=EPOCHS, + ) + + def test_0_fit(self): self.crli.fit(TRAIN_SET) - def test_parameters(self): + def test_1_parameters(self): assert hasattr(self.crli, "model") and self.crli.model is not None assert hasattr(self.crli, "G_optimizer") and self.crli.G_optimizer is not None @@ -49,7 +52,7 @@ def test_parameters(self): and self.crli.best_model_dict is not None ) - def test_cluster(self): + def test_2_cluster(self): clustering = self.crli.cluster(TEST_SET) RI = cal_rand_index(clustering, DATA["test_y"]) CP = cal_cluster_purity(clustering, DATA["test_y"]) @@ -57,20 +60,35 @@ def test_cluster(self): class TestVaDER(unittest.TestCase): - def setUp(self) -> None: - logger.info("Running test cases for VaDER...") - self.vader = VaDER( - n_steps=DATA["n_steps"], - n_features=DATA["n_features"], - n_clusters=DATA["n_classes"], - rnn_hidden_size=64, - d_mu_stddev=5, - pretrain_epochs=20, - epochs=EPOCHS, - ) + logger.info("Running tests for a clustering model Transformer...") + + # initialize a VaDER model + vader = VaDER( + n_steps=DATA["n_steps"], + n_features=DATA["n_features"], + n_clusters=DATA["n_classes"], + rnn_hidden_size=64, + d_mu_stddev=5, + pretrain_epochs=20, + epochs=EPOCHS, + ) + + def test_0_fit(self): self.vader.fit(TRAIN_SET) - def test_parameters(self): + def test_1_cluster(self): + try: + clustering = self.vader.cluster(TEST_SET) + RI = cal_rand_index(clustering, DATA["test_y"]) + CP = cal_cluster_purity(clustering, DATA["test_y"]) + logger.info(f"RI: {RI}\nCP: {CP}") + except np.linalg.LinAlgError as e: + logger.error( + f"{e}\n" + "Got singular matrix, please try to retrain the model to fix this" + ) + + def test_2_parameters(self): assert hasattr(self.vader, "model") and self.vader.model is not None assert hasattr(self.vader, "optimizer") and self.vader.optimizer is not None @@ -83,18 +101,6 @@ def test_parameters(self): and self.vader.best_model_dict is not None ) - def test_cluster(self): - try: - clustering = self.vader.cluster(TEST_SET) - RI = cal_rand_index(clustering, DATA["test_y"]) - CP = cal_cluster_purity(clustering, DATA["test_y"]) - logger.info(f"RI: {RI}\nCP: {CP}") - except np.linalg.LinAlgError as e: - logger.info( - f"{e}\n" - "Got singular matrix, please try to retrain the model to fix this" - ) - if __name__ == "__main__": unittest.main() diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data_lazy_loading_from_file.py index bfc1464a..77b7faca 100644 --- a/pypots/tests/test_data_lazy_loading_from_file.py +++ b/pypots/tests/test_data_lazy_loading_from_file.py @@ -9,11 +9,11 @@ import unittest import h5py -import numpy as np from pypots.classification import BRITS, GRUD from pypots.imputation import SAITS from pypots.tests.unified_data_for_test import DATA +from pypots.utils.logging import logger TRAIN_SET = "./train_set.h5" VAL_SET = "./val_set.h5" @@ -26,28 +26,65 @@ def save_data_set_into_h5(data, path): with h5py.File(path, "w") as hf: for i in data.keys(): - hf.create_dataset(i, data=data[i].astype(np.float32)) + tp = int if i == "y" else "float32" + hf.create_dataset(i, data=data[i].astype(tp)) EPOCHS = 1 +save_data_set_into_h5( + {"X": DATA["train_X"], "y": DATA["train_y"].astype(int)}, TRAIN_SET +) +save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"].astype(int)}, VAL_SET) +save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET) +save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET) + +save_data_set_into_h5( + { + "X": DATA["test_X"], + "X_intact": DATA["test_X_intact"], + "X_indicating_mask": DATA["test_X_indicating_mask"], + }, + TEST_SET, +) + class TestLazyLoadingClasses(unittest.TestCase): - def setUp(self) -> None: - save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET) - save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET) - save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET) - save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET) - - save_data_set_into_h5( - { - "X": DATA["test_X"], - "X_intact": DATA["test_X_intact"], - "X_indicating_mask": DATA["test_X_indicating_mask"], - }, - TEST_SET, - ) + logger.info("Running tests for Dataset classes with lazy-loading strategy...") + + # initialize a SAITS model for testing DatasetForMIT and BaseDataset + saits = SAITS( + DATA["n_steps"], + DATA["n_features"], + n_layers=2, + d_model=256, + d_inner=128, + n_head=4, + d_k=64, + d_v=64, + dropout=0.1, + epochs=EPOCHS, + ) + + # initialize a BRITS model for testing DatasetForBRITS + brits = BRITS( + DATA["n_steps"], + DATA["n_features"], + 256, + n_classes=DATA["n_classes"], + epochs=EPOCHS, + ) + + # initialize a GRUD model for testing DatasetForGRUD + grud = GRUD( + DATA["n_steps"], + DATA["n_features"], + 256, + n_classes=DATA["n_classes"], + epochs=EPOCHS, + ) + def setUp(self) -> None: assert os.path.exists(TRAIN_SET) assert os.path.exists(VAL_SET) assert os.path.exists(TEST_SET) @@ -55,35 +92,6 @@ def setUp(self) -> None: assert os.path.exists(IMPUTATION_TRAIN_SET) assert os.path.exists(IMPUTATION_VAL_SET) - self.saits = SAITS( - DATA["n_steps"], - DATA["n_features"], - n_layers=2, - d_model=256, - d_inner=128, - n_head=4, - d_k=64, - d_v=64, - dropout=0.1, - epochs=EPOCHS, - ) - - self.brits = BRITS( - DATA["n_steps"], - DATA["n_features"], - 256, - n_classes=DATA["n_classes"], - epochs=EPOCHS, - ) - - self.grud = GRUD( - DATA["n_steps"], - DATA["n_features"], - 256, - n_classes=DATA["n_classes"], - epochs=EPOCHS, - ) - def test_DatasetForMIT(self): self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET) _ = self.saits.impute(X=TEST_SET) diff --git a/pypots/tests/test_forecasting.py b/pypots/tests/test_forecasting.py index 409c7f81..f44d7207 100644 --- a/pypots/tests/test_forecasting.py +++ b/pypots/tests/test_forecasting.py @@ -20,20 +20,21 @@ class TestBTTF(unittest.TestCase): - def setUp(self) -> None: - logger.info("Running test cases for BTTF...") - self.bttf = BTTF( - 100, - 10, - 20, - 2, - 10, - np.asarray([1, 2, 3, 10, 10 + 1, 10 + 2, 20, 20 + 1, 20 + 2]), - 5, - 5, - ) - - def test_forecasting(self): + logger.info("Running tests for a forecasting model BTTF...") + + # initialize a BTTF model + bttf = BTTF( + 100, + 10, + 20, + 2, + 10, + np.asarray([1, 2, 3, 10, 10 + 1, 10 + 2, 20, 20 + 1, 20 + 2]), + 5, + 5, + ) + + def test_0_forecasting(self): predictions = self.bttf.forecast(TEST_SET) mae = cal_mae(predictions, DATA["test_X_intact"][:, 100:]) logger.info(f"prediction MAE: {mae}") diff --git a/pypots/tests/test_imputation.py b/pypots/tests/test_imputation.py index 8367eb21..209b50f4 100644 --- a/pypots/tests/test_imputation.py +++ b/pypots/tests/test_imputation.py @@ -28,23 +28,36 @@ class TestSAITS(unittest.TestCase): - def setUp(self) -> None: - logger.info("Running test cases for SAITS...") - self.saits = SAITS( - DATA["n_steps"], - DATA["n_features"], - n_layers=2, - d_model=256, - d_inner=128, - n_head=4, - d_k=64, - d_v=64, - dropout=0.1, - epochs=EPOCH, - ) + logger.info("Running tests for an imputation model SAITS...") + + # initialize a SAITS model + saits = SAITS( + DATA["n_steps"], + DATA["n_features"], + n_layers=2, + d_model=256, + d_inner=128, + n_head=4, + d_k=64, + d_v=64, + dropout=0.1, + epochs=EPOCH, + ) + + def test_0_fit(self): self.saits.fit(TRAIN_SET, VAL_SET) - def test_parameters(self): + def test_1_impute(self): + imputed_X = self.saits.impute(TEST_SET) + assert not np.isnan( + imputed_X + ).any(), "Output still has missing values after running impute()." + test_MAE = cal_mae( + imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] + ) + logger.info(f"SAITS test_MAE: {test_MAE}") + + def test_2_parameters(self): assert hasattr(self.saits, "model") and self.saits.model is not None assert hasattr(self.saits, "optimizer") and self.saits.optimizer is not None @@ -57,35 +70,38 @@ def test_parameters(self): and self.saits.best_model_dict is not None ) - def test_impute(self): - imputed_X = self.saits.impute(TEST_SET) + +class TestTransformer(unittest.TestCase): + logger.info("Running tests for an imputation model Transformer...") + + # initialize a Transformer model + transformer = Transformer( + DATA["n_steps"], + DATA["n_features"], + n_layers=2, + d_model=256, + d_inner=128, + n_head=4, + d_k=64, + d_v=64, + dropout=0.1, + epochs=EPOCH, + ) + + def test_0_fit(self): + self.transformer.fit(TRAIN_SET, VAL_SET) + + def test_1_impute(self): + imputed_X = self.transformer.impute(TEST_SET) assert not np.isnan( imputed_X ).any(), "Output still has missing values after running impute()." test_MAE = cal_mae( imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] ) - logger.info(f"SAITS test_MAE: {test_MAE}") - - -class TestTransformer(unittest.TestCase): - def setUp(self) -> None: - logger.info("Running test cases for Transformer...") - self.transformer = Transformer( - DATA["n_steps"], - DATA["n_features"], - n_layers=2, - d_model=256, - d_inner=128, - n_head=4, - d_k=64, - d_v=64, - dropout=0.1, - epochs=EPOCH, - ) - self.transformer.fit(TRAIN_SET, VAL_SET) + logger.info(f"Transformer test_MAE: {test_MAE}") - def test_parameters(self): + def test_2_parameters(self): assert hasattr(self.transformer, "model") and self.transformer.model is not None assert ( @@ -101,24 +117,27 @@ def test_parameters(self): and self.transformer.best_model_dict is not None ) - def test_impute(self): - imputed_X = self.transformer.impute(TEST_SET) + +class TestBRITS(unittest.TestCase): + logger.info("Running tests for an imputation model BRITS...") + + # initialize a BRITS model + brits = BRITS(DATA["n_steps"], DATA["n_features"], 256, epochs=EPOCH) + + def test_0_fit(self): + self.brits.fit(TRAIN_SET, VAL_SET) + + def test_1_impute(self): + imputed_X = self.brits.impute(TEST_SET) assert not np.isnan( imputed_X ).any(), "Output still has missing values after running impute()." test_MAE = cal_mae( imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] ) - logger.info(f"Transformer test_MAE: {test_MAE}") - - -class TestBRITS(unittest.TestCase): - def setUp(self) -> None: - logger.info("Running test cases for BRITS...") - self.brits = BRITS(DATA["n_steps"], DATA["n_features"], 256, epochs=EPOCH) - self.brits.fit(TRAIN_SET, VAL_SET) + logger.info(f"BRITS test_MAE: {test_MAE}") - def test_parameters(self): + def test_2_parameters(self): assert hasattr(self.brits, "model") and self.brits.model is not None assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None @@ -131,26 +150,12 @@ def test_parameters(self): and self.brits.best_model_dict is not None ) - def test_impute(self): - imputed_X = self.brits.impute(TEST_SET) - assert not np.isnan( - imputed_X - ).any(), "Output still has missing values after running impute()." - test_MAE = cal_mae( - imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"] - ) - logger.info(f"BRITS test_MAE: {test_MAE}") - class TestLOCF(unittest.TestCase): - def setUp(self) -> None: - logger.info("Running test cases for LOCF...") - self.locf = LOCF(nan=0) - - def test_parameters(self): - assert hasattr(self.locf, "nan") and self.locf.nan is not None + logger.info("Running tests for an imputation model LOCF...") + locf = LOCF(nan=0) - def test_impute(self): + def test_0_impute(self): test_X_imputed = self.locf.impute(TEST_SET) assert not np.isnan( test_X_imputed @@ -160,6 +165,9 @@ def test_impute(self): ) logger.info(f"LOCF test_MAE: {test_MAE}") + def test_1_parameters(self): + assert hasattr(self.locf, "nan") and self.locf.nan is not None + if __name__ == "__main__": unittest.main() From 13a7cd15c123136a64fcda780805cade4327f87c Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 31 Mar 2023 23:56:40 +0800 Subject: [PATCH 19/22] fix: use annotation @pytest.mark.xdist_group to help pytest-dist execute tasks sequentially; Some test tasks need to be executed sequentially, but we're using pytest-dist to accelerate the testing precess. To solve this problem, refer to https://github.com/pytest-dev/pytest-xdist/issues/385#issuecomment-1304877301. And please note that it need pytest-dist >= v2.5.0. --- .github/workflows/testing.yml | 14 +++--- pypots/tests/environment_test.yml | 2 +- pypots/tests/test_classification.py | 13 +++++- pypots/tests/test_clustering.py | 9 +++- ...lazy_loading_from_file.py => test_data.py} | 13 ++++-- pypots/tests/test_forecasting.py | 2 + pypots/tests/test_imputation.py | 14 +++++- pypots/tests/test_logging.py | 43 +++++++++++-------- 8 files changed, 78 insertions(+), 32 deletions(-) rename pypots/tests/{test_data_lazy_loading_from_file.py => test_data.py} (87%) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 6c201f5f..50cba726 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -43,13 +43,13 @@ jobs: - name: Test with pytest run: | # run tests separately here due to Segmentation Fault in test_clustering when run all in - # one command with `pytest` on MacOS. Bugs not catched, so this is a trade-off to avoid SF. - python -m pytest -rA pypots/tests/test_classification.py -n auto --cov=pypots - python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append - python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append - python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append - python -m pytest -rA pypots/tests/test_data_lazy_loading_from_file.py -n auto --cov=pypots --cov-append - python -m pytest -rA pypots/tests/test_logging.py -n auto --cov=pypots --cov-append + # one command with `pytest` on MacOS. Bugs not caught, so this is a trade-off to avoid SF. + python -m pytest -rA pypots/tests/test_classification.py -n auto --cov=pypots --dist=loadgroup + python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append --dist=loadgroup + python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append --dist=loadgroup + python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append --dist=loadgroup + python -m pytest -rA pypots/tests/test_data_lazy_loading_from_file.py -n auto --cov=pypots --cov-append --dist=loadgroup + python -m pytest -rA pypots/tests/test_logging.py -n auto --cov=pypots --cov-append --dist=loadgroup - name: Generate the LCOV report run: | diff --git a/pypots/tests/environment_test.yml b/pypots/tests/environment_test.yml index dc4e3316..ceadbe60 100644 --- a/pypots/tests/environment_test.yml +++ b/pypots/tests/environment_test.yml @@ -14,7 +14,7 @@ dependencies: - conda-forge::tensorboard - conda-forge::pip - conda-forge::pytest-cov - - conda-forge::pytest-xdist + - conda-forge::pytest-xdist>=2.5.0 - conda-forge::coverage - conda-forge::pycorruptor - conda-forge::tsdb diff --git a/pypots/tests/test_classification.py b/pypots/tests/test_classification.py index b57b9d37..36f48484 100644 --- a/pypots/tests/test_classification.py +++ b/pypots/tests/test_classification.py @@ -7,10 +7,12 @@ import unittest +import pytest + from pypots.classification import BRITS, GRUD, Raindrop from pypots.tests.unified_data_for_test import DATA -from pypots.utils.metrics import cal_binary_classification_metrics from pypots.utils.logging import logger +from pypots.utils.metrics import cal_binary_classification_metrics EPOCHS = 5 @@ -31,9 +33,11 @@ class TestBRITS(unittest.TestCase): epochs=EPOCHS, ) + @pytest.mark.xdist_group(name="classification-brits") def test_0_fit(self): self.brits.fit(TRAIN_SET, VAL_SET) + @pytest.mark.xdist_group(name="classification-brits") def test_1_classify(self): predictions = self.brits.classify(TEST_SET) metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) @@ -46,6 +50,7 @@ def test_1_classify(self): ) assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" + @pytest.mark.xdist_group(name="classification-brits") def test_2_parameters(self): assert hasattr(self.brits, "model") and self.brits.model is not None @@ -72,9 +77,11 @@ class TestGRUD(unittest.TestCase): epochs=EPOCHS, ) + @pytest.mark.xdist_group(name="classification-grud") def test_0_fit(self): self.grud.fit(TRAIN_SET, VAL_SET) + @pytest.mark.xdist_group(name="classification-grud") def test_1_classify(self): predictions = self.grud.classify(TEST_SET) metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) @@ -87,6 +94,7 @@ def test_1_classify(self): ) assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" + @pytest.mark.xdist_group(name="classification-grud") def test_2_parameters(self): assert hasattr(self.grud, "model") and self.grud.model is not None @@ -121,9 +129,11 @@ class TestRaindrop(unittest.TestCase): epochs=EPOCHS, ) + @pytest.mark.xdist_group(name="classification-raindrop") def test_0_fit(self): self.raindrop.fit(TRAIN_SET, VAL_SET) + @pytest.mark.xdist_group(name="classification-raindrop") def test_1_classify(self): predictions = self.raindrop.classify(TEST_SET) metrics = cal_binary_classification_metrics(predictions, DATA["test_y"]) @@ -136,6 +146,7 @@ def test_1_classify(self): ) assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5" + @pytest.mark.xdist_group(name="classification-raindrop") def test_2_parameters(self): assert hasattr(self.raindrop, "model") and self.raindrop.model is not None diff --git a/pypots/tests/test_clustering.py b/pypots/tests/test_clustering.py index 3696d2b5..15b00736 100644 --- a/pypots/tests/test_clustering.py +++ b/pypots/tests/test_clustering.py @@ -9,10 +9,11 @@ import unittest import numpy as np +import pytest from pypots.clustering import VaDER, CRLI -from pypots.utils.logging import logger from pypots.tests.unified_data_for_test import DATA +from pypots.utils.logging import logger from pypots.utils.metrics import cal_rand_index, cal_cluster_purity EPOCHS = 5 @@ -35,9 +36,11 @@ class TestCRLI(unittest.TestCase): epochs=EPOCHS, ) + @pytest.mark.xdist_group(name="clustering-crli") def test_0_fit(self): self.crli.fit(TRAIN_SET) + @pytest.mark.xdist_group(name="clustering-crli") def test_1_parameters(self): assert hasattr(self.crli, "model") and self.crli.model is not None @@ -52,6 +55,7 @@ def test_1_parameters(self): and self.crli.best_model_dict is not None ) + @pytest.mark.xdist_group(name="clustering-crli") def test_2_cluster(self): clustering = self.crli.cluster(TEST_SET) RI = cal_rand_index(clustering, DATA["test_y"]) @@ -73,9 +77,11 @@ class TestVaDER(unittest.TestCase): epochs=EPOCHS, ) + @pytest.mark.xdist_group(name="clustering-vader") def test_0_fit(self): self.vader.fit(TRAIN_SET) + @pytest.mark.xdist_group(name="clustering-vader") def test_1_cluster(self): try: clustering = self.vader.cluster(TEST_SET) @@ -88,6 +94,7 @@ def test_1_cluster(self): "Got singular matrix, please try to retrain the model to fix this" ) + @pytest.mark.xdist_group(name="clustering-vader") def test_2_parameters(self): assert hasattr(self.vader, "model") and self.vader.model is not None diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data.py similarity index 87% rename from pypots/tests/test_data_lazy_loading_from_file.py rename to pypots/tests/test_data.py index 77b7faca..eff222a9 100644 --- a/pypots/tests/test_data_lazy_loading_from_file.py +++ b/pypots/tests/test_data.py @@ -9,6 +9,7 @@ import unittest import h5py +import pytest from pypots.classification import BRITS, GRUD from pypots.imputation import SAITS @@ -92,15 +93,21 @@ def setUp(self) -> None: assert os.path.exists(IMPUTATION_TRAIN_SET) assert os.path.exists(IMPUTATION_VAL_SET) - def test_DatasetForMIT(self): + @pytest.mark.xdist_group(name="data-lazy-loading") + def test_0_DatasetForMIT(self): self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET) + + @pytest.mark.xdist_group(name="data-lazy-loading") + def test_1_BaseDataset(self): _ = self.saits.impute(X=TEST_SET) - def test_DatasetForBRITS(self): + @pytest.mark.xdist_group(name="data-lazy-loading") + def test_2_DatasetForBRITS(self): self.brits.fit(train_set=TRAIN_SET, val_set=VAL_SET) _ = self.brits.classify(X=TEST_SET) - def test_DatasetForGRUD(self): + @pytest.mark.xdist_group(name="data-lazy-loading") + def test_3_DatasetForGRUD(self): self.grud.fit(train_set=TRAIN_SET, val_set=VAL_SET) _ = self.grud.classify(X=TEST_SET) diff --git a/pypots/tests/test_forecasting.py b/pypots/tests/test_forecasting.py index f44d7207..7a6bed4d 100644 --- a/pypots/tests/test_forecasting.py +++ b/pypots/tests/test_forecasting.py @@ -8,6 +8,7 @@ import unittest import numpy as np +import pytest from pypots.forecasting import BTTF from pypots.tests.unified_data_for_test import gene_random_walk_data @@ -34,6 +35,7 @@ class TestBTTF(unittest.TestCase): 5, ) + @pytest.mark.xdist_group(name="forecasting-bttf") def test_0_forecasting(self): predictions = self.bttf.forecast(TEST_SET) mae = cal_mae(predictions, DATA["test_X_intact"][:, 100:]) diff --git a/pypots/tests/test_imputation.py b/pypots/tests/test_imputation.py index 209b50f4..34d75153 100644 --- a/pypots/tests/test_imputation.py +++ b/pypots/tests/test_imputation.py @@ -9,6 +9,7 @@ import unittest import numpy as np +import pytest from pypots.imputation import ( SAITS, @@ -17,8 +18,8 @@ LOCF, ) from pypots.tests.unified_data_for_test import DATA -from pypots.utils.metrics import cal_mae from pypots.utils.logging import logger +from pypots.utils.metrics import cal_mae EPOCH = 5 @@ -44,9 +45,11 @@ class TestSAITS(unittest.TestCase): epochs=EPOCH, ) + @pytest.mark.xdist_group(name="imputation-saits") def test_0_fit(self): self.saits.fit(TRAIN_SET, VAL_SET) + @pytest.mark.xdist_group(name="imputation-saits") def test_1_impute(self): imputed_X = self.saits.impute(TEST_SET) assert not np.isnan( @@ -57,6 +60,7 @@ def test_1_impute(self): ) logger.info(f"SAITS test_MAE: {test_MAE}") + @pytest.mark.xdist_group(name="imputation-saits") def test_2_parameters(self): assert hasattr(self.saits, "model") and self.saits.model is not None @@ -88,9 +92,11 @@ class TestTransformer(unittest.TestCase): epochs=EPOCH, ) + @pytest.mark.xdist_group(name="imputation-transformer") def test_0_fit(self): self.transformer.fit(TRAIN_SET, VAL_SET) + @pytest.mark.xdist_group(name="imputation-transformer") def test_1_impute(self): imputed_X = self.transformer.impute(TEST_SET) assert not np.isnan( @@ -101,6 +107,7 @@ def test_1_impute(self): ) logger.info(f"Transformer test_MAE: {test_MAE}") + @pytest.mark.xdist_group(name="imputation-transformer") def test_2_parameters(self): assert hasattr(self.transformer, "model") and self.transformer.model is not None @@ -124,9 +131,11 @@ class TestBRITS(unittest.TestCase): # initialize a BRITS model brits = BRITS(DATA["n_steps"], DATA["n_features"], 256, epochs=EPOCH) + @pytest.mark.xdist_group(name="imputation-brits") def test_0_fit(self): self.brits.fit(TRAIN_SET, VAL_SET) + @pytest.mark.xdist_group(name="imputation-brits") def test_1_impute(self): imputed_X = self.brits.impute(TEST_SET) assert not np.isnan( @@ -137,6 +146,7 @@ def test_1_impute(self): ) logger.info(f"BRITS test_MAE: {test_MAE}") + @pytest.mark.xdist_group(name="imputation-brits") def test_2_parameters(self): assert hasattr(self.brits, "model") and self.brits.model is not None @@ -155,6 +165,7 @@ class TestLOCF(unittest.TestCase): logger.info("Running tests for an imputation model LOCF...") locf = LOCF(nan=0) + @pytest.mark.xdist_group(name="imputation-locf") def test_0_impute(self): test_X_imputed = self.locf.impute(TEST_SET) assert not np.isnan( @@ -165,6 +176,7 @@ def test_0_impute(self): ) logger.info(f"LOCF test_MAE: {test_MAE}") + @pytest.mark.xdist_group(name="imputation-locf") def test_1_parameters(self): assert hasattr(self.locf, "nan") and self.locf.nan is not None diff --git a/pypots/tests/test_logging.py b/pypots/tests/test_logging.py index 3ebc3fca..f3c888fe 100644 --- a/pypots/tests/test_logging.py +++ b/pypots/tests/test_logging.py @@ -13,30 +13,37 @@ class TestLogger(unittest.TestCase): - def setUp(self) -> None: - self.logger_creator = Logger(name="PyPOTS testing log", logging_level="debug") - self.logger = self.logger_creator.logger + logger_creator = Logger(name="PyPOTS testing log", logging_level="debug") + logger = logger_creator.logger def test_different_level_logging(self): - self.logger.debug('debug') - self.logger.info('info') - self.logger.warning('warning') - self.logger.error('error') + self.logger.debug("debug") + self.logger.info("info") + self.logger.warning("warning") + self.logger.error("error") def test_changing_level(self): - self.logger_creator.set_level('info') - assert self.logger.level == 20, f'the level of logger is {self.logger.level}, not INFO' - self.logger_creator.set_level('warning') - assert self.logger.level == 30, f'the level of logger is {self.logger.level}, not WARNING' - self.logger_creator.set_level('error') - assert self.logger.level == 40, f'the level of logger is {self.logger.level}, not ERROR' - self.logger_creator.set_level('debug') - assert self.logger.level == 10, f'the level of logger is {self.logger.level}, not DEBUG' + self.logger_creator.set_level("info") + assert ( + self.logger.level == 20 + ), f"the level of logger is {self.logger.level}, not INFO" + self.logger_creator.set_level("warning") + assert ( + self.logger.level == 30 + ), f"the level of logger is {self.logger.level}, not WARNING" + self.logger_creator.set_level("error") + assert ( + self.logger.level == 40 + ), f"the level of logger is {self.logger.level}, not ERROR" + self.logger_creator.set_level("debug") + assert ( + self.logger.level == 10 + ), f"the level of logger is {self.logger.level}, not DEBUG" def test_saving_log_into_file(self): - self.logger_creator.set_saving_path('test_log', 'testing.log') - assert os.path.exists('test_log/testing.log') - shutil.rmtree('test_log', ignore_errors=True) + self.logger_creator.set_saving_path("test_log", "testing.log") + assert os.path.exists("test_log/testing.log") + shutil.rmtree("test_log", ignore_errors=True) if __name__ == "__main__": From 9ad9c7ea775bfa40ae469f5bd63006f7790f5009 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sat, 1 Apr 2023 00:00:23 +0800 Subject: [PATCH 20/22] fix: fix some warnings while running VaDER; --- pypots/clustering/vader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypots/clustering/vader.py b/pypots/clustering/vader.py index 128743a4..9a7a0e1f 100644 --- a/pypots/clustering/vader.py +++ b/pypots/clustering/vader.py @@ -103,7 +103,7 @@ def set_values(self, mu, var, phi): assert phi.shape == self.phi_c_unscaled.shape self.mu_c_unscaled = torch.nn.Parameter(mu) self.var_c_unscaled = torch.nn.Parameter(var) - self.phi_c_unscaled = torch.tensor(phi) + self.phi_c_unscaled = phi def forward(self): mu_c = self.mu_c_unscaled @@ -293,6 +293,7 @@ def forward(self, inputs, pretrain=False): ii, jj = torch.meshgrid( torch.arange(self.n_clusters, dtype=torch.int64, device=device), torch.arange(batch_size, dtype=torch.int64, device=device), + indexing="ij", ) ii = ii.flatten() jj = jj.flatten() From e7bee57223abc2bb8aaa065092d309f75c5f86c0 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sat, 1 Apr 2023 01:02:24 +0800 Subject: [PATCH 21/22] fix: move dataset saving into test steps; --- pypots/tests/test_data.py | 47 +++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/pypots/tests/test_data.py b/pypots/tests/test_data.py index eff222a9..bf2c238d 100644 --- a/pypots/tests/test_data.py +++ b/pypots/tests/test_data.py @@ -33,22 +33,6 @@ def save_data_set_into_h5(data, path): EPOCHS = 1 -save_data_set_into_h5( - {"X": DATA["train_X"], "y": DATA["train_y"].astype(int)}, TRAIN_SET -) -save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"].astype(int)}, VAL_SET) -save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET) -save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET) - -save_data_set_into_h5( - { - "X": DATA["test_X"], - "X_intact": DATA["test_X_intact"], - "X_indicating_mask": DATA["test_X_indicating_mask"], - }, - TEST_SET, -) - class TestLazyLoadingClasses(unittest.TestCase): logger.info("Running tests for Dataset classes with lazy-loading strategy...") @@ -85,20 +69,29 @@ class TestLazyLoadingClasses(unittest.TestCase): epochs=EPOCHS, ) - def setUp(self) -> None: - assert os.path.exists(TRAIN_SET) - assert os.path.exists(VAL_SET) - assert os.path.exists(TEST_SET) - - assert os.path.exists(IMPUTATION_TRAIN_SET) - assert os.path.exists(IMPUTATION_VAL_SET) - @pytest.mark.xdist_group(name="data-lazy-loading") - def test_0_DatasetForMIT(self): - self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET) + def test_0_save_datasets_into_files(self): + save_data_set_into_h5( + {"X": DATA["train_X"], "y": DATA["train_y"].astype(int)}, TRAIN_SET + ) + save_data_set_into_h5( + {"X": DATA["val_X"], "y": DATA["val_y"].astype(int)}, VAL_SET + ) + save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET) + save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET) + + save_data_set_into_h5( + { + "X": DATA["test_X"], + "X_intact": DATA["test_X_intact"], + "X_indicating_mask": DATA["test_X_indicating_mask"], + }, + TEST_SET, + ) @pytest.mark.xdist_group(name="data-lazy-loading") - def test_1_BaseDataset(self): + def test_1_DatasetForMIT_BaseDataset(self): + self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET) _ = self.saits.impute(X=TEST_SET) @pytest.mark.xdist_group(name="data-lazy-loading") From 235c6070268b5041570e4997c90142d3f435075d Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sat, 1 Apr 2023 01:17:31 +0800 Subject: [PATCH 22/22] fix: the error file name of test_data.py; --- .github/workflows/testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 50cba726..41b70c44 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -48,7 +48,7 @@ jobs: python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append --dist=loadgroup python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append --dist=loadgroup python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append --dist=loadgroup - python -m pytest -rA pypots/tests/test_data_lazy_loading_from_file.py -n auto --cov=pypots --cov-append --dist=loadgroup + python -m pytest -rA pypots/tests/test_data.py -n auto --cov=pypots --cov-append --dist=loadgroup python -m pytest -rA pypots/tests/test_logging.py -n auto --cov=pypots --cov-append --dist=loadgroup - name: Generate the LCOV report