From f8da4f6365e6fc2cea2e68338d3625088755ba3f Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Mon, 16 Jan 2023 21:37:10 +0800
Subject: [PATCH 01/22] feat: add the lazy-loading strategy for BaseDataset;

---
 pypots/data/base.py | 158 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 149 insertions(+), 9 deletions(-)

diff --git a/pypots/data/base.py b/pypots/data/base.py
index 827b5d93..462852e1 100644
--- a/pypots/data/base.py
+++ b/pypots/data/base.py
@@ -5,8 +5,12 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: GPL-v3
 
-from torch.utils.data import Dataset
+import h5py
 import torch
+from torch.utils.data import Dataset
+
+# Currently we only support h5 files
+SUPPORTED_DATASET_FILE_TYPE = ["h5py"]
 
 
 class BaseDataset(Dataset):
@@ -19,32 +23,81 @@ class BaseDataset(Dataset):
 
     y : tensor, shape of [n_samples], optional, default=None,
         Classification labels of according time-series samples.
+
+    file_path : str,
+        The path to the dataset file.
+
+    file_type : str,
+        The type of the given file, should be one of `numpy`, `h5py`, `pickle`.
     """
 
-    def __init__(self, X, y=None):
+    def __init__(self, X=None, y=None, file_path=None, file_type="h5py"):
         super().__init__()
         # types and shapes had been checked after X and y input into the model
         # So they are safe to use here. No need to check again.
+
+        assert X is None and file_path is None, f"X and file_path cannot both be None."
+        assert (
+            X is not None and file_path is not None
+        ), f"X and file_path cannot both be given. Either of them should be given."
+        assert (
+            file_type in SUPPORTED_DATASET_FILE_TYPE
+        ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}"
+
         self.X = X
         self.y = y
-        self.n_steps = self.X.shape[1]
-        self.n_features = self.X.shape[2]
+        self.file_path = file_path
+        self.file_type = file_type
+        if self.file_path is not None:
+            self.file_handler = self._open_file_handle()
+            assert (
+                "X" in self.file_handler.keys()
+            ), "The given dataset file doesn't contains X. Please double check."
+        self.sample_num = self._get_sample_num()
+
+        if self.X is not None:
+            self.fetch_data = self._fetch_data_from_array
+        else:
+            self.fetch_data = self._fetch_data_from_file
+
+    def _get_sample_num(self):
+        """Determine the number of samples in the dataset and return the number.
+
+        Returns
+        -------
+        sample_num : int
+            The number of the samples in the given dataset.
+        """
+        if self.X is not None:
+            sample_num = len(self.X)
+        elif self.file_type == "h5py":
+            with h5py.File(self.file_path, "r") as hf:
+                sample_num = len(hf["X"])
+        else:
+            raise TypeError(f"So far only h5py is supported.")
+
+        return sample_num
 
     def __len__(self):
-        return len(self.X)
+        return self.sample_num
 
-    def __getitem__(self, idx):
-        """Fetch data according to index.
+    def _fetch_data_from_array(self, idx):
+        """Fetch data from self.X if it is given.
 
         Parameters
         ----------
         idx : int,
-            The index to fetch the specified sample.
+            The index of the sample to be return.
+
+        Returns
+        -------
+        sample : list,
+            The collated data sample, a list including all necessary sample info.
         """
+
         X = self.X[idx]
         missing_mask = ~torch.isnan(X)
         X = torch.nan_to_num(X)
-
         sample = [
             torch.tensor(idx),
             X.to(torch.float32),
@@ -55,3 +108,90 @@ def __getitem__(self, idx):
             sample.append(self.y[idx].to(torch.long))
 
         return sample
+
+    def _open_file_handle(self):
+        """Open the file handle for reading data from the file.
+
+        Notes
+        -----
+        This function can also help confirm if the given file and file type match.
+
+        Returns
+        -------
+        file_handle : file.
+
+        """
+        try:
+            file_handler = h5py.File(
+                self.file_path, "r"
+            )  # set swmr=True if the h5 file need to be written into new content during reading
+        except OSError as e:
+            raise TypeError(
+                f"{e} This probably is caused by file type error. "
+                f"Please confirm that the given file {self.file_path} is an h5 file."
+            )
+        except Exception as e:
+            raise RuntimeError(e)
+        return file_handler
+
+    def _fetch_data_from_file(self, idx):
+        """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples.
+        Here the opened file handle does not load the entire dataset into RAM but only load the currently accessed slice
+
+        Notes
+        -----
+        Multi workers reading from h5 file is tricky, and I was confronted with a problem similar to
+        https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/7 in 2020, please
+        refer to it for more details about the problem.
+        The implementation here is referred to
+        https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/10
+        And according to https://discuss.pytorch.org/t/dataloader-when-num-worker-0-there-is-bug/25643/37,
+        pytorch v1.7.1 and h5py v3.2.0 work well, so probably updating to the latest version can avoid the
+        issue I met. After all, this implementation may need to be updated in the near future.
+
+        Parameters
+        ----------
+        idx : int,
+            The index of the sample to be return.
+
+        Returns
+        -------
+        sample : list,
+            The collated data sample, a list including all necessary sample info.
+        """
+
+        if self.file_handler is None:
+            self.file_handler = self._open_file_handle()
+
+        X = self.file_handler["X"][idx]
+        missing_mask = ~torch.isnan(X)
+        X = torch.nan_to_num(X)
+        sample = [
+            torch.tensor(idx),
+            X.to(torch.float32),
+            missing_mask.to(torch.float32),
+        ]
+
+        if (
+            "y" in self.file_handler.keys()
+        ):  # if the dataset has labels, then fetch it from the file
+            sample.append(self.file_handler["y"][idx].to(torch.long))
+
+        return sample
+
+    def __getitem__(self, idx):
+        """Fetch data according to index.
+
+        Parameters
+        ----------
+        idx : int,
+            The index to fetch the specified sample.
+
+        Returns
+        -------
+        sample : list,
+            The collated data sample, a list including all necessary sample info.
+        """
+
+        sample = self.fetch_data(idx)
+        return sample

From df2414b4a767c8f1b775b90301dacd936011a785 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Sun, 19 Feb 2023 16:06:23 +0800
Subject: [PATCH 02/22] feat: add the file lazy-loading strategy for classes
 derived from BaseDataset;

---
 pypots/data/base.py              |   5 +-
 pypots/data/dataset_for_brits.py | 127 ++++++++++++++++++++++---------
 pypots/data/dataset_for_grud.py  |  71 +++++++++++++----
 pypots/data/dataset_for_mit.py   |  46 +++++++++--
 4 files changed, 194 insertions(+), 55 deletions(-)

diff --git a/pypots/data/base.py b/pypots/data/base.py
index 462852e1..a1aff6b7 100644
--- a/pypots/data/base.py
+++ b/pypots/data/base.py
@@ -5,6 +5,7 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: GPL-v3
 
+from abc import abstractmethod
 import h5py
 import torch
 from torch.utils.data import Dataset
@@ -81,6 +82,7 @@ def _get_sample_num(self):
     def __len__(self):
         return self.sample_num
 
+    @abstractmethod
     def _fetch_data_from_array(self, idx):
         """Fetch data from self.X if it is given.
 
@@ -134,9 +136,10 @@ def _open_file_handle(self):
             raise RuntimeError(e)
         return file_handler
 
+    @abstractmethod
     def _fetch_data_from_file(self, idx):
         """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples.
-        Here the opened file handle does not load the entire dataset into RAM but only load the currently accessed slice
+        Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice.
 
         Notes
         -----
diff --git a/pypots/data/dataset_for_brits.py b/pypots/data/dataset_for_brits.py
index 0f3ee6a7..415d1530 100644
--- a/pypots/data/dataset_for_brits.py
+++ b/pypots/data/dataset_for_brits.py
@@ -5,6 +5,7 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: GLP-v3
 
+import numpy as np
 import torch
 
 from pypots.data.base import BaseDataset
@@ -56,43 +57,44 @@ class DatasetForBRITS(BaseDataset):
         Classification labels of according time-series samples.
     """
 
-    def __init__(self, X, y=None):
-        super().__init__(X, y)
-
-        # calculate all delta here.
-        # Training will take too much time if we put delta calculation in __getitem__().
-        forward_missing_mask = (~torch.isnan(X)).type(torch.float32)
-        forward_X = torch.nan_to_num(X)
-        forward_delta = parse_delta(forward_missing_mask)
-        backward_X = torch.flip(forward_X, dims=[1])
-        backward_missing_mask = torch.flip(forward_missing_mask, dims=[1])
-        backward_delta = parse_delta(backward_missing_mask)
-
-        self.data = {
-            "forward": {
-                "X": forward_X,
-                "missing_mask": forward_missing_mask,
-                "delta": forward_delta,
-            },
-            "backward": {
-                "X": backward_X,
-                "missing_mask": backward_missing_mask,
-                "delta": backward_delta,
-            },
-        }
-
-    def __getitem__(self, idx):
-        """Fetch data according to index.
+    def __init__(self, X=None, y=None, file_path=None, file_type="h5py"):
+        super().__init__(X, y, file_path, file_type)
+
+        if self.X is not None:
+            # calculate all delta here.
+            # Training will take too much time if we put delta calculation in __getitem__().
+            forward_missing_mask = (~torch.isnan(X)).type(torch.float32)
+            forward_X = torch.nan_to_num(X)
+            forward_delta = parse_delta(forward_missing_mask)
+            backward_X = torch.flip(forward_X, dims=[1])
+            backward_missing_mask = torch.flip(forward_missing_mask, dims=[1])
+            backward_delta = parse_delta(backward_missing_mask)
+
+            self.processed_data = {
+                "forward": {
+                    "X": forward_X,
+                    "missing_mask": forward_missing_mask,
+                    "delta": forward_delta,
+                },
+                "backward": {
+                    "X": backward_X,
+                    "missing_mask": backward_missing_mask,
+                    "delta": backward_delta,
+                },
+            }
+
+    def _fetch_data_from_array(self, idx):
+        """Fetch data from self.X if it is given.
 
         Parameters
         ----------
         idx : int,
-            The index to fetch the specified sample.
+            The index of the sample to be return.
 
         Returns
         -------
-        dict,
-            A dict contains
+        sample : list,
+            A list contains
 
             index : int tensor,
                 The index of the sample.
@@ -112,16 +114,69 @@ def __getitem__(self, idx):
         sample = [
             torch.tensor(idx),
             # for forward
-            self.data["forward"]["X"][idx].to(torch.float32),
-            self.data["forward"]["missing_mask"][idx].to(torch.float32),
-            self.data["forward"]["delta"][idx].to(torch.float32),
+            self.processed_data["forward"]["X"][idx].to(torch.float32),
+            self.processed_data["forward"]["missing_mask"][idx].to(torch.float32),
+            self.processed_data["forward"]["delta"][idx].to(torch.float32),
             # for backward
-            self.data["backward"]["X"][idx].to(torch.float32),
-            self.data["backward"]["missing_mask"][idx].to(torch.float32),
-            self.data["backward"]["delta"][idx].to(torch.float32),
+            self.processed_data["backward"]["X"][idx].to(torch.float32),
+            self.processed_data["backward"]["missing_mask"][idx].to(torch.float32),
+            self.processed_data["backward"]["delta"][idx].to(torch.float32),
         ]
 
         if self.y is not None:
             sample.append(self.y[idx].to(torch.long))
 
         return sample
+
+    def _fetch_data_from_file(self, idx):
+        """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples.
+        Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice.
+
+        Parameters
+        ----------
+        idx : int,
+            The index of the sample to be return.
+
+        Returns
+        -------
+        sample : list,
+            The collated data sample, a list including all necessary sample info.
+        """
+
+        if self.file_handler is None:
+            self.file_handler = self._open_file_handle()
+
+        X = self.file_handler["X"][idx]
+        missing_mask = (~np.isnan(X)).astype("float32")
+        X = np.nan_to_num(X)
+
+        forward = {
+            "X": X,
+            "missing_mask": missing_mask,
+            "deltas": parse_delta(missing_mask),
+        }
+
+        backward = {
+            "X": np.flip(forward["X"], axis=0).copy(),
+            "missing_mask": np.flip(forward["missing_mask"], axis=0).copy(),
+        }
+        backward["deltas"] = parse_delta(backward["missing_mask"])
+
+        sample = [
+            torch.tensor(idx),
+            # for forward
+            torch.from_numpy(forward["X"].astype("float32")),
+            torch.from_numpy(forward["missing_mask"].astype("float32")),
+            torch.from_numpy(forward["deltas"].astype("float32")),
+            # for backward
+            torch.from_numpy(backward["X"].astype("float32")),
+            torch.from_numpy(backward["missing_mask"].astype("float32")),
+            torch.from_numpy(backward["deltas"].astype("float32")),
+        ]
+
+        if (
+            "y" in self.file_handler.keys()
+        ):  # if the dataset has labels, then fetch it from the file
+            sample.append(self.file_handler["y"][idx].to(torch.long))
+
+        return sample
diff --git a/pypots/data/dataset_for_grud.py b/pypots/data/dataset_for_grud.py
index f3dd1d80..dccaa102 100644
--- a/pypots/data/dataset_for_grud.py
+++ b/pypots/data/dataset_for_grud.py
@@ -25,19 +25,21 @@ class DatasetForGRUD(BaseDataset):
         Classification labels of according time-series samples.
     """
 
-    def __init__(self, X, y=None):
-        super().__init__(X, y)
+    def __init__(self, X=None, y=None, file_path=None, file_type="h5py"):
+        super().__init__(X, y, file_path, file_type)
 
         self.locf = LOCF()
-        self.missing_mask = (~torch.isnan(X)).to(torch.float32)
-        self.X = torch.nan_to_num(X)
-        self.deltas = parse_delta(self.missing_mask)
-        self.X_filledLOCF = self.locf.locf_torch(X)
-        self.empirical_mean = torch.sum(
-            self.missing_mask * self.X, dim=[0, 1]
-        ) / torch.sum(self.missing_mask, dim=[0, 1])
-
-    def __getitem__(self, idx):
+
+        if self.X is not None:
+            self.missing_mask = (~torch.isnan(X)).to(torch.float32)
+            self.X_filledLOCF = self.locf.locf_torch(X)
+            self.X = torch.nan_to_num(X)
+            self.deltas = parse_delta(self.missing_mask)
+            self.empirical_mean = torch.sum(
+                self.missing_mask * self.X, dim=[0, 1]
+            ) / torch.sum(self.missing_mask, dim=[0, 1])
+
+    def _fetch_data_from_array(self, idx):
         """Fetch data according to index.
 
         Parameters
@@ -47,8 +49,8 @@ def __getitem__(self, idx):
 
         Returns
         -------
-        dict,
-            A dict contains
+        sample : list,
+            A list contains
 
             index : int tensor,
                 The index of the sample.
@@ -81,3 +83,46 @@ def __getitem__(self, idx):
             sample.append(self.y[idx].to(torch.long))
 
         return sample
+
+    def _fetch_data_from_file(self, idx):
+        """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples.
+        Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice.
+
+        Parameters
+        ----------
+        idx : int,
+            The index of the sample to be return.
+
+        Returns
+        -------
+        sample : list,
+            The collated data sample, a list including all necessary sample info.
+        """
+
+        if self.file_handler is None:
+            self.file_handler = self._open_file_handle()
+
+        X = torch.from_numpy(self.file_handler["X"][idx])
+        missing_mask = (~torch.isnan(X)).to(torch.float32)
+        X_filledLOCF = self.locf.locf_torch(X)
+        X = torch.nan_to_num(X)
+        deltas = parse_delta(missing_mask)
+        empirical_mean = torch.sum(missing_mask * X, dim=[0, 1]) / torch.sum(
+            missing_mask, dim=[0, 1]
+        )
+
+        sample = [
+            torch.tensor(idx),
+            X,
+            X_filledLOCF,
+            missing_mask,
+            deltas,
+            empirical_mean,
+        ]
+
+        if (
+            "y" in self.file_handler.keys()
+        ):  # if the dataset has labels, then fetch it from the file
+            sample.append(self.file_handler["y"][idx].to(torch.long))
+
+        return sample
diff --git a/pypots/data/dataset_for_mit.py b/pypots/data/dataset_for_mit.py
index b24e3f75..5da35590 100644
--- a/pypots/data/dataset_for_mit.py
+++ b/pypots/data/dataset_for_mit.py
@@ -36,11 +36,11 @@ class DatasetForMIT(BaseDataset):
 
     """
 
-    def __init__(self, X, y=None, rate=0.2):
-        super().__init__(X, y)
+    def __init__(self, X=None, y=None, file_path=None, file_type="h5py", rate=0.2):
+        super().__init__(X, y, file_path, file_type)
         self.rate = rate
 
-    def __getitem__(self, idx):
+    def _fetch_data_from_array(self, idx):
         """Fetch data according to index.
 
         Parameters
@@ -50,8 +50,8 @@ def __getitem__(self, idx):
 
         Returns
         -------
-        dict,
-            A dict contains
+        sample : list,
+            A list contains
 
             index : int tensor,
                 The index of the sample.
@@ -83,3 +83,39 @@ def __getitem__(self, idx):
             sample.append(self.y[idx].to(torch.long))
 
         return sample
+
+    def _fetch_data_from_file(self, idx):
+        """Fetch data with the lazy-loading strategy, i.e. only loading data from the file while requesting for samples.
+        Here the opened file handle doesn't load the entire dataset into RAM but only load the currently accessed slice.
+
+        Parameters
+        ----------
+        idx : int,
+            The index of the sample to be return.
+
+        Returns
+        -------
+        sample : list,
+            The collated data sample, a list including all necessary sample info.
+        """
+
+        if self.file_handler is None:
+            self.file_handler = self._open_file_handle()
+
+        X = torch.from_numpy(self.file_handler["X"][idx])
+        X_intact, X, missing_mask, indicating_mask = mcar(X, rate=self.rate)
+
+        sample = [
+            torch.tensor(idx),
+            X_intact.to(torch.float32),
+            X.to(torch.float32),
+            missing_mask.to(torch.float32),
+            indicating_mask.to(torch.float32),
+        ]
+
+        if (
+            "y" in self.file_handler.keys()
+        ):  # if the dataset has labels, then fetch it from the file
+            sample.append(self.file_handler["y"][idx].to(torch.long))
+
+        return sample

From 19c5bb3ce644b548c8c5b4cd24947ba3f26c15f0 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Wed, 29 Mar 2023 21:20:46 +0800
Subject: [PATCH 03/22] doc: update the reference info;

---
 CITATION.cff |  4 ++--
 README.md    | 50 ++++++++++++++++++++++++++------------------------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index 49eed6c0..64753889 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -5,7 +5,7 @@ authors:
   given-names: "Wenjie"
   orcid: "https://orcid.org/0000-0003-3046-7835"
 title: "PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series"
-version: 0.0.7
-doi: 10.5281/zenodo.6823222
+version: 0.0.9
+doi: 10.5281/zenodo.6823221
 date-released: 2022-07-12
 url: "https://github.com/WenjieDu/PyPOTS"
\ No newline at end of file
diff --git a/README.md b/README.md
index 686b3042..5f95c52e 100644
--- a/README.md
+++ b/README.md
@@ -5,44 +5,46 @@
 
 <p align='center'>
     <!-- Python version -->
-    <img src='https://img.shields.io/badge/python-v3-yellow'>
+    <img src='https://img.shields.io/badge/python-v3-yellow?color=a4e2c6'>
     <!-- PyTorch-->
-    <img src='https://img.shields.io/static/v1?label=PyTorch&message=%E2%9D%A4%EF%B8%8F&color=DC583A&logo=pytorch'>
+    <img src='https://img.shields.io/static/v1?label=PyTorch&message=%E2%9D%A4%EF%B8%8F&color=7bcfa6&logo=pytorch'>
     <!-- PyPI version -->
     <a alt='PyPI download number' href='https://pypi.org/project/pypots'>
-        <img alt="PyPI" src="https://img.shields.io/pypi/v/pypots?color=yellowgreen&label=PyPI&logo=pypi&logoColor=white">
+        <img alt="PyPI" src="https://img.shields.io/pypi/v/pypots?color=7fecad&label=PyPI&logo=pypi&logoColor=white">
     </a>
     <!-- on Anaconda -->
     <a alt='on anaconda' href='https://anaconda.org/conda-forge/pypots'>
-        <img alt="on anaconda" src="https://img.shields.io/conda/vn/conda-forge/pypots?color=green&label=Conda&logo=anaconda" />
+        <img alt="on anaconda" src="https://img.shields.io/conda/pn/conda-forge/pypots?color=3de1ad&label=Conda&logo=anaconda" />
     </a>
     <!-- License -->
     <a alt='GPL3 license' href='https://github.com/WenjieDu/PyPOTS/blob/main/LICENSE'>
-        <img src='https://img.shields.io/badge/License-GPL--v3-green?color=79C641'>
+        <img src='https://img.shields.io/badge/License-GPL--v3-00e09e'>
     </a>
-    <!-- GitHub Testing -->
-    <a alt='GitHub Testing' href='https://github.com/WenjieDu/PyPOTS/actions/workflows/testing.yml'> 
-        <img src='https://github.com/WenjieDu/PyPOTS/actions/workflows/testing.yml/badge.svg'>
+    <!-- Repo size -->
+    <img src="https://img.shields.io/github/repo-size/WenjieDu/PyPOTS?color=48c0a3">
+    <!-- Code of Conduct -->
+    <a alt='CODE_OF_CONDUCT' href='https://github.com/WenjieDu/PyPOTS/blob/main/CODE_OF_CONDUCT.md'> 
+        <img src='https://img.shields.io/badge/Contributor%20Covenant-v2.1-21a675.svg'>
     </a>
-    <!-- Coveralls report -->
-    <a alt='Coveralls report' href='https://coveralls.io/github/WenjieDu/PyPOTS'> 
-        <img src='https://img.shields.io/coverallsCoverage/github/WenjieDu/PyPOTS?branch=main&logo=coveralls&labelColor=3F5767'>
+    <!-- Slack Workspace -->
+    <a alt='Slack Workspace' href='https://join.slack.com/t/pypots-dev/shared_invite/zt-1gq6ufwsi-p0OZdW~e9UW_IA4_f1OfxA'> 
+        <img src='https://img.shields.io/badge/Slack-PyPOTS-grey?logo=slack&color=549688'>
+    </a>
+    <!-- Zenodo DOI -->
+    <a alt='Zenodo DOI' href='https://doi.org/10.5281/zenodo.6823221'>
+        <img src='https://zenodo.org/badge/DOI/10.5281/zenodo.6823221.svg'>
     </a>
     <!-- PyPI download number -->
     <a alt='PyPI download number' href='https://pepy.tech/project/pypots'>
-        <img src='https://static.pepy.tech/personalized-badge/pypots?period=total&units=international_system&left_color=grey&right_color=blue&left_text=Downloads'>
+        <img src='https://static.pepy.tech/personalized-badge/pypots?period=total&units=international_system&left_color=grey&right_color=navy&left_text=Downloads'>
     </a>
-    <!-- Zenodo DOI -->
-    <a alt='Zenodo DOI' href='https://zenodo.org/badge/latestdoi/475477908'>
-        <img src='https://zenodo.org/badge/475477908.svg'>
-    </a>
-    <!-- Code of Conduct -->
-    <a alt='CODE_OF_CONDUCT' href='https://github.com/WenjieDu/PyPOTS/blob/main/CODE_OF_CONDUCT.md'> 
-        <img src='https://img.shields.io/badge/Contributor%20Covenant-v2.1-4baaaa.svg'>
+    <!-- GitHub Testing -->
+    <a alt='GitHub Testing' href='https://github.com/WenjieDu/PyPOTS/actions/workflows/testing.yml'> 
+        <img src='https://github.com/WenjieDu/PyPOTS/actions/workflows/testing.yml/badge.svg'>
     </a>
-    <!-- Slack Workspace -->
-    <a alt='Slack Workspace' href='https://join.slack.com/t/pypots-dev/shared_invite/zt-1gq6ufwsi-p0OZdW~e9UW_IA4_f1OfxA'> 
-        <img src='https://img.shields.io/badge/Slack-PyPOTS-grey?logo=slack&labelColor=4A154B&color=62BCE5'>
+    <!-- Coveralls report -->
+    <a alt='Coveralls report' href='https://coveralls.io/github/WenjieDu/PyPOTS'> 
+        <img src='https://img.shields.io/coverallsCoverage/github/WenjieDu/PyPOTS?branch=main&logo=coveralls&labelColor=#0aa344'>
     </a>
 </p>
 
@@ -112,13 +114,13 @@ author = {Wenjie Du},
 title = {{PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series}},
 howpublished = {\url{https://github.com/wenjiedu/pypots}},
 year = {2022},
-doi = {10.5281/zenodo.6823222},
+doi = {10.5281/zenodo.6823221},
 }
 ```
 
 or
 
-`Wenjie Du. (2022). PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series. Zenodo. https://doi.org/10.5281/zenodo.6823222`
+`Wenjie Du. (2022). PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series. Zenodo. https://doi.org/10.5281/zenodo.6823221`
 
 ## ❖ Attention 👀
 The documentation and tutorials are under construction. And a short paper introducing PyPOTS is on the way! 🚀 Stay tuned please!

From 3c56ce272ef7dc935b85662886f128e8847c4dc5 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 30 Mar 2023 15:08:12 +0800
Subject: [PATCH 04/22] fix: imputation models applying MIT do not need use
 DatasetForMIT on val_set;

---
 pypots/imputation/base.py        |  4 ++--
 pypots/imputation/brits.py       |  4 ++--
 pypots/imputation/saits.py       | 12 +++++++++---
 pypots/imputation/transformer.py | 13 ++++++++++---
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py
index e62ae50c..a16769fe 100644
--- a/pypots/imputation/base.py
+++ b/pypots/imputation/base.py
@@ -164,8 +164,8 @@ def _train_model(
                     with torch.no_grad():
                         for idx, data in enumerate(val_loader):
                             inputs = self.assemble_input_for_validating(data)
-                            results = self.model.forward(inputs)
-                            imputation_collector.append(results["imputed_data"])
+                            imputed_data, _ = self.model.impute(inputs)
+                            imputation_collector.append(imputed_data)
 
                     imputation_collector = torch.cat(imputation_collector)
                     imputation_collector = imputation_collector
diff --git a/pypots/imputation/brits.py b/pypots/imputation/brits.py
index d15c8e33..dd61b1e6 100644
--- a/pypots/imputation/brits.py
+++ b/pypots/imputation/brits.py
@@ -336,7 +336,7 @@ def impute(self, inputs):
         imputed_data_b = {"imputed_data_b": imputed_data_b}
         imputed_data_b = self.reverse(imputed_data_b)["imputed_data_b"]
         imputed_data = (imputed_data_f + imputed_data_b) / 2
-        return imputed_data
+        return imputed_data, None
 
     @staticmethod
     def get_consistency_loss(pred_f, pred_b):
@@ -620,7 +620,7 @@ def impute(self, X):
         with torch.no_grad():
             for idx, data in enumerate(test_loader):
                 inputs = self.assemble_input_for_testing(data)
-                imputed_data = self.model.impute(inputs)
+                imputed_data, _ = self.model.impute(inputs)
                 imputation_collector.append(imputed_data)
 
         imputation_collector = torch.cat(imputation_collector)
diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py
index d32bd0ab..4b3a33cc 100644
--- a/pypots/imputation/saits.py
+++ b/pypots/imputation/saits.py
@@ -230,7 +230,7 @@ def fit(self, train_X, val_X=None):
                 val_X, 0.2
             )
             val_X = masked_fill(val_X, 1 - val_X_missing_mask, torch.nan)
-            val_set = DatasetForMIT(val_X)
+            val_set = BaseDataset(val_X)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
             self._train_model(
                 training_loader, val_loader, val_X_intact, val_X_indicating_mask
@@ -282,7 +282,13 @@ def assemble_input_for_validating(self, data) -> dict:
         inputs : dict,
             A python dictionary contains the input data for model validating.
         """
-        return self.assemble_input_for_training(data)
+        indices, X, missing_mask = data
+
+        inputs = {
+            "X": X,
+            "missing_mask": missing_mask,
+        }
+        return inputs
 
     def assemble_input_for_testing(self, data) -> dict:
         """Assemble the given data into a dictionary for testing input.
@@ -301,7 +307,7 @@ def assemble_input_for_testing(self, data) -> dict:
         inputs : dict,
             A python dictionary contains the input data for model testing.
         """
-        return self.assemble_input_for_training(data)
+        return self.assemble_input_for_validating(data)
 
     def impute(self, X):
         X = self.check_input(self.n_steps, self.n_features, X)
diff --git a/pypots/imputation/transformer.py b/pypots/imputation/transformer.py
index c84c30b1..9ea288ed 100644
--- a/pypots/imputation/transformer.py
+++ b/pypots/imputation/transformer.py
@@ -320,7 +320,7 @@ def fit(self, train_X, val_X=None):
                 val_X, 0.2
             )
             val_X = masked_fill(val_X, 1 - val_X_missing_mask, np.nan)
-            val_set = DatasetForMIT(val_X)
+            val_set = BaseDataset(val_X)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
             self._train_model(
                 training_loader, val_loader, val_X_intact, val_X_indicating_mask
@@ -373,7 +373,14 @@ def assemble_input_for_validating(self, data) -> dict:
         inputs : dict,
             A python dictionary contains the input data for model validating.
         """
-        return self.assemble_input_for_training(data)
+        indices, X, missing_mask = data
+
+        inputs = {
+            "X": X,
+            "missing_mask": missing_mask,
+        }
+
+        return inputs
 
     def assemble_input_for_testing(self, data) -> dict:
         """Assemble the given data into a dictionary for testing input.
@@ -392,7 +399,7 @@ def assemble_input_for_testing(self, data) -> dict:
         inputs : dict,
             A python dictionary contains the input data for model testing.
         """
-        return self.assemble_input_for_training(data)
+        return self.assemble_input_for_validating(data)
 
     def impute(self, X):
         X = self.check_input(self.n_steps, self.n_features, X)

From 5927909f8d23a90eee8374805fe0291cc1ccc1aa Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 30 Mar 2023 15:35:38 +0800
Subject: [PATCH 05/22] fix: only import h5py when needed;

---
 pypots/data/base.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/pypots/data/base.py b/pypots/data/base.py
index a1aff6b7..c4da8c79 100644
--- a/pypots/data/base.py
+++ b/pypots/data/base.py
@@ -6,7 +6,6 @@
 # License: GPL-v3
 
 from abc import abstractmethod
-import h5py
 import torch
 from torch.utils.data import Dataset
 
@@ -71,9 +70,10 @@ def _get_sample_num(self):
         """
         if self.X is not None:
             sample_num = len(self.X)
-        elif self.file_type == "h5py":
-            with h5py.File(self.file_path, "r") as hf:
-                sample_num = len(hf["X"])
+        elif self.file_path is not None and self.file_type == "h5py":
+            if self.file_handler is None:
+                self.file_handler = self._open_file_handle()
+            sample_num = len(self.file_handler["X"])
         else:
             raise TypeError(f"So far only h5py is supported.")
 
@@ -124,9 +124,15 @@ def _open_file_handle(self):
 
         """
         try:
+            import h5py
+
             file_handler = h5py.File(
                 self.file_path, "r"
             )  # set swmr=True if the h5 file need to be written into new content during reading
+        except ImportError:
+            raise ImportError(
+                "h5py is missing and cannot be imported. Please install it first."
+            )
         except OSError as e:
             raise TypeError(
                 f"{e} This probably is caused by file type error. "

From 4a9c5be0d1417872f197771f83336bbd85c3ae83 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 30 Mar 2023 15:51:53 +0800
Subject: [PATCH 06/22] feat: move check_input() to BaseDataset;

---
 pypots/base.py                    | 96 -------------------------------
 pypots/classification/brits.py    |  5 --
 pypots/classification/grud.py     |  5 --
 pypots/classification/raindrop.py |  5 --
 pypots/clustering/crli.py         |  2 -
 pypots/clustering/vader.py        |  2 -
 pypots/data/base.py               | 83 ++++++++++++++++++++++++++
 pypots/forecasting/bttf.py        |  1 -
 pypots/imputation/brits.py        |  5 --
 pypots/imputation/saits.py        |  4 --
 pypots/imputation/transformer.py  |  4 --
 11 files changed, 83 insertions(+), 129 deletions(-)

diff --git a/pypots/base.py b/pypots/base.py
index 49b1b0c2..0f2e69e4 100644
--- a/pypots/base.py
+++ b/pypots/base.py
@@ -8,7 +8,6 @@
 import os
 from abc import ABC
 
-import numpy as np
 import torch
 
 from pypots.utils.files import create_dir_if_not_exist
@@ -32,101 +31,6 @@ def __init__(self, device):
         else:
             self.device = device
 
-    def check_input(
-        self, expected_n_steps, expected_n_features, X, y=None, out_dtype="tensor"
-    ):
-        """Check value type and shape of input X and y
-
-        Parameters
-        ----------
-        expected_n_steps : int
-            Number of time steps of input time series (X) that the model expects.
-            This value is the same with the argument `n_steps` used to initialize the model.
-
-        expected_n_features : int
-            Number of feature dimensions of input time series (X) that the model expects.
-            This value is the same with the argument `n_features` used to initialize the model.
-
-        X : array-like,
-            Time-series data that must have a shape like [n_samples, expected_n_steps, expected_n_features].
-
-        y : array-like, default=None
-            Labels of time-series samples (X) that must have a shape like [n_samples] or [n_samples, n_classes].
-
-        out_dtype : str, in ['tensor', 'ndarray'], default='tensor'
-            Data type of the output, should be np.ndarray or torch.Tensor
-
-        Returns
-        -------
-        X : tensor
-
-        y : tensor
-        """
-        assert out_dtype in [
-            "tensor",
-            "ndarray",
-        ], f'out_dtype should be "tensor" or "ndarray", but got {out_dtype}'
-        is_list = isinstance(X, list)
-        is_array = isinstance(X, np.ndarray)
-        is_tensor = isinstance(X, torch.Tensor)
-        assert is_tensor or is_array or is_list, TypeError(
-            "X should be an instance of list/np.ndarray/torch.Tensor, "
-            f"but got {type(X)}"
-        )
-
-        # convert the data type if in need
-        if out_dtype == "tensor":
-            if is_list:
-                X = torch.tensor(X).to(self.device)
-            elif is_array:
-                X = torch.from_numpy(X).to(self.device)
-            else:  # is tensor
-                X = X.to(self.device)
-        else:  # out_dtype is ndarray
-            # convert to np.ndarray first for shape check
-            if is_list:
-                X = np.asarray(X)
-            elif is_tensor:
-                X = X.numpy()
-            else:  # is ndarray
-                pass
-
-        # check the shape of X here
-        X_shape = X.shape
-        assert len(X_shape) == 3, (
-            f"input should have 3 dimensions [n_samples, seq_len, n_features],"
-            f"but got shape={X.shape}"
-        )
-        assert (
-            X_shape[1] == expected_n_steps
-        ), f"expect X.shape[1] to be {expected_n_steps}, but got {X_shape[1]}"
-        assert (
-            X_shape[2] == expected_n_features
-        ), f"expect X.shape[2] to be {expected_n_features}, but got {X_shape[2]}"
-
-        if y is not None:
-            assert len(X) == len(y), (
-                f"lengths of X and y must match, " f"but got f{len(X)} and {len(y)}"
-            )
-            if isinstance(y, torch.Tensor):
-                y = y.to(self.device) if out_dtype == "tensor" else y.numpy()
-            elif isinstance(y, list):
-                y = (
-                    torch.tensor(y).to(self.device)
-                    if out_dtype == "tensor"
-                    else np.asarray(y)
-                )
-            elif isinstance(y, np.ndarray):
-                y = torch.from_numpy(y).to(self.device) if out_dtype == "tensor" else y
-            else:
-                raise TypeError(
-                    "y should be an instance of list/np.ndarray/torch.Tensor, "
-                    f"but got {type(y)}"
-                )
-            return X, y
-        else:
-            return X
-
     def save_logs_to_tensorboard(self, saving_path):
         """Save logs (self.logger) into a tensorboard file.
 
diff --git a/pypots/classification/brits.py b/pypots/classification/brits.py
index 5ef03860..961979d5 100644
--- a/pypots/classification/brits.py
+++ b/pypots/classification/brits.py
@@ -196,10 +196,6 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         self : object,
             Trained model.
         """
-        train_X, train_y = self.check_input(
-            self.n_steps, self.n_features, train_X, train_y
-        )
-        val_X, val_y = self.check_input(self.n_steps, self.n_features, val_X, val_y)
 
         training_set = DatasetForBRITS(
             train_X, train_y
@@ -326,7 +322,6 @@ def assemble_input_for_testing(self, data) -> dict:
         return inputs
 
     def classify(self, X):
-        X = self.check_input(self.n_steps, self.n_features, X)
         self.model.eval()  # set the model as eval status to freeze it.
         test_set = DatasetForBRITS(X)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
diff --git a/pypots/classification/grud.py b/pypots/classification/grud.py
index 69929dcc..4435afeb 100644
--- a/pypots/classification/grud.py
+++ b/pypots/classification/grud.py
@@ -160,10 +160,6 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         self : object,
             Trained model.
         """
-        train_X, train_y = self.check_input(
-            self.n_steps, self.n_features, train_X, train_y
-        )
-        val_X, val_y = self.check_input(self.n_steps, self.n_features, val_X, val_y)
 
         training_set = DatasetForGRUD(train_X, train_y)
         training_loader = DataLoader(
@@ -260,7 +256,6 @@ def assemble_input_for_testing(self, data) -> dict:
         return inputs
 
     def classify(self, X):
-        X = self.check_input(self.n_steps, self.n_features, X)
         self.model.eval()  # set the model as eval status to freeze it.
         test_set = DatasetForGRUD(X)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
diff --git a/pypots/classification/raindrop.py b/pypots/classification/raindrop.py
index c6204bc5..fd5f8885 100644
--- a/pypots/classification/raindrop.py
+++ b/pypots/classification/raindrop.py
@@ -681,10 +681,6 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         self : object,
             Trained model.
         """
-        train_X, train_y = self.check_input(
-            self.n_steps, self.n_features, train_X, train_y
-        )
-        val_X, val_y = self.check_input(self.n_steps, self.n_features, val_X, val_y)
 
         training_set = DatasetForGRUD(train_X, train_y)
         training_loader = DataLoader(
@@ -789,7 +785,6 @@ def assemble_input_for_testing(self, data) -> dict:
         return inputs
 
     def classify(self, X):
-        X = self.check_input(self.n_steps, self.n_features, X)
         self.model.eval()  # set the model as eval status to freeze it.
         test_set = DatasetForGRUD(X)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
diff --git a/pypots/clustering/crli.py b/pypots/clustering/crli.py
index b0bd9723..2cbd1c33 100644
--- a/pypots/clustering/crli.py
+++ b/pypots/clustering/crli.py
@@ -353,7 +353,6 @@ def __init__(
         self.logger = {"training_loss_generator": [], "training_loss_discriminator": []}
 
     def fit(self, train_X):
-        train_X = self.check_input(self.n_steps, self.n_features, train_X)
         training_set = DatasetForGRUD(train_X)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
@@ -516,7 +515,6 @@ def _train_model(self, training_loader, val_loader=None):
         logger.info("Finished training.")
 
     def cluster(self, X):
-        X = self.check_input(self.n_steps, self.n_features, X)
         self.model.eval()  # set the model as eval status to freeze it.
         test_set = DatasetForGRUD(X)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
diff --git a/pypots/clustering/vader.py b/pypots/clustering/vader.py
index 14f682fe..141e772e 100644
--- a/pypots/clustering/vader.py
+++ b/pypots/clustering/vader.py
@@ -379,7 +379,6 @@ def __init__(
         self._print_model_size()
 
     def fit(self, train_X):
-        train_X = self.check_input(self.n_steps, self.n_features, train_X)
         training_set = DatasetForGRUD(train_X)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
@@ -558,7 +557,6 @@ def _train_model(self, training_loader, val_loader=None):
         logger.info("Finished training.")
 
     def cluster(self, X):
-        X = self.check_input(self.n_steps, self.n_features, X)
         self.model.eval()  # set the model as eval status to freeze it.
         test_set = DatasetForGRUD(X)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
diff --git a/pypots/data/base.py b/pypots/data/base.py
index c4da8c79..4339b761 100644
--- a/pypots/data/base.py
+++ b/pypots/data/base.py
@@ -6,6 +6,8 @@
 # License: GPL-v3
 
 from abc import abstractmethod
+
+import numpy as np
 import torch
 from torch.utils.data import Dataset
 
@@ -44,17 +46,23 @@ def __init__(self, X=None, y=None, file_path=None, file_type="h5py"):
             file_type in SUPPORTED_DATASET_FILE_TYPE
         ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}"
 
+        if X is not None:
+            X, y = self.check_input(X, y)
+
         self.X = X
         self.y = y
         self.file_path = file_path
         self.file_type = file_type
+
         if self.file_path is not None:
             self.file_handler = self._open_file_handle()
             assert (
                 "X" in self.file_handler.keys()
             ), "The given dataset file doesn't contains X. Please double check."
+
         self.sample_num = self._get_sample_num()
 
+        # set up function fetch_data()
         if self.X is not None:
             self.fetch_data = self._fetch_data_from_array
         else:
@@ -82,6 +90,81 @@ def _get_sample_num(self):
     def __len__(self):
         return self.sample_num
 
+    def check_input(self, X, y=None, out_dtype="tensor"):
+        """Check value type and shape of input X and y
+
+        Parameters
+        ----------
+        X : array-like,
+            Time-series data that must have a shape like [n_samples, expected_n_steps, expected_n_features].
+
+        y : array-like, default=None
+            Labels of time-series samples (X) that must have a shape like [n_samples] or [n_samples, n_classes].
+
+        out_dtype : str, in ['tensor', 'ndarray'], default='tensor'
+            Data type of the output, should be np.ndarray or torch.Tensor
+
+        Returns
+        -------
+        X : array-like
+
+        y : array-like
+        """
+        assert out_dtype in [
+            "tensor",
+            "ndarray",
+        ], f'out_dtype should be "tensor" or "ndarray", but got {out_dtype}'
+
+        is_list = isinstance(X, list)
+        is_array = isinstance(X, np.ndarray)
+        is_tensor = isinstance(X, torch.Tensor)
+        assert is_tensor or is_array or is_list, TypeError(
+            "X should be an instance of list/np.ndarray/torch.Tensor, "
+            f"but got {type(X)}"
+        )
+
+        # convert the data type if in need
+        if out_dtype == "tensor":
+            if is_list:
+                X = torch.tensor(X).to()
+            elif is_array:
+                X = torch.from_numpy(X).to()
+            else:  # is tensor
+                pass
+        else:  # out_dtype is ndarray
+            # convert to np.ndarray first for shape check
+            if is_list:
+                X = np.asarray(X)
+            elif is_tensor:
+                X = X.numpy()
+            else:  # is ndarray
+                pass
+
+        # check the shape of X here
+        X_shape = X.shape
+        assert len(X_shape) == 3, (
+            f"input should have 3 dimensions [n_samples, seq_len, n_features],"
+            f"but got shape={X_shape}"
+        )
+
+        if y is not None:
+            assert len(X) == len(y), (
+                f"lengths of X and y must match, " f"but got f{len(X)} and {len(y)}"
+            )
+            if isinstance(y, torch.Tensor):
+                y = y if out_dtype == "tensor" else y.numpy()
+            elif isinstance(y, list):
+                y = torch.tensor(y) if out_dtype == "tensor" else np.asarray(y)
+            elif isinstance(y, np.ndarray):
+                y = torch.from_numpy(y) if out_dtype == "tensor" else y
+            else:
+                raise TypeError(
+                    "y should be an instance of list/np.ndarray/torch.Tensor, "
+                    f"but got {type(y)}"
+                )
+
+        return X, y
+
     @abstractmethod
     def _fetch_data_from_array(self, idx):
         """Fetch data from self.X if it is given.
diff --git a/pypots/forecasting/bttf.py b/pypots/forecasting/bttf.py
index 03711d5f..4f81cb4c 100644
--- a/pypots/forecasting/bttf.py
+++ b/pypots/forecasting/bttf.py
@@ -462,7 +462,6 @@ def fit(self, train_X):
         warnings.warn("Please run func forecast(X) directly.")
 
     def forecast(self, X):
-        self.check_input(self.n_steps, self.n_features, X, out_dtype="ndarray")
         X = X.transpose((0, 2, 1))
 
         pred = BTTF_forecast(
diff --git a/pypots/imputation/brits.py b/pypots/imputation/brits.py
index dd61b1e6..b93311bd 100644
--- a/pypots/imputation/brits.py
+++ b/pypots/imputation/brits.py
@@ -511,10 +511,6 @@ def fit(self, train_X, val_X=None):
         self : object,
             Trained model.
         """
-        train_X = self.check_input(self.n_steps, self.n_features, train_X)
-        if val_X is not None:
-            val_X = self.check_input(self.n_steps, self.n_features, val_X)
-
         training_set = DatasetForBRITS(train_X)  # time_gaps is necessary for BRITS
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
@@ -611,7 +607,6 @@ def assemble_input_for_testing(self, data) -> dict:
         return self.assemble_input_for_training(data)
 
     def impute(self, X):
-        X = self.check_input(self.n_steps, self.n_features, X)
         self.model.eval()  # set the model as eval status to freeze it.
         test_set = DatasetForBRITS(X)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py
index 4b3a33cc..981aa544 100644
--- a/pypots/imputation/saits.py
+++ b/pypots/imputation/saits.py
@@ -215,10 +215,6 @@ def __init__(
         self._print_model_size()
 
     def fit(self, train_X, val_X=None):
-        train_X = self.check_input(self.n_steps, self.n_features, train_X)
-        if val_X is not None:
-            val_X = self.check_input(self.n_steps, self.n_features, val_X)
-
         training_set = DatasetForMIT(train_X)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
diff --git a/pypots/imputation/transformer.py b/pypots/imputation/transformer.py
index 9ea288ed..a9c8e221 100644
--- a/pypots/imputation/transformer.py
+++ b/pypots/imputation/transformer.py
@@ -305,9 +305,6 @@ def __init__(
         self._print_model_size()
 
     def fit(self, train_X, val_X=None):
-        train_X = self.check_input(self.n_steps, self.n_features, train_X)
-        if val_X is not None:
-            val_X = self.check_input(self.n_steps, self.n_features, val_X)
 
         training_set = DatasetForMIT(train_X)
         training_loader = DataLoader(
@@ -402,7 +399,6 @@ def assemble_input_for_testing(self, data) -> dict:
         return self.assemble_input_for_validating(data)
 
     def impute(self, X):
-        X = self.check_input(self.n_steps, self.n_features, X)
         self.model.eval()  # set the model as eval status to freeze it.
         test_set = BaseDataset(X)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)

From c71c8faa928e30926a1912d27f9ab9a346d4aa73 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 30 Mar 2023 16:18:55 +0800
Subject: [PATCH 07/22] fix: correct mistaken operator from & to ^;

---
 pypots/data/base.py              | 17 +++++++++++------
 pypots/data/dataset_for_brits.py |  4 ++--
 pypots/data/dataset_for_grud.py  |  6 +++---
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/pypots/data/base.py b/pypots/data/base.py
index 4339b761..382915a4 100644
--- a/pypots/data/base.py
+++ b/pypots/data/base.py
@@ -38,10 +38,14 @@ def __init__(self, X=None, y=None, file_path=None, file_type="h5py"):
         # types and shapes had been checked after X and y input into the model
         # So they are safe to use here. No need to check again.
 
-        assert X is None and file_path is None, f"X and file_path cannot both be None."
-        assert (
-            X is not None and file_path is not None
+        assert (X is None) ^ (
+            file_path is None
+        ), f"X and file_path cannot both be None."
+
+        assert (X is not None) ^ (
+            file_path is not None
         ), f"X and file_path cannot both be given. Either of them should be given."
+
         assert (
             file_type in SUPPORTED_DATASET_FILE_TYPE
         ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}"
@@ -90,7 +94,8 @@ def _get_sample_num(self):
     def __len__(self):
         return self.sample_num
 
-    def check_input(self, X, y=None, out_dtype="tensor"):
+    @staticmethod
+    def check_input(X, y=None, out_dtype="tensor"):
         """Check value type and shape of input X and y
 
         Parameters
@@ -126,9 +131,9 @@ def check_input(self, X, y=None, out_dtype="tensor"):
         # convert the data type if in need
         if out_dtype == "tensor":
             if is_list:
-                X = torch.tensor(X).to()
+                X = torch.tensor(X)
             elif is_array:
-                X = torch.from_numpy(X).to()
+                X = torch.from_numpy(X)
             else:  # is tensor
                 pass
         else:  # out_dtype is ndarray
diff --git a/pypots/data/dataset_for_brits.py b/pypots/data/dataset_for_brits.py
index 38e1feeb..139e7f4c 100644
--- a/pypots/data/dataset_for_brits.py
+++ b/pypots/data/dataset_for_brits.py
@@ -63,8 +63,8 @@ def __init__(self, X=None, y=None, file_path=None, file_type="h5py"):
         if self.X is not None:
             # calculate all delta here.
             # Training will take too much time if we put delta calculation in __getitem__().
-            forward_missing_mask = (~torch.isnan(X)).type(torch.float32)
-            forward_X = torch.nan_to_num(X)
+            forward_missing_mask = (~torch.isnan(self.X)).type(torch.float32)
+            forward_X = torch.nan_to_num(self.X)
             forward_delta = parse_delta(forward_missing_mask)
             backward_X = torch.flip(forward_X, dims=[1])
             backward_missing_mask = torch.flip(forward_missing_mask, dims=[1])
diff --git a/pypots/data/dataset_for_grud.py b/pypots/data/dataset_for_grud.py
index dccaa102..f7dd9df5 100644
--- a/pypots/data/dataset_for_grud.py
+++ b/pypots/data/dataset_for_grud.py
@@ -31,9 +31,9 @@ def __init__(self, X=None, y=None, file_path=None, file_type="h5py"):
         self.locf = LOCF()
 
         if self.X is not None:
-            self.missing_mask = (~torch.isnan(X)).to(torch.float32)
-            self.X_filledLOCF = self.locf.locf_torch(X)
-            self.X = torch.nan_to_num(X)
+            self.missing_mask = (~torch.isnan(self.X)).to(torch.float32)
+            self.X_filledLOCF = self.locf.locf_torch(self.X)
+            self.X = torch.nan_to_num(self.X)
             self.deltas = parse_delta(self.missing_mask)
             self.empirical_mean = torch.sum(
                 self.missing_mask * self.X, dim=[0, 1]

From af4586a60d27f913c69527dada8bb1c0133b5f44 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 30 Mar 2023 16:51:24 +0800
Subject: [PATCH 08/22] fix: turn imputation to numpy.ndarray in the validation
 stage;

---
 pypots/imputation/base.py  | 2 +-
 pypots/imputation/saits.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py
index a16769fe..743ce958 100644
--- a/pypots/imputation/base.py
+++ b/pypots/imputation/base.py
@@ -168,7 +168,7 @@ def _train_model(
                             imputation_collector.append(imputed_data)
 
                     imputation_collector = torch.cat(imputation_collector)
-                    imputation_collector = imputation_collector
+                    imputation_collector = imputation_collector.numpy()
 
                     mean_val_loss = cal_mae(
                         imputation_collector, val_X_intact, val_indicating_mask
diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py
index 981aa544..9832865c 100644
--- a/pypots/imputation/saits.py
+++ b/pypots/imputation/saits.py
@@ -306,7 +306,6 @@ def assemble_input_for_testing(self, data) -> dict:
         return self.assemble_input_for_validating(data)
 
     def impute(self, X):
-        X = self.check_input(self.n_steps, self.n_features, X)
         self.model.eval()  # set the model as eval status to freeze it.
         test_set = BaseDataset(X)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)

From fababb1f3586285907c836a1fdd123a832030c56 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 01:23:36 +0800
Subject: [PATCH 09/22] feat: update the data given and input logic to support
 loading dataset from files;

---
 pypots/classification/base.py       | 48 ++++++++++-----
 pypots/classification/brits.py      | 60 ++++++++++++++-----
 pypots/classification/grud.py       | 56 ++++++++++++++----
 pypots/classification/raindrop.py   | 52 ++++++++++++----
 pypots/clustering/base.py           | 26 +++++---
 pypots/clustering/crli.py           | 44 ++++++++++++--
 pypots/clustering/vader.py          | 44 ++++++++++++--
 pypots/data/base.py                 | 92 +++++++++++++----------------
 pypots/data/dataset_for_brits.py    | 33 ++++++-----
 pypots/data/dataset_for_grud.py     | 30 ++++++----
 pypots/data/dataset_for_mit.py      | 35 ++++++-----
 pypots/forecasting/base.py          | 29 +++++++--
 pypots/forecasting/bttf.py          | 25 +++++++-
 pypots/imputation/base.py           | 40 +++++++++----
 pypots/imputation/brits.py          | 67 ++++++++++++++++-----
 pypots/imputation/locf.py           | 50 +++++++++++++---
 pypots/imputation/saits.py          | 69 +++++++++++++++++++---
 pypots/imputation/transformer.py    | 66 ++++++++++++++++++---
 pypots/tests/test_classification.py | 40 ++++---------
 pypots/tests/test_clustering.py     | 24 ++++----
 pypots/tests/test_forecasting.py    | 10 ++--
 pypots/tests/test_imputation.py     | 52 +++++++---------
 22 files changed, 701 insertions(+), 291 deletions(-)

diff --git a/pypots/classification/base.py b/pypots/classification/base.py
index 598902aa..27dcac5a 100644
--- a/pypots/classification/base.py
+++ b/pypots/classification/base.py
@@ -22,19 +22,31 @@ def __init__(self, device):
         super().__init__(device)
 
     @abstractmethod
-    def fit(self, train_X, train_y, val_X=None, val_y=None):
-        """Train the classifier.
+    def fit(self, train_set, val_set=None, file_type="h5py"):
+        """Train the classifier on the given data.
 
         Parameters
         ----------
-        train_X : array-like of shape [n_samples, sequence length (time steps), n_features],
-            Time-series data for training, can contain missing values.
-        train_y : array,
-            Classification labels for training.
-        val_X : array-like of shape [n_samples, sequence length (time steps), n_features],
-            Time-series data for validation, can contain missing values.
-        val_y : array,
-            Classification labels for validation.
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including keys as 'X' and 'y',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including keys as 'X' and 'y',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        file_type : str, default = "h5py",
+            The type of the given file if train_set and val_set are path strings.
 
         Returns
         -------
@@ -44,18 +56,22 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         return self
 
     @abstractmethod
-    def classify(self, X):
-        """Classify the input with the trained model.
+    def classify(self, X, file_type="h5py"):
+        """Classify the input data with the trained model.
 
         Parameters
         ----------
-        X : array-like of shape [n_samples, sequence length (time steps), n_features],
-            Time-series data contains missing values.
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py",
+            The type of the given file if X is a path string.
 
         Returns
         -------
-        array-like, shape [n_samples, sequence length (time steps), n_features],
-            Classification results.
+        array-like, shape [n_samples],
+            Classification results of the given samples.
         """
         pass
 
diff --git a/pypots/classification/brits.py b/pypots/classification/brits.py
index 961979d5..6cd9a959 100644
--- a/pypots/classification/brits.py
+++ b/pypots/classification/brits.py
@@ -123,8 +123,6 @@ class BRITS(BaseNNClassifier):
         The underlying BRITS model.
     optimizer : object,
         The optimizer for model training.
-    data_loader : object,
-        The data loader for dataset loading.
 
     Parameters
     ----------
@@ -181,33 +179,47 @@ def __init__(
         self.model = self.model.to(self.device)
         self._print_model_size()
 
-    def fit(self, train_X, train_y, val_X=None, val_y=None):
-        """Fit the model on the given training data.
+    def fit(self, train_set, val_set=None, file_type="h5py"):
+        """Train the classifier on the given data.
 
         Parameters
         ----------
-        train_X : array, shape [n_samples, sequence length (time steps), n_features],
-            Time-series vectors.
-        train_y : array,
-            Classification labels.
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including keys as 'X' and 'y',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including keys as 'X' and 'y',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        file_type : str, default = "h5py"
+            The type of the given file if train_set and val_set are path strings.
 
         Returns
         -------
         self : object,
-            Trained model.
+            Trained classifier.
         """
 
-        training_set = DatasetForBRITS(
-            train_X, train_y
-        )  # time_gaps is necessary for BRITS
+        training_set = DatasetForBRITS(train_set)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
         )
 
-        if val_X is None:
+        if val_set is None:
             self._train_model(training_loader)
         else:
-            val_set = DatasetForBRITS(val_X, val_y)
+            val_set = DatasetForBRITS(val_set)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
             self._train_model(training_loader, val_loader)
 
@@ -321,9 +333,25 @@ def assemble_input_for_testing(self, data) -> dict:
         }
         return inputs
 
-    def classify(self, X):
+    def classify(self, X, file_type="h5py"):
+        """Classify the input data with the trained model.
+
+        Parameters
+        ----------
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py",
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples],
+            Classification results of the given samples.
+        """
         self.model.eval()  # set the model as eval status to freeze it.
-        test_set = DatasetForBRITS(X)
+        test_set = DatasetForBRITS(X, file_type)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
         prediction_collector = []
 
diff --git a/pypots/classification/grud.py b/pypots/classification/grud.py
index 4435afeb..fb13df4f 100644
--- a/pypots/classification/grud.py
+++ b/pypots/classification/grud.py
@@ -145,31 +145,47 @@ def __init__(
         self.model = self.model.to(self.device)
         self._print_model_size()
 
-    def fit(self, train_X, train_y, val_X=None, val_y=None):
-        """Fit the model on the given training data.
+    def fit(self, train_set, val_set=None, file_type="h5py"):
+        """Train the classifier on the given data.
 
         Parameters
         ----------
-        train_X : array, shape [n_samples, sequence length (time steps), n_features],
-            Time-series vectors.
-        train_y : array,
-            Classification labels.
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including keys as 'X' and 'y',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including keys as 'X' and 'y',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        file_type : str, default = "h5py"
+            The type of the given file if train_set and val_set are path strings.
 
         Returns
         -------
         self : object,
-            Trained model.
+            Trained classifier.
         """
 
-        training_set = DatasetForGRUD(train_X, train_y)
+        training_set = DatasetForGRUD(train_set, file_type)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
         )
 
-        if val_X is None:
+        if val_set is None:
             self._train_model(training_loader)
         else:
-            val_set = DatasetForGRUD(val_X, val_y)
+            val_set = DatasetForGRUD(val_set)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
             self._train_model(training_loader, val_loader)
 
@@ -255,9 +271,25 @@ def assemble_input_for_testing(self, data) -> dict:
 
         return inputs
 
-    def classify(self, X):
+    def classify(self, X, file_type="h5py"):
+        """Classify the input data with the trained model.
+
+        Parameters
+        ----------
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py",
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples],
+            Classification results of the given samples.
+        """
         self.model.eval()  # set the model as eval status to freeze it.
-        test_set = DatasetForGRUD(X)
+        test_set = DatasetForGRUD(X, file_type)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
         prediction_collector = []
 
diff --git a/pypots/classification/raindrop.py b/pypots/classification/raindrop.py
index fd5f8885..31220608 100644
--- a/pypots/classification/raindrop.py
+++ b/pypots/classification/raindrop.py
@@ -666,15 +666,31 @@ def __init__(
         self.model = self.model.to(self.device)
         self._print_model_size()
 
-    def fit(self, train_X, train_y, val_X=None, val_y=None):
+    def fit(self, train_set, val_set=None, file_type="h5py"):
         """Fit the model on the given training data.
 
         Parameters
         ----------
-        train_X : array, shape [n_samples, sequence length (time steps), n_features],
-            Time-series vectors.
-        train_y : array,
-            Classification labels.
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including keys as 'X' and 'y',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including keys as 'X' and 'y',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        file_type : str, default = "h5py"
+            The type of the given file if train_set and val_set are path strings.
 
         Returns
         -------
@@ -682,15 +698,15 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
             Trained model.
         """
 
-        training_set = DatasetForGRUD(train_X, train_y)
+        training_set = DatasetForGRUD(train_set)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
         )
 
-        if val_X is None:
+        if val_set is None:
             self._train_model(training_loader)
         else:
-            val_set = DatasetForGRUD(val_X, val_y)
+            val_set = DatasetForGRUD(val_set)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
             self._train_model(training_loader, val_loader)
 
@@ -784,9 +800,25 @@ def assemble_input_for_testing(self, data) -> dict:
 
         return inputs
 
-    def classify(self, X):
+    def classify(self, X, file_type="h5py"):
+        """Classify the input data with the trained model.
+
+        Parameters
+        ----------
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py",
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples],
+            Classification results of the given samples.
+        """
         self.model.eval()  # set the model as eval status to freeze it.
-        test_set = DatasetForGRUD(X)
+        test_set = DatasetForGRUD(X, file_type)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
         prediction_collector = []
 
diff --git a/pypots/clustering/base.py b/pypots/clustering/base.py
index f3cc8c2e..8b66eb35 100644
--- a/pypots/clustering/base.py
+++ b/pypots/clustering/base.py
@@ -22,13 +22,21 @@ def __init__(self, device):
         super().__init__(device)
 
     @abstractmethod
-    def fit(self, train_X):
+    def fit(self, train_set, file_type="h5py"):
         """Train the cluster.
 
         Parameters
         ----------
-        train_X : array-like of shape [n_samples, sequence length (time steps), n_features],
-            Time-series data for training, can contain missing values.
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        file_type : str, default = "h5py"
+            The type of the given file if train_set is a path string.
 
         Returns
         -------
@@ -38,17 +46,21 @@ def fit(self, train_X):
         return self
 
     @abstractmethod
-    def cluster(self, X):
+    def cluster(self, X, file_type="h5py"):
         """Cluster the input with the trained model.
 
         Parameters
         ----------
-        X : array-like of shape [n_samples, sequence length (time steps), n_features],
-            Time-series data contains missing values.
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py"
+            The type of the given file if X is a path string.
 
         Returns
         -------
-        array-like, shape [n_samples, sequence length (time steps), n_features],
+        array-like, shape [n_samples],
             Clustering results.
         """
         pass
diff --git a/pypots/clustering/crli.py b/pypots/clustering/crli.py
index 2cbd1c33..b062fc33 100644
--- a/pypots/clustering/crli.py
+++ b/pypots/clustering/crli.py
@@ -352,8 +352,28 @@ def __init__(
         self._print_model_size()
         self.logger = {"training_loss_generator": [], "training_loss_discriminator": []}
 
-    def fit(self, train_X):
-        training_set = DatasetForGRUD(train_X)
+    def fit(self, train_set, file_type="h5py"):
+        """Train the cluster.
+
+        Parameters
+        ----------
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        file_type : str, default = "h5py"
+            The type of the given file if train_set is a path string.
+
+        Returns
+        -------
+        self : object,
+            Trained classifier.
+        """
+        training_set = DatasetForGRUD(train_set, file_type)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
         )
@@ -514,9 +534,25 @@ def _train_model(self, training_loader, val_loader=None):
 
         logger.info("Finished training.")
 
-    def cluster(self, X):
+    def cluster(self, X, file_type="h5py"):
+        """Cluster the input with the trained model.
+
+        Parameters
+        ----------
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py"
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples],
+            Clustering results.
+        """
         self.model.eval()  # set the model as eval status to freeze it.
-        test_set = DatasetForGRUD(X)
+        test_set = DatasetForGRUD(X, file_type)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
         latent_collector = []
 
diff --git a/pypots/clustering/vader.py b/pypots/clustering/vader.py
index 141e772e..128743a4 100644
--- a/pypots/clustering/vader.py
+++ b/pypots/clustering/vader.py
@@ -378,8 +378,28 @@ def __init__(
         self.model = self.model.to(self.device)
         self._print_model_size()
 
-    def fit(self, train_X):
-        training_set = DatasetForGRUD(train_X)
+    def fit(self, train_set, file_type="h5py"):
+        """Train the cluster.
+
+        Parameters
+        ----------
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        file_type : str, default = "h5py"
+            The type of the given file if train_set is a path string.
+
+        Returns
+        -------
+        self : object,
+            Trained classifier.
+        """
+        training_set = DatasetForGRUD(train_set, file_type)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
         )
@@ -556,9 +576,25 @@ def _train_model(self, training_loader, val_loader=None):
 
         logger.info("Finished training.")
 
-    def cluster(self, X):
+    def cluster(self, X, file_type="h5py"):
+        """Cluster the input with the trained model.
+
+        Parameters
+        ----------
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py"
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples],
+            Clustering results.
+        """
         self.model.eval()  # set the model as eval status to freeze it.
-        test_set = DatasetForGRUD(X)
+        test_set = DatasetForGRUD(X, file_type)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
         clustering_results_collector = []
 
diff --git a/pypots/data/base.py b/pypots/data/base.py
index 382915a4..84707c2d 100644
--- a/pypots/data/base.py
+++ b/pypots/data/base.py
@@ -18,52 +18,45 @@
 class BaseDataset(Dataset):
     """Base dataset class in PyPOTS.
 
-    Parameters
-    ----------
-    X : tensor, shape of [n_samples, n_steps, n_features]
-        Time-series feature vector.
-
-    y : tensor, shape of [n_samples], optional, default=None,
-        Classification labels of according time-series samples.
-
-    file_path : str,
-        The path to the dataset file.
-
-    file_type : str,
-        The type of the given file, should be one of `numpy`, `h5py`, `pickle`.
+    data : dict or str,
+        The dataset for model input, should be a dictionary including keys as 'X' and 'y',
+        or a path string locating a data file.
+        If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+        which is time-series data for input, can contain missing values, and y should be array-like of shape
+        [n_samples], which is classification labels of X.
+        If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+        key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+    file_type : str, default = "h5py"
+        The type of the given file if train_set and val_set are path strings.
     """
 
-    def __init__(self, X=None, y=None, file_path=None, file_type="h5py"):
+    def __init__(self, data, file_type="h5py"):
         super().__init__()
         # types and shapes had been checked after X and y input into the model
         # So they are safe to use here. No need to check again.
 
-        assert (X is None) ^ (
-            file_path is None
-        ), f"X and file_path cannot both be None."
-
-        assert (X is not None) ^ (
-            file_path is not None
-        ), f"X and file_path cannot both be given. Either of them should be given."
-
-        assert (
-            file_type in SUPPORTED_DATASET_FILE_TYPE
-        ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}"
-
-        if X is not None:
-            X, y = self.check_input(X, y)
+        self.data = data
+        if isinstance(data, str):
+            self.file_type = file_type
 
-        self.X = X
-        self.y = y
-        self.file_path = file_path
-        self.file_type = file_type
+            # check if the given file type is supported
+            assert (
+                file_type in SUPPORTED_DATASET_FILE_TYPE
+            ), f"file_type should be one of {SUPPORTED_DATASET_FILE_TYPE}, but got {file_type}"
 
-        if self.file_path is not None:
-            self.file_handler = self._open_file_handle()
+            # open the file handle
+            self.file_handle = self._open_file_handle()
+            # check if X exists in the file
             assert (
-                "X" in self.file_handler.keys()
+                "X" in self.file_handle.keys()
             ), "The given dataset file doesn't contains X. Please double check."
 
+        else:
+            X = data["X"]
+            y = None if "y" not in data.keys() else data["y"]
+            self.X, self.y = self.check_input(X, y)
+
         self.sample_num = self._get_sample_num()
 
         # set up function fetch_data()
@@ -80,14 +73,12 @@ def _get_sample_num(self):
         sample_num : int
             The number of the samples in the given dataset.
         """
-        if self.X is not None:
-            sample_num = len(self.X)
-        elif self.file_path is not None and self.file_type == "h5py":
-            if self.file_handler is None:
-                self.file_handler = self._open_file_handle()
-            sample_num = len(self.file_handler["X"])
+        if isinstance(self.data, str):
+            if self.file_handle is None:
+                self.file_handle = self._open_file_handle()
+            sample_num = len(self.file_handle["X"])
         else:
-            raise TypeError(f"So far only h5py is supported.")
+            sample_num = len(self.X)
 
         return sample_num
 
@@ -208,14 +199,15 @@ def _open_file_handle(self):
 
         Returns
         -------
-        file_handle : file.
+        file_handle : file
 
         """
+        data_file_path = self.data
         try:
             import h5py
 
             file_handler = h5py.File(
-                self.file_path, "r"
+                data_file_path, "r"
             )  # set swmr=True if the h5 file need to be written into new content during reading
         except ImportError:
             raise ImportError(
@@ -224,7 +216,7 @@ def _open_file_handle(self):
         except OSError as e:
             raise TypeError(
                 f"{e} This probably is caused by file type error. "
-                f"Please confirm that the given file {self.file_path} is an h5 file."
+                f"Please confirm that the given file {data_file_path} is an h5 file."
             )
         except Exception as e:
             raise RuntimeError(e)
@@ -257,10 +249,10 @@ def _fetch_data_from_file(self, idx):
             The collated data sample, a list including all necessary sample info.
         """
 
-        if self.file_handler is None:
-            self.file_handler = self._open_file_handle()
+        if self.file_handle is None:
+            self.file_handle = self._open_file_handle()
 
-        X = self.file_handler["X"][idx]
+        X = self.file_handle["X"][idx]
         missing_mask = ~torch.isnan(X)
         X = torch.nan_to_num(X)
         sample = [
@@ -270,9 +262,9 @@ def _fetch_data_from_file(self, idx):
         ]
 
         if (
-            "y" in self.file_handler.keys()
+            "y" in self.file_handle.keys()
         ):  # if the dataset has labels, then fetch it from the file
-            sample.append(self.file_handler["y"][idx].to(torch.long))
+            sample.append(self.file_handle["y"][idx].to(torch.long))
 
         return sample
 
diff --git a/pypots/data/dataset_for_brits.py b/pypots/data/dataset_for_brits.py
index 139e7f4c..e7a74b13 100644
--- a/pypots/data/dataset_for_brits.py
+++ b/pypots/data/dataset_for_brits.py
@@ -50,19 +50,24 @@ class DatasetForBRITS(BaseDataset):
 
     Parameters
     ----------
-    X : tensor, shape of [n_samples, n_steps, n_features]
-        Time-series data.
-
-    y : tensor, shape of [n_samples], optional, default=None,
-        Classification labels of according time-series samples.
+    data : dict or str,
+        The dataset for model input, should be a dictionary including keys as 'X' and 'y',
+        or a path string locating a data file.
+        If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+        which is time-series data for input, can contain missing values, and y should be array-like of shape
+        [n_samples], which is classification labels of X.
+        If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+        key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+    file_type : str, default = "h5py"
+        The type of the given file if train_set and val_set are path strings.
     """
 
-    def __init__(self, X=None, y=None, file_path=None, file_type="h5py"):
-        super().__init__(X, y, file_path, file_type)
+    def __init__(self, data, file_type="h5py"):
+        super().__init__(data, file_type)
 
-        if self.X is not None:
+        if not isinstance(self.data, str):
             # calculate all delta here.
-            # Training will take too much time if we put delta calculation in __getitem__().
             forward_missing_mask = (~torch.isnan(self.X)).type(torch.float32)
             forward_X = torch.nan_to_num(self.X)
             forward_delta = parse_delta(forward_missing_mask)
@@ -143,10 +148,10 @@ def _fetch_data_from_file(self, idx):
             The collated data sample, a list including all necessary sample info.
         """
 
-        if self.file_handler is None:
-            self.file_handler = self._open_file_handle()
+        if self.file_handle is None:
+            self.file_handle = self._open_file_handle()
 
-        X = self.file_handler["X"][idx]
+        X = self.file_handle["X"][idx]
         missing_mask = (~np.isnan(X)).astype("float32")
         X = np.nan_to_num(X)
 
@@ -175,8 +180,8 @@ def _fetch_data_from_file(self, idx):
         ]
 
         if (
-            "y" in self.file_handler.keys()
+            "y" in self.file_handle.keys()
         ):  # if the dataset has labels, then fetch it from the file
-            sample.append(self.file_handler["y"][idx].to(torch.long))
+            sample.append(self.file_handle["y"][idx].to(torch.long))
 
         return sample
diff --git a/pypots/data/dataset_for_grud.py b/pypots/data/dataset_for_grud.py
index f7dd9df5..8db41843 100644
--- a/pypots/data/dataset_for_grud.py
+++ b/pypots/data/dataset_for_grud.py
@@ -18,15 +18,21 @@ class DatasetForGRUD(BaseDataset):
 
     Parameters
     ----------
-    X : tensor, shape of [n_samples, seq_len, n_features]
-        Time-series feature vector.
-
-    y : tensor, shape of [n_samples], optional, default=None,
-        Classification labels of according time-series samples.
+    data : dict or str,
+        The dataset for model input, should be a dictionary including keys as 'X' and 'y',
+        or a path string locating a data file.
+        If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+        which is time-series data for input, can contain missing values, and y should be array-like of shape
+        [n_samples], which is classification labels of X.
+        If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+        key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+    file_type : str, default = "h5py"
+        The type of the given file if train_set and val_set are path strings.
     """
 
-    def __init__(self, X=None, y=None, file_path=None, file_type="h5py"):
-        super().__init__(X, y, file_path, file_type)
+    def __init__(self, data, file_type="h5py"):
+        super().__init__(data, file_type)
 
         self.locf = LOCF()
 
@@ -99,10 +105,10 @@ def _fetch_data_from_file(self, idx):
             The collated data sample, a list including all necessary sample info.
         """
 
-        if self.file_handler is None:
-            self.file_handler = self._open_file_handle()
+        if self.file_handle is None:
+            self.file_handle = self._open_file_handle()
 
-        X = torch.from_numpy(self.file_handler["X"][idx])
+        X = torch.from_numpy(self.file_handle["X"][idx])
         missing_mask = (~torch.isnan(X)).to(torch.float32)
         X_filledLOCF = self.locf.locf_torch(X)
         X = torch.nan_to_num(X)
@@ -121,8 +127,8 @@ def _fetch_data_from_file(self, idx):
         ]
 
         if (
-            "y" in self.file_handler.keys()
+            "y" in self.file_handle.keys()
         ):  # if the dataset has labels, then fetch it from the file
-            sample.append(self.file_handler["y"][idx].to(torch.long))
+            sample.append(self.file_handle["y"][idx].to(torch.long))
 
         return sample
diff --git a/pypots/data/dataset_for_mit.py b/pypots/data/dataset_for_mit.py
index 89fa78e9..787e7c9e 100644
--- a/pypots/data/dataset_for_mit.py
+++ b/pypots/data/dataset_for_mit.py
@@ -18,26 +18,29 @@ class DatasetForMIT(BaseDataset):
 
     Parameters
     ----------
-    X : tensor, shape of [n_samples, n_steps, n_features]
-        Time-series feature vector.
-
-    y : tensor, shape of [n_samples], optional, default=None,
-        Classification labels of according time-series samples.
+    data : dict or str,
+        The dataset for model input, should be a dictionary including keys as 'X' and 'y',
+        or a path string locating a data file.
+        If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+        which is time-series data for input, can contain missing values, and y should be array-like of shape
+        [n_samples], which is classification labels of X.
+        If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+        key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+    file_type : str, default = "h5py"
+        The type of the given file if train_set and val_set are path strings.
 
     rate : float, in (0,1),
         Artificially missing rate, rate of the observed values which will be artificially masked as missing.
-
-        Note that,
-        `rate` = (number of artificially missing values) / np.sum(~np.isnan(self.data)),
+        Note that, `rate` = (number of artificially missing values) / np.sum(~np.isnan(self.data)),
         not (number of artificially missing values) / np.product(self.data.shape),
         considering that the given data may already contain missing values,
         the latter way may be confusing because if the original missing rate >= `rate`,
         the function will do nothing, i.e. it won't play the role it has to be.
-
     """
 
-    def __init__(self, X=None, y=None, file_path=None, file_type="h5py", rate=0.2):
-        super().__init__(X, y, file_path, file_type)
+    def __init__(self, data, file_type="h5py", rate=0.2):
+        super().__init__(data, file_type)
         self.rate = rate
 
     def _fetch_data_from_array(self, idx):
@@ -99,10 +102,10 @@ def _fetch_data_from_file(self, idx):
             The collated data sample, a list including all necessary sample info.
         """
 
-        if self.file_handler is None:
-            self.file_handler = self._open_file_handle()
+        if self.file_handle is None:
+            self.file_handle = self._open_file_handle()
 
-        X = torch.from_numpy(self.file_handler["X"][idx])
+        X = torch.from_numpy(self.file_handle["X"][idx])
         X_intact, X, missing_mask, indicating_mask = mcar(X, rate=self.rate)
 
         sample = [
@@ -114,8 +117,8 @@ def _fetch_data_from_file(self, idx):
         ]
 
         if (
-            "y" in self.file_handler.keys()
+            "y" in self.file_handle.keys()
         ):  # if the dataset has labels, then fetch it from the file
-            sample.append(self.file_handler["y"][idx].to(torch.long))
+            sample.append(self.file_handle["y"][idx].to(torch.long))
 
         return sample
diff --git a/pypots/forecasting/base.py b/pypots/forecasting/base.py
index 282b0336..5423657c 100644
--- a/pypots/forecasting/base.py
+++ b/pypots/forecasting/base.py
@@ -22,13 +22,29 @@ def __init__(self, device):
         super().__init__(device)
 
     @abstractmethod
-    def fit(self, train_X):
-        """Train the cluster.
+    def fit(self, train_set, val_set=None, file_type="h5py"):
+        """Train the classifier on the given data.
 
         Parameters
         ----------
-        train_X : array-like of shape [n_samples, sequence length (time steps), n_features],
-            Time-series data for training, can contain missing values.
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validation, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        file_type : str, default = "h5py",
+            The type of the given file if train_set and val_set are path strings.
 
         Returns
         -------
@@ -38,7 +54,7 @@ def fit(self, train_X):
         return self
 
     @abstractmethod
-    def forecast(self, X):
+    def forecast(self, X, file_type="h5py"):
         """Forecast the future the input with the trained model.
 
         Parameters
@@ -46,6 +62,9 @@ def forecast(self, X):
         X : array-like of shape [n_samples, sequence length (time steps), n_features],
             Time-series data containing missing values.
 
+        file_type : str, default = "h5py"
+            The type of the given file if X is a path string.
+
         Returns
         -------
         array-like, shape [n_samples, prediction_horizon, n_features],
diff --git a/pypots/forecasting/bttf.py b/pypots/forecasting/bttf.py
index 4f81cb4c..4bcd1cf2 100644
--- a/pypots/forecasting/bttf.py
+++ b/pypots/forecasting/bttf.py
@@ -458,10 +458,31 @@ def __init__(
         self.burn_iter = burn_iter
         self.gibbs_iter = gibbs_iter
 
-    def fit(self, train_X):
+    def fit(self, train_set, val_set=None, file_type="h5py"):
         warnings.warn("Please run func forecast(X) directly.")
 
-    def forecast(self, X):
+    def forecast(self, X, file_type="h5py"):
+        """Forecast the future the input with the trained model.
+
+        Parameters
+        ----------
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py"
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples, prediction_horizon, n_features],
+            Forecasting results.
+        """
+        assert not isinstance(
+            X, str
+        ), "BTTF so far does not accept file input. It needs a specified Dataset class."
+
+        X = X["X"]
         X = X.transpose((0, 2, 1))
 
         pred = BTTF_forecast(
diff --git a/pypots/imputation/base.py b/pypots/imputation/base.py
index 743ce958..dbb70d9c 100644
--- a/pypots/imputation/base.py
+++ b/pypots/imputation/base.py
@@ -28,31 +28,49 @@ def __init__(self, device):
         super().__init__(device)
 
     @abstractmethod
-    def fit(self, train_X, val_X=None):
-        """Train the imputer.
+    def fit(self, train_set, val_set=None, file_type="h5py"):
+        """Train the imputer on the given data.
 
         Parameters
         ----------
-        train_X : array-like, shape: [n_samples, sequence length (time steps), n_features],
-            Time-series data for training, can contain missing values.
-        val_X : array-like, optional, shape [n_samples, sequence length (time steps), n_features],
-            Time-series data for validating, can contain missing values.
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        file_type : str, default = "h5py",
+            The type of the given file if train_set and val_set are path strings.
 
         Returns
         -------
         self : object,
-            Trained imputer.
+            The trained imputer.
         """
         return self
 
     @abstractmethod
-    def impute(self, X):
-        """Impute missing data with the trained model.
+    def impute(self, X, file_type="h5py"):
+        """Impute missing values in the given data with the trained model.
 
         Parameters
         ----------
-        X : array-like of shape [n_samples, sequence length (time steps), n_features],
-            Time-series data for imputing contains missing values.
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py",
+            The type of the given file if X is a path string.
 
         Returns
         -------
diff --git a/pypots/imputation/brits.py b/pypots/imputation/brits.py
index b93311bd..19d8450d 100644
--- a/pypots/imputation/brits.py
+++ b/pypots/imputation/brits.py
@@ -6,6 +6,7 @@
 # License: GPL-v3
 
 import math
+import numpy as np
 
 import torch
 import torch.nn as nn
@@ -495,36 +496,58 @@ def __init__(
         self.model = self.model.to(self.device)
         self._print_model_size()
 
-    def fit(self, train_X, val_X=None):
-        """Fit the model on the given training data.
+    def fit(self, train_set, val_set=None, file_type="h5py"):
+        """Train the imputer on the given data.
 
         Parameters
         ----------
-        train_X : array-like, shape of [n_samples, n_steps, n_features],
-            Data for training.
-
-        val_X : array-like, optional, shape of [n_samples, n_steps, n_features],
-            Data for validating.
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        file_type : str, default = "h5py",
+            The type of the given file if train_set and val_set are path strings.
 
         Returns
         -------
         self : object,
-            Trained model.
+            The trained imputer.
         """
-        training_set = DatasetForBRITS(train_X)  # time_gaps is necessary for BRITS
+        training_set = DatasetForBRITS(train_set, file_type)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
         )
 
-        if val_X is None:
+        if val_set is None:
             self._train_model(training_loader)
         else:
+            if isinstance(val_set, str):
+                import h5py
+
+                with h5py.File(val_set, "r") as hf:
+                    val_X = hf["X"]
+                val_set = {"X": val_X}
+
             val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar(
-                val_X, 0.2
+                val_set["X"], 0.2
             )
-            val_X = masked_fill(val_X, 1 - val_X_missing_mask, torch.nan)
-            val_set = DatasetForBRITS(val_X)
+            val_X = masked_fill(val_X, 1 - val_X_missing_mask, np.nan)
+            val_set["X"] = val_X
+            val_set = DatasetForBRITS(val_set)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
+
             self._train_model(
                 training_loader, val_loader, val_X_intact, val_X_indicating_mask
             )
@@ -606,7 +629,23 @@ def assemble_input_for_testing(self, data) -> dict:
         """
         return self.assemble_input_for_training(data)
 
-    def impute(self, X):
+    def impute(self, X, file_type="h5py"):
+        """Impute missing values in the given data with the trained model.
+
+        Parameters
+        ----------
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py",
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples, sequence length (time steps), n_features],
+            Imputed data.
+        """
         self.model.eval()  # set the model as eval status to freeze it.
         test_set = DatasetForBRITS(X)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
diff --git a/pypots/imputation/locf.py b/pypots/imputation/locf.py
index 2d391bb9..9bdde882 100644
--- a/pypots/imputation/locf.py
+++ b/pypots/imputation/locf.py
@@ -26,7 +26,35 @@ def __init__(self, nan=0):
         super().__init__("cpu")
         self.nan = nan
 
-    def fit(self, train_X, val_X=None):
+    def fit(self, train_set, val_set=None, file_type="h5py"):
+        """Train the imputer on the given data.
+
+        Parameters
+        ----------
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        file_type : str, default = "h5py",
+            The type of the given file if train_set and val_set are path strings.
+
+        Returns
+        -------
+        self : object,
+            The trained imputer.
+        """
         warnings.warn(
             "LOCF (Last Observed Carried Forward) imputation class has no parameter to train. "
             "Please run func impute(X) directly."
@@ -103,19 +131,27 @@ def locf_torch(self, X):
 
         return X_imputed
 
-    def impute(self, X):
-        """Impute missing values
+    def impute(self, X, file_type="h5py"):
+        """Impute missing values in the given data with the trained model.
 
         Parameters
         ----------
-        X : array-like,
-            Time-series vectors containing missing values (NaN).
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py",
+            The type of the given file if X is a path string.
 
         Returns
         -------
-        array-like,
-            Imputed time series.
+        array-like, shape [n_samples, sequence length (time steps), n_features],
+            Imputed data.
         """
+
+        assert not isinstance(X, str)
+        X = X["X"]
+
         assert len(X.shape) == 3, (
             f"Input X should have 3 dimensions [n_samples, n_steps, n_features], "
             f"but the actual shape of X: {X.shape}"
diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py
index 9832865c..627e2b6a 100644
--- a/pypots/imputation/saits.py
+++ b/pypots/imputation/saits.py
@@ -6,6 +6,7 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: GPL-v3
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -214,19 +215,55 @@ def __init__(
         self.model = self.model.to(self.device)
         self._print_model_size()
 
-    def fit(self, train_X, val_X=None):
-        training_set = DatasetForMIT(train_X)
+    def fit(self, train_set, val_set=None, file_type="h5py"):
+        """Train the imputer on the given data.
+
+        Parameters
+        ----------
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        file_type : str, default = "h5py",
+            The type of the given file if train_set and val_set are path strings.
+
+        Returns
+        -------
+        self : object,
+            The trained imputer.
+        """
+        training_set = DatasetForMIT(train_set, file_type)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
         )
-        if val_X is None:
+        if val_set is None:
             self._train_model(training_loader)
         else:
+            if isinstance(val_set, str):
+                import h5py
+
+                with h5py.File(val_set, "r") as hf:
+                    val_X = hf["X"]
+                val_set = {"X": val_X}
+
             val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar(
-                val_X, 0.2
+                val_set["X"], 0.2
             )
-            val_X = masked_fill(val_X, 1 - val_X_missing_mask, torch.nan)
-            val_set = BaseDataset(val_X)
+            val_X = masked_fill(val_X, 1 - val_X_missing_mask, np.nan)
+            val_set["X"] = val_X
+            val_set = BaseDataset(val_set)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
             self._train_model(
                 training_loader, val_loader, val_X_intact, val_X_indicating_mask
@@ -305,9 +342,25 @@ def assemble_input_for_testing(self, data) -> dict:
         """
         return self.assemble_input_for_validating(data)
 
-    def impute(self, X):
+    def impute(self, X, file_type="h5py"):
+        """Impute missing values in the given data with the trained model.
+
+        Parameters
+        ----------
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py",
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples, sequence length (time steps), n_features],
+            Imputed data.
+        """
         self.model.eval()  # set the model as eval status to freeze it.
-        test_set = BaseDataset(X)
+        test_set = BaseDataset(X, file_type)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
         imputation_collector = []
 
diff --git a/pypots/imputation/transformer.py b/pypots/imputation/transformer.py
index a9c8e221..4b89a94c 100644
--- a/pypots/imputation/transformer.py
+++ b/pypots/imputation/transformer.py
@@ -304,20 +304,56 @@ def __init__(
         self.model = self.model.to(self.device)
         self._print_model_size()
 
-    def fit(self, train_X, val_X=None):
+    def fit(self, train_set, val_set=None, file_type="h5py"):
+        """Train the imputer on the given data.
 
-        training_set = DatasetForMIT(train_X)
+        Parameters
+        ----------
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including the key 'X',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include the key 'X'.
+
+        file_type : str, default = "h5py",
+            The type of the given file if train_set and val_set are path strings.
+
+        Returns
+        -------
+        self : object,
+            The trained imputer.
+        """
+
+        training_set = DatasetForMIT(train_set, file_type)
         training_loader = DataLoader(
             training_set, batch_size=self.batch_size, shuffle=True
         )
-        if val_X is None:
+        if val_set is None:
             self._train_model(training_loader)
         else:
+            if isinstance(val_set, str):
+                import h5py
+
+                with h5py.File(val_set, "r") as hf:
+                    val_X = hf["X"]
+                val_set = {"X": val_X}
+
             val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar(
-                val_X, 0.2
+                val_set["X"], 0.2
             )
             val_X = masked_fill(val_X, 1 - val_X_missing_mask, np.nan)
-            val_set = BaseDataset(val_X)
+            val_set["X"] = val_X
+            val_set = BaseDataset(val_set)
             val_loader = DataLoader(val_set, batch_size=self.batch_size, shuffle=False)
             self._train_model(
                 training_loader, val_loader, val_X_intact, val_X_indicating_mask
@@ -398,9 +434,25 @@ def assemble_input_for_testing(self, data) -> dict:
         """
         return self.assemble_input_for_validating(data)
 
-    def impute(self, X):
+    def impute(self, X, file_type="h5py"):
+        """Impute missing values in the given data with the trained model.
+
+        Parameters
+        ----------
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py",
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples, sequence length (time steps), n_features],
+            Imputed data.
+        """
         self.model.eval()  # set the model as eval status to freeze it.
-        test_set = BaseDataset(X)
+        test_set = BaseDataset(X, file_type)
         test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False)
         imputation_collector = []
 
diff --git a/pypots/tests/test_classification.py b/pypots/tests/test_classification.py
index 8148980b..034d65ab 100644
--- a/pypots/tests/test_classification.py
+++ b/pypots/tests/test_classification.py
@@ -14,15 +14,13 @@
 
 EPOCHS = 5
 
+TRAIN_SET = {"X": DATA["train_X"], "y": DATA["train_y"]}
+VAL_SET = {"X": DATA["val_X"], "y": DATA["val_y"]}
+TEST_SET = {"X": DATA["test_X"]}
+
 
 class TestBRITS(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA["train_X"]
-        self.train_y = DATA["train_y"]
-        self.val_X = DATA["val_X"]
-        self.val_y = DATA["val_y"]
-        self.test_X = DATA["test_X"]
-        self.test_y = DATA["test_y"]
         logger.info("Running test cases for BRITS...")
         self.brits = BRITS(
             DATA["n_steps"],
@@ -31,7 +29,7 @@ def setUp(self) -> None:
             n_classes=DATA["n_classes"],
             epochs=EPOCHS,
         )
-        self.brits.fit(self.train_X, self.train_y, self.val_X, self.val_y)
+        self.brits.fit(TRAIN_SET, VAL_SET)
 
     def test_parameters(self):
         assert hasattr(self.brits, "model") and self.brits.model is not None
@@ -47,8 +45,8 @@ def test_parameters(self):
         )
 
     def test_classify(self):
-        predictions = self.brits.classify(self.test_X)
-        metrics = cal_binary_classification_metrics(predictions, self.test_y)
+        predictions = self.brits.classify(TEST_SET)
+        metrics = cal_binary_classification_metrics(predictions, DATA["test_y"])
         logger.info(
             f'ROC_AUC: {metrics["roc_auc"]}, \n'
             f'PR_AUC: {metrics["pr_auc"]},\n'
@@ -61,12 +59,6 @@ def test_classify(self):
 
 class TestGRUD(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA["train_X"]
-        self.train_y = DATA["train_y"]
-        self.val_X = DATA["val_X"]
-        self.val_y = DATA["val_y"]
-        self.test_X = DATA["test_X"]
-        self.test_y = DATA["test_y"]
         logger.info("Running test cases for GRUD...")
         self.grud = GRUD(
             DATA["n_steps"],
@@ -75,7 +67,7 @@ def setUp(self) -> None:
             n_classes=DATA["n_classes"],
             epochs=EPOCHS,
         )
-        self.grud.fit(self.train_X, self.train_y, self.val_X, self.val_y)
+        self.grud.fit(TRAIN_SET, VAL_SET)
 
     def test_parameters(self):
         assert hasattr(self.grud, "model") and self.grud.model is not None
@@ -91,8 +83,8 @@ def test_parameters(self):
         )
 
     def test_classify(self):
-        predictions = self.grud.classify(self.test_X)
-        metrics = cal_binary_classification_metrics(predictions, self.test_y)
+        predictions = self.grud.classify(TEST_SET)
+        metrics = cal_binary_classification_metrics(predictions, DATA["test_y"])
         logger.info(
             f'ROC_AUC: {metrics["roc_auc"]}, \n'
             f'PR_AUC: {metrics["pr_auc"]},\n'
@@ -105,12 +97,6 @@ def test_classify(self):
 
 class TestRaindrop(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA["train_X"]
-        self.train_y = DATA["train_y"]
-        self.val_X = DATA["val_X"]
-        self.val_y = DATA["val_y"]
-        self.test_X = DATA["test_X"]
-        self.test_y = DATA["test_y"]
         logger.info("Running test cases for Raindrop...")
         self.raindrop = Raindrop(
             DATA["n_features"],
@@ -127,7 +113,7 @@ def setUp(self) -> None:
             False,
             epochs=EPOCHS,
         )
-        self.raindrop.fit(self.train_X, self.train_y, self.val_X, self.val_y)
+        self.raindrop.fit(TRAIN_SET, VAL_SET)
 
     def test_parameters(self):
         assert hasattr(self.raindrop, "model") and self.raindrop.model is not None
@@ -145,8 +131,8 @@ def test_parameters(self):
         )
 
     def test_classify(self):
-        predictions = self.raindrop.classify(self.test_X)
-        metrics = cal_binary_classification_metrics(predictions, self.test_y)
+        predictions = self.raindrop.classify(TEST_SET)
+        metrics = cal_binary_classification_metrics(predictions, DATA["test_y"])
         logger.info(
             f'ROC_AUC: {metrics["roc_auc"]}, \n'
             f'PR_AUC: {metrics["pr_auc"]},\n'
diff --git a/pypots/tests/test_clustering.py b/pypots/tests/test_clustering.py
index ce22c64a..ddc36887 100644
--- a/pypots/tests/test_clustering.py
+++ b/pypots/tests/test_clustering.py
@@ -17,11 +17,13 @@
 
 EPOCHS = 5
 
+TRAIN_SET = {"X": DATA["train_X"]}
+VAL_SET = {"X": DATA["val_X"]}
+TEST_SET = {"X": DATA["test_X"]}
+
 
 class TestCRLI(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA["train_X"]
-        self.train_y = DATA["train_y"]
         logger.info("Running test cases for CRLI...")
         self.crli = CRLI(
             n_steps=DATA["n_steps"],
@@ -31,7 +33,7 @@ def setUp(self) -> None:
             rnn_hidden_size=128,
             epochs=EPOCHS,
         )
-        self.crli.fit(self.train_X)
+        self.crli.fit(TRAIN_SET)
 
     def test_parameters(self):
         assert hasattr(self.crli, "model") and self.crli.model is not None
@@ -48,16 +50,14 @@ def test_parameters(self):
         )
 
     def test_cluster(self):
-        clustering = self.crli.cluster(self.train_X)
-        RI = cal_rand_index(clustering, self.train_y)
-        CP = cal_cluster_purity(clustering, self.train_y)
+        clustering = self.crli.cluster(TEST_SET)
+        RI = cal_rand_index(clustering, DATA["test_y"])
+        CP = cal_cluster_purity(clustering, DATA["test_y"])
         logger.info(f"RI: {RI}\nCP: {CP}")
 
 
 class TestVaDER(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA["train_X"]
-        self.train_y = DATA["train_y"]
         logger.info("Running test cases for VaDER...")
         self.vader = VaDER(
             n_steps=DATA["n_steps"],
@@ -68,7 +68,7 @@ def setUp(self) -> None:
             pretrain_epochs=20,
             epochs=EPOCHS,
         )
-        self.vader.fit(self.train_X)
+        self.vader.fit(TRAIN_SET)
 
     def test_parameters(self):
         assert hasattr(self.vader, "model") and self.vader.model is not None
@@ -85,9 +85,9 @@ def test_parameters(self):
 
     def test_cluster(self):
         try:
-            clustering = self.vader.cluster(self.train_X)
-            RI = cal_rand_index(clustering, self.train_y)
-            CP = cal_cluster_purity(clustering, self.train_y)
+            clustering = self.vader.cluster(TEST_SET)
+            RI = cal_rand_index(clustering, DATA["test_y"])
+            CP = cal_cluster_purity(clustering, DATA["test_y"])
             logger.info(f"RI: {RI}\nCP: {CP}")
         except np.linalg.LinAlgError as e:
             logger.info(
diff --git a/pypots/tests/test_forecasting.py b/pypots/tests/test_forecasting.py
index 27734a68..409c7f81 100644
--- a/pypots/tests/test_forecasting.py
+++ b/pypots/tests/test_forecasting.py
@@ -15,14 +15,12 @@
 from pypots.utils.metrics import cal_mae
 
 EPOCHS = 5
+DATA = gene_random_walk_data(n_steps=120, n_features=10)
+TEST_SET = {"X": DATA["test_X"][:, :100]}
 
 
 class TestBTTF(unittest.TestCase):
     def setUp(self) -> None:
-        DATA = gene_random_walk_data(n_steps=120, n_features=10)
-        self.test_X = DATA["test_X"]
-        self.test_X_intact = DATA["test_X_intact"]
-        self.test_X_for_input = self.test_X[:, :100]
         logger.info("Running test cases for BTTF...")
         self.bttf = BTTF(
             100,
@@ -36,8 +34,8 @@ def setUp(self) -> None:
         )
 
     def test_forecasting(self):
-        predictions = self.bttf.forecast(self.test_X_for_input)
-        mae = cal_mae(predictions, self.test_X_intact[:, 100:])
+        predictions = self.bttf.forecast(TEST_SET)
+        mae = cal_mae(predictions, DATA["test_X_intact"][:, 100:])
         logger.info(f"prediction MAE: {mae}")
 
 
diff --git a/pypots/tests/test_imputation.py b/pypots/tests/test_imputation.py
index 957a4d34..8367eb21 100644
--- a/pypots/tests/test_imputation.py
+++ b/pypots/tests/test_imputation.py
@@ -22,14 +22,13 @@
 
 EPOCH = 5
 
+TRAIN_SET = {"X": DATA["train_X"]}
+VAL_SET = {"X": DATA["val_X"]}
+TEST_SET = {"X": DATA["test_X"]}
+
 
 class TestSAITS(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA["train_X"]
-        self.val_X = DATA["val_X"]
-        self.test_X = DATA["test_X"]
-        self.test_X_intact = DATA["test_X_intact"]
-        self.test_X_indicating_mask = DATA["test_X_indicating_mask"]
         logger.info("Running test cases for SAITS...")
         self.saits = SAITS(
             DATA["n_steps"],
@@ -43,7 +42,7 @@ def setUp(self) -> None:
             dropout=0.1,
             epochs=EPOCH,
         )
-        self.saits.fit(self.train_X, self.val_X)
+        self.saits.fit(TRAIN_SET, VAL_SET)
 
     def test_parameters(self):
         assert hasattr(self.saits, "model") and self.saits.model is not None
@@ -59,21 +58,18 @@ def test_parameters(self):
         )
 
     def test_impute(self):
-        imputed_X = self.saits.impute(self.test_X)
+        imputed_X = self.saits.impute(TEST_SET)
         assert not np.isnan(
             imputed_X
         ).any(), "Output still has missing values after running impute()."
-        test_MAE = cal_mae(imputed_X, self.test_X_intact, self.test_X_indicating_mask)
+        test_MAE = cal_mae(
+            imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"]
+        )
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
 
 class TestTransformer(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA["train_X"]
-        self.val_X = DATA["val_X"]
-        self.test_X = DATA["test_X"]
-        self.test_X_intact = DATA["test_X_intact"]
-        self.test_X_indicating_mask = DATA["test_X_indicating_mask"]
         logger.info("Running test cases for Transformer...")
         self.transformer = Transformer(
             DATA["n_steps"],
@@ -87,7 +83,7 @@ def setUp(self) -> None:
             dropout=0.1,
             epochs=EPOCH,
         )
-        self.transformer.fit(self.train_X, self.val_X)
+        self.transformer.fit(TRAIN_SET, VAL_SET)
 
     def test_parameters(self):
         assert hasattr(self.transformer, "model") and self.transformer.model is not None
@@ -106,24 +102,21 @@ def test_parameters(self):
         )
 
     def test_impute(self):
-        imputed_X = self.transformer.impute(self.test_X)
+        imputed_X = self.transformer.impute(TEST_SET)
         assert not np.isnan(
             imputed_X
         ).any(), "Output still has missing values after running impute()."
-        test_MAE = cal_mae(imputed_X, self.test_X_intact, self.test_X_indicating_mask)
+        test_MAE = cal_mae(
+            imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"]
+        )
         logger.info(f"Transformer test_MAE: {test_MAE}")
 
 
 class TestBRITS(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA["train_X"]
-        self.val_X = DATA["val_X"]
-        self.test_X = DATA["test_X"]
-        self.test_X_intact = DATA["test_X_intact"]
-        self.test_X_indicating_mask = DATA["test_X_indicating_mask"]
         logger.info("Running test cases for BRITS...")
         self.brits = BRITS(DATA["n_steps"], DATA["n_features"], 256, epochs=EPOCH)
-        self.brits.fit(self.train_X, self.val_X)
+        self.brits.fit(TRAIN_SET, VAL_SET)
 
     def test_parameters(self):
         assert hasattr(self.brits, "model") and self.brits.model is not None
@@ -139,21 +132,18 @@ def test_parameters(self):
         )
 
     def test_impute(self):
-        imputed_X = self.brits.impute(self.test_X)
+        imputed_X = self.brits.impute(TEST_SET)
         assert not np.isnan(
             imputed_X
         ).any(), "Output still has missing values after running impute()."
-        test_MAE = cal_mae(imputed_X, self.test_X_intact, self.test_X_indicating_mask)
+        test_MAE = cal_mae(
+            imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"]
+        )
         logger.info(f"BRITS test_MAE: {test_MAE}")
 
 
 class TestLOCF(unittest.TestCase):
     def setUp(self) -> None:
-        self.train_X = DATA["train_X"]
-        self.val_X = DATA["val_X"]
-        self.test_X = DATA["test_X"]
-        self.test_X_intact = DATA["test_X_intact"]
-        self.test_X_indicating_mask = DATA["test_X_indicating_mask"]
         logger.info("Running test cases for LOCF...")
         self.locf = LOCF(nan=0)
 
@@ -161,12 +151,12 @@ def test_parameters(self):
         assert hasattr(self.locf, "nan") and self.locf.nan is not None
 
     def test_impute(self):
-        test_X_imputed = self.locf.impute(self.test_X)
+        test_X_imputed = self.locf.impute(TEST_SET)
         assert not np.isnan(
             test_X_imputed
         ).any(), "Output still has missing values after running impute()."
         test_MAE = cal_mae(
-            test_X_imputed, self.test_X_intact, self.test_X_indicating_mask
+            test_X_imputed, DATA["test_X_intact"], DATA["test_X_indicating_mask"]
         )
         logger.info(f"LOCF test_MAE: {test_MAE}")
 

From 7dfbf87c9110f0862b5ab3ab1c4b1fcecd4ad56a Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 10:34:17 +0800
Subject: [PATCH 10/22] fix: bugs in Dataset classes' functions with
 lazy-loading strategy;

---
 pypots/data/base.py              | 16 +++---
 pypots/data/dataset_for_brits.py | 99 +++++++++++++++++++++++---------
 pypots/data/dataset_for_grud.py  | 10 ++--
 pypots/data/dataset_for_mit.py   |  2 +-
 pypots/imputation/saits.py       |  2 +-
 5 files changed, 88 insertions(+), 41 deletions(-)

diff --git a/pypots/data/base.py b/pypots/data/base.py
index 84707c2d..f0303839 100644
--- a/pypots/data/base.py
+++ b/pypots/data/base.py
@@ -37,7 +37,7 @@ def __init__(self, data, file_type="h5py"):
         # So they are safe to use here. No need to check again.
 
         self.data = data
-        if isinstance(data, str):
+        if isinstance(self.data, str):  # data from file
             self.file_type = file_type
 
             # check if the given file type is supported
@@ -52,7 +52,7 @@ def __init__(self, data, file_type="h5py"):
                 "X" in self.file_handle.keys()
             ), "The given dataset file doesn't contains X. Please double check."
 
-        else:
+        else:  # data from array
             X = data["X"]
             y = None if "y" not in data.keys() else data["y"]
             self.X, self.y = self.check_input(X, y)
@@ -60,10 +60,10 @@ def __init__(self, data, file_type="h5py"):
         self.sample_num = self._get_sample_num()
 
         # set up function fetch_data()
-        if self.X is not None:
-            self.fetch_data = self._fetch_data_from_array
-        else:
+        if isinstance(self.data, str):
             self.fetch_data = self._fetch_data_from_file
+        else:
+            self.fetch_data = self._fetch_data_from_array
 
     def _get_sample_num(self):
         """Determine the number of samples in the dataset and return the number.
@@ -207,8 +207,8 @@ def _open_file_handle(self):
             import h5py
 
             file_handler = h5py.File(
-                data_file_path, "r"
-            )  # set swmr=True if the h5 file need to be written into new content during reading
+                data_file_path, "r", swmr=True
+            )  # set  if the h5 file need to be written into new content during reading
         except ImportError:
             raise ImportError(
                 "h5py is missing and cannot be imported. Please install it first."
@@ -252,7 +252,7 @@ def _fetch_data_from_file(self, idx):
         if self.file_handle is None:
             self.file_handle = self._open_file_handle()
 
-        X = self.file_handle["X"][idx]
+        X = torch.from_numpy(self.file_handle["X"][idx])
         missing_mask = ~torch.isnan(X)
         X = torch.nan_to_num(X)
         sample = [
diff --git a/pypots/data/dataset_for_brits.py b/pypots/data/dataset_for_brits.py
index e7a74b13..eb360583 100644
--- a/pypots/data/dataset_for_brits.py
+++ b/pypots/data/dataset_for_brits.py
@@ -16,7 +16,7 @@ def parse_delta(missing_mask):
 
     Parameters
     ----------
-    missing_mask : tensor, shape of [n_samples, n_steps, n_features]
+    missing_mask : tensor, shape of [n_steps, n_features] or [n_samples, n_steps, n_features]
         Binary masks indicate missing values.
 
     Returns
@@ -25,23 +25,70 @@ def parse_delta(missing_mask):
         Delta matrix indicates time gaps of missing values.
         Its math definition please refer to :cite:`che2018GRUD`.
     """
-    # missing_mask is from X, and X's shape and type had been checked. So no need to double-check here.
-    n_samples, n_steps, n_features = missing_mask.shape
-    device = missing_mask.device
-    delta_collector = []
-    for m_mask in missing_mask:
-        delta = []
+
+    def cal_delta_for_single_sample(mask):
+        d = []  # single sample's delta
         for step in range(n_steps):
             if step == 0:
-                delta.append(torch.zeros(1, n_features, device=device))
+                d.append(torch.zeros(1, n_features, device=device))
             else:
-                delta.append(
-                    torch.ones(1, n_features, device=device)
-                    + (1 - m_mask[step]) * delta[-1]
+                d.append(
+                    torch.ones(1, n_features, device=device) + (1 - mask[step]) * d[-1]
                 )
-        delta = torch.concat(delta, dim=0)
-        delta_collector.append(delta.unsqueeze(0))
-    delta = torch.concat(delta_collector, dim=0)
+        d = torch.concat(d, dim=0)
+        return d
+
+    # missing_mask is from X, and X's shape and type had been checked. So no need to double-check here.
+    device = missing_mask.device
+    if len(missing_mask.shape) == 2:
+        n_steps, n_features = missing_mask.shape
+        delta = cal_delta_for_single_sample(missing_mask)
+    else:
+        n_samples, n_steps, n_features = missing_mask.shape
+        delta_collector = []
+        for m_mask in missing_mask:
+            delta = cal_delta_for_single_sample(m_mask)
+            delta_collector.append(delta.unsqueeze(0))
+        delta = torch.concat(delta_collector, dim=0)
+
+    return delta
+
+
+def parse_delta_np(missing_mask):
+    """Generate time-gap (delta) matrix from missing masks.
+
+    Parameters
+    ----------
+    missing_mask : array, shape of [seq_len, n_features]
+        Binary masks indicate missing values.
+
+    Returns
+    -------
+    delta, array,
+        Delta matrix indicates time gaps of missing values.
+        Its math definition please refer to :cite:`che2018MissingData`.
+    """
+
+    def cal_delta_for_single_sample(mask):
+        d = []
+        for step in range(seq_len):
+            if step == 0:
+                d.append(np.zeros(n_features))
+            else:
+                d.append(np.ones(n_features) + (1 - mask[step]) * d[-1])
+        d = np.asarray(d)
+        return d
+
+    if len(missing_mask.shape) == 2:
+        seq_len, n_features = missing_mask.shape
+        delta = cal_delta_for_single_sample(missing_mask)
+    else:
+        n_samples, seq_len, n_features = missing_mask.shape
+        delta_collector = []
+        for m_mask in missing_mask:
+            delta = cal_delta_for_single_sample(m_mask)
+            delta_collector.append(delta)
+        delta = np.asarray(delta_collector)
     return delta
 
 
@@ -151,9 +198,9 @@ def _fetch_data_from_file(self, idx):
         if self.file_handle is None:
             self.file_handle = self._open_file_handle()
 
-        X = self.file_handle["X"][idx]
-        missing_mask = (~np.isnan(X)).astype("float32")
-        X = np.nan_to_num(X)
+        X = torch.from_numpy(self.file_handle["X"][idx])
+        missing_mask = (~torch.isnan(X)).to(torch.float32)
+        X = torch.nan_to_num(X)
 
         forward = {
             "X": X,
@@ -162,26 +209,26 @@ def _fetch_data_from_file(self, idx):
         }
 
         backward = {
-            "X": np.flip(forward["X"], axis=0).copy(),
-            "missing_mask": np.flip(forward["missing_mask"], axis=0).copy(),
+            "X": torch.flip(forward["X"], dims=[0]),
+            "missing_mask": torch.flip(forward["missing_mask"], dims=[0]),
         }
         backward["deltas"] = parse_delta(backward["missing_mask"])
 
         sample = [
             torch.tensor(idx),
             # for forward
-            torch.from_numpy(forward["X"].astype("float32")),
-            torch.from_numpy(forward["missing_mask"].astype("float32")),
-            torch.from_numpy(forward["deltas"].astype("float32")),
+            forward["X"],
+            forward["missing_mask"],
+            forward["deltas"],
             # for backward
-            torch.from_numpy(backward["X"].astype("float32")),
-            torch.from_numpy(backward["missing_mask"].astype("float32")),
-            torch.from_numpy(backward["deltas"].astype("float32")),
+            backward["X"],
+            backward["missing_mask"],
+            backward["deltas"],
         ]
 
         if (
             "y" in self.file_handle.keys()
         ):  # if the dataset has labels, then fetch it from the file
-            sample.append(self.file_handle["y"][idx].to(torch.long))
+            sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long))
 
         return sample
diff --git a/pypots/data/dataset_for_grud.py b/pypots/data/dataset_for_grud.py
index 8db41843..77f4f5f1 100644
--- a/pypots/data/dataset_for_grud.py
+++ b/pypots/data/dataset_for_grud.py
@@ -36,7 +36,7 @@ def __init__(self, data, file_type="h5py"):
 
         self.locf = LOCF()
 
-        if self.X is not None:
+        if not isinstance(self.data, str):  # data from array
             self.missing_mask = (~torch.isnan(self.X)).to(torch.float32)
             self.X_filledLOCF = self.locf.locf_torch(self.X)
             self.X = torch.nan_to_num(self.X)
@@ -110,11 +110,11 @@ def _fetch_data_from_file(self, idx):
 
         X = torch.from_numpy(self.file_handle["X"][idx])
         missing_mask = (~torch.isnan(X)).to(torch.float32)
-        X_filledLOCF = self.locf.locf_torch(X)
+        X_filledLOCF = self.locf.locf_torch(X.unsqueeze(dim=0)).squeeze()
         X = torch.nan_to_num(X)
         deltas = parse_delta(missing_mask)
-        empirical_mean = torch.sum(missing_mask * X, dim=[0, 1]) / torch.sum(
-            missing_mask, dim=[0, 1]
+        empirical_mean = torch.sum(missing_mask * X, dim=[0]) / torch.sum(
+            missing_mask, dim=[0]
         )
 
         sample = [
@@ -129,6 +129,6 @@ def _fetch_data_from_file(self, idx):
         if (
             "y" in self.file_handle.keys()
         ):  # if the dataset has labels, then fetch it from the file
-            sample.append(self.file_handle["y"][idx].to(torch.long))
+            sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long))
 
         return sample
diff --git a/pypots/data/dataset_for_mit.py b/pypots/data/dataset_for_mit.py
index 787e7c9e..7dfc4e4c 100644
--- a/pypots/data/dataset_for_mit.py
+++ b/pypots/data/dataset_for_mit.py
@@ -119,6 +119,6 @@ def _fetch_data_from_file(self, idx):
         if (
             "y" in self.file_handle.keys()
         ):  # if the dataset has labels, then fetch it from the file
-            sample.append(self.file_handle["y"][idx].to(torch.long))
+            sample.append(torch.tensor(self.file_handle["y"][idx], dtype=torch.long))
 
         return sample
diff --git a/pypots/imputation/saits.py b/pypots/imputation/saits.py
index 627e2b6a..3870b218 100644
--- a/pypots/imputation/saits.py
+++ b/pypots/imputation/saits.py
@@ -255,7 +255,7 @@ def fit(self, train_set, val_set=None, file_type="h5py"):
                 import h5py
 
                 with h5py.File(val_set, "r") as hf:
-                    val_X = hf["X"]
+                    val_X = hf["X"][:]
                 val_set = {"X": val_X}
 
             val_X_intact, val_X, val_X_missing_mask, val_X_indicating_mask = mcar(

From fdc1459ade7753ce118530e690514d380ea155e9 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 10:49:20 +0800
Subject: [PATCH 11/22] fix: update the dependencies;

---
 environment.yml                   | 9 ++++-----
 pypots/tests/environment_test.yml | 7 +++----
 requirements.txt                  | 3 ++-
 setup.py                          | 1 +
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/environment.yml b/environment.yml
index 396b79b2..c1cb2024 100644
--- a/environment.yml
+++ b/environment.yml
@@ -9,10 +9,9 @@ dependencies:
     - conda-forge::scipy
     - conda-forge::pandas
     - conda-forge::scikit-learn
-    - conda-forge::matplotlib
     - conda-forge::tensorboard
     - conda-forge::pip
-    - pytorch::pytorch==1.11.0
-    - pip:
-        - pycorruptor==0.0.4
-        - tsdb==0.0.7
+    - conda-forge::pycorruptor
+    - conda-forge::tsdb
+    - conda-forge::h5py
+    - pytorch::pytorch==1.11.0
\ No newline at end of file
diff --git a/pypots/tests/environment_test.yml b/pypots/tests/environment_test.yml
index 44c3a21c..dc4e3316 100644
--- a/pypots/tests/environment_test.yml
+++ b/pypots/tests/environment_test.yml
@@ -10,14 +10,13 @@ dependencies:
     - conda-forge::scipy
     - conda-forge::pandas
     - conda-forge::scikit-learn
-    - conda-forge::matplotlib
+    - conda-forge::h5py
     - conda-forge::tensorboard
     - conda-forge::pip
     - conda-forge::pytest-cov
     - conda-forge::pytest-xdist
     - conda-forge::coverage
+    - conda-forge::pycorruptor
+    - conda-forge::tsdb
     - pytorch::pytorch==1.11.0
     - pyg::pyg==2.0.4
-    - pip:
-        - pycorruptor==0.0.4
-        - tsdb==0.0.7
diff --git a/requirements.txt b/requirements.txt
index 59de6847..41a9e125 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ scipy
 tensorboard
 pandas
 pycorruptor
-tsdb
\ No newline at end of file
+tsdb
+h5py
diff --git a/setup.py b/setup.py
index ba9febff..9cafa889 100644
--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,7 @@
         "pandas",
         "pycorruptor",
         "tsdb",
+        "h5py",
     ],
     setup_requires=["setuptools>=38.6.0"],
 )

From ee5270a5e7f1709d66c20792dda44d6147a4c95f Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 10:50:14 +0800
Subject: [PATCH 12/22] feat: add testing cases for lazy-loading datasets;

---
 .../tests/test_data_lazy_loading_from_file.py | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 pypots/tests/test_data_lazy_loading_from_file.py

diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data_lazy_loading_from_file.py
new file mode 100644
index 00000000..c9b741f3
--- /dev/null
+++ b/pypots/tests/test_data_lazy_loading_from_file.py
@@ -0,0 +1,103 @@
+"""
+Test cases for data classes with the lazy-loading strategy of reading from files.
+"""
+import os
+import sys
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: GLP-v3
+
+import unittest
+
+from pypots.tests.unified_data_for_test import DATA
+import h5py
+from pypots.imputation import SAITS
+from pypots.classification import BRITS, GRUD
+from pypots.data import DatasetForBRITS, DatasetForMIT, DatasetForGRUD
+import numpy as np
+
+EPOCHS = 1
+
+TRAIN_SET = "./train_set.h5"
+VAL_SET = "./val_set.h5"
+TEST_SET = "./test_set.h5"
+
+IMPUTATION_TRAIN_SET = "./imputation_train_set.h5"
+IMPUTATION_VAL_SET = "./imputation_val_set.h5"
+
+
+def save_data_set_into_h5(data, path):
+    with h5py.File(path, "w") as hf:
+        for i in data.keys():
+            hf.create_dataset(i, data=data[i].astype(np.float32))
+
+
+save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET)
+save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET)
+save_data_set_into_h5(
+    {
+        "X": DATA["test_X"],
+        "X_intact": DATA["test_X_intact"],
+        "X_indicating_mask": DATA["test_X_indicating_mask"],
+    },
+    TEST_SET,
+)
+
+save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET)
+save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET)
+
+
+class TestLazyLoadingClasses(unittest.TestCase):
+    def setUp(self) -> None:
+
+        assert os.path.exists(TRAIN_SET)
+        assert os.path.exists(VAL_SET)
+        assert os.path.exists(TEST_SET)
+
+        assert os.path.exists(IMPUTATION_TRAIN_SET)
+        assert os.path.exists(IMPUTATION_VAL_SET)
+
+        self.saits = SAITS(
+            DATA["n_steps"],
+            DATA["n_features"],
+            n_layers=2,
+            d_model=256,
+            d_inner=128,
+            n_head=4,
+            d_k=64,
+            d_v=64,
+            dropout=0.1,
+            epochs=EPOCHS,
+        )
+
+        self.brits = BRITS(
+            DATA["n_steps"],
+            DATA["n_features"],
+            256,
+            n_classes=DATA["n_classes"],
+            epochs=EPOCHS,
+        )
+
+        self.grud = GRUD(
+            DATA["n_steps"],
+            DATA["n_features"],
+            256,
+            n_classes=DATA["n_classes"],
+            epochs=EPOCHS,
+        )
+
+    def test_DatasetForMIT(self):
+        self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET)
+        _ = self.saits.impute(X=TEST_SET)
+
+    def test_DatasetForBRITS(self):
+        self.brits.fit(train_set=TRAIN_SET, val_set=VAL_SET)
+        _ = self.brits.classify(X=TEST_SET)
+
+    def test_DatasetForGRUD(self):
+        self.grud.fit(train_set=TRAIN_SET, val_set=VAL_SET)
+        _ = self.grud.classify(X=TEST_SET)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 8a4f68280de00776b3151c615be561ff50c3b0c6 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 10:57:49 +0800
Subject: [PATCH 13/22] doc: update README;

---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5f95c52e..615570c3 100644
--- a/README.md
+++ b/README.md
@@ -66,6 +66,9 @@ Install it with `conda install pypots`, you may need to specify the channel with
 Install the latest release from PyPI:
 > pip install pypots
 
+or install from the source code with the latest features not officially released in a version:
+> pip install `https://github.com/WenjieDu/PyPOTS/archive/main.zip`
+
 <details open>
 <summary><b>Below is an example applying SAITS in PyPOTS to impute missing values in the dataset PhysioNet2012:</b></summary>
 
@@ -84,10 +87,11 @@ X = StandardScaler().fit_transform(X.to_numpy())
 X = X.reshape(num_samples, 48, -1)
 X_intact, X, missing_mask, indicating_mask = mcar(X, 0.1) # hold out 10% observed values as ground truth
 X = masked_fill(X, 1 - missing_mask, np.nan)
+dataset = {"X": X}
 # Model training. This is PyPOTS showtime. 💪
 saits = SAITS(n_steps=48, n_features=37, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=10)
-saits.fit(X)  # train the model. Here I use the whole dataset as the training set, because ground truth is not visible to the model.
-imputation = saits.impute(X)  # impute the originally-missing values and artificially-missing values
+saits.fit(dataset)  # train the model. Here I use the whole dataset as the training set, because ground truth is not visible to the model.
+imputation = saits.impute(dataset)  # impute the originally-missing values and artificially-missing values
 mae = cal_mae(imputation, X_intact, indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)
 ```
 </details>

From 0fb57d4ba7b5531dccb96f6cadd96b20fa831cee Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 17:38:50 +0800
Subject: [PATCH 14/22] feat: v0.0.10 is ready;

---
 pypots/__version__.py                            |  2 +-
 pypots/tests/test_data_lazy_loading_from_file.py | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pypots/__version__.py b/pypots/__version__.py
index b4069ba5..c6345fc4 100644
--- a/pypots/__version__.py
+++ b/pypots/__version__.py
@@ -21,4 +21,4 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 
-version = "0.0.9"
+version = "0.0.10"
diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data_lazy_loading_from_file.py
index c9b741f3..6ef6edf8 100644
--- a/pypots/tests/test_data_lazy_loading_from_file.py
+++ b/pypots/tests/test_data_lazy_loading_from_file.py
@@ -1,21 +1,20 @@
 """
 Test cases for data classes with the lazy-loading strategy of reading from files.
 """
-import os
-import sys
 
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: GLP-v3
 
+import os
 import unittest
 
-from pypots.tests.unified_data_for_test import DATA
 import h5py
-from pypots.imputation import SAITS
-from pypots.classification import BRITS, GRUD
-from pypots.data import DatasetForBRITS, DatasetForMIT, DatasetForGRUD
 import numpy as np
 
+from pypots.classification import BRITS, GRUD
+from pypots.imputation import SAITS
+from pypots.tests.unified_data_for_test import DATA
+
 EPOCHS = 1
 
 TRAIN_SET = "./train_set.h5"

From 72eaf20649760a5a21ae96c9cd01d0479b3a75f8 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 18:10:13 +0800
Subject: [PATCH 15/22] fix: running testing cases for forecasting models and
 lazy-loading datasets;

---
 .github/workflows/testing.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 462a555a..fda30e59 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -47,6 +47,8 @@ jobs:
                   python -m pytest -rA pypots/tests/test_classification.py -n auto --cov=pypots 
                   python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append  
                   python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append
+                  python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append
+                  python -m pytest -rA pypots/tests/test_data_lazy_loading_from_file.py -n auto --cov=pypots --cov-append
 
             - name: Generate the LCOV report
               run: |

From fa5f5b6af08aba4a2b38c68fae873ac2ddb53199 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 18:53:46 +0800
Subject: [PATCH 16/22] fix: running testing cases for logging;

---
 .github/workflows/testing.yml                 |  1 +
 .../tests/test_data_lazy_loading_from_file.py | 38 ++++---------------
 pypots/tests/unified_data_for_test.py         | 29 ++++++++++++++
 3 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index fda30e59..6c201f5f 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -49,6 +49,7 @@ jobs:
                   python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append
                   python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append
                   python -m pytest -rA pypots/tests/test_data_lazy_loading_from_file.py -n auto --cov=pypots --cov-append
+                  python -m pytest -rA pypots/tests/test_logging.py -n auto --cov=pypots --cov-append
 
             - name: Generate the LCOV report
               run: |
diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data_lazy_loading_from_file.py
index 6ef6edf8..6ec23a86 100644
--- a/pypots/tests/test_data_lazy_loading_from_file.py
+++ b/pypots/tests/test_data_lazy_loading_from_file.py
@@ -8,42 +8,18 @@
 import os
 import unittest
 
-import h5py
-import numpy as np
-
 from pypots.classification import BRITS, GRUD
 from pypots.imputation import SAITS
-from pypots.tests.unified_data_for_test import DATA
-
-EPOCHS = 1
-
-TRAIN_SET = "./train_set.h5"
-VAL_SET = "./val_set.h5"
-TEST_SET = "./test_set.h5"
-
-IMPUTATION_TRAIN_SET = "./imputation_train_set.h5"
-IMPUTATION_VAL_SET = "./imputation_val_set.h5"
-
-
-def save_data_set_into_h5(data, path):
-    with h5py.File(path, "w") as hf:
-        for i in data.keys():
-            hf.create_dataset(i, data=data[i].astype(np.float32))
-
-
-save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET)
-save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET)
-save_data_set_into_h5(
-    {
-        "X": DATA["test_X"],
-        "X_intact": DATA["test_X_intact"],
-        "X_indicating_mask": DATA["test_X_indicating_mask"],
-    },
+from pypots.tests.unified_data_for_test import (
+    DATA,
+    TRAIN_SET,
+    VAL_SET,
     TEST_SET,
+    IMPUTATION_TRAIN_SET,
+    IMPUTATION_VAL_SET,
 )
 
-save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET)
-save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET)
+EPOCHS = 1
 
 
 class TestLazyLoadingClasses(unittest.TestCase):
diff --git a/pypots/tests/unified_data_for_test.py b/pypots/tests/unified_data_for_test.py
index ffb0f395..93e59990 100644
--- a/pypots/tests/unified_data_for_test.py
+++ b/pypots/tests/unified_data_for_test.py
@@ -5,6 +5,8 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: GLP-v3
 
+import h5py
+import numpy as np
 import torch
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
@@ -13,6 +15,12 @@
 from pypots.data import load_specific_dataset
 
 
+def save_data_set_into_h5(data, path):
+    with h5py.File(path, "w") as hf:
+        for i in data.keys():
+            hf.create_dataset(i, data=data[i].astype(np.float32))
+
+
 def gene_random_walk_data(
     n_steps=24, n_features=10, n_classes=2, n_samples_each_class=1000
 ):
@@ -128,3 +136,24 @@ def gene_physionet2012():
 # generate and cache data first.
 # Otherwise, file lock will cause bug if running test parallely with pytest-xdist.
 DATA = gene_random_walk_data()
+
+TRAIN_SET = "./train_set.h5"
+VAL_SET = "./val_set.h5"
+TEST_SET = "./test_set.h5"
+
+IMPUTATION_TRAIN_SET = "./imputation_train_set.h5"
+IMPUTATION_VAL_SET = "./imputation_val_set.h5"
+
+save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET)
+save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET)
+save_data_set_into_h5(
+    {
+        "X": DATA["test_X"],
+        "X_intact": DATA["test_X_intact"],
+        "X_indicating_mask": DATA["test_X_indicating_mask"],
+    },
+    TEST_SET,
+)
+
+save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET)
+save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET)

From e9aea749483a9d363f5ee93ea6e3d705ee1fe3d4 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 21:30:31 +0800
Subject: [PATCH 17/22] fix: try to fix the BlockingIOError, see below message
 for details;

BlockingIOError: [Errno 35] Unable to create file (unable to lock file, errno = 35, error message = 'Resource temporarily unavailable')
This may be caused by the program creates h5 files for multiple times;
---
 .../tests/test_data_lazy_loading_from_file.py | 39 +++++++++++++++----
 pypots/tests/unified_data_for_test.py         | 29 --------------
 2 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data_lazy_loading_from_file.py
index 6ec23a86..bfc1464a 100644
--- a/pypots/tests/test_data_lazy_loading_from_file.py
+++ b/pypots/tests/test_data_lazy_loading_from_file.py
@@ -8,22 +8,45 @@
 import os
 import unittest
 
+import h5py
+import numpy as np
+
 from pypots.classification import BRITS, GRUD
 from pypots.imputation import SAITS
-from pypots.tests.unified_data_for_test import (
-    DATA,
-    TRAIN_SET,
-    VAL_SET,
-    TEST_SET,
-    IMPUTATION_TRAIN_SET,
-    IMPUTATION_VAL_SET,
-)
+from pypots.tests.unified_data_for_test import DATA
+
+TRAIN_SET = "./train_set.h5"
+VAL_SET = "./val_set.h5"
+TEST_SET = "./test_set.h5"
+
+IMPUTATION_TRAIN_SET = "./imputation_train_set.h5"
+IMPUTATION_VAL_SET = "./imputation_val_set.h5"
+
+
+def save_data_set_into_h5(data, path):
+    with h5py.File(path, "w") as hf:
+        for i in data.keys():
+            hf.create_dataset(i, data=data[i].astype(np.float32))
+
 
 EPOCHS = 1
 
 
 class TestLazyLoadingClasses(unittest.TestCase):
     def setUp(self) -> None:
+        save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET)
+        save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET)
+        save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET)
+        save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET)
+
+        save_data_set_into_h5(
+            {
+                "X": DATA["test_X"],
+                "X_intact": DATA["test_X_intact"],
+                "X_indicating_mask": DATA["test_X_indicating_mask"],
+            },
+            TEST_SET,
+        )
 
         assert os.path.exists(TRAIN_SET)
         assert os.path.exists(VAL_SET)
diff --git a/pypots/tests/unified_data_for_test.py b/pypots/tests/unified_data_for_test.py
index 93e59990..ffb0f395 100644
--- a/pypots/tests/unified_data_for_test.py
+++ b/pypots/tests/unified_data_for_test.py
@@ -5,8 +5,6 @@
 # Created by Wenjie Du <wenjay.du@gmail.com>
 # License: GLP-v3
 
-import h5py
-import numpy as np
 import torch
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
@@ -15,12 +13,6 @@
 from pypots.data import load_specific_dataset
 
 
-def save_data_set_into_h5(data, path):
-    with h5py.File(path, "w") as hf:
-        for i in data.keys():
-            hf.create_dataset(i, data=data[i].astype(np.float32))
-
-
 def gene_random_walk_data(
     n_steps=24, n_features=10, n_classes=2, n_samples_each_class=1000
 ):
@@ -136,24 +128,3 @@ def gene_physionet2012():
 # generate and cache data first.
 # Otherwise, file lock will cause bug if running test parallely with pytest-xdist.
 DATA = gene_random_walk_data()
-
-TRAIN_SET = "./train_set.h5"
-VAL_SET = "./val_set.h5"
-TEST_SET = "./test_set.h5"
-
-IMPUTATION_TRAIN_SET = "./imputation_train_set.h5"
-IMPUTATION_VAL_SET = "./imputation_val_set.h5"
-
-save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET)
-save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET)
-save_data_set_into_h5(
-    {
-        "X": DATA["test_X"],
-        "X_intact": DATA["test_X_intact"],
-        "X_indicating_mask": DATA["test_X_indicating_mask"],
-    },
-    TEST_SET,
-)
-
-save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET)
-save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET)

From 46fca4148b39aa01e29c80e3f0150e6dad1c9066 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 22:55:29 +0800
Subject: [PATCH 18/22] refactor: test scripts;

---
 pypots/tests/test_classification.py           | 133 +++++++++--------
 pypots/tests/test_clustering.py               |  78 +++++-----
 .../tests/test_data_lazy_loading_from_file.py |  98 +++++++------
 pypots/tests/test_forecasting.py              |  29 ++--
 pypots/tests/test_imputation.py               | 138 +++++++++---------
 5 files changed, 254 insertions(+), 222 deletions(-)

diff --git a/pypots/tests/test_classification.py b/pypots/tests/test_classification.py
index 034d65ab..b57b9d37 100644
--- a/pypots/tests/test_classification.py
+++ b/pypots/tests/test_classification.py
@@ -20,18 +20,33 @@
 
 
 class TestBRITS(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info("Running test cases for BRITS...")
-        self.brits = BRITS(
-            DATA["n_steps"],
-            DATA["n_features"],
-            256,
-            n_classes=DATA["n_classes"],
-            epochs=EPOCHS,
-        )
+    logger.info("Running tests for a classification model BRITS...")
+
+    # initialize a BRITS model
+    brits = BRITS(
+        DATA["n_steps"],
+        DATA["n_features"],
+        256,
+        n_classes=DATA["n_classes"],
+        epochs=EPOCHS,
+    )
+
+    def test_0_fit(self):
         self.brits.fit(TRAIN_SET, VAL_SET)
 
-    def test_parameters(self):
+    def test_1_classify(self):
+        predictions = self.brits.classify(TEST_SET)
+        metrics = cal_binary_classification_metrics(predictions, DATA["test_y"])
+        logger.info(
+            f'ROC_AUC: {metrics["roc_auc"]}, \n'
+            f'PR_AUC: {metrics["pr_auc"]},\n'
+            f'F1: {metrics["f1"]},\n'
+            f'Precision: {metrics["precision"]},\n'
+            f'Recall: {metrics["recall"]},\n'
+        )
+        assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5"
+
+    def test_2_parameters(self):
         assert hasattr(self.brits, "model") and self.brits.model is not None
 
         assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None
@@ -44,8 +59,24 @@ def test_parameters(self):
             and self.brits.best_model_dict is not None
         )
 
-    def test_classify(self):
-        predictions = self.brits.classify(TEST_SET)
+
+class TestGRUD(unittest.TestCase):
+    logger.info("Running tests for a classification model GRUD...")
+
+    # initialize a GRUD model
+    grud = GRUD(
+        DATA["n_steps"],
+        DATA["n_features"],
+        256,
+        n_classes=DATA["n_classes"],
+        epochs=EPOCHS,
+    )
+
+    def test_0_fit(self):
+        self.grud.fit(TRAIN_SET, VAL_SET)
+
+    def test_1_classify(self):
+        predictions = self.grud.classify(TEST_SET)
         metrics = cal_binary_classification_metrics(predictions, DATA["test_y"])
         logger.info(
             f'ROC_AUC: {metrics["roc_auc"]}, \n'
@@ -56,20 +87,7 @@ def test_classify(self):
         )
         assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5"
 
-
-class TestGRUD(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info("Running test cases for GRUD...")
-        self.grud = GRUD(
-            DATA["n_steps"],
-            DATA["n_features"],
-            256,
-            n_classes=DATA["n_classes"],
-            epochs=EPOCHS,
-        )
-        self.grud.fit(TRAIN_SET, VAL_SET)
-
-    def test_parameters(self):
+    def test_2_parameters(self):
         assert hasattr(self.grud, "model") and self.grud.model is not None
 
         assert hasattr(self.grud, "optimizer") and self.grud.optimizer is not None
@@ -82,8 +100,32 @@ def test_parameters(self):
             and self.grud.best_model_dict is not None
         )
 
-    def test_classify(self):
-        predictions = self.grud.classify(TEST_SET)
+
+class TestRaindrop(unittest.TestCase):
+    logger.info("Running tests for a classification model Raindrop...")
+
+    # initialize a Raindrop model
+    raindrop = Raindrop(
+        DATA["n_features"],
+        2,
+        DATA["n_features"] * 4,
+        256,
+        2,
+        DATA["n_classes"],
+        0.3,
+        DATA["n_steps"],
+        0,
+        "mean",
+        False,
+        False,
+        epochs=EPOCHS,
+    )
+
+    def test_0_fit(self):
+        self.raindrop.fit(TRAIN_SET, VAL_SET)
+
+    def test_1_classify(self):
+        predictions = self.raindrop.classify(TEST_SET)
         metrics = cal_binary_classification_metrics(predictions, DATA["test_y"])
         logger.info(
             f'ROC_AUC: {metrics["roc_auc"]}, \n'
@@ -94,28 +136,7 @@ def test_classify(self):
         )
         assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5"
 
-
-class TestRaindrop(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info("Running test cases for Raindrop...")
-        self.raindrop = Raindrop(
-            DATA["n_features"],
-            2,
-            DATA["n_features"] * 4,
-            256,
-            2,
-            DATA["n_classes"],
-            0.3,
-            DATA["n_steps"],
-            0,
-            "mean",
-            False,
-            False,
-            epochs=EPOCHS,
-        )
-        self.raindrop.fit(TRAIN_SET, VAL_SET)
-
-    def test_parameters(self):
+    def test_2_parameters(self):
         assert hasattr(self.raindrop, "model") and self.raindrop.model is not None
 
         assert (
@@ -130,18 +151,6 @@ def test_parameters(self):
             and self.raindrop.best_model_dict is not None
         )
 
-    def test_classify(self):
-        predictions = self.raindrop.classify(TEST_SET)
-        metrics = cal_binary_classification_metrics(predictions, DATA["test_y"])
-        logger.info(
-            f'ROC_AUC: {metrics["roc_auc"]}, \n'
-            f'PR_AUC: {metrics["pr_auc"]},\n'
-            f'F1: {metrics["f1"]},\n'
-            f'Precision: {metrics["precision"]},\n'
-            f'Recall: {metrics["recall"]},\n'
-        )
-        assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5"
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/pypots/tests/test_clustering.py b/pypots/tests/test_clustering.py
index ddc36887..3696d2b5 100644
--- a/pypots/tests/test_clustering.py
+++ b/pypots/tests/test_clustering.py
@@ -23,19 +23,22 @@
 
 
 class TestCRLI(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info("Running test cases for CRLI...")
-        self.crli = CRLI(
-            n_steps=DATA["n_steps"],
-            n_features=DATA["n_features"],
-            n_clusters=DATA["n_classes"],
-            n_generator_layers=2,
-            rnn_hidden_size=128,
-            epochs=EPOCHS,
-        )
+    logger.info("Running tests for a clustering model CRLI...")
+
+    # initialize a CRLI model
+    crli = CRLI(
+        n_steps=DATA["n_steps"],
+        n_features=DATA["n_features"],
+        n_clusters=DATA["n_classes"],
+        n_generator_layers=2,
+        rnn_hidden_size=128,
+        epochs=EPOCHS,
+    )
+
+    def test_0_fit(self):
         self.crli.fit(TRAIN_SET)
 
-    def test_parameters(self):
+    def test_1_parameters(self):
         assert hasattr(self.crli, "model") and self.crli.model is not None
 
         assert hasattr(self.crli, "G_optimizer") and self.crli.G_optimizer is not None
@@ -49,7 +52,7 @@ def test_parameters(self):
             and self.crli.best_model_dict is not None
         )
 
-    def test_cluster(self):
+    def test_2_cluster(self):
         clustering = self.crli.cluster(TEST_SET)
         RI = cal_rand_index(clustering, DATA["test_y"])
         CP = cal_cluster_purity(clustering, DATA["test_y"])
@@ -57,20 +60,35 @@ def test_cluster(self):
 
 
 class TestVaDER(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info("Running test cases for VaDER...")
-        self.vader = VaDER(
-            n_steps=DATA["n_steps"],
-            n_features=DATA["n_features"],
-            n_clusters=DATA["n_classes"],
-            rnn_hidden_size=64,
-            d_mu_stddev=5,
-            pretrain_epochs=20,
-            epochs=EPOCHS,
-        )
+    logger.info("Running tests for a clustering model Transformer...")
+
+    # initialize a VaDER model
+    vader = VaDER(
+        n_steps=DATA["n_steps"],
+        n_features=DATA["n_features"],
+        n_clusters=DATA["n_classes"],
+        rnn_hidden_size=64,
+        d_mu_stddev=5,
+        pretrain_epochs=20,
+        epochs=EPOCHS,
+    )
+
+    def test_0_fit(self):
         self.vader.fit(TRAIN_SET)
 
-    def test_parameters(self):
+    def test_1_cluster(self):
+        try:
+            clustering = self.vader.cluster(TEST_SET)
+            RI = cal_rand_index(clustering, DATA["test_y"])
+            CP = cal_cluster_purity(clustering, DATA["test_y"])
+            logger.info(f"RI: {RI}\nCP: {CP}")
+        except np.linalg.LinAlgError as e:
+            logger.error(
+                f"{e}\n"
+                "Got singular matrix, please try to retrain the model to fix this"
+            )
+
+    def test_2_parameters(self):
         assert hasattr(self.vader, "model") and self.vader.model is not None
 
         assert hasattr(self.vader, "optimizer") and self.vader.optimizer is not None
@@ -83,18 +101,6 @@ def test_parameters(self):
             and self.vader.best_model_dict is not None
         )
 
-    def test_cluster(self):
-        try:
-            clustering = self.vader.cluster(TEST_SET)
-            RI = cal_rand_index(clustering, DATA["test_y"])
-            CP = cal_cluster_purity(clustering, DATA["test_y"])
-            logger.info(f"RI: {RI}\nCP: {CP}")
-        except np.linalg.LinAlgError as e:
-            logger.info(
-                f"{e}\n"
-                "Got singular matrix, please try to retrain the model to fix this"
-            )
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data_lazy_loading_from_file.py
index bfc1464a..77b7faca 100644
--- a/pypots/tests/test_data_lazy_loading_from_file.py
+++ b/pypots/tests/test_data_lazy_loading_from_file.py
@@ -9,11 +9,11 @@
 import unittest
 
 import h5py
-import numpy as np
 
 from pypots.classification import BRITS, GRUD
 from pypots.imputation import SAITS
 from pypots.tests.unified_data_for_test import DATA
+from pypots.utils.logging import logger
 
 TRAIN_SET = "./train_set.h5"
 VAL_SET = "./val_set.h5"
@@ -26,28 +26,65 @@
 def save_data_set_into_h5(data, path):
     with h5py.File(path, "w") as hf:
         for i in data.keys():
-            hf.create_dataset(i, data=data[i].astype(np.float32))
+            tp = int if i == "y" else "float32"
+            hf.create_dataset(i, data=data[i].astype(tp))
 
 
 EPOCHS = 1
 
+save_data_set_into_h5(
+    {"X": DATA["train_X"], "y": DATA["train_y"].astype(int)}, TRAIN_SET
+)
+save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"].astype(int)}, VAL_SET)
+save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET)
+save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET)
+
+save_data_set_into_h5(
+    {
+        "X": DATA["test_X"],
+        "X_intact": DATA["test_X_intact"],
+        "X_indicating_mask": DATA["test_X_indicating_mask"],
+    },
+    TEST_SET,
+)
+
 
 class TestLazyLoadingClasses(unittest.TestCase):
-    def setUp(self) -> None:
-        save_data_set_into_h5({"X": DATA["train_X"], "y": DATA["train_y"]}, TRAIN_SET)
-        save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"]}, VAL_SET)
-        save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET)
-        save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET)
-
-        save_data_set_into_h5(
-            {
-                "X": DATA["test_X"],
-                "X_intact": DATA["test_X_intact"],
-                "X_indicating_mask": DATA["test_X_indicating_mask"],
-            },
-            TEST_SET,
-        )
+    logger.info("Running tests for Dataset classes with lazy-loading strategy...")
+
+    # initialize a SAITS model for testing DatasetForMIT and BaseDataset
+    saits = SAITS(
+        DATA["n_steps"],
+        DATA["n_features"],
+        n_layers=2,
+        d_model=256,
+        d_inner=128,
+        n_head=4,
+        d_k=64,
+        d_v=64,
+        dropout=0.1,
+        epochs=EPOCHS,
+    )
+
+    # initialize a BRITS model for testing DatasetForBRITS
+    brits = BRITS(
+        DATA["n_steps"],
+        DATA["n_features"],
+        256,
+        n_classes=DATA["n_classes"],
+        epochs=EPOCHS,
+    )
+
+    # initialize a GRUD model for testing DatasetForGRUD
+    grud = GRUD(
+        DATA["n_steps"],
+        DATA["n_features"],
+        256,
+        n_classes=DATA["n_classes"],
+        epochs=EPOCHS,
+    )
 
+    def setUp(self) -> None:
         assert os.path.exists(TRAIN_SET)
         assert os.path.exists(VAL_SET)
         assert os.path.exists(TEST_SET)
@@ -55,35 +92,6 @@ def setUp(self) -> None:
         assert os.path.exists(IMPUTATION_TRAIN_SET)
         assert os.path.exists(IMPUTATION_VAL_SET)
 
-        self.saits = SAITS(
-            DATA["n_steps"],
-            DATA["n_features"],
-            n_layers=2,
-            d_model=256,
-            d_inner=128,
-            n_head=4,
-            d_k=64,
-            d_v=64,
-            dropout=0.1,
-            epochs=EPOCHS,
-        )
-
-        self.brits = BRITS(
-            DATA["n_steps"],
-            DATA["n_features"],
-            256,
-            n_classes=DATA["n_classes"],
-            epochs=EPOCHS,
-        )
-
-        self.grud = GRUD(
-            DATA["n_steps"],
-            DATA["n_features"],
-            256,
-            n_classes=DATA["n_classes"],
-            epochs=EPOCHS,
-        )
-
     def test_DatasetForMIT(self):
         self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET)
         _ = self.saits.impute(X=TEST_SET)
diff --git a/pypots/tests/test_forecasting.py b/pypots/tests/test_forecasting.py
index 409c7f81..f44d7207 100644
--- a/pypots/tests/test_forecasting.py
+++ b/pypots/tests/test_forecasting.py
@@ -20,20 +20,21 @@
 
 
 class TestBTTF(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info("Running test cases for BTTF...")
-        self.bttf = BTTF(
-            100,
-            10,
-            20,
-            2,
-            10,
-            np.asarray([1, 2, 3, 10, 10 + 1, 10 + 2, 20, 20 + 1, 20 + 2]),
-            5,
-            5,
-        )
-
-    def test_forecasting(self):
+    logger.info("Running tests for a forecasting model BTTF...")
+
+    # initialize a BTTF model
+    bttf = BTTF(
+        100,
+        10,
+        20,
+        2,
+        10,
+        np.asarray([1, 2, 3, 10, 10 + 1, 10 + 2, 20, 20 + 1, 20 + 2]),
+        5,
+        5,
+    )
+
+    def test_0_forecasting(self):
         predictions = self.bttf.forecast(TEST_SET)
         mae = cal_mae(predictions, DATA["test_X_intact"][:, 100:])
         logger.info(f"prediction MAE: {mae}")
diff --git a/pypots/tests/test_imputation.py b/pypots/tests/test_imputation.py
index 8367eb21..209b50f4 100644
--- a/pypots/tests/test_imputation.py
+++ b/pypots/tests/test_imputation.py
@@ -28,23 +28,36 @@
 
 
 class TestSAITS(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info("Running test cases for SAITS...")
-        self.saits = SAITS(
-            DATA["n_steps"],
-            DATA["n_features"],
-            n_layers=2,
-            d_model=256,
-            d_inner=128,
-            n_head=4,
-            d_k=64,
-            d_v=64,
-            dropout=0.1,
-            epochs=EPOCH,
-        )
+    logger.info("Running tests for an imputation model SAITS...")
+
+    # initialize a SAITS model
+    saits = SAITS(
+        DATA["n_steps"],
+        DATA["n_features"],
+        n_layers=2,
+        d_model=256,
+        d_inner=128,
+        n_head=4,
+        d_k=64,
+        d_v=64,
+        dropout=0.1,
+        epochs=EPOCH,
+    )
+
+    def test_0_fit(self):
         self.saits.fit(TRAIN_SET, VAL_SET)
 
-    def test_parameters(self):
+    def test_1_impute(self):
+        imputed_X = self.saits.impute(TEST_SET)
+        assert not np.isnan(
+            imputed_X
+        ).any(), "Output still has missing values after running impute()."
+        test_MAE = cal_mae(
+            imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"]
+        )
+        logger.info(f"SAITS test_MAE: {test_MAE}")
+
+    def test_2_parameters(self):
         assert hasattr(self.saits, "model") and self.saits.model is not None
 
         assert hasattr(self.saits, "optimizer") and self.saits.optimizer is not None
@@ -57,35 +70,38 @@ def test_parameters(self):
             and self.saits.best_model_dict is not None
         )
 
-    def test_impute(self):
-        imputed_X = self.saits.impute(TEST_SET)
+
+class TestTransformer(unittest.TestCase):
+    logger.info("Running tests for an imputation model Transformer...")
+
+    # initialize a Transformer model
+    transformer = Transformer(
+        DATA["n_steps"],
+        DATA["n_features"],
+        n_layers=2,
+        d_model=256,
+        d_inner=128,
+        n_head=4,
+        d_k=64,
+        d_v=64,
+        dropout=0.1,
+        epochs=EPOCH,
+    )
+
+    def test_0_fit(self):
+        self.transformer.fit(TRAIN_SET, VAL_SET)
+
+    def test_1_impute(self):
+        imputed_X = self.transformer.impute(TEST_SET)
         assert not np.isnan(
             imputed_X
         ).any(), "Output still has missing values after running impute()."
         test_MAE = cal_mae(
             imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"]
         )
-        logger.info(f"SAITS test_MAE: {test_MAE}")
-
-
-class TestTransformer(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info("Running test cases for Transformer...")
-        self.transformer = Transformer(
-            DATA["n_steps"],
-            DATA["n_features"],
-            n_layers=2,
-            d_model=256,
-            d_inner=128,
-            n_head=4,
-            d_k=64,
-            d_v=64,
-            dropout=0.1,
-            epochs=EPOCH,
-        )
-        self.transformer.fit(TRAIN_SET, VAL_SET)
+        logger.info(f"Transformer test_MAE: {test_MAE}")
 
-    def test_parameters(self):
+    def test_2_parameters(self):
         assert hasattr(self.transformer, "model") and self.transformer.model is not None
 
         assert (
@@ -101,24 +117,27 @@ def test_parameters(self):
             and self.transformer.best_model_dict is not None
         )
 
-    def test_impute(self):
-        imputed_X = self.transformer.impute(TEST_SET)
+
+class TestBRITS(unittest.TestCase):
+    logger.info("Running tests for an imputation model BRITS...")
+
+    # initialize a BRITS model
+    brits = BRITS(DATA["n_steps"], DATA["n_features"], 256, epochs=EPOCH)
+
+    def test_0_fit(self):
+        self.brits.fit(TRAIN_SET, VAL_SET)
+
+    def test_1_impute(self):
+        imputed_X = self.brits.impute(TEST_SET)
         assert not np.isnan(
             imputed_X
         ).any(), "Output still has missing values after running impute()."
         test_MAE = cal_mae(
             imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"]
         )
-        logger.info(f"Transformer test_MAE: {test_MAE}")
-
-
-class TestBRITS(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info("Running test cases for BRITS...")
-        self.brits = BRITS(DATA["n_steps"], DATA["n_features"], 256, epochs=EPOCH)
-        self.brits.fit(TRAIN_SET, VAL_SET)
+        logger.info(f"BRITS test_MAE: {test_MAE}")
 
-    def test_parameters(self):
+    def test_2_parameters(self):
         assert hasattr(self.brits, "model") and self.brits.model is not None
 
         assert hasattr(self.brits, "optimizer") and self.brits.optimizer is not None
@@ -131,26 +150,12 @@ def test_parameters(self):
             and self.brits.best_model_dict is not None
         )
 
-    def test_impute(self):
-        imputed_X = self.brits.impute(TEST_SET)
-        assert not np.isnan(
-            imputed_X
-        ).any(), "Output still has missing values after running impute()."
-        test_MAE = cal_mae(
-            imputed_X, DATA["test_X_intact"], DATA["test_X_indicating_mask"]
-        )
-        logger.info(f"BRITS test_MAE: {test_MAE}")
-
 
 class TestLOCF(unittest.TestCase):
-    def setUp(self) -> None:
-        logger.info("Running test cases for LOCF...")
-        self.locf = LOCF(nan=0)
-
-    def test_parameters(self):
-        assert hasattr(self.locf, "nan") and self.locf.nan is not None
+    logger.info("Running tests for an imputation model LOCF...")
+    locf = LOCF(nan=0)
 
-    def test_impute(self):
+    def test_0_impute(self):
         test_X_imputed = self.locf.impute(TEST_SET)
         assert not np.isnan(
             test_X_imputed
@@ -160,6 +165,9 @@ def test_impute(self):
         )
         logger.info(f"LOCF test_MAE: {test_MAE}")
 
+    def test_1_parameters(self):
+        assert hasattr(self.locf, "nan") and self.locf.nan is not None
+
 
 if __name__ == "__main__":
     unittest.main()

From 13a7cd15c123136a64fcda780805cade4327f87c Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 31 Mar 2023 23:56:40 +0800
Subject: [PATCH 19/22] fix: use annotation @pytest.mark.xdist_group to help
 pytest-dist execute tasks sequentially;

Some test tasks need to be executed sequentially, but we're using pytest-dist to accelerate the testing precess. To solve this problem, refer to https://github.com/pytest-dev/pytest-xdist/issues/385#issuecomment-1304877301. And please note that it need pytest-dist >= v2.5.0.
---
 .github/workflows/testing.yml                 | 14 +++---
 pypots/tests/environment_test.yml             |  2 +-
 pypots/tests/test_classification.py           | 13 +++++-
 pypots/tests/test_clustering.py               |  9 +++-
 ...lazy_loading_from_file.py => test_data.py} | 13 ++++--
 pypots/tests/test_forecasting.py              |  2 +
 pypots/tests/test_imputation.py               | 14 +++++-
 pypots/tests/test_logging.py                  | 43 +++++++++++--------
 8 files changed, 78 insertions(+), 32 deletions(-)
 rename pypots/tests/{test_data_lazy_loading_from_file.py => test_data.py} (87%)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 6c201f5f..50cba726 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -43,13 +43,13 @@ jobs:
             - name: Test with pytest
               run: |
                   # run tests separately here due to Segmentation Fault in test_clustering when run all in 
-                  # one command with `pytest` on MacOS. Bugs not catched, so this is a trade-off to avoid SF.
-                  python -m pytest -rA pypots/tests/test_classification.py -n auto --cov=pypots 
-                  python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append  
-                  python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append
-                  python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append
-                  python -m pytest -rA pypots/tests/test_data_lazy_loading_from_file.py -n auto --cov=pypots --cov-append
-                  python -m pytest -rA pypots/tests/test_logging.py -n auto --cov=pypots --cov-append
+                  # one command with `pytest` on MacOS. Bugs not caught, so this is a trade-off to avoid SF.
+                  python -m pytest -rA pypots/tests/test_classification.py -n auto --cov=pypots --dist=loadgroup 
+                  python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append --dist=loadgroup
+                  python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append --dist=loadgroup
+                  python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append --dist=loadgroup
+                  python -m pytest -rA pypots/tests/test_data_lazy_loading_from_file.py -n auto --cov=pypots --cov-append --dist=loadgroup
+                  python -m pytest -rA pypots/tests/test_logging.py -n auto --cov=pypots --cov-append --dist=loadgroup
 
             - name: Generate the LCOV report
               run: |
diff --git a/pypots/tests/environment_test.yml b/pypots/tests/environment_test.yml
index dc4e3316..ceadbe60 100644
--- a/pypots/tests/environment_test.yml
+++ b/pypots/tests/environment_test.yml
@@ -14,7 +14,7 @@ dependencies:
     - conda-forge::tensorboard
     - conda-forge::pip
     - conda-forge::pytest-cov
-    - conda-forge::pytest-xdist
+    - conda-forge::pytest-xdist>=2.5.0
     - conda-forge::coverage
     - conda-forge::pycorruptor
     - conda-forge::tsdb
diff --git a/pypots/tests/test_classification.py b/pypots/tests/test_classification.py
index b57b9d37..36f48484 100644
--- a/pypots/tests/test_classification.py
+++ b/pypots/tests/test_classification.py
@@ -7,10 +7,12 @@
 
 import unittest
 
+import pytest
+
 from pypots.classification import BRITS, GRUD, Raindrop
 from pypots.tests.unified_data_for_test import DATA
-from pypots.utils.metrics import cal_binary_classification_metrics
 from pypots.utils.logging import logger
+from pypots.utils.metrics import cal_binary_classification_metrics
 
 EPOCHS = 5
 
@@ -31,9 +33,11 @@ class TestBRITS(unittest.TestCase):
         epochs=EPOCHS,
     )
 
+    @pytest.mark.xdist_group(name="classification-brits")
     def test_0_fit(self):
         self.brits.fit(TRAIN_SET, VAL_SET)
 
+    @pytest.mark.xdist_group(name="classification-brits")
     def test_1_classify(self):
         predictions = self.brits.classify(TEST_SET)
         metrics = cal_binary_classification_metrics(predictions, DATA["test_y"])
@@ -46,6 +50,7 @@ def test_1_classify(self):
         )
         assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5"
 
+    @pytest.mark.xdist_group(name="classification-brits")
     def test_2_parameters(self):
         assert hasattr(self.brits, "model") and self.brits.model is not None
 
@@ -72,9 +77,11 @@ class TestGRUD(unittest.TestCase):
         epochs=EPOCHS,
     )
 
+    @pytest.mark.xdist_group(name="classification-grud")
     def test_0_fit(self):
         self.grud.fit(TRAIN_SET, VAL_SET)
 
+    @pytest.mark.xdist_group(name="classification-grud")
     def test_1_classify(self):
         predictions = self.grud.classify(TEST_SET)
         metrics = cal_binary_classification_metrics(predictions, DATA["test_y"])
@@ -87,6 +94,7 @@ def test_1_classify(self):
         )
         assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5"
 
+    @pytest.mark.xdist_group(name="classification-grud")
     def test_2_parameters(self):
         assert hasattr(self.grud, "model") and self.grud.model is not None
 
@@ -121,9 +129,11 @@ class TestRaindrop(unittest.TestCase):
         epochs=EPOCHS,
     )
 
+    @pytest.mark.xdist_group(name="classification-raindrop")
     def test_0_fit(self):
         self.raindrop.fit(TRAIN_SET, VAL_SET)
 
+    @pytest.mark.xdist_group(name="classification-raindrop")
     def test_1_classify(self):
         predictions = self.raindrop.classify(TEST_SET)
         metrics = cal_binary_classification_metrics(predictions, DATA["test_y"])
@@ -136,6 +146,7 @@ def test_1_classify(self):
         )
         assert metrics["roc_auc"] >= 0.5, "ROC-AUC < 0.5"
 
+    @pytest.mark.xdist_group(name="classification-raindrop")
     def test_2_parameters(self):
         assert hasattr(self.raindrop, "model") and self.raindrop.model is not None
 
diff --git a/pypots/tests/test_clustering.py b/pypots/tests/test_clustering.py
index 3696d2b5..15b00736 100644
--- a/pypots/tests/test_clustering.py
+++ b/pypots/tests/test_clustering.py
@@ -9,10 +9,11 @@
 import unittest
 
 import numpy as np
+import pytest
 
 from pypots.clustering import VaDER, CRLI
-from pypots.utils.logging import logger
 from pypots.tests.unified_data_for_test import DATA
+from pypots.utils.logging import logger
 from pypots.utils.metrics import cal_rand_index, cal_cluster_purity
 
 EPOCHS = 5
@@ -35,9 +36,11 @@ class TestCRLI(unittest.TestCase):
         epochs=EPOCHS,
     )
 
+    @pytest.mark.xdist_group(name="clustering-crli")
     def test_0_fit(self):
         self.crli.fit(TRAIN_SET)
 
+    @pytest.mark.xdist_group(name="clustering-crli")
     def test_1_parameters(self):
         assert hasattr(self.crli, "model") and self.crli.model is not None
 
@@ -52,6 +55,7 @@ def test_1_parameters(self):
             and self.crli.best_model_dict is not None
         )
 
+    @pytest.mark.xdist_group(name="clustering-crli")
     def test_2_cluster(self):
         clustering = self.crli.cluster(TEST_SET)
         RI = cal_rand_index(clustering, DATA["test_y"])
@@ -73,9 +77,11 @@ class TestVaDER(unittest.TestCase):
         epochs=EPOCHS,
     )
 
+    @pytest.mark.xdist_group(name="clustering-vader")
     def test_0_fit(self):
         self.vader.fit(TRAIN_SET)
 
+    @pytest.mark.xdist_group(name="clustering-vader")
     def test_1_cluster(self):
         try:
             clustering = self.vader.cluster(TEST_SET)
@@ -88,6 +94,7 @@ def test_1_cluster(self):
                 "Got singular matrix, please try to retrain the model to fix this"
             )
 
+    @pytest.mark.xdist_group(name="clustering-vader")
     def test_2_parameters(self):
         assert hasattr(self.vader, "model") and self.vader.model is not None
 
diff --git a/pypots/tests/test_data_lazy_loading_from_file.py b/pypots/tests/test_data.py
similarity index 87%
rename from pypots/tests/test_data_lazy_loading_from_file.py
rename to pypots/tests/test_data.py
index 77b7faca..eff222a9 100644
--- a/pypots/tests/test_data_lazy_loading_from_file.py
+++ b/pypots/tests/test_data.py
@@ -9,6 +9,7 @@
 import unittest
 
 import h5py
+import pytest
 
 from pypots.classification import BRITS, GRUD
 from pypots.imputation import SAITS
@@ -92,15 +93,21 @@ def setUp(self) -> None:
         assert os.path.exists(IMPUTATION_TRAIN_SET)
         assert os.path.exists(IMPUTATION_VAL_SET)
 
-    def test_DatasetForMIT(self):
+    @pytest.mark.xdist_group(name="data-lazy-loading")
+    def test_0_DatasetForMIT(self):
         self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET)
+
+    @pytest.mark.xdist_group(name="data-lazy-loading")
+    def test_1_BaseDataset(self):
         _ = self.saits.impute(X=TEST_SET)
 
-    def test_DatasetForBRITS(self):
+    @pytest.mark.xdist_group(name="data-lazy-loading")
+    def test_2_DatasetForBRITS(self):
         self.brits.fit(train_set=TRAIN_SET, val_set=VAL_SET)
         _ = self.brits.classify(X=TEST_SET)
 
-    def test_DatasetForGRUD(self):
+    @pytest.mark.xdist_group(name="data-lazy-loading")
+    def test_3_DatasetForGRUD(self):
         self.grud.fit(train_set=TRAIN_SET, val_set=VAL_SET)
         _ = self.grud.classify(X=TEST_SET)
 
diff --git a/pypots/tests/test_forecasting.py b/pypots/tests/test_forecasting.py
index f44d7207..7a6bed4d 100644
--- a/pypots/tests/test_forecasting.py
+++ b/pypots/tests/test_forecasting.py
@@ -8,6 +8,7 @@
 import unittest
 
 import numpy as np
+import pytest
 
 from pypots.forecasting import BTTF
 from pypots.tests.unified_data_for_test import gene_random_walk_data
@@ -34,6 +35,7 @@ class TestBTTF(unittest.TestCase):
         5,
     )
 
+    @pytest.mark.xdist_group(name="forecasting-bttf")
     def test_0_forecasting(self):
         predictions = self.bttf.forecast(TEST_SET)
         mae = cal_mae(predictions, DATA["test_X_intact"][:, 100:])
diff --git a/pypots/tests/test_imputation.py b/pypots/tests/test_imputation.py
index 209b50f4..34d75153 100644
--- a/pypots/tests/test_imputation.py
+++ b/pypots/tests/test_imputation.py
@@ -9,6 +9,7 @@
 import unittest
 
 import numpy as np
+import pytest
 
 from pypots.imputation import (
     SAITS,
@@ -17,8 +18,8 @@
     LOCF,
 )
 from pypots.tests.unified_data_for_test import DATA
-from pypots.utils.metrics import cal_mae
 from pypots.utils.logging import logger
+from pypots.utils.metrics import cal_mae
 
 EPOCH = 5
 
@@ -44,9 +45,11 @@ class TestSAITS(unittest.TestCase):
         epochs=EPOCH,
     )
 
+    @pytest.mark.xdist_group(name="imputation-saits")
     def test_0_fit(self):
         self.saits.fit(TRAIN_SET, VAL_SET)
 
+    @pytest.mark.xdist_group(name="imputation-saits")
     def test_1_impute(self):
         imputed_X = self.saits.impute(TEST_SET)
         assert not np.isnan(
@@ -57,6 +60,7 @@ def test_1_impute(self):
         )
         logger.info(f"SAITS test_MAE: {test_MAE}")
 
+    @pytest.mark.xdist_group(name="imputation-saits")
     def test_2_parameters(self):
         assert hasattr(self.saits, "model") and self.saits.model is not None
 
@@ -88,9 +92,11 @@ class TestTransformer(unittest.TestCase):
         epochs=EPOCH,
     )
 
+    @pytest.mark.xdist_group(name="imputation-transformer")
     def test_0_fit(self):
         self.transformer.fit(TRAIN_SET, VAL_SET)
 
+    @pytest.mark.xdist_group(name="imputation-transformer")
     def test_1_impute(self):
         imputed_X = self.transformer.impute(TEST_SET)
         assert not np.isnan(
@@ -101,6 +107,7 @@ def test_1_impute(self):
         )
         logger.info(f"Transformer test_MAE: {test_MAE}")
 
+    @pytest.mark.xdist_group(name="imputation-transformer")
     def test_2_parameters(self):
         assert hasattr(self.transformer, "model") and self.transformer.model is not None
 
@@ -124,9 +131,11 @@ class TestBRITS(unittest.TestCase):
     # initialize a BRITS model
     brits = BRITS(DATA["n_steps"], DATA["n_features"], 256, epochs=EPOCH)
 
+    @pytest.mark.xdist_group(name="imputation-brits")
     def test_0_fit(self):
         self.brits.fit(TRAIN_SET, VAL_SET)
 
+    @pytest.mark.xdist_group(name="imputation-brits")
     def test_1_impute(self):
         imputed_X = self.brits.impute(TEST_SET)
         assert not np.isnan(
@@ -137,6 +146,7 @@ def test_1_impute(self):
         )
         logger.info(f"BRITS test_MAE: {test_MAE}")
 
+    @pytest.mark.xdist_group(name="imputation-brits")
     def test_2_parameters(self):
         assert hasattr(self.brits, "model") and self.brits.model is not None
 
@@ -155,6 +165,7 @@ class TestLOCF(unittest.TestCase):
     logger.info("Running tests for an imputation model LOCF...")
     locf = LOCF(nan=0)
 
+    @pytest.mark.xdist_group(name="imputation-locf")
     def test_0_impute(self):
         test_X_imputed = self.locf.impute(TEST_SET)
         assert not np.isnan(
@@ -165,6 +176,7 @@ def test_0_impute(self):
         )
         logger.info(f"LOCF test_MAE: {test_MAE}")
 
+    @pytest.mark.xdist_group(name="imputation-locf")
     def test_1_parameters(self):
         assert hasattr(self.locf, "nan") and self.locf.nan is not None
 
diff --git a/pypots/tests/test_logging.py b/pypots/tests/test_logging.py
index 3ebc3fca..f3c888fe 100644
--- a/pypots/tests/test_logging.py
+++ b/pypots/tests/test_logging.py
@@ -13,30 +13,37 @@
 
 
 class TestLogger(unittest.TestCase):
-    def setUp(self) -> None:
-        self.logger_creator = Logger(name="PyPOTS testing log", logging_level="debug")
-        self.logger = self.logger_creator.logger
+    logger_creator = Logger(name="PyPOTS testing log", logging_level="debug")
+    logger = logger_creator.logger
 
     def test_different_level_logging(self):
-        self.logger.debug('debug')
-        self.logger.info('info')
-        self.logger.warning('warning')
-        self.logger.error('error')
+        self.logger.debug("debug")
+        self.logger.info("info")
+        self.logger.warning("warning")
+        self.logger.error("error")
 
     def test_changing_level(self):
-        self.logger_creator.set_level('info')
-        assert self.logger.level == 20, f'the level of logger is {self.logger.level}, not INFO'
-        self.logger_creator.set_level('warning')
-        assert self.logger.level == 30, f'the level of logger is {self.logger.level}, not WARNING'
-        self.logger_creator.set_level('error')
-        assert self.logger.level == 40, f'the level of logger is {self.logger.level}, not ERROR'
-        self.logger_creator.set_level('debug')
-        assert self.logger.level == 10, f'the level of logger is {self.logger.level}, not DEBUG'
+        self.logger_creator.set_level("info")
+        assert (
+            self.logger.level == 20
+        ), f"the level of logger is {self.logger.level}, not INFO"
+        self.logger_creator.set_level("warning")
+        assert (
+            self.logger.level == 30
+        ), f"the level of logger is {self.logger.level}, not WARNING"
+        self.logger_creator.set_level("error")
+        assert (
+            self.logger.level == 40
+        ), f"the level of logger is {self.logger.level}, not ERROR"
+        self.logger_creator.set_level("debug")
+        assert (
+            self.logger.level == 10
+        ), f"the level of logger is {self.logger.level}, not DEBUG"
 
     def test_saving_log_into_file(self):
-        self.logger_creator.set_saving_path('test_log', 'testing.log')
-        assert os.path.exists('test_log/testing.log')
-        shutil.rmtree('test_log', ignore_errors=True)
+        self.logger_creator.set_saving_path("test_log", "testing.log")
+        assert os.path.exists("test_log/testing.log")
+        shutil.rmtree("test_log", ignore_errors=True)
 
 
 if __name__ == "__main__":

From 9ad9c7ea775bfa40ae469f5bd63006f7790f5009 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Sat, 1 Apr 2023 00:00:23 +0800
Subject: [PATCH 20/22] fix: fix some warnings while running VaDER;

---
 pypots/clustering/vader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pypots/clustering/vader.py b/pypots/clustering/vader.py
index 128743a4..9a7a0e1f 100644
--- a/pypots/clustering/vader.py
+++ b/pypots/clustering/vader.py
@@ -103,7 +103,7 @@ def set_values(self, mu, var, phi):
         assert phi.shape == self.phi_c_unscaled.shape
         self.mu_c_unscaled = torch.nn.Parameter(mu)
         self.var_c_unscaled = torch.nn.Parameter(var)
-        self.phi_c_unscaled = torch.tensor(phi)
+        self.phi_c_unscaled = phi
 
     def forward(self):
         mu_c = self.mu_c_unscaled
@@ -293,6 +293,7 @@ def forward(self, inputs, pretrain=False):
         ii, jj = torch.meshgrid(
             torch.arange(self.n_clusters, dtype=torch.int64, device=device),
             torch.arange(batch_size, dtype=torch.int64, device=device),
+            indexing="ij",
         )
         ii = ii.flatten()
         jj = jj.flatten()

From e7bee57223abc2bb8aaa065092d309f75c5f86c0 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Sat, 1 Apr 2023 01:02:24 +0800
Subject: [PATCH 21/22] fix: move dataset saving into test steps;

---
 pypots/tests/test_data.py | 47 +++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/pypots/tests/test_data.py b/pypots/tests/test_data.py
index eff222a9..bf2c238d 100644
--- a/pypots/tests/test_data.py
+++ b/pypots/tests/test_data.py
@@ -33,22 +33,6 @@ def save_data_set_into_h5(data, path):
 
 EPOCHS = 1
 
-save_data_set_into_h5(
-    {"X": DATA["train_X"], "y": DATA["train_y"].astype(int)}, TRAIN_SET
-)
-save_data_set_into_h5({"X": DATA["val_X"], "y": DATA["val_y"].astype(int)}, VAL_SET)
-save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET)
-save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET)
-
-save_data_set_into_h5(
-    {
-        "X": DATA["test_X"],
-        "X_intact": DATA["test_X_intact"],
-        "X_indicating_mask": DATA["test_X_indicating_mask"],
-    },
-    TEST_SET,
-)
-
 
 class TestLazyLoadingClasses(unittest.TestCase):
     logger.info("Running tests for Dataset classes with lazy-loading strategy...")
@@ -85,20 +69,29 @@ class TestLazyLoadingClasses(unittest.TestCase):
         epochs=EPOCHS,
     )
 
-    def setUp(self) -> None:
-        assert os.path.exists(TRAIN_SET)
-        assert os.path.exists(VAL_SET)
-        assert os.path.exists(TEST_SET)
-
-        assert os.path.exists(IMPUTATION_TRAIN_SET)
-        assert os.path.exists(IMPUTATION_VAL_SET)
-
     @pytest.mark.xdist_group(name="data-lazy-loading")
-    def test_0_DatasetForMIT(self):
-        self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET)
+    def test_0_save_datasets_into_files(self):
+        save_data_set_into_h5(
+            {"X": DATA["train_X"], "y": DATA["train_y"].astype(int)}, TRAIN_SET
+        )
+        save_data_set_into_h5(
+            {"X": DATA["val_X"], "y": DATA["val_y"].astype(int)}, VAL_SET
+        )
+        save_data_set_into_h5({"X": DATA["train_X"]}, IMPUTATION_TRAIN_SET)
+        save_data_set_into_h5({"X": DATA["val_X"]}, IMPUTATION_VAL_SET)
+
+        save_data_set_into_h5(
+            {
+                "X": DATA["test_X"],
+                "X_intact": DATA["test_X_intact"],
+                "X_indicating_mask": DATA["test_X_indicating_mask"],
+            },
+            TEST_SET,
+        )
 
     @pytest.mark.xdist_group(name="data-lazy-loading")
-    def test_1_BaseDataset(self):
+    def test_1_DatasetForMIT_BaseDataset(self):
+        self.saits.fit(train_set=IMPUTATION_TRAIN_SET, val_set=IMPUTATION_VAL_SET)
         _ = self.saits.impute(X=TEST_SET)
 
     @pytest.mark.xdist_group(name="data-lazy-loading")

From 235c6070268b5041570e4997c90142d3f435075d Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Sat, 1 Apr 2023 01:17:31 +0800
Subject: [PATCH 22/22] fix: the error file name of test_data.py;

---
 .github/workflows/testing.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 50cba726..41b70c44 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -48,7 +48,7 @@ jobs:
                   python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append --dist=loadgroup
                   python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append --dist=loadgroup
                   python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append --dist=loadgroup
-                  python -m pytest -rA pypots/tests/test_data_lazy_loading_from_file.py -n auto --cov=pypots --cov-append --dist=loadgroup
+                  python -m pytest -rA pypots/tests/test_data.py -n auto --cov=pypots --cov-append --dist=loadgroup
                   python -m pytest -rA pypots/tests/test_logging.py -n auto --cov=pypots --cov-append --dist=loadgroup
 
             - name: Generate the LCOV report