Merge pull request #39 from WenjieDu/dev

Merge `dev` into `main`
WenjieDu · Mar 31, 2023 · f64dda9 · f64dda9
2 parents 7cdd393 + 235c607
commit f64dda9
Show file tree

Hide file tree

Showing 33 changed files with 1,560 additions and 665 deletions.
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -43,10 +43,13 @@ jobs:
             - name: Test with pytest
               run: |
                   # run tests separately here due to Segmentation Fault in test_clustering when run all in 
-                  # one command with `pytest` on MacOS. Bugs not catched, so this is a trade-off to avoid SF.
-                  python -m pytest -rA pypots/tests/test_classification.py -n auto --cov=pypots 
-                  python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append  
-                  python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append
+                  # one command with `pytest` on MacOS. Bugs not caught, so this is a trade-off to avoid SF.
+                  python -m pytest -rA pypots/tests/test_classification.py -n auto --cov=pypots --dist=loadgroup 
+                  python -m pytest -rA pypots/tests/test_imputation.py -n auto --cov=pypots --cov-append --dist=loadgroup
+                  python -m pytest -rA pypots/tests/test_clustering.py -n auto --cov=pypots --cov-append --dist=loadgroup
+                  python -m pytest -rA pypots/tests/test_forecasting.py -n auto --cov=pypots --cov-append --dist=loadgroup
+                  python -m pytest -rA pypots/tests/test_data.py -n auto --cov=pypots --cov-append --dist=loadgroup
+                  python -m pytest -rA pypots/tests/test_logging.py -n auto --cov=pypots --cov-append --dist=loadgroup
 
             - name: Generate the LCOV report
               run: |

diff --git a/CITATION.cff b/CITATION.cff
@@ -5,7 +5,7 @@ authors:
   given-names: "Wenjie"
   orcid: "https://orcid.org/0000-0003-3046-7835"
 title: "PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series"
-version: 0.0.7
-doi: 10.5281/zenodo.6823222
+version: 0.0.9
+doi: 10.5281/zenodo.6823221
 date-released: 2022-07-12
 url: "https://github.com/WenjieDu/PyPOTS"
diff --git a/README.md b/README.md
@@ -5,44 +5,46 @@
 
 <p align='center'>
     <!-- Python version -->
-    <img src='https://img.shields.io/badge/python-v3-yellow'>
+    <img src='https://img.shields.io/badge/python-v3-yellow?color=a4e2c6'>
     <!-- PyTorch-->
-    <img src='https://img.shields.io/static/v1?label=PyTorch&message=%E2%9D%A4%EF%B8%8F&color=DC583A&logo=pytorch'>
+    <img src='https://img.shields.io/static/v1?label=PyTorch&message=%E2%9D%A4%EF%B8%8F&color=7bcfa6&logo=pytorch'>
     <!-- PyPI version -->
     <a alt='PyPI download number' href='https://pypi.org/project/pypots'>
-        <img alt="PyPI" src="https://img.shields.io/pypi/v/pypots?color=yellowgreen&label=PyPI&logo=pypi&logoColor=white">
+        <img alt="PyPI" src="https://img.shields.io/pypi/v/pypots?color=7fecad&label=PyPI&logo=pypi&logoColor=white">
     </a>
     <!-- on Anaconda -->
     <a alt='on anaconda' href='https://anaconda.org/conda-forge/pypots'>
-        <img alt="on anaconda" src="https://img.shields.io/conda/vn/conda-forge/pypots?color=green&label=Conda&logo=anaconda" />
+        <img alt="on anaconda" src="https://img.shields.io/conda/pn/conda-forge/pypots?color=3de1ad&label=Conda&logo=anaconda" />
     </a>
     <!-- License -->
     <a alt='GPL3 license' href='https://github.com/WenjieDu/PyPOTS/blob/main/LICENSE'>
-        <img src='https://img.shields.io/badge/License-GPL--v3-green?color=79C641'>
+        <img src='https://img.shields.io/badge/License-GPL--v3-00e09e'>
     </a>
-    <!-- GitHub Testing -->
-    <a alt='GitHub Testing' href='https://github.com/WenjieDu/PyPOTS/actions/workflows/testing.yml'> 
-        <img src='https://github.com/WenjieDu/PyPOTS/actions/workflows/testing.yml/badge.svg'>
+    <!-- Repo size -->
+    <img src="https://img.shields.io/github/repo-size/WenjieDu/PyPOTS?color=48c0a3">
+    <!-- Code of Conduct -->
+    <a alt='CODE_OF_CONDUCT' href='https://github.com/WenjieDu/PyPOTS/blob/main/CODE_OF_CONDUCT.md'> 
+        <img src='https://img.shields.io/badge/Contributor%20Covenant-v2.1-21a675.svg'>
     </a>
-    <!-- Coveralls report -->
-    <a alt='Coveralls report' href='https://coveralls.io/github/WenjieDu/PyPOTS'> 
-        <img src='https://img.shields.io/coverallsCoverage/github/WenjieDu/PyPOTS?branch=main&logo=coveralls&labelColor=3F5767'>
+    <!-- Slack Workspace -->
+    <a alt='Slack Workspace' href='https://join.slack.com/t/pypots-dev/shared_invite/zt-1gq6ufwsi-p0OZdW~e9UW_IA4_f1OfxA'> 
+        <img src='https://img.shields.io/badge/Slack-PyPOTS-grey?logo=slack&color=549688'>
+    </a>
+    <!-- Zenodo DOI -->
+    <a alt='Zenodo DOI' href='https://doi.org/10.5281/zenodo.6823221'>
+        <img src='https://zenodo.org/badge/DOI/10.5281/zenodo.6823221.svg'>
     </a>
     <!-- PyPI download number -->
     <a alt='PyPI download number' href='https://pepy.tech/project/pypots'>
-        <img src='https://static.pepy.tech/personalized-badge/pypots?period=total&units=international_system&left_color=grey&right_color=blue&left_text=Downloads'>
+        <img src='https://static.pepy.tech/personalized-badge/pypots?period=total&units=international_system&left_color=grey&right_color=navy&left_text=Downloads'>
     </a>
-    <!-- Zenodo DOI -->
-    <a alt='Zenodo DOI' href='https://zenodo.org/badge/latestdoi/475477908'>
-        <img src='https://zenodo.org/badge/475477908.svg'>
-    </a>
-    <!-- Code of Conduct -->
-    <a alt='CODE_OF_CONDUCT' href='https://github.com/WenjieDu/PyPOTS/blob/main/CODE_OF_CONDUCT.md'> 
-        <img src='https://img.shields.io/badge/Contributor%20Covenant-v2.1-4baaaa.svg'>
+    <!-- GitHub Testing -->
+    <a alt='GitHub Testing' href='https://github.com/WenjieDu/PyPOTS/actions/workflows/testing.yml'> 
+        <img src='https://github.com/WenjieDu/PyPOTS/actions/workflows/testing.yml/badge.svg'>
     </a>
-    <!-- Slack Workspace -->
-    <a alt='Slack Workspace' href='https://join.slack.com/t/pypots-dev/shared_invite/zt-1gq6ufwsi-p0OZdW~e9UW_IA4_f1OfxA'> 
-        <img src='https://img.shields.io/badge/Slack-PyPOTS-grey?logo=slack&labelColor=4A154B&color=62BCE5'>
+    <!-- Coveralls report -->
+    <a alt='Coveralls report' href='https://coveralls.io/github/WenjieDu/PyPOTS'> 
+        <img src='https://img.shields.io/coverallsCoverage/github/WenjieDu/PyPOTS?branch=main&logo=coveralls&labelColor=#0aa344'>
     </a>
 </p>
 
@@ -64,6 +66,9 @@ Install it with `conda install pypots`, you may need to specify the channel with
 Install the latest release from PyPI:
 > pip install pypots
 
+or install from the source code with the latest features not officially released in a version:
+> pip install `https://github.com/WenjieDu/PyPOTS/archive/main.zip`
+
 <details open>
 <summary><b>Below is an example applying SAITS in PyPOTS to impute missing values in the dataset PhysioNet2012:</b></summary>
 
@@ -82,10 +87,11 @@ X = StandardScaler().fit_transform(X.to_numpy())
 X = X.reshape(num_samples, 48, -1)
 X_intact, X, missing_mask, indicating_mask = mcar(X, 0.1) # hold out 10% observed values as ground truth
 X = masked_fill(X, 1 - missing_mask, np.nan)
+dataset = {"X": X}
 # Model training. This is PyPOTS showtime. 💪
 saits = SAITS(n_steps=48, n_features=37, n_layers=2, d_model=256, d_inner=128, n_head=4, d_k=64, d_v=64, dropout=0.1, epochs=10)
-saits.fit(X)  # train the model. Here I use the whole dataset as the training set, because ground truth is not visible to the model.
-imputation = saits.impute(X)  # impute the originally-missing values and artificially-missing values
+saits.fit(dataset)  # train the model. Here I use the whole dataset as the training set, because ground truth is not visible to the model.
+imputation = saits.impute(dataset)  # impute the originally-missing values and artificially-missing values
 mae = cal_mae(imputation, X_intact, indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)
 ```
 </details>
@@ -112,13 +118,13 @@ author = {Wenjie Du},
 title = {{PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series}},
 howpublished = {\url{https://github.com/wenjiedu/pypots}},
 year = {2022},
-doi = {10.5281/zenodo.6823222},
+doi = {10.5281/zenodo.6823221},
 }
 ```
 
 or
 
-`Wenjie Du. (2022). PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series. Zenodo. https://doi.org/10.5281/zenodo.6823222`
+`Wenjie Du. (2022). PyPOTS: A Python Toolbox for Data Mining on Partially-Observed Time Series. Zenodo. https://doi.org/10.5281/zenodo.6823221`
 
 ## ❖ Attention 👀
 The documentation and tutorials are under construction. And a short paper introducing PyPOTS is on the way! 🚀 Stay tuned please!

diff --git a/environment.yml b/environment.yml
@@ -9,10 +9,9 @@ dependencies:
     - conda-forge::scipy
     - conda-forge::pandas
     - conda-forge::scikit-learn
-    - conda-forge::matplotlib
     - conda-forge::tensorboard
     - conda-forge::pip
-    - pytorch::pytorch==1.11.0
-    - pip:
-        - pycorruptor==0.0.4
-        - tsdb==0.0.7
+    - conda-forge::pycorruptor
+    - conda-forge::tsdb
+    - conda-forge::h5py
+    - pytorch::pytorch==1.11.0
diff --git a/pypots/__version__.py b/pypots/__version__.py
@@ -21,4 +21,4 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 
-version = "0.0.9"
+version = "0.0.10"
diff --git a/pypots/base.py b/pypots/base.py
@@ -8,7 +8,6 @@
 import os
 from abc import ABC
 
-import numpy as np
 import torch
 
 from pypots.utils.files import create_dir_if_not_exist
@@ -32,101 +31,6 @@ def __init__(self, device):
         else:
             self.device = device
 
-    def check_input(
-        self, expected_n_steps, expected_n_features, X, y=None, out_dtype="tensor"
-    ):
-        """Check value type and shape of input X and y
-
-        Parameters
-        ----------
-        expected_n_steps : int
-            Number of time steps of input time series (X) that the model expects.
-            This value is the same with the argument `n_steps` used to initialize the model.
-
-        expected_n_features : int
-            Number of feature dimensions of input time series (X) that the model expects.
-            This value is the same with the argument `n_features` used to initialize the model.
-
-        X : array-like,
-            Time-series data that must have a shape like [n_samples, expected_n_steps, expected_n_features].
-
-        y : array-like, default=None
-            Labels of time-series samples (X) that must have a shape like [n_samples] or [n_samples, n_classes].
-
-        out_dtype : str, in ['tensor', 'ndarray'], default='tensor'
-            Data type of the output, should be np.ndarray or torch.Tensor
-
-        Returns
-        -------
-        X : tensor
-
-        y : tensor
-        """
-        assert out_dtype in [
-            "tensor",
-            "ndarray",
-        ], f'out_dtype should be "tensor" or "ndarray", but got {out_dtype}'
-        is_list = isinstance(X, list)
-        is_array = isinstance(X, np.ndarray)
-        is_tensor = isinstance(X, torch.Tensor)
-        assert is_tensor or is_array or is_list, TypeError(
-            "X should be an instance of list/np.ndarray/torch.Tensor, "
-            f"but got {type(X)}"
-        )
-
-        # convert the data type if in need
-        if out_dtype == "tensor":
-            if is_list:
-                X = torch.tensor(X).to(self.device)
-            elif is_array:
-                X = torch.from_numpy(X).to(self.device)
-            else:  # is tensor
-                X = X.to(self.device)
-        else:  # out_dtype is ndarray
-            # convert to np.ndarray first for shape check
-            if is_list:
-                X = np.asarray(X)
-            elif is_tensor:
-                X = X.numpy()
-            else:  # is ndarray
-                pass
-
-        # check the shape of X here
-        X_shape = X.shape
-        assert len(X_shape) == 3, (
-            f"input should have 3 dimensions [n_samples, seq_len, n_features],"
-            f"but got shape={X.shape}"
-        )
-        assert (
-            X_shape[1] == expected_n_steps
-        ), f"expect X.shape[1] to be {expected_n_steps}, but got {X_shape[1]}"
-        assert (
-            X_shape[2] == expected_n_features
-        ), f"expect X.shape[2] to be {expected_n_features}, but got {X_shape[2]}"
-
-        if y is not None:
-            assert len(X) == len(y), (
-                f"lengths of X and y must match, " f"but got f{len(X)} and {len(y)}"
-            )
-            if isinstance(y, torch.Tensor):
-                y = y.to(self.device) if out_dtype == "tensor" else y.numpy()
-            elif isinstance(y, list):
-                y = (
-                    torch.tensor(y).to(self.device)
-                    if out_dtype == "tensor"
-                    else np.asarray(y)
-                )
-            elif isinstance(y, np.ndarray):
-                y = torch.from_numpy(y).to(self.device) if out_dtype == "tensor" else y
-            else:
-                raise TypeError(
-                    "y should be an instance of list/np.ndarray/torch.Tensor, "
-                    f"but got {type(y)}"
-                )
-            return X, y
-        else:
-            return X
-
     def save_logs_to_tensorboard(self, saving_path):
         """Save logs (self.logger) into a tensorboard file.
 

diff --git a/pypots/classification/base.py b/pypots/classification/base.py
@@ -22,19 +22,31 @@ def __init__(self, device):
         super().__init__(device)
 
     @abstractmethod
-    def fit(self, train_X, train_y, val_X=None, val_y=None):
-        """Train the classifier.
+    def fit(self, train_set, val_set=None, file_type="h5py"):
+        """Train the classifier on the given data.
 
         Parameters
         ----------
-        train_X : array-like of shape [n_samples, sequence length (time steps), n_features],
-            Time-series data for training, can contain missing values.
-        train_y : array,
-            Classification labels for training.
-        val_X : array-like of shape [n_samples, sequence length (time steps), n_features],
-            Time-series data for validation, can contain missing values.
-        val_y : array,
-            Classification labels for validation.
+        train_set : dict or str,
+            The dataset for model training, should be a dictionary including keys as 'X' and 'y',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for training, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        val_set : dict or str,
+            The dataset for model validating, should be a dictionary including keys as 'X' and 'y',
+            or a path string locating a data file.
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        file_type : str, default = "h5py",
+            The type of the given file if train_set and val_set are path strings.
 
         Returns
         -------
@@ -44,18 +56,22 @@ def fit(self, train_X, train_y, val_X=None, val_y=None):
         return self
 
     @abstractmethod
-    def classify(self, X):
-        """Classify the input with the trained model.
+    def classify(self, X, file_type="h5py"):
+        """Classify the input data with the trained model.
 
         Parameters
         ----------
-        X : array-like of shape [n_samples, sequence length (time steps), n_features],
-            Time-series data contains missing values.
+        X : array-like or str,
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type : str, default = "h5py",
+            The type of the given file if X is a path string.
 
         Returns
         -------
-        array-like, shape [n_samples, sequence length (time steps), n_features],
-            Classification results.
+        array-like, shape [n_samples],
+            Classification results of the given samples.
         """
         pass