Merge pull request #64 from kostaleonard/style

Style
kostaleonard · Apr 16, 2022 · 236d349 · 236d349
2 parents 37bdab7 + 543c90f
commit 236d349
Show file tree

Hide file tree

Showing 42 changed files with 1,664 additions and 1,358 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -13,6 +13,8 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           make install
+      - name: Pre-commit
+        run: pre-commit run --all-files
       - name: Lint
         run: make lint
       - name: Test

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,13 @@
+files: ^mlops/|^tests/
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+      - id: black
+        language_version: python3.9
diff --git a/.pylintrc b/.pylintrc
@@ -86,7 +86,8 @@ disable=raw-checker-failed,
         useless-suppression,
         deprecated-pragma,
         use-symbolic-message-instead,
-        duplicate-code
+        duplicate-code,
+        line-too-long
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
@@ -282,7 +283,7 @@ indent-after-paren=4
 indent-string='    '
 
 # Maximum number of characters on a single line.
-max-line-length=100
+max-line-length=79
 
 # Maximum number of lines in a module.
 max-module-lines=1000

diff --git a/mlops/__init__.py b/mlops/__init__.py
@@ -1,13 +1,6 @@
-"""mlops is a package for conducting MLOps, including versioning of datasets and
-models."""
+"""mlops is a package for conducting MLOps, including versioning of datasets
+and models."""
 
-__version__ = '0.1.6'
+__version__ = "0.1.7"
 
-__all__ = [
-    '__version__',
-    'artifact',
-    'dataset',
-    'errors',
-    'model',
-    'hashing'
-]
+__all__ = ["__version__", "artifact", "dataset", "errors", "model", "hashing"]
diff --git a/mlops/artifact/versioned_artifact.py b/mlops/artifact/versioned_artifact.py
@@ -52,7 +52,7 @@ def md5(self) -> str:
         :return: The artifact's MD5 hash.
         """
 
-    def __eq__(self, other: 'VersionedArtifact') -> bool:
+    def __eq__(self, other: "VersionedArtifact") -> bool:
         """Returns True if the two objects have the same loaded MD5 hash code,
         False otherwise.
 
@@ -81,23 +81,19 @@ def update_metadata(self, updates: dict[str, Any]) -> None:
             a key does not exist in the metadata, it is added; if it does
             exist, its value is overwritten.
         """
-        if self.metadata_path.startswith('s3://'):
+        if self.metadata_path.startswith("s3://"):
             fs = S3FileSystem()
-            with fs.open(self.metadata_path,
-                         'r',
-                         encoding='utf-8') as infile:
+            with fs.open(self.metadata_path, "r", encoding="utf-8") as infile:
                 metadata = json.loads(infile.read())
         else:
-            with open(self.metadata_path, 'r', encoding='utf-8') as infile:
+            with open(self.metadata_path, "r", encoding="utf-8") as infile:
                 metadata = json.loads(infile.read())
         updated_metadata = {**metadata, **updates}
-        if self.metadata_path.startswith('s3://'):
-            with fs.open(self.metadata_path,
-                         'w',
-                         encoding='utf-8') as outfile:
+        if self.metadata_path.startswith("s3://"):
+            with fs.open(self.metadata_path, "w", encoding="utf-8") as outfile:
                 outfile.write(json.dumps(updated_metadata))
         else:
-            with open(self.metadata_path, 'w', encoding='utf-8') as outfile:
+            with open(self.metadata_path, "w", encoding="utf-8") as outfile:
                 outfile.write(json.dumps(updated_metadata))
 
     def republish(self, republication_path: str) -> str:
@@ -114,7 +110,7 @@ def republish(self, republication_path: str) -> str:
             artifacts with the same version.
         :return: The versioned artifact's publication path.
         """
-        if republication_path.startswith('s3://'):
+        if republication_path.startswith("s3://"):
             return self._republish_to_s3(republication_path)
         return self._republish_to_local(republication_path)
 
@@ -130,7 +126,7 @@ def _republish_to_local(self, republication_path: str) -> str:
         # pylint: disable=protected-access
         publication_path = os.path.join(republication_path, self.version)
         VersionedArtifactBuilder._make_publication_path_local(publication_path)
-        if self.path.startswith('s3://'):
+        if self.path.startswith("s3://"):
             fs = S3FileSystem()
             fs.get(self.path, publication_path, recursive=True)
         else:
@@ -150,18 +146,18 @@ def _republish_to_s3(self, republication_path: str) -> str:
         publication_path = os.path.join(republication_path, self.version)
         fs = S3FileSystem()
         VersionedArtifactBuilder._make_publication_path_s3(
-            publication_path, fs)
-        if self.path.startswith('s3://'):
-            artifact_path_no_prefix = self.path.replace('s3://', '', 1)
-            copy_path_no_prefix = publication_path.replace('s3://', '', 1)
+            publication_path, fs
+        )
+        if self.path.startswith("s3://"):
+            artifact_path_no_prefix = self.path.replace("s3://", "", 1)
+            copy_path_no_prefix = publication_path.replace("s3://", "", 1)
             for current_path, _, filenames in fs.walk(self.path):
-                outfile_prefix = current_path.replace(artifact_path_no_prefix,
-                                                      copy_path_no_prefix, 1)
+                outfile_prefix = current_path.replace(
+                    artifact_path_no_prefix, copy_path_no_prefix, 1
+                )
                 for filename in filenames:
-                    infile_path = os.path.join(current_path,
-                                               filename)
-                    outfile_path = os.path.join(outfile_prefix,
-                                                filename)
+                    infile_path = os.path.join(current_path, filename)
+                    outfile_path = os.path.join(outfile_prefix, filename)
                     fs.copy(infile_path, outfile_path)
         else:
             fs.put(self.path, publication_path, recursive=True)

diff --git a/mlops/artifact/versioned_artifact_builder.py b/mlops/artifact/versioned_artifact_builder.py
@@ -9,6 +9,7 @@
 
 class VersionedArtifactBuilder(ABC):
     """Represents a versioned artifact builder."""
+
     # pylint: disable=too-few-public-methods
 
     @abstractmethod
@@ -43,8 +44,9 @@ def _make_publication_path_local(publication_path: str) -> None:
             raise PublicationPathAlreadyExistsError from err
 
     @staticmethod
-    def _make_publication_path_s3(publication_path: str,
-                                  fs: S3FileSystem) -> None:
+    def _make_publication_path_s3(
+        publication_path: str, fs: S3FileSystem
+    ) -> None:
         """Creates the directories that compose the publication path.
 
         :param publication_path: The path to which to publish the artifact.

diff --git a/mlops/dataset/data_processor.py b/mlops/dataset/data_processor.py
@@ -9,8 +9,9 @@ class DataProcessor:
     """Transforms a raw dataset into features and labels for downstream model
     training, prediction, etc."""
 
-    def get_preprocessed_features_and_labels(self, dataset_path: str) -> \
-            Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
+    def get_preprocessed_features_and_labels(
+        self, dataset_path: str
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
         """Returns the preprocessed feature and label tensors from the dataset
         path. This method is specifically used for the train/val/test sets and
         not input data for prediction, because in some cases the features and
@@ -23,16 +24,23 @@ def get_preprocessed_features_and_labels(self, dataset_path: str) -> \
         :return: A 2-tuple of the features dictionary and labels dictionary,
             with matching keys and ordered tensors.
         """
-        raw_feature_tensors, raw_label_tensors = \
-            self.get_raw_features_and_labels(dataset_path)
-        features = {name: self.preprocess_features(raw_feature_tensor)
-                    for name, raw_feature_tensor in raw_feature_tensors.items()}
-        labels = {name: self.preprocess_labels(raw_label_tensor)
-                  for name, raw_label_tensor in raw_label_tensors.items()}
+        (
+            raw_feature_tensors,
+            raw_label_tensors,
+        ) = self.get_raw_features_and_labels(dataset_path)
+        features = {
+            name: self.preprocess_features(raw_feature_tensor)
+            for name, raw_feature_tensor in raw_feature_tensors.items()
+        }
+        labels = {
+            name: self.preprocess_labels(raw_label_tensor)
+            for name, raw_label_tensor in raw_label_tensors.items()
+        }
         return features, labels
 
-    def get_preprocessed_features(self, dataset_path: str) -> \
-            Dict[str, np.ndarray]:
+    def get_preprocessed_features(
+        self, dataset_path: str
+    ) -> Dict[str, np.ndarray]:
         """Transforms the raw data at the given file or directory into features
         that can be used by downstream models. The data in the directory may be
         the training/validation/test data, or it may be a batch of user data
@@ -44,21 +52,24 @@ def get_preprocessed_features(self, dataset_path: str) -> \
             remote filesystem containing the dataset.
         :return: A dictionary whose values are feature tensors and whose
             corresponding keys are the names by which those tensors should be
-            referenced. For example, the training features (value) may be called
-            'X_train' (key).
+            referenced. For example, the training features (value) may be
+            called 'X_train' (key).
         """
         raw_feature_tensors = self.get_raw_features(dataset_path)
-        return {name: self.preprocess_features(raw_feature_tensor)
-                for name, raw_feature_tensor in raw_feature_tensors.items()}
+        return {
+            name: self.preprocess_features(raw_feature_tensor)
+            for name, raw_feature_tensor in raw_feature_tensors.items()
+        }
 
     @abstractmethod
-    def get_raw_features_and_labels(self, dataset_path: str) -> \
-            Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
-        """Returns the raw feature and label tensors from the dataset path. This
-        method is specifically used for the train/val/test sets and not input
-        data for prediction, because in some cases the features and labels need
-        to be read simultaneously to ensure proper ordering of features and
-        labels.
+    def get_raw_features_and_labels(
+        self, dataset_path: str
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
+        """Returns the raw feature and label tensors from the dataset path.
+        This method is specifically used for the train/val/test sets and not
+        input data for prediction, because in some cases the features and
+        labels need to be read simultaneously to ensure proper ordering of
+        features and labels.
 
         For example, when handling image data, the raw features would likely be
         tensors of shape m x h x w x c, where m is the number of images, h is
@@ -88,12 +99,14 @@ def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
             remote filesystem containing the dataset.
         :return: A dictionary whose values are feature tensors and whose
             corresponding keys are the names by which those tensors should be
-            referenced. For example, the training features (value) may be called
-            'X_train' (key).
+            referenced. For example, the training features (value) may be
+            called 'X_train' (key).
         """
 
     @abstractmethod
-    def preprocess_features(self, raw_feature_tensor: np.ndarray) -> np.ndarray:
+    def preprocess_features(
+        self, raw_feature_tensor: np.ndarray
+    ) -> np.ndarray:
         """Returns the preprocessed feature tensor from the raw tensor. The
         preprocessed features are how training/validation/test as well as
         prediction data are fed into downstream models. For example, when

diff --git a/mlops/dataset/pathless_data_processor.py b/mlops/dataset/pathless_data_processor.py
@@ -8,9 +8,9 @@
 class PathlessDataProcessor(DataProcessor):
     """Loads preset features and labels."""
 
-    def __init__(self,
-                 features: Dict[str, np.ndarray],
-                 labels: Dict[str, np.ndarray]) -> None:
+    def __init__(
+        self, features: Dict[str, np.ndarray], labels: Dict[str, np.ndarray]
+    ) -> None:
         """Instantiates the object.
 
         :param features: The training features.
@@ -19,8 +19,9 @@ def __init__(self,
         self.features = features
         self.labels = labels
 
-    def get_raw_features_and_labels(self, dataset_path: str) -> \
-            Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
+    def get_raw_features_and_labels(
+        self, dataset_path: str
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
         """Returns the training features and labels.
 
         :param dataset_path: Unused
@@ -35,12 +36,14 @@ def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
         :param dataset_path: Unused.
         :return: A dictionary whose values are feature tensors and whose
             corresponding keys are the names by which those tensors should be
-            referenced. For example, the training features (value) may be called
-            'X_train' (key).
+            referenced. For example, the training features (value) may be
+            called 'X_train' (key).
         """
         return self.features
 
-    def preprocess_features(self, raw_feature_tensor: np.ndarray) -> np.ndarray:
+    def preprocess_features(
+        self, raw_feature_tensor: np.ndarray
+    ) -> np.ndarray:
         """Returns the identity function on the input features.
 
         :param raw_feature_tensor: The raw features to be preprocessed.