Merge pull request #34 from kostaleonard/backcompat

Backcompat
kostaleonard · Jan 7, 2022 · bd5433e · bd5433e
2 parents bf48324 + 6a5fda7
commit bd5433e
Show file tree

Hide file tree

Showing 16 changed files with 48 additions and 40 deletions.
diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml
@@ -7,10 +7,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.9
+      - name: Set up Python 3.6
         uses: actions/setup-python@v2
         with:
-          python-version: 3.9
+          python-version: 3.6
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -31,10 +31,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.9
+      - name: Set up Python 3.6
         uses: actions/setup-python@v2
         with:
-          python-version: 3.9
+          python-version: 3.6
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -5,10 +5,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.9
+      - name: Set up Python 3.6
         uses: actions/setup-python@v2
         with:
-          python-version: 3.9
+          python-version: 3.6
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip

diff --git a/mlops/__init__.py b/mlops/__init__.py
@@ -1,7 +1,7 @@
 """mlops is a package for conducting MLOps, including versioning of datasets and
 models."""
 
-__version__ = '0.1.0'
+__version__ = '0.1.1'
 
 __all__ = [
     '__version__',

diff --git a/mlops/dataset/data_processor.py b/mlops/dataset/data_processor.py
@@ -1,6 +1,7 @@
 """Contains the DataProcessor class."""
 
 from abc import abstractmethod
+from typing import Dict
 import numpy as np
 
 
@@ -9,7 +10,7 @@ class DataProcessor:
     training, prediction, etc."""
 
     def get_preprocessed_features_and_labels(self, dataset_path: str) -> \
-            (dict[str, np.ndarray], dict[str, np.ndarray]):
+            (Dict[str, np.ndarray], Dict[str, np.ndarray]):
         """Returns the preprocessed feature and label tensors from the dataset
         path. This method is specifically used for the train/val/test sets and
         not input data for prediction, because in some cases the features and
@@ -31,7 +32,7 @@ def get_preprocessed_features_and_labels(self, dataset_path: str) -> \
         return features, labels
 
     def get_preprocessed_features(self, dataset_path: str) -> \
-            dict[str, np.ndarray]:
+            Dict[str, np.ndarray]:
         """Transforms the raw data at the given file or directory into features
         that can be used by downstream models. The data in the directory may be
         the training/validation/test data, or it may be a batch of user data
@@ -52,7 +53,7 @@ def get_preprocessed_features(self, dataset_path: str) -> \
 
     @abstractmethod
     def get_raw_features_and_labels(self, dataset_path: str) -> \
-            (dict[str, np.ndarray], dict[str, np.ndarray]):
+            (Dict[str, np.ndarray], Dict[str, np.ndarray]):
         """Returns the raw feature and label tensors from the dataset path. This
         method is specifically used for the train/val/test sets and not input
         data for prediction, because in some cases the features and labels need
@@ -74,7 +75,7 @@ def get_raw_features_and_labels(self, dataset_path: str) -> \
         """
 
     @abstractmethod
-    def get_raw_features(self, dataset_path: str) -> dict[str, np.ndarray]:
+    def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
         """Returns the raw feature tensors from the dataset path. The raw
         features are how training/validation/test as well as prediction data
         enter the data pipeline. For example, when handling image data, the raw

diff --git a/mlops/dataset/versioned_dataset_builder.py b/mlops/dataset/versioned_dataset_builder.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 import shutil
 from tempfile import TemporaryFile
-from typing import Optional
+from typing import Optional, List, Set
 from datetime import datetime
 import json
 import dill as pickle
@@ -52,7 +52,7 @@ def publish(self,
                 path: str,
                 version: Optional[str] = None,
                 dataset_copy_strategy: str = STRATEGY_COPY,
-                tags: Optional[list[str]] = None) -> str:
+                tags: Optional[List[str]] = None) -> str:
         """Saves the versioned dataset files to the given path. If the path and
         appended version already exists, this operation will raise a
         PublicationPathAlreadyExistsError.
@@ -241,7 +241,7 @@ def _make_publication_path_s3(publication_path: str,
             raise PublicationPathAlreadyExistsError
         fs.mkdirs(publication_path)
 
-    def _write_tensors_local(self, publication_path: str) -> set[str]:
+    def _write_tensors_local(self, publication_path: str) -> Set[str]:
         """Writes the feature and label tensors to the publication path
         directory and returns the paths to the created files for hashing.
 
@@ -257,7 +257,7 @@ def _write_tensors_local(self, publication_path: str) -> set[str]:
 
     def _write_tensors_s3(self,
                           publication_path: str,
-                          fs: S3FileSystem) -> set[str]:
+                          fs: S3FileSystem) -> Set[str]:
         """Writes the feature and label tensors to the publication path
         directory and returns the paths to the created files for hashing.
 
@@ -297,7 +297,7 @@ def _write_tensor_s3(tensor: np.ndarray,
             with fs.open(path, 'wb') as outfile:
                 outfile.write(tmp_file.read())
 
-    def _copy_raw_dataset_local(self, copy_path: str) -> set[str]:
+    def _copy_raw_dataset_local(self, copy_path: str) -> Set[str]:
         """Copies the raw dataset to the given path, and returns the paths to
         all created files for hashing.
 
@@ -319,7 +319,7 @@ def _copy_raw_dataset_local(self, copy_path: str) -> set[str]:
 
     def _copy_raw_dataset_s3(self,
                              copy_path: str,
-                             fs: S3FileSystem) -> set[str]:
+                             fs: S3FileSystem) -> Set[str]:
         """Copies the raw dataset to the given path, and returns the paths to
         all created files for hashing.
 

diff --git a/mlops/examples/image/classification/model_prediction.py b/mlops/examples/image/classification/model_prediction.py
@@ -45,7 +45,7 @@ def model_predict(features: np.ndarray,
     return dataset.data_processor.unpreprocess_labels(valid_predictions)
 
 
-def get_best_model(model_paths: Collection[str]) -> VersionedModel:
+def get_best_model(model_paths: Collection) -> VersionedModel:
     """Returns the versioned model with the best performance on the validation
     dataset.
 

diff --git a/mlops/examples/image/classification/pokemon_classification_data_processor.py b/mlops/examples/image/classification/pokemon_classification_data_processor.py
@@ -1,7 +1,7 @@
 """Contains the PokemonClassificationDataProcessor class."""
 
 import os
-from typing import Optional
+from typing import Optional, Dict
 import numpy as np
 import pandas as pd
 from matplotlib.image import imread
@@ -29,7 +29,7 @@ class PokemonClassificationDataProcessor(InvertibleDataProcessor):
     labels for classification."""
 
     def get_raw_features_and_labels(self, dataset_path: str) -> \
-            (dict[str, np.ndarray], dict[str, np.ndarray]):
+            (Dict[str, np.ndarray], Dict[str, np.ndarray]):
         """Returns the raw feature and label tensors from the dataset path. This
         method is specifically used for the train/val/test sets and not input
         data for prediction, because in some cases the features and labels need
@@ -87,7 +87,7 @@ def get_raw_features_and_labels(self, dataset_path: str) -> \
         return ({'X_train': X_train, 'X_val': X_val, 'X_test': X_test},
                 {'y_train': y_train, 'y_val': y_val, 'y_test': y_test})
 
-    def get_raw_features(self, dataset_path: str) -> dict[str, np.ndarray]:
+    def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
         """Returns the raw feature tensors from the prediction dataset path. Raw
         features are tensors of shape m x h x w x c, where m is the number of
         images, h is the image height, w is the image width, and c is the number

diff --git a/mlops/examples/image/classification/train_model.py b/mlops/examples/image/classification/train_model.py
@@ -2,7 +2,7 @@
 # pylint: disable=no-name-in-module
 
 import os
-from typing import Optional, Any
+from typing import Optional, Any, List
 from tensorflow.keras.models import Model, Sequential
 from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, \
     Dropout
@@ -83,6 +83,8 @@ def train_model(model: Model,
             model_checkpoint_filename,
             save_best_only=True)
         callbacks.append(checkpoint_callback)
+        model_dir = os.path.dirname(model_checkpoint_filename)
+        os.makedirs(model_dir, exist_ok=True)
     history = model.fit(x=dataset.X_train,
                         y=dataset.y_train,
                         validation_data=(dataset.X_val, dataset.y_val),
@@ -95,7 +97,7 @@ def publish_model(model: Model,
                   dataset: VersionedDataset,
                   training_config: TrainingConfig,
                   publication_path: str,
-                  tags: Optional[list[str]] = None) -> str:
+                  tags: Optional[List[str]] = None) -> str:
     """Publishes the model to the path on the local or remote filesystem.
 
     :param model: The model to be published, with the exact weights desired for

diff --git a/mlops/hashing/hashing.py b/mlops/hashing/hashing.py
@@ -8,7 +8,7 @@
 CHUNK_SIZE = 2 ** 20
 
 
-def get_hash_local(files_to_hash: Collection[str]) -> str:
+def get_hash_local(files_to_hash: Collection) -> str:
     """Returns the MD5 hex digest string from hashing the content of all the
     given files on the local filesystem. The files are sorted before hashing
     so that the process is reproducible.
@@ -26,7 +26,7 @@ def get_hash_local(files_to_hash: Collection[str]) -> str:
     return hash_md5.hexdigest()
 
 
-def get_hash_s3(files_to_hash: Collection[str]) -> str:
+def get_hash_s3(files_to_hash: Collection) -> str:
     """Returns the MD5 hex digest string from hashing the content of all the
     given files in S3. The files are sorted before hashing so that the
     process is reproducible.

diff --git a/mlops/model/training_config.py b/mlops/model/training_config.py
@@ -1,7 +1,7 @@
 """Contains the TrainingConfig class."""
 # pylint: disable = no-name-in-module
 
-from typing import Any
+from typing import Any, Dict
 from dataclasses import dataclass
 from tensorflow.keras.callbacks import History
 
@@ -14,4 +14,4 @@ class TrainingConfig:
     train_args: The training arguments.
     """
     history: History
-    train_args: dict[str, Any]
+    train_args: Dict[str, Any]
diff --git a/mlops/model/versioned_model_builder.py b/mlops/model/versioned_model_builder.py
@@ -4,7 +4,7 @@
 import os
 import json
 from tempfile import TemporaryDirectory
-from typing import Optional
+from typing import Optional, List
 from datetime import datetime
 from s3fs import S3FileSystem
 from tensorflow.keras.models import Model
@@ -43,7 +43,7 @@ def __init__(self,
     def publish(self,
                 path: str,
                 version: Optional[str] = None,
-                tags: Optional[list[str]] = None) -> str:
+                tags: Optional[List[str]] = None) -> str:
         """Saves the versioned model files to the given path. If the path and
         appended version already exists, this operation will raise a
         PublicationPathAlreadyExistsError.

diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@ pytest
 pytest-cov
 numpy
 pandas
-tensorflow
+tensorflow==2.4.1
 boto3
 botocore
 s3fs
@@ -14,4 +14,5 @@ dill
 sphinx
 sphinx_rtd_theme
 build
-twine
+twine
+python-dateutil
diff --git a/tests/dataset/preset_data_processor.py b/tests/dataset/preset_data_processor.py
@@ -1,6 +1,7 @@
 """Contains the PresetDataProcessor class."""
 # pylint:disable=no-name-in-module
 
+from typing import Dict
 import numpy as np
 from tensorflow.keras.utils import to_categorical
 from mlops.dataset.invertible_data_processor import InvertibleDataProcessor
@@ -27,7 +28,7 @@ class PresetDataProcessor(InvertibleDataProcessor):
     """Processes a preset dataset, with no file I/O."""
 
     def get_raw_features_and_labels(self, dataset_path: str) -> \
-            (dict[str, np.ndarray], dict[str, np.ndarray]):
+            (Dict[str, np.ndarray], Dict[str, np.ndarray]):
         """Returns preset raw feature and label tensors.
 
         :param dataset_path: Unused
@@ -40,7 +41,7 @@ def get_raw_features_and_labels(self, dataset_path: str) -> \
         return self.get_raw_features(dataset_path), labels
 
     def get_raw_features(self,
-                         dataset_path: str) -> dict[str, np.ndarray]:
+                         dataset_path: str) -> Dict[str, np.ndarray]:
         """Returns the preset raw features.
 
         :param dataset_path: Unused.

diff --git a/tests/dataset/test_serialization.py b/tests/dataset/test_serialization.py
@@ -2,6 +2,7 @@
 
 import os
 import shutil
+from typing import Dict
 import numpy as np
 from mlops.dataset.data_processor import DataProcessor
 from mlops.dataset.versioned_dataset_builder import VersionedDatasetBuilder
@@ -16,7 +17,7 @@ class DataProcessorThatWillChange(DataProcessor):
     how data should enter the model pipeline."""
 
     def get_raw_features_and_labels(self, dataset_path: str) -> \
-            (dict[str, np.ndarray], dict[str, np.ndarray]):
+            (Dict[str, np.ndarray], Dict[str, np.ndarray]):
         """Returns dummy features and labels
 
         :param dataset_path: Unused.
@@ -26,7 +27,7 @@ def get_raw_features_and_labels(self, dataset_path: str) -> \
         return ({'X': np.array([1, 2, 3])},
                 {'y': np.array([1, 2, 3])})
 
-    def get_raw_features(self, dataset_path: str) -> dict[str, np.ndarray]:
+    def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
         """Returns dummy features.
 
         :param dataset_path: Unused.
@@ -59,7 +60,7 @@ def preprocess_labels(self, raw_label_tensor: np.ndarray) -> np.ndarray:
 def _redefine_class() -> None:
     """Redefines DataProcessorThatWillChange."""
     # pylint: disable=redefined-outer-name
-    # pylint: disable=global-variable-not-assigned
+    # pylint: disable=global-variable-undefined
     # pylint: disable=invalid-name
     # pylint: disable=unused-variable
     global DataProcessorThatWillChange
@@ -69,7 +70,7 @@ class DataProcessorThatWillChange(DataProcessor):
         redefining how data should enter the model pipeline."""
 
         def get_raw_features_and_labels(self, dataset_path: str) -> \
-                (dict[str, np.ndarray], dict[str, np.ndarray]):
+                (Dict[str, np.ndarray], Dict[str, np.ndarray]):
             """Returns dummy features and labels
 
             :param dataset_path: Unused.
@@ -78,7 +79,7 @@ def get_raw_features_and_labels(self, dataset_path: str) -> \
             """
             raise ValueError('The new implementation is different.')
 
-        def get_raw_features(self, dataset_path: str) -> dict[str, np.ndarray]:
+        def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
             """Returns dummy features.
 
             :param dataset_path: Unused.

diff --git a/tests/dataset/test_versioned_dataset_builder.py b/tests/dataset/test_versioned_dataset_builder.py
@@ -7,6 +7,7 @@
 import json
 import pickle
 from urllib.parse import urlparse
+import dateutil
 import numpy as np
 import pytest
 import boto3
@@ -98,7 +99,7 @@ def test_publish_appends_version_timestamp() -> None:
     end = datetime.now()
     assert len(os.listdir(TEST_PUBLICATION_PATH_LOCAL)) == 1
     dirname = os.listdir(TEST_PUBLICATION_PATH_LOCAL)[0]
-    publication_time = datetime.fromisoformat(dirname)
+    publication_time = dateutil.parser.parse(dirname)
     assert start < publication_time < end
 
 

diff --git a/tests/model/test_versioned_model_builder.py b/tests/model/test_versioned_model_builder.py
@@ -5,6 +5,7 @@
 import shutil
 from datetime import datetime
 import json
+import dateutil
 import numpy as np
 import pytest
 from s3fs import S3FileSystem
@@ -117,7 +118,7 @@ def test_publish_appends_version_timestamp(
     end = datetime.now()
     assert len(os.listdir(TEST_MODEL_PUBLICATION_PATH_LOCAL)) == 1
     dirname = os.listdir(TEST_MODEL_PUBLICATION_PATH_LOCAL)[0]
-    publication_time = datetime.fromisoformat(dirname)
+    publication_time = dateutil.parser.parse(dirname)
     assert start < publication_time < end