Skip to content

Commit

Permalink
Merge pull request #34 from kostaleonard/backcompat
Browse files Browse the repository at this point in the history
Backcompat
  • Loading branch information
kostaleonard authored Jan 7, 2022
2 parents bf48324 + 6a5fda7 commit bd5433e
Show file tree
Hide file tree
Showing 16 changed files with 48 additions and 40 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/cd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
- name: Set up Python 3.6
uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: 3.6
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand All @@ -31,10 +31,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
- name: Set up Python 3.6
uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: 3.6
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.9
- name: Set up Python 3.6
uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: 3.6
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion mlops/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""mlops is a package for conducting MLOps, including versioning of datasets and
models."""

__version__ = '0.1.0'
__version__ = '0.1.1'

__all__ = [
'__version__',
Expand Down
9 changes: 5 additions & 4 deletions mlops/dataset/data_processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Contains the DataProcessor class."""

from abc import abstractmethod
from typing import Dict
import numpy as np


Expand All @@ -9,7 +10,7 @@ class DataProcessor:
training, prediction, etc."""

def get_preprocessed_features_and_labels(self, dataset_path: str) -> \
(dict[str, np.ndarray], dict[str, np.ndarray]):
(Dict[str, np.ndarray], Dict[str, np.ndarray]):
"""Returns the preprocessed feature and label tensors from the dataset
path. This method is specifically used for the train/val/test sets and
not input data for prediction, because in some cases the features and
Expand All @@ -31,7 +32,7 @@ def get_preprocessed_features_and_labels(self, dataset_path: str) -> \
return features, labels

def get_preprocessed_features(self, dataset_path: str) -> \
dict[str, np.ndarray]:
Dict[str, np.ndarray]:
"""Transforms the raw data at the given file or directory into features
that can be used by downstream models. The data in the directory may be
the training/validation/test data, or it may be a batch of user data
Expand All @@ -52,7 +53,7 @@ def get_preprocessed_features(self, dataset_path: str) -> \

@abstractmethod
def get_raw_features_and_labels(self, dataset_path: str) -> \
(dict[str, np.ndarray], dict[str, np.ndarray]):
(Dict[str, np.ndarray], Dict[str, np.ndarray]):
"""Returns the raw feature and label tensors from the dataset path. This
method is specifically used for the train/val/test sets and not input
data for prediction, because in some cases the features and labels need
Expand All @@ -74,7 +75,7 @@ def get_raw_features_and_labels(self, dataset_path: str) -> \
"""

@abstractmethod
def get_raw_features(self, dataset_path: str) -> dict[str, np.ndarray]:
def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
"""Returns the raw feature tensors from the dataset path. The raw
features are how training/validation/test as well as prediction data
enter the data pipeline. For example, when handling image data, the raw
Expand Down
12 changes: 6 additions & 6 deletions mlops/dataset/versioned_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path
import shutil
from tempfile import TemporaryFile
from typing import Optional
from typing import Optional, List, Set
from datetime import datetime
import json
import dill as pickle
Expand Down Expand Up @@ -52,7 +52,7 @@ def publish(self,
path: str,
version: Optional[str] = None,
dataset_copy_strategy: str = STRATEGY_COPY,
tags: Optional[list[str]] = None) -> str:
tags: Optional[List[str]] = None) -> str:
"""Saves the versioned dataset files to the given path. If the path and
appended version already exists, this operation will raise a
PublicationPathAlreadyExistsError.
Expand Down Expand Up @@ -241,7 +241,7 @@ def _make_publication_path_s3(publication_path: str,
raise PublicationPathAlreadyExistsError
fs.mkdirs(publication_path)

def _write_tensors_local(self, publication_path: str) -> set[str]:
def _write_tensors_local(self, publication_path: str) -> Set[str]:
"""Writes the feature and label tensors to the publication path
directory and returns the paths to the created files for hashing.
Expand All @@ -257,7 +257,7 @@ def _write_tensors_local(self, publication_path: str) -> set[str]:

def _write_tensors_s3(self,
publication_path: str,
fs: S3FileSystem) -> set[str]:
fs: S3FileSystem) -> Set[str]:
"""Writes the feature and label tensors to the publication path
directory and returns the paths to the created files for hashing.
Expand Down Expand Up @@ -297,7 +297,7 @@ def _write_tensor_s3(tensor: np.ndarray,
with fs.open(path, 'wb') as outfile:
outfile.write(tmp_file.read())

def _copy_raw_dataset_local(self, copy_path: str) -> set[str]:
def _copy_raw_dataset_local(self, copy_path: str) -> Set[str]:
"""Copies the raw dataset to the given path, and returns the paths to
all created files for hashing.
Expand All @@ -319,7 +319,7 @@ def _copy_raw_dataset_local(self, copy_path: str) -> set[str]:

def _copy_raw_dataset_s3(self,
copy_path: str,
fs: S3FileSystem) -> set[str]:
fs: S3FileSystem) -> Set[str]:
"""Copies the raw dataset to the given path, and returns the paths to
all created files for hashing.
Expand Down
2 changes: 1 addition & 1 deletion mlops/examples/image/classification/model_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def model_predict(features: np.ndarray,
return dataset.data_processor.unpreprocess_labels(valid_predictions)


def get_best_model(model_paths: Collection[str]) -> VersionedModel:
def get_best_model(model_paths: Collection) -> VersionedModel:
"""Returns the versioned model with the best performance on the validation
dataset.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Contains the PokemonClassificationDataProcessor class."""

import os
from typing import Optional
from typing import Optional, Dict
import numpy as np
import pandas as pd
from matplotlib.image import imread
Expand Down Expand Up @@ -29,7 +29,7 @@ class PokemonClassificationDataProcessor(InvertibleDataProcessor):
labels for classification."""

def get_raw_features_and_labels(self, dataset_path: str) -> \
(dict[str, np.ndarray], dict[str, np.ndarray]):
(Dict[str, np.ndarray], Dict[str, np.ndarray]):
"""Returns the raw feature and label tensors from the dataset path. This
method is specifically used for the train/val/test sets and not input
data for prediction, because in some cases the features and labels need
Expand Down Expand Up @@ -87,7 +87,7 @@ def get_raw_features_and_labels(self, dataset_path: str) -> \
return ({'X_train': X_train, 'X_val': X_val, 'X_test': X_test},
{'y_train': y_train, 'y_val': y_val, 'y_test': y_test})

def get_raw_features(self, dataset_path: str) -> dict[str, np.ndarray]:
def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
"""Returns the raw feature tensors from the prediction dataset path. Raw
features are tensors of shape m x h x w x c, where m is the number of
images, h is the image height, w is the image width, and c is the number
Expand Down
6 changes: 4 additions & 2 deletions mlops/examples/image/classification/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# pylint: disable=no-name-in-module

import os
from typing import Optional, Any
from typing import Optional, Any, List
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, \
Dropout
Expand Down Expand Up @@ -83,6 +83,8 @@ def train_model(model: Model,
model_checkpoint_filename,
save_best_only=True)
callbacks.append(checkpoint_callback)
model_dir = os.path.dirname(model_checkpoint_filename)
os.makedirs(model_dir, exist_ok=True)
history = model.fit(x=dataset.X_train,
y=dataset.y_train,
validation_data=(dataset.X_val, dataset.y_val),
Expand All @@ -95,7 +97,7 @@ def publish_model(model: Model,
dataset: VersionedDataset,
training_config: TrainingConfig,
publication_path: str,
tags: Optional[list[str]] = None) -> str:
tags: Optional[List[str]] = None) -> str:
"""Publishes the model to the path on the local or remote filesystem.
:param model: The model to be published, with the exact weights desired for
Expand Down
4 changes: 2 additions & 2 deletions mlops/hashing/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
CHUNK_SIZE = 2 ** 20


def get_hash_local(files_to_hash: Collection[str]) -> str:
def get_hash_local(files_to_hash: Collection) -> str:
"""Returns the MD5 hex digest string from hashing the content of all the
given files on the local filesystem. The files are sorted before hashing
so that the process is reproducible.
Expand All @@ -26,7 +26,7 @@ def get_hash_local(files_to_hash: Collection[str]) -> str:
return hash_md5.hexdigest()


def get_hash_s3(files_to_hash: Collection[str]) -> str:
def get_hash_s3(files_to_hash: Collection) -> str:
"""Returns the MD5 hex digest string from hashing the content of all the
given files in S3. The files are sorted before hashing so that the
process is reproducible.
Expand Down
4 changes: 2 additions & 2 deletions mlops/model/training_config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Contains the TrainingConfig class."""
# pylint: disable = no-name-in-module

from typing import Any
from typing import Any, Dict
from dataclasses import dataclass
from tensorflow.keras.callbacks import History

Expand All @@ -14,4 +14,4 @@ class TrainingConfig:
train_args: The training arguments.
"""
history: History
train_args: dict[str, Any]
train_args: Dict[str, Any]
4 changes: 2 additions & 2 deletions mlops/model/versioned_model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import json
from tempfile import TemporaryDirectory
from typing import Optional
from typing import Optional, List
from datetime import datetime
from s3fs import S3FileSystem
from tensorflow.keras.models import Model
Expand Down Expand Up @@ -43,7 +43,7 @@ def __init__(self,
def publish(self,
path: str,
version: Optional[str] = None,
tags: Optional[list[str]] = None) -> str:
tags: Optional[List[str]] = None) -> str:
"""Saves the versioned model files to the given path. If the path and
appended version already exists, this operation will raise a
PublicationPathAlreadyExistsError.
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ pytest
pytest-cov
numpy
pandas
tensorflow
tensorflow==2.4.1
boto3
botocore
s3fs
Expand All @@ -14,4 +14,5 @@ dill
sphinx
sphinx_rtd_theme
build
twine
twine
python-dateutil
5 changes: 3 additions & 2 deletions tests/dataset/preset_data_processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Contains the PresetDataProcessor class."""
# pylint:disable=no-name-in-module

from typing import Dict
import numpy as np
from tensorflow.keras.utils import to_categorical
from mlops.dataset.invertible_data_processor import InvertibleDataProcessor
Expand All @@ -27,7 +28,7 @@ class PresetDataProcessor(InvertibleDataProcessor):
"""Processes a preset dataset, with no file I/O."""

def get_raw_features_and_labels(self, dataset_path: str) -> \
(dict[str, np.ndarray], dict[str, np.ndarray]):
(Dict[str, np.ndarray], Dict[str, np.ndarray]):
"""Returns preset raw feature and label tensors.
:param dataset_path: Unused
Expand All @@ -40,7 +41,7 @@ def get_raw_features_and_labels(self, dataset_path: str) -> \
return self.get_raw_features(dataset_path), labels

def get_raw_features(self,
dataset_path: str) -> dict[str, np.ndarray]:
dataset_path: str) -> Dict[str, np.ndarray]:
"""Returns the preset raw features.
:param dataset_path: Unused.
Expand Down
11 changes: 6 additions & 5 deletions tests/dataset/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
import shutil
from typing import Dict
import numpy as np
from mlops.dataset.data_processor import DataProcessor
from mlops.dataset.versioned_dataset_builder import VersionedDatasetBuilder
Expand All @@ -16,7 +17,7 @@ class DataProcessorThatWillChange(DataProcessor):
how data should enter the model pipeline."""

def get_raw_features_and_labels(self, dataset_path: str) -> \
(dict[str, np.ndarray], dict[str, np.ndarray]):
(Dict[str, np.ndarray], Dict[str, np.ndarray]):
"""Returns dummy features and labels
:param dataset_path: Unused.
Expand All @@ -26,7 +27,7 @@ def get_raw_features_and_labels(self, dataset_path: str) -> \
return ({'X': np.array([1, 2, 3])},
{'y': np.array([1, 2, 3])})

def get_raw_features(self, dataset_path: str) -> dict[str, np.ndarray]:
def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
"""Returns dummy features.
:param dataset_path: Unused.
Expand Down Expand Up @@ -59,7 +60,7 @@ def preprocess_labels(self, raw_label_tensor: np.ndarray) -> np.ndarray:
def _redefine_class() -> None:
"""Redefines DataProcessorThatWillChange."""
# pylint: disable=redefined-outer-name
# pylint: disable=global-variable-not-assigned
# pylint: disable=global-variable-undefined
# pylint: disable=invalid-name
# pylint: disable=unused-variable
global DataProcessorThatWillChange
Expand All @@ -69,7 +70,7 @@ class DataProcessorThatWillChange(DataProcessor):
redefining how data should enter the model pipeline."""

def get_raw_features_and_labels(self, dataset_path: str) -> \
(dict[str, np.ndarray], dict[str, np.ndarray]):
(Dict[str, np.ndarray], Dict[str, np.ndarray]):
"""Returns dummy features and labels
:param dataset_path: Unused.
Expand All @@ -78,7 +79,7 @@ def get_raw_features_and_labels(self, dataset_path: str) -> \
"""
raise ValueError('The new implementation is different.')

def get_raw_features(self, dataset_path: str) -> dict[str, np.ndarray]:
def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
"""Returns dummy features.
:param dataset_path: Unused.
Expand Down
3 changes: 2 additions & 1 deletion tests/dataset/test_versioned_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import json
import pickle
from urllib.parse import urlparse
import dateutil
import numpy as np
import pytest
import boto3
Expand Down Expand Up @@ -98,7 +99,7 @@ def test_publish_appends_version_timestamp() -> None:
end = datetime.now()
assert len(os.listdir(TEST_PUBLICATION_PATH_LOCAL)) == 1
dirname = os.listdir(TEST_PUBLICATION_PATH_LOCAL)[0]
publication_time = datetime.fromisoformat(dirname)
publication_time = dateutil.parser.parse(dirname)
assert start < publication_time < end


Expand Down
3 changes: 2 additions & 1 deletion tests/model/test_versioned_model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import shutil
from datetime import datetime
import json
import dateutil
import numpy as np
import pytest
from s3fs import S3FileSystem
Expand Down Expand Up @@ -117,7 +118,7 @@ def test_publish_appends_version_timestamp(
end = datetime.now()
assert len(os.listdir(TEST_MODEL_PUBLICATION_PATH_LOCAL)) == 1
dirname = os.listdir(TEST_MODEL_PUBLICATION_PATH_LOCAL)[0]
publication_time = datetime.fromisoformat(dirname)
publication_time = dateutil.parser.parse(dirname)
assert start < publication_time < end


Expand Down

0 comments on commit bd5433e

Please sign in to comment.