Skip to content

Commit

Permalink
Merge pull request #64 from kostaleonard/style
Browse files Browse the repository at this point in the history
Style
  • Loading branch information
kostaleonard authored Apr 16, 2022
2 parents 37bdab7 + 543c90f commit 236d349
Show file tree
Hide file tree
Showing 42 changed files with 1,664 additions and 1,358 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ jobs:
run: |
python -m pip install --upgrade pip
make install
- name: Pre-commit
run: pre-commit run --all-files
- name: Lint
run: make lint
- name: Test
Expand Down
13 changes: 13 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
files: ^mlops/|^tests/
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.3.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
language_version: python3.9
5 changes: 3 additions & 2 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ disable=raw-checker-failed,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
duplicate-code
duplicate-code,
line-too-long

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
Expand Down Expand Up @@ -282,7 +283,7 @@ indent-after-paren=4
indent-string=' '

# Maximum number of characters on a single line.
max-line-length=100
max-line-length=79

# Maximum number of lines in a module.
max-module-lines=1000
Expand Down
15 changes: 4 additions & 11 deletions mlops/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
"""mlops is a package for conducting MLOps, including versioning of datasets and
models."""
"""mlops is a package for conducting MLOps, including versioning of datasets
and models."""

__version__ = '0.1.6'
__version__ = "0.1.7"

__all__ = [
'__version__',
'artifact',
'dataset',
'errors',
'model',
'hashing'
]
__all__ = ["__version__", "artifact", "dataset", "errors", "model", "hashing"]
42 changes: 19 additions & 23 deletions mlops/artifact/versioned_artifact.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def md5(self) -> str:
:return: The artifact's MD5 hash.
"""

def __eq__(self, other: 'VersionedArtifact') -> bool:
def __eq__(self, other: "VersionedArtifact") -> bool:
"""Returns True if the two objects have the same loaded MD5 hash code,
False otherwise.
Expand Down Expand Up @@ -81,23 +81,19 @@ def update_metadata(self, updates: dict[str, Any]) -> None:
a key does not exist in the metadata, it is added; if it does
exist, its value is overwritten.
"""
if self.metadata_path.startswith('s3://'):
if self.metadata_path.startswith("s3://"):
fs = S3FileSystem()
with fs.open(self.metadata_path,
'r',
encoding='utf-8') as infile:
with fs.open(self.metadata_path, "r", encoding="utf-8") as infile:
metadata = json.loads(infile.read())
else:
with open(self.metadata_path, 'r', encoding='utf-8') as infile:
with open(self.metadata_path, "r", encoding="utf-8") as infile:
metadata = json.loads(infile.read())
updated_metadata = {**metadata, **updates}
if self.metadata_path.startswith('s3://'):
with fs.open(self.metadata_path,
'w',
encoding='utf-8') as outfile:
if self.metadata_path.startswith("s3://"):
with fs.open(self.metadata_path, "w", encoding="utf-8") as outfile:
outfile.write(json.dumps(updated_metadata))
else:
with open(self.metadata_path, 'w', encoding='utf-8') as outfile:
with open(self.metadata_path, "w", encoding="utf-8") as outfile:
outfile.write(json.dumps(updated_metadata))

def republish(self, republication_path: str) -> str:
Expand All @@ -114,7 +110,7 @@ def republish(self, republication_path: str) -> str:
artifacts with the same version.
:return: The versioned artifact's publication path.
"""
if republication_path.startswith('s3://'):
if republication_path.startswith("s3://"):
return self._republish_to_s3(republication_path)
return self._republish_to_local(republication_path)

Expand All @@ -130,7 +126,7 @@ def _republish_to_local(self, republication_path: str) -> str:
# pylint: disable=protected-access
publication_path = os.path.join(republication_path, self.version)
VersionedArtifactBuilder._make_publication_path_local(publication_path)
if self.path.startswith('s3://'):
if self.path.startswith("s3://"):
fs = S3FileSystem()
fs.get(self.path, publication_path, recursive=True)
else:
Expand All @@ -150,18 +146,18 @@ def _republish_to_s3(self, republication_path: str) -> str:
publication_path = os.path.join(republication_path, self.version)
fs = S3FileSystem()
VersionedArtifactBuilder._make_publication_path_s3(
publication_path, fs)
if self.path.startswith('s3://'):
artifact_path_no_prefix = self.path.replace('s3://', '', 1)
copy_path_no_prefix = publication_path.replace('s3://', '', 1)
publication_path, fs
)
if self.path.startswith("s3://"):
artifact_path_no_prefix = self.path.replace("s3://", "", 1)
copy_path_no_prefix = publication_path.replace("s3://", "", 1)
for current_path, _, filenames in fs.walk(self.path):
outfile_prefix = current_path.replace(artifact_path_no_prefix,
copy_path_no_prefix, 1)
outfile_prefix = current_path.replace(
artifact_path_no_prefix, copy_path_no_prefix, 1
)
for filename in filenames:
infile_path = os.path.join(current_path,
filename)
outfile_path = os.path.join(outfile_prefix,
filename)
infile_path = os.path.join(current_path, filename)
outfile_path = os.path.join(outfile_prefix, filename)
fs.copy(infile_path, outfile_path)
else:
fs.put(self.path, publication_path, recursive=True)
Expand Down
6 changes: 4 additions & 2 deletions mlops/artifact/versioned_artifact_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

class VersionedArtifactBuilder(ABC):
"""Represents a versioned artifact builder."""

# pylint: disable=too-few-public-methods

@abstractmethod
Expand Down Expand Up @@ -43,8 +44,9 @@ def _make_publication_path_local(publication_path: str) -> None:
raise PublicationPathAlreadyExistsError from err

@staticmethod
def _make_publication_path_s3(publication_path: str,
fs: S3FileSystem) -> None:
def _make_publication_path_s3(
publication_path: str, fs: S3FileSystem
) -> None:
"""Creates the directories that compose the publication path.
:param publication_path: The path to which to publish the artifact.
Expand Down
61 changes: 37 additions & 24 deletions mlops/dataset/data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@ class DataProcessor:
"""Transforms a raw dataset into features and labels for downstream model
training, prediction, etc."""

def get_preprocessed_features_and_labels(self, dataset_path: str) -> \
Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
def get_preprocessed_features_and_labels(
self, dataset_path: str
) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
"""Returns the preprocessed feature and label tensors from the dataset
path. This method is specifically used for the train/val/test sets and
not input data for prediction, because in some cases the features and
Expand All @@ -23,16 +24,23 @@ def get_preprocessed_features_and_labels(self, dataset_path: str) -> \
:return: A 2-tuple of the features dictionary and labels dictionary,
with matching keys and ordered tensors.
"""
raw_feature_tensors, raw_label_tensors = \
self.get_raw_features_and_labels(dataset_path)
features = {name: self.preprocess_features(raw_feature_tensor)
for name, raw_feature_tensor in raw_feature_tensors.items()}
labels = {name: self.preprocess_labels(raw_label_tensor)
for name, raw_label_tensor in raw_label_tensors.items()}
(
raw_feature_tensors,
raw_label_tensors,
) = self.get_raw_features_and_labels(dataset_path)
features = {
name: self.preprocess_features(raw_feature_tensor)
for name, raw_feature_tensor in raw_feature_tensors.items()
}
labels = {
name: self.preprocess_labels(raw_label_tensor)
for name, raw_label_tensor in raw_label_tensors.items()
}
return features, labels

def get_preprocessed_features(self, dataset_path: str) -> \
Dict[str, np.ndarray]:
def get_preprocessed_features(
self, dataset_path: str
) -> Dict[str, np.ndarray]:
"""Transforms the raw data at the given file or directory into features
that can be used by downstream models. The data in the directory may be
the training/validation/test data, or it may be a batch of user data
Expand All @@ -44,21 +52,24 @@ def get_preprocessed_features(self, dataset_path: str) -> \
remote filesystem containing the dataset.
:return: A dictionary whose values are feature tensors and whose
corresponding keys are the names by which those tensors should be
referenced. For example, the training features (value) may be called
'X_train' (key).
referenced. For example, the training features (value) may be
called 'X_train' (key).
"""
raw_feature_tensors = self.get_raw_features(dataset_path)
return {name: self.preprocess_features(raw_feature_tensor)
for name, raw_feature_tensor in raw_feature_tensors.items()}
return {
name: self.preprocess_features(raw_feature_tensor)
for name, raw_feature_tensor in raw_feature_tensors.items()
}

@abstractmethod
def get_raw_features_and_labels(self, dataset_path: str) -> \
Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
"""Returns the raw feature and label tensors from the dataset path. This
method is specifically used for the train/val/test sets and not input
data for prediction, because in some cases the features and labels need
to be read simultaneously to ensure proper ordering of features and
labels.
def get_raw_features_and_labels(
self, dataset_path: str
) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
"""Returns the raw feature and label tensors from the dataset path.
This method is specifically used for the train/val/test sets and not
input data for prediction, because in some cases the features and
labels need to be read simultaneously to ensure proper ordering of
features and labels.
For example, when handling image data, the raw features would likely be
tensors of shape m x h x w x c, where m is the number of images, h is
Expand Down Expand Up @@ -88,12 +99,14 @@ def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
remote filesystem containing the dataset.
:return: A dictionary whose values are feature tensors and whose
corresponding keys are the names by which those tensors should be
referenced. For example, the training features (value) may be called
'X_train' (key).
referenced. For example, the training features (value) may be
called 'X_train' (key).
"""

@abstractmethod
def preprocess_features(self, raw_feature_tensor: np.ndarray) -> np.ndarray:
def preprocess_features(
self, raw_feature_tensor: np.ndarray
) -> np.ndarray:
"""Returns the preprocessed feature tensor from the raw tensor. The
preprocessed features are how training/validation/test as well as
prediction data are fed into downstream models. For example, when
Expand Down
19 changes: 11 additions & 8 deletions mlops/dataset/pathless_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
class PathlessDataProcessor(DataProcessor):
"""Loads preset features and labels."""

def __init__(self,
features: Dict[str, np.ndarray],
labels: Dict[str, np.ndarray]) -> None:
def __init__(
self, features: Dict[str, np.ndarray], labels: Dict[str, np.ndarray]
) -> None:
"""Instantiates the object.
:param features: The training features.
Expand All @@ -19,8 +19,9 @@ def __init__(self,
self.features = features
self.labels = labels

def get_raw_features_and_labels(self, dataset_path: str) -> \
Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
def get_raw_features_and_labels(
self, dataset_path: str
) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
"""Returns the training features and labels.
:param dataset_path: Unused
Expand All @@ -35,12 +36,14 @@ def get_raw_features(self, dataset_path: str) -> Dict[str, np.ndarray]:
:param dataset_path: Unused.
:return: A dictionary whose values are feature tensors and whose
corresponding keys are the names by which those tensors should be
referenced. For example, the training features (value) may be called
'X_train' (key).
referenced. For example, the training features (value) may be
called 'X_train' (key).
"""
return self.features

def preprocess_features(self, raw_feature_tensor: np.ndarray) -> np.ndarray:
def preprocess_features(
self, raw_feature_tensor: np.ndarray
) -> np.ndarray:
"""Returns the identity function on the input features.
:param raw_feature_tensor: The raw features to be preprocessed.
Expand Down
Loading

0 comments on commit 236d349

Please sign in to comment.