From c7f90a688380707af56f5edfa6e3e4d13377de03 Mon Sep 17 00:00:00 2001 From: Mikhail Sveshnikov Date: Fri, 12 Nov 2021 01:20:45 +0300 Subject: [PATCH] fix dvc download (#109) * fix dvc download * fix lazy field for dataset add gitpython to deps fix dvc downloading * conditional pull * fix isadir error * fix catboost and upload saving * fix tests * fix #83 * fix requirement duplication requirements tests requirement expand * fix test * spelling --- mlem/cli/apply.py | 12 +++- mlem/contrib/catboost.py | 3 + mlem/contrib/dvc.py | 41 +++++++++-- mlem/contrib/lightgbm.py | 13 ++++ mlem/contrib/sklearn.py | 4 +- mlem/contrib/xgboost.py | 12 ++++ mlem/core/artifacts.py | 16 +++-- mlem/core/errors.py | 8 +++ mlem/core/hooks.py | 10 +-- mlem/core/meta_io.py | 2 +- mlem/core/metadata.py | 3 +- mlem/core/model.py | 5 +- mlem/core/objects.py | 18 +++-- mlem/core/requirements.py | 110 ++++++++++++++++++++-------- mlem/polydantic/core.py | 6 +- mlem/polydantic/lazy.py | 41 ++++++++--- mlem/utils/module.py | 9 ++- setup.py | 1 + tests/cli/test_apply.py | 12 ++++ tests/conftest.py | 6 +- tests/contrib/test_catboost.py | 2 +- tests/contrib/test_lightgbm.py | 3 +- tests/contrib/test_sklearn.py | 51 +++++++++++++ tests/contrib/test_xgboost.py | 3 +- tests/core/test_artifacts.py | 22 +++++- tests/core/test_meta_io.py | 3 +- tests/core/test_metadata.py | 2 + tests/core/test_objects.py | 10 ++- tests/core/test_requirements.py | 124 ++++++++++++++++++++++++++++++++ tests/polydantic/test_lazy.py | 24 ++++++- 30 files changed, 496 insertions(+), 80 deletions(-) create mode 100644 tests/core/test_requirements.py diff --git a/mlem/cli/apply.py b/mlem/cli/apply.py index ab1a0ac4..09a47b13 100644 --- a/mlem/cli/apply.py +++ b/mlem/cli/apply.py @@ -1,4 +1,4 @@ -from typing import Any, Tuple +from typing import Any, Optional, Tuple import click @@ -25,13 +25,19 @@ help="Whether to create links for outputs in .mlem directory.", ) def apply( - model: ModelMeta, output: str, method: str, args: Tuple[Any], link: bool + model: ModelMeta, + output: Optional[str], + method: str, + args: Tuple[Any], + link: bool, ): """Apply a model to supplied data.""" from mlem.api import apply click.echo("applying") - apply(model, *args, method=method, output=output, link=link) + result = apply(model, *args, method=method, output=output, link=link) + if output is None: + click.echo(result) @mlem_command() diff --git a/mlem/contrib/catboost.py b/mlem/contrib/catboost.py index a7b948d2..2e97423a 100644 --- a/mlem/contrib/catboost.py +++ b/mlem/contrib/catboost.py @@ -57,6 +57,9 @@ def _get_model_file_name(self, model): return self.classifier_file_name return self.regressor_file_name + class Config: + use_enum_values = True + class CatBoostModel(ModelType, ModelHook): """ diff --git a/mlem/contrib/dvc.py b/mlem/contrib/dvc.py index 108cba86..a6256f10 100644 --- a/mlem/contrib/dvc.py +++ b/mlem/contrib/dvc.py @@ -1,13 +1,30 @@ import contextlib +import os.path from typing import IO, ClassVar, Iterator, Tuple from urllib.parse import unquote_plus from fsspec import AbstractFileSystem from fsspec.implementations.github import GithubFileSystem +from fsspec.implementations.local import LocalFileSystem from mlem.core.artifacts import LocalArtifact, LocalStorage, Storage from mlem.core.meta_io import get_fs +BATCH_SIZE = 10 ** 5 + + +def find_dvc_repo_root(path: str): + from dvc.exceptions import NotDvcRepoError + + _path = path[:] + while True: + if os.path.isdir(os.path.join(_path, ".dvc")): + return _path + if _path == "/": + break + _path = os.path.dirname(_path) + raise NotDvcRepoError(f"Path {path} is not in dvc repo") + class DVCStorage(LocalStorage): """For now this storage is user-managed dvc storage, which means user should @@ -38,9 +55,13 @@ class DVCArtifact(LocalArtifact): uri: str def download(self, target_path: str) -> LocalArtifact: - from dvc.repo import Repo - - Repo.get_url(self.uri, out=target_path) + with self.open() as fin, open( + os.path.join(target_path, os.path.basename(self.uri)), "wb" + ) as fout: + batch = fin.read(BATCH_SIZE) + while batch: + fout.write(batch) + batch = fin.read(BATCH_SIZE) return LocalArtifact(uri=target_path) @contextlib.contextmanager @@ -58,9 +79,17 @@ def open(self) -> Iterator[IO]: mode="rb", ) as f: yield f - else: - with fs.open(path) as f: - yield f + return + elif isinstance(fs, LocalFileSystem): + if not os.path.exists(path): + root = find_dvc_repo_root(path) + # alternative caching impl + # Repo(root).pull(os.path.relpath(path, root)) + with open(os.path.relpath(path, root), mode="rb") as f: + yield f + return + with fs.open(path) as f: + yield f def relative(self, fs: AbstractFileSystem, path: str) -> "DVCArtifact": relative = super().relative(fs, path) diff --git a/mlem/contrib/lightgbm.py b/mlem/contrib/lightgbm.py index d8cd65b4..2d8b1861 100644 --- a/mlem/contrib/lightgbm.py +++ b/mlem/contrib/lightgbm.py @@ -16,7 +16,9 @@ from mlem.core.hooks import IsInstanceHookMixin from mlem.core.model import ModelHook, ModelIO, ModelType, Signature from mlem.core.requirements import ( + AddRequirementHook, InstallableRequirement, + Requirement, Requirements, UnixPackageRequirement, ) @@ -131,3 +133,14 @@ def get_requirements(self) -> Requirements: + InstallableRequirement.from_module(mod=lgb) + LGB_REQUIREMENT ) + + +class LGBMLibgompHook(AddRequirementHook): + to_add = LGB_REQUIREMENT + + @classmethod + def is_object_valid(cls, obj: Requirement) -> bool: + return ( + isinstance(obj, InstallableRequirement) + and obj.module == "lightgbm" + ) diff --git a/mlem/contrib/sklearn.py b/mlem/contrib/sklearn.py index c4689003..f71688bc 100644 --- a/mlem/contrib/sklearn.py +++ b/mlem/contrib/sklearn.py @@ -66,4 +66,6 @@ def get_requirements(self) -> Requirements: ) # FIXME: https://github.com/iterative/mlem/issues/34 # optimize methods reqs # some sklearn compatible model (either from library or user code) - fallback - return super().get_requirements() + return super().get_requirements() + InstallableRequirement.from_module( + sklearn + ) diff --git a/mlem/contrib/xgboost.py b/mlem/contrib/xgboost.py index d20260cd..ac9f65f5 100644 --- a/mlem/contrib/xgboost.py +++ b/mlem/contrib/xgboost.py @@ -12,7 +12,9 @@ from mlem.core.hooks import IsInstanceHookMixin from mlem.core.model import ModelHook, ModelIO, ModelType, Signature from mlem.core.requirements import ( + AddRequirementHook, InstallableRequirement, + Requirement, Requirements, UnixPackageRequirement, WithRequirements, @@ -172,3 +174,13 @@ def get_requirements(self) -> Requirements: + InstallableRequirement.from_module(xgboost) + XGB_REQUIREMENT ) + + +class XGBLibgopmHook(AddRequirementHook): + to_add = XGB_REQUIREMENT + + @classmethod + def is_object_valid(cls, obj: Requirement) -> bool: + return ( + isinstance(obj, InstallableRequirement) and obj.module == "xgboost" + ) diff --git a/mlem/core/artifacts.py b/mlem/core/artifacts.py index a2dc24fa..d1c3801b 100644 --- a/mlem/core/artifacts.py +++ b/mlem/core/artifacts.py @@ -98,9 +98,10 @@ class FSSpecStorage(Storage): storage_options: Optional[Dict[str, str]] = {} def upload(self, local_path: str, target_path: str) -> FSSpecArtifact: - self.get_fs().upload( - local_path, os.path.join(self.base_path, target_path) - ) + fs = self.get_fs() + path = os.path.join(self.base_path, target_path) + fs.makedirs(os.path.dirname(path), exist_ok=True) + fs.upload(local_path, path) return FSSpecArtifact(uri=self.create_uri(target_path)) @contextlib.contextmanager @@ -146,6 +147,10 @@ class LocalStorage(FSSpecStorage): type: ClassVar = "local" fs = LocalFileSystem() + @property + def base_path(self): + return self.uri + def relative(self, fs: AbstractFileSystem, path: str) -> "Storage": if isinstance(fs, LocalFileSystem): return LocalStorage(uri=self.create_uri(path)) @@ -158,11 +163,12 @@ def relative(self, fs: AbstractFileSystem, path: str) -> "Storage": return storage def upload(self, local_path: str, target_path: str) -> "LocalArtifact": - return LocalArtifact(uri=super().upload(local_path, target_path).uri) + super().upload(local_path, target_path) + return LocalArtifact(uri=target_path) @contextlib.contextmanager def open(self, path) -> Iterator[Tuple[IO, "LocalArtifact"]]: - with super().open(os.path.join(self.uri, path)) as (io, _): + with super().open(path) as (io, _): yield io, LocalArtifact(uri=path) diff --git a/mlem/core/errors.py b/mlem/core/errors.py index 4b983c0e..b9771108 100644 --- a/mlem/core/errors.py +++ b/mlem/core/errors.py @@ -46,3 +46,11 @@ class MlemObjectNotSavedError(ValueError, MlemError): class ObjectExistsError(ValueError, MlemError): """Thrown if we attempt to write object, but something already exists in the given path""" + + +class HookNotFound(MlemError): + """Thrown if object does not have suitable hook""" + + +class MultipleHooksFound(MlemError): + """Thrown if more than one hook found for object""" diff --git a/mlem/core/hooks.py b/mlem/core/hooks.py index 542df77d..9f837f5d 100644 --- a/mlem/core/hooks.py +++ b/mlem/core/hooks.py @@ -9,6 +9,8 @@ from abc import ABC, abstractmethod from typing import Any, ClassVar, Generic, List, Tuple, Type, TypeVar +from mlem.core.errors import HookNotFound, MultipleHooksFound + logger = logging.getLogger(__name__) ANALYZER_FIELD = "analyzer" @@ -40,7 +42,7 @@ def is_object_valid(cls, obj: Any) -> bool: :param obj: object to analyze :return: True or False """ - raise NotImplementedError() + raise NotImplementedError @classmethod @abstractmethod @@ -52,7 +54,7 @@ def process(cls, obj: Any, **kwargs) -> T: :param kwargs: additional information to be used for analysis :return: analysis result """ - raise NotImplementedError() + raise NotImplementedError def __init_subclass__(cls, *args, **kwargs): if not inspect.isabstract(cls): @@ -186,10 +188,10 @@ def _find_hook(cls, obj) -> Type[Hook[T]]: if len(lp_hooks) == 1: return lp_hooks[0] if len(lp_hooks) > 1: - raise ValueError( + raise MultipleHooksFound( f"Multiple suitable hooks for object {obj} ({lp_hooks})" ) - raise ValueError( + raise HookNotFound( f"No suitable {cls.base_hook_class.__name__} for object of type " f'"{type(obj).__name__}". Registered hooks: {cls.hooks}' ) diff --git a/mlem/core/meta_io.py b/mlem/core/meta_io.py index 2395dfc1..96a7f800 100644 --- a/mlem/core/meta_io.py +++ b/mlem/core/meta_io.py @@ -64,7 +64,7 @@ def get_path_by_fs_path(fs: AbstractFileSystem, path: str): Another alternative is to support this on fsspec level, but we need to contribute it ourselves""" if isinstance(fs, GithubFileSystem): # here "rev" should be already url encoded - return f"{fs.protocol}://{fs.org}:{fs.repo}@{fs.root}/{path}" + return f"https://github.com/{fs.org}/{fs.repo}/tree/{fs.root}/{path}" protocol = fs.protocol if isinstance(protocol, (list, tuple)): if any(path.startswith(p) for p in protocol): diff --git a/mlem/core/metadata.py b/mlem/core/metadata.py index 1c38ab46..7884a463 100644 --- a/mlem/core/metadata.py +++ b/mlem/core/metadata.py @@ -10,6 +10,7 @@ from typing_extensions import Literal from yaml import safe_load +from mlem.core.errors import HookNotFound from mlem.core.meta_io import get_fs, get_meta_path, get_path_by_repo_path_rev from mlem.core.objects import DatasetMeta, MlemMeta, ModelMeta, find_object from mlem.utils.root import find_mlem_root @@ -19,7 +20,7 @@ def get_object_metadata(obj: Any, tmp_sample_data=None) -> MlemMeta: """Convert given object to appropriate MlemMeta subclass""" try: return DatasetMeta.from_data(obj) - except ValueError: # TODO need separate analysis exception + except HookNotFound: return ModelMeta.from_obj(obj, sample_data=tmp_sample_data) diff --git a/mlem/core/model.py b/mlem/core/model.py index cdfc5b41..88101c4d 100644 --- a/mlem/core/model.py +++ b/mlem/core/model.py @@ -18,6 +18,7 @@ ) from mlem.core.hooks import Analyzer, Hook from mlem.core.requirements import Requirements, WithRequirements +from mlem.utils.module import get_object_requirements class ModelIO(MlemObject): @@ -255,7 +256,7 @@ def get_requirements(self) -> Requirements: for m in self.methods.values() for r in m.get_requirements().__root__ ] - ) + ) + get_object_requirements(self.model) class ModelHook(Hook[ModelType], ABC): @@ -264,7 +265,7 @@ class ModelHook(Hook[ModelType], ABC): def process( # pylint: disable=arguments-differ # so what cls, obj: Any, sample_data: Optional[Any] = None, **kwargs ) -> ModelType: - pass + raise NotImplementedError class ModelAnalyzer(Analyzer[ModelType]): diff --git a/mlem/core/objects.py b/mlem/core/objects.py index df1b3f9f..4255ee28 100644 --- a/mlem/core/objects.py +++ b/mlem/core/objects.py @@ -407,7 +407,9 @@ class ModelMeta(_ExternalMeta): def from_obj(cls, model: Any, sample_data: Any = None) -> "ModelMeta": mt = ModelAnalyzer.analyze(model, sample_data=sample_data) mt.model = model - return ModelMeta(model_type=mt, requirements=mt.get_requirements()) + return ModelMeta( + model_type=mt, requirements=mt.get_requirements().expanded + ) def write_value(self, mlem_root: str) -> Artifacts: if self.model_type.model is not None: @@ -417,7 +419,7 @@ def write_value(self, mlem_root: str) -> Artifacts: self.model_type.model, ) else: - raise NotImplementedError() # TODO: https://github.com/iterative/mlem/issues/37 + raise NotImplementedError # TODO: https://github.com/iterative/mlem/issues/37 # self.get_artifacts().materialize(path) return artifacts @@ -438,7 +440,15 @@ def __getattr__(self, item): class DatasetMeta(_ExternalMeta): __transient_fields__ = {"dataset"} object_type: ClassVar = "dataset" - reader: Optional[DatasetReader] = None + reader_cache: Optional[Dict] + reader: Optional[DatasetReader] + reader, reader_raw, reader_cache = lazy_field( + DatasetReader, + "reader", + "reader_cache", + parse_as_type=Optional[DatasetReader], + default=None, + ) dataset: ClassVar[Optional[Dataset]] = None @property @@ -451,7 +461,7 @@ def from_data(cls, data: Any) -> "DatasetMeta": data, ) meta = DatasetMeta( - requirements=dataset.dataset_type.get_requirements() + requirements=dataset.dataset_type.get_requirements().expanded ) meta.dataset = dataset return meta diff --git a/mlem/core/requirements.py b/mlem/core/requirements.py index 206fa017..7dbd9ce8 100644 --- a/mlem/core/requirements.py +++ b/mlem/core/requirements.py @@ -7,7 +7,7 @@ import json import os import zlib -from abc import ABC +from abc import ABC, abstractmethod from pathlib import Path from types import ModuleType from typing import ( @@ -26,6 +26,9 @@ from mlem.core.base import MlemObject # I dont know how to do this better +from mlem.core.errors import HookNotFound +from mlem.core.hooks import Analyzer, Hook + MODULE_PACKAGE_MAPPING = { "sklearn": "scikit-learn", "skimage": "scikit-image", @@ -132,7 +135,6 @@ class CustomRequirement(PythonRequirement): """ type: ClassVar[str] = "custom" - name: str source64zip: str is_package: bool @@ -162,7 +164,10 @@ def from_module(mod: ModuleType) -> "CustomRequirement": Path(mod.__file__).read_text(encoding="utf8") ) return CustomRequirement( - name=mod.__name__, source64zip=src, is_package=is_package + module=mod.__name__.split(".")[0], + name=mod.__name__, + source64zip=src, + is_package=is_package, ) @staticmethod @@ -312,6 +317,10 @@ def modules(self) -> List[str]: """ return [r.module for r in self.of_type(PythonRequirement)] + @property + def expanded(self) -> "Requirements": + return expand_requirements(self) + def _add_installable(self, requirement: InstallableRequirement): for req in self.installable: if req.package == requirement.package: @@ -407,35 +416,40 @@ def resolve_requirements(other: "AnyRequirements") -> Requirements: :param other: requirement in supported format :return: :class:`Requirements` instance """ - if not isinstance(other, Requirements): - if isinstance(other, list): - if len(other) == 0: - return Requirements.new() - if isinstance(other[0], str): - other = Requirements( - __root__=[ - InstallableRequirement.from_str(r) for r in other - ] - ) - elif isinstance(other[0], Requirement): - other = Requirements(__root__=other) - else: - raise TypeError( - "only other Requirements, Requirement, list of Requirement objects, string " - "(or list of strings) can be added" - ) - elif isinstance(other, Requirement): - other = Requirements(__root__=[other]) - elif isinstance(other, str): - other = Requirements( - __root__=[InstallableRequirement.from_str(other)] + if isinstance(other, Requirements): + return other + + if isinstance(other, list): + if len(other) == 0: + return Requirements.new() + + if isinstance(other[0], str): + return Requirements( + __root__=[ + InstallableRequirement.from_str(r) for r in set(other) + ] ) - else: - raise TypeError( - "only other Requirements, Requirement, list of Requirement objects, string " - "(or list of strings) can be added" - ) - return other + + if isinstance(other[0], Requirement): + res = Requirements.new() + for r in other: + res.add(r) + return res + + raise TypeError( + "only other Requirements, Requirement, list of Requirement objects, string " + "(or list of strings) can be added" + ) + if isinstance(other, Requirement): + return Requirements(__root__=[other]) + + if isinstance(other, str): + return Requirements(__root__=[InstallableRequirement.from_str(other)]) + + raise TypeError( + "only other Requirements, Requirement, list of Requirement objects, string " + "(or list of strings) can be added" + ) AnyRequirements = Union[ @@ -464,6 +478,40 @@ def get_requirements(self) -> Requirements: ) +class RequirementsHook(Hook[Requirements], ABC): + @classmethod + @abstractmethod + def is_object_valid(cls, obj: Requirement) -> bool: + raise NotImplementedError + + @classmethod + @abstractmethod + def process(cls, obj: Requirement, **kwargs) -> Requirements: + raise NotImplementedError + + +class AddRequirementHook(RequirementsHook, ABC): + to_add: AnyRequirements = [] + + @classmethod + def process(cls, obj: Requirement, **kwargs) -> Requirements: + return resolve_requirements(cls.to_add) + obj + + +class RequirementsAnalyzer(Analyzer[Requirements]): + base_hook_class = RequirementsHook + + +def expand_requirements(requirements: Requirements) -> Requirements: + res = Requirements.new() + for req in requirements.__root__: + try: + res += RequirementsAnalyzer.analyze(req) + except HookNotFound: + res += req + return res + + # Copyright 2019 Zyfra # Copyright 2021 Iterative # diff --git a/mlem/polydantic/core.py b/mlem/polydantic/core.py index e744901c..ff696c5c 100644 --- a/mlem/polydantic/core.py +++ b/mlem/polydantic/core.py @@ -3,8 +3,10 @@ from pydantic import BaseModel +from mlem.polydantic.lazy import LazyModel -class PolyModel(BaseModel): + +class PolyModel(LazyModel): """Base class that enable polymorphism in pydantic models Attributes: @@ -53,7 +55,7 @@ def dict(self, **kwargs): result[self.__type_field__] = alias return result - @wraps(BaseModel._calculate_keys) + @wraps(BaseModel._calculate_keys) # pylint: disable=protected-access def _calculate_keys(self, *args, **kwargs) -> Optional[AbstractSet[str]]: """Exclude transient stuff""" kwargs["exclude"] = ( diff --git a/mlem/polydantic/lazy.py b/mlem/polydantic/lazy.py index 89b207f3..e4e06a8d 100644 --- a/mlem/polydantic/lazy.py +++ b/mlem/polydantic/lazy.py @@ -1,28 +1,53 @@ -from typing import Type +from typing import Any, Type -from pydantic import Field, parse_obj_as +from pydantic import BaseModel, Field, parse_obj_as from pydantic.fields import FieldInfo +class LazyModel(BaseModel): + def __setattr__(self, name, value): + """ + To be able to use properties with setters + """ + prop = getattr(self.__class__, name, None) + if ( + prop is not None + and isinstance(prop, property) + and prop.fset is not None + ): + object.__setattr__(self, name, value) + return + super().__setattr__(name, value) + + def lazy_field( - type_: Type, alias: str, alias_cache: str, *args, **field_kwargs + type_: Type, + alias: str, + alias_cache: str, + *args, + parse_as_type: Any = None, + **field_kwargs ): - field_info: FieldInfo = Field(*args, alias=alias, **field_kwargs) - @property def getter(self): value = getattr(self, alias_cache) if not isinstance(value, type_): - value = parse_obj_as(type_, value) + value = parse_obj_as(parse_as_type or type_, value) setattr(self, alias_cache, value) return value - @property + def setter(self, value): + setattr(self, alias_cache, value) + def getter_raw(self): value = getattr(self, alias_cache) if isinstance(value, type_): value = value.dict() return value - return getter, getter_raw, field_info + return ( + property(fget=getter, fset=setter), + property(fget=getter_raw, fset=setter), + field_info, + ) diff --git a/mlem/utils/module.py b/mlem/utils/module.py index 9b9f4bbb..09e6b971 100644 --- a/mlem/utils/module.py +++ b/mlem/utils/module.py @@ -160,7 +160,8 @@ def __init__(self): [ "opcode", "nturl2path", # pytest requirements missed by isort - "pkg_resources", # EBNT-112: workaround for imports from setup.py (see build/builder/docker.py) + "pkg_resources", # workaround for imports from setup.py (see build/builder/docker.py) + "sre_compile", "posixpath", "setuptools", "pydevconsole", @@ -507,6 +508,7 @@ def _should_ignore(self, mod: ModuleType): any(mod.__name__.startswith(i) for i in self.ignoring) or is_private_module(mod) or is_pseudo_module(mod) + or is_builtin_module(mod) ) def add_requirement(self, obj_or_module): @@ -565,7 +567,10 @@ def get_object_requirements(obj) -> Requirements: :param obj: obj to analyze :return: :class:`.Requirements` object containing all required packages """ - a = RequirementAnalyzer(recurse=True) + # there was recurse=True here for some reason, but this caused infinite recursion in some cases + # (def a(): b(); def b(): a(); def func(): a(); get_obj_reqs(func) - smth like that) + # I removed it and none of the tests failed ¯\_(ツ)_/¯ + a = RequirementAnalyzer() a.dump(obj) return a.to_requirements() diff --git a/setup.py b/setup.py index 1662327c..b1b19d3f 100644 --- a/setup.py +++ b/setup.py @@ -66,6 +66,7 @@ def run(self): "appdirs", "python-daemon", "distro", + "gitpython", ] # storage diff --git a/tests/cli/test_apply.py b/tests/cli/test_apply.py index 7cd2d71e..d992b014 100644 --- a/tests/cli/test_apply.py +++ b/tests/cli/test_apply.py @@ -21,6 +21,18 @@ def test_apply(model_path, data_path): assert isinstance(predictions, ndarray) +def test_apply_no_output(model_path, data_path): + runner = CliRunner() + result = runner.invoke( + apply, + [model_path, data_path, "-m", "predict", "--no-link"], + ) + assert result.exit_code == 0, (result.output, result.exception) + applying = "applying\n" + assert result.output.startswith(applying) + assert len(result.output) > len(applying) + + def test_apply_for_multiple_datasets(model_path, data_path): runner = CliRunner() result = runner.invoke( diff --git a/tests/conftest.py b/tests/conftest.py index a19e7479..e1996bdc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -230,13 +230,17 @@ def dataset_write_read_check( def check_model_type_common_interface( - model_type: ModelType, data_type: DatasetType, returns_type: DatasetType + model_type: ModelType, + data_type: DatasetType, + returns_type: DatasetType, + **kwargs, ): assert PREDICT_METHOD_NAME in model_type.methods assert model_type.methods[PREDICT_METHOD_NAME] == Signature( name=PREDICT_METHOD_NAME, args=[Argument(name=PREDICT_ARG_NAME, type_=data_type)], returns=returns_type, + **kwargs, ) diff --git a/tests/contrib/test_catboost.py b/tests/contrib/test_catboost.py index e79048d7..b785856e 100644 --- a/tests/contrib/test_catboost.py +++ b/tests/contrib/test_catboost.py @@ -47,7 +47,7 @@ def test_catboost_model(catboost_model_fixture, pandas_data, tmpdir, request): ), ) - expected_requirements = {"catboost", "pandas", "numpy"} + expected_requirements = {"catboost", "pandas", "numpy", "scipy"} assert set(cbmw.get_requirements().modules) == expected_requirements assert cbmw.model is catboost_model diff --git a/tests/contrib/test_lightgbm.py b/tests/contrib/test_lightgbm.py index 3b3d2bc3..05b7d17b 100644 --- a/tests/contrib/test_lightgbm.py +++ b/tests/contrib/test_lightgbm.py @@ -147,7 +147,8 @@ def test_model__predict_not_dataset(model): def test_model__dump_load(tmpdir, model, dataset_np, local_fs): - expected_requirements = {"lightgbm", "numpy"} + # pandas is not required, but if it is installed, it is imported by lightgbm + expected_requirements = {"lightgbm", "numpy", "scipy", "pandas"} assert set(model.get_requirements().modules) == expected_requirements artifacts = model.dump( diff --git a/tests/contrib/test_sklearn.py b/tests/contrib/test_sklearn.py index 346a15ee..72845229 100644 --- a/tests/contrib/test_sklearn.py +++ b/tests/contrib/test_sklearn.py @@ -1,3 +1,4 @@ +import lightgbm as lgb import numpy as np import pytest from sklearn.linear_model import LinearRegression, LogisticRegression @@ -7,6 +8,7 @@ from mlem.core.artifacts import LOCAL_STORAGE from mlem.core.dataset_type import DatasetAnalyzer from mlem.core.model import ModelAnalyzer +from mlem.core.requirements import UnixPackageRequirement from tests.conftest import check_model_type_common_interface @@ -34,6 +36,12 @@ def regressor(inp_data, out_data): return lr +@pytest.fixture +def lgbm_model(inp_data, out_data): + lgbm_regressor = lgb.LGBMRegressor() + return lgbm_regressor.fit(inp_data, out_data) + + @pytest.mark.parametrize("model_fixture", ["classifier", "regressor"]) def test_hook(model_fixture, inp_data, request): model = request.getfixturevalue(model_fixture) @@ -51,6 +59,22 @@ def test_hook(model_fixture, inp_data, request): ) +def test_hook_lgb(lgbm_model, inp_data): + data_type = DatasetAnalyzer.analyze(inp_data) + model_type = ModelAnalyzer.analyze(lgbm_model, sample_data=inp_data) + + assert isinstance(model_type, SklearnModel) + check_model_type_common_interface( + model_type, + data_type, + NumpyNdarrayType( + shape=(None,), + dtype="float64", + ), + varkw="kwargs", + ) + + @pytest.mark.parametrize("model", ["classifier", "regressor"]) def test_model_type__predict(model, inp_data, request): model = request.getfixturevalue(model) @@ -97,6 +121,33 @@ def test_model_type__dump_load(tmpdir, model, inp_data, request): assert set(model_type.get_requirements().modules) == expected_requirements +def test_model_type_lgb__dump_load(tmpdir, lgbm_model, inp_data): + model_type = ModelAnalyzer.analyze(lgbm_model, sample_data=inp_data) + + expected_requirements = {"sklearn", "lightgbm", "pandas", "numpy", "scipy"} + reqs = model_type.get_requirements().expanded + assert set(reqs.modules) == expected_requirements + assert reqs.of_type(UnixPackageRequirement) == [ + UnixPackageRequirement(package_name="libgomp1") + ] + artifacts = model_type.dump(LOCAL_STORAGE, tmpdir) + model_type.model = None + + with pytest.raises(ValueError): + model_type.call_method("predict", inp_data) + + model_type.load(artifacts) + np.testing.assert_array_almost_equal( + lgbm_model.predict(inp_data), + model_type.call_method("predict", inp_data), + ) + reqs = model_type.get_requirements().expanded + assert set(reqs.modules) == expected_requirements + assert reqs.of_type(UnixPackageRequirement) == [ + UnixPackageRequirement(package_name="libgomp1") + ] + + # Copyright 2019 Zyfra # Copyright 2021 Iterative # diff --git a/tests/contrib/test_xgboost.py b/tests/contrib/test_xgboost.py index a605d828..c7f3958d 100644 --- a/tests/contrib/test_xgboost.py +++ b/tests/contrib/test_xgboost.py @@ -131,7 +131,8 @@ def test_model__predict_not_dmatrix(model): def test_model__dump_load(tmpdir, model, dmatrix_np, local_fs): - expected_requirements = {"xgboost", "numpy"} + # pandas is not required, but it is conditionally imported by some Booster methods + expected_requirements = {"xgboost", "numpy", "scipy", "pandas"} assert set(model.get_requirements().modules) == expected_requirements artifacts = model.dump( diff --git a/tests/core/test_artifacts.py b/tests/core/test_artifacts.py index 6a55d269..f3de08c3 100644 --- a/tests/core/test_artifacts.py +++ b/tests/core/test_artifacts.py @@ -4,7 +4,12 @@ from fsspec.implementations.local import LocalFileSystem from s3fs import S3FileSystem -from mlem.core.artifacts import FSSpecArtifact, FSSpecStorage, LocalStorage +from mlem.core.artifacts import ( + FSSpecArtifact, + FSSpecStorage, + LocalArtifact, + LocalStorage, +) from tests.conftest import long, resource_path @@ -53,3 +58,18 @@ def test_relative_storage_local(): assert rel1 != local_storage assert isinstance(rel1, FSSpecStorage) assert rel1.uri == "s3://some_path" + + +def test_local_storage_relative(tmpdir): + storage = LocalStorage(uri=str(tmpdir)) + rstorage = storage.relative(LocalFileSystem(), "subdir") + with rstorage.open("file2") as (f, open_art): + f.write(b"1") + assert isinstance(open_art, LocalArtifact) + assert open_art.uri == "file2" + assert os.path.isfile(os.path.join(tmpdir, "subdir", open_art.uri)) + + upload_art = rstorage.upload(__file__, "file") + assert isinstance(upload_art, LocalArtifact) + assert upload_art.uri == "file" + assert os.path.isfile(os.path.join(tmpdir, "subdir", upload_art.uri)) diff --git a/tests/core/test_meta_io.py b/tests/core/test_meta_io.py index 26c44613..25ea0a2f 100644 --- a/tests/core/test_meta_io.py +++ b/tests/core/test_meta_io.py @@ -1,4 +1,5 @@ import os.path +from json import loads import pytest from fsspec.implementations.github import GithubFileSystem @@ -118,7 +119,7 @@ def test_get_path_by_fs_path_github(): ) uri = get_path_by_fs_path(fs, "path") fs2, path = get_fs(uri) - assert fs2.to_json() == fs.to_json() + assert loads(fs2.to_json()) == loads(fs.to_json()) assert path == "path" diff --git a/tests/core/test_metadata.py b/tests/core/test_metadata.py index c63fb814..d2ea05fe 100644 --- a/tests/core/test_metadata.py +++ b/tests/core/test_metadata.py @@ -128,6 +128,7 @@ def test_load_link_with_fsspec_path(current_test_branch): model.predict(train) +@pytest.mark.xfail # TODO: https://github.com/iterative/mlem/issues/110 @long def test_saving_to_s3(model, s3_storage_fs, s3_tmp_path): path = s3_tmp_path("model_save") @@ -143,6 +144,7 @@ def test_saving_to_s3(model, s3_storage_fs, s3_tmp_path): assert s3_storage_fs.isfile(str(model_path / ART_DIR / "data.pkl")) +@pytest.mark.xfail # TODO: https://github.com/iterative/mlem/issues/110 @long def test_loading_from_s3(model, s3_storage_fs, s3_tmp_path): path = s3_tmp_path("model_load") diff --git a/tests/core/test_objects.py b/tests/core/test_objects.py index 086ae1d4..ba6db8fc 100644 --- a/tests/core/test_objects.py +++ b/tests/core/test_objects.py @@ -18,17 +18,23 @@ from mlem.core.objects import MlemLink, ModelMeta, mlem_dir_path from tests.conftest import MLEM_TEST_REPO, long, need_test_repo_auth +MODEL_NAME = "decision_tree" + def test_model_dump(mlem_root): X, y = load_iris(return_X_y=True) clf = DecisionTreeClassifier().fit(X, y) meta = ModelMeta.from_obj(clf, sample_data=X) - dir = os.path.join(mlem_root, "decision_tree") + dir = os.path.join(mlem_root, MODEL_NAME) meta.dump(dir, link=True) link_path = os.path.join( - mlem_root, MLEM_DIR, "model", "decision_tree" + MLEM_EXT + mlem_root, MLEM_DIR, "model", MODEL_NAME + MLEM_EXT ) assert os.path.exists(link_path) + assert os.path.exists(os.path.join(dir, META_FILE_NAME)) + assert os.path.exists( + os.path.join(mlem_root, MODEL_NAME, ART_DIR, "data.pkl") + ) model = load(link_path, follow_links=True) model.predict(X) diff --git a/tests/core/test_requirements.py b/tests/core/test_requirements.py new file mode 100644 index 00000000..fe58d535 --- /dev/null +++ b/tests/core/test_requirements.py @@ -0,0 +1,124 @@ +import pytest + +from mlem.core.requirements import ( + CustomRequirement, + InstallableRequirement, + Requirements, + resolve_requirements, +) + + +def test_resolve_requirements_arg(): + requirements = Requirements.new( + [ + InstallableRequirement(module="dumb", version="0.4.1"), + InstallableRequirement(module="art", version="4.0"), + ] + ) + actual_reqs = resolve_requirements(requirements) + assert actual_reqs == requirements + + +def test_resolve_requirement_arg(): + req = InstallableRequirement(module="dumb", version="0.4.1") + actual_reqs = resolve_requirements(req) + assert actual_reqs.installable[0] == req + + +def test_resolve_requirement_list_arg(): + req = [ + InstallableRequirement(module="dumb", version="0.4.1"), + InstallableRequirement(module="art", version="4.0"), + ] + actual_reqs = resolve_requirements(req) + assert len(actual_reqs.installable) == 2 + assert actual_reqs.installable == req + + +def test_resolve_str_arg(): + req = "dumb==0.4.1" + actual_reqs = resolve_requirements(req) + assert actual_reqs.installable[0].to_str() == req + + +def test_resolve_str_list_arg(): + req = ["dumb==0.4.1", "art==4.0"] + actual_reqs = resolve_requirements(req) + assert len(actual_reqs.installable) == 2 + assert sorted(req) == sorted([r.to_str() for r in actual_reqs.installable]) + + +def test_installable_requirement__from_module(): + import pandas as pd + + assert ( + InstallableRequirement.from_module(pd).to_str() + == f"pandas=={pd.__version__}" + ) + + import numpy as np + + assert ( + InstallableRequirement.from_module(np).to_str() + == f"numpy=={np.__version__}" + ) + + import sklearn as sk + + assert ( + InstallableRequirement.from_module(sk).to_str() + == f"scikit-learn=={sk.__version__}" + ) + assert ( + InstallableRequirement.from_module(sk, "xyz").to_str() + == f"xyz=={sk.__version__}" + ) + + +def test_custom_requirement__source(): + from mlem import core + + package = CustomRequirement.from_module(core) + assert package.is_package + assert package.sources is not None + with pytest.raises(AttributeError): + package.source # pylint: disable=pointless-statement + + module = CustomRequirement.from_module(core.requirements) + assert not module.is_package + assert module.source is not None + with pytest.raises(AttributeError): + module.sources # pylint: disable=pointless-statement + + +def test_resolve_unique_str(): + reqs_str = ["a==1", "a==1"] + reqs = Requirements.new(reqs_str) + assert len(reqs.__root__) == 1 + assert reqs.installable[0] == InstallableRequirement( + module="a", version="1" + ) + + +def test_resolve_unique_req(): + req = InstallableRequirement(module="a", version="1") + reqs_list = [req, req] + reqs = Requirements.new(reqs_list) + assert len(reqs.__root__) == 1 + assert reqs.installable[0] == req + + +# Copyright 2019 Zyfra +# Copyright 2021 Iterative +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/polydantic/test_lazy.py b/tests/polydantic/test_lazy.py index c0a5446f..ac4d3998 100644 --- a/tests/polydantic/test_lazy.py +++ b/tests/polydantic/test_lazy.py @@ -1,9 +1,9 @@ -from typing import Any +from typing import Any, Dict, Optional import pytest from pydantic import BaseModel, ValidationError, parse_obj_as, validator -from mlem.polydantic.lazy import lazy_field +from mlem.polydantic.lazy import LazyModel, lazy_field class Payload(BaseModel): @@ -60,3 +60,23 @@ def test_setting_value(): obj.field.value = 2 assert obj.field.value == 2 assert obj.field_raw["value"] == 2 + + +class ModelWithOptional(LazyModel): + field_cache: Optional[Dict] + field, field_raw, field_cache = lazy_field( + Payload, + "field", + "field_cache", + parse_as_type=Optional[Payload], + default=None, + ) + + +def test_setting_optional_field(): + obj = ModelWithOptional() + assert obj.field is None + obj.field = Payload(value=0) + assert obj.field.value == 1 + obj.field_raw = {"value": 5} + assert obj.field.value == 6