diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index bfc91ffc0f..9d8a904b9f 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -47,6 +47,7 @@ steps: bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) pip install --upgrade -r requirements-test.txt -r requirements-test-optional.txt pip install $(python .ci/scripts/get_compatible_scipy_version.py) + if [ $(echo $(PYTHON_VERSION) | grep '3.8\|3.9\|3.10') ]; then conda install -q -y -c intel dpnp; fi pip list displayName: 'Install testing requirements' - script: | diff --git a/.ci/pipeline/ci.yml b/.ci/pipeline/ci.yml index 7c5c54dd78..5973fc130e 100644 --- a/.ci/pipeline/ci.yml +++ b/.ci/pipeline/ci.yml @@ -42,20 +42,6 @@ variables: ARGS: '1' jobs: -- job: PEP8 - pool: - vmImage: 'ubuntu-22.04' - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.9' - addToPath: true - - script: | - python -m pip install --upgrade pip setuptools - pip install flake8 - flake8 --ignore=E265,E722,E402,F401,F403,W503 --max-line-length=90 --count - displayName: 'PEP 8 check' - - job: Linux timeoutInMinutes: 120 strategy: diff --git a/.ci/pipeline/docs.yml b/.ci/pipeline/docs.yml index 4a9667bb7d..bbf6cb73c4 100644 --- a/.ci/pipeline/docs.yml +++ b/.ci/pipeline/docs.yml @@ -41,19 +41,6 @@ variables: value: python jobs: -- job: PEP8 - pool: - vmImage: 'ubuntu-22.04' - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '3.9' - addToPath: true - - script: | - python -m pip install --upgrade pip setuptools - pip install flake8 - flake8 --ignore=E265,E722,E402,F401,F403,W503 --max-line-length=90 --count - displayName: 'PEP 8 check' - job: Docs pool: vmImage: 'ubuntu-22.04' diff --git a/.circleci/run_xpu_tests.py b/.circleci/run_xpu_tests.py index 5a35cbad9f..7912e63395 100644 --- a/.circleci/run_xpu_tests.py +++ b/.circleci/run_xpu_tests.py @@ -80,7 +80,7 @@ if args.device == "gpu": from sklearnex._config import config_context with config_context( - target_offload=args.device, allow_fallback_to_host=True): + target_offload=args.device, allow_fallback_to_host=False): pytest.main( pytest_params + ["--pyargs", "sklearn"] + yml_deselected_tests ) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000..2b766d4e56 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,37 @@ +name: Lint + +on: [pull_request, push] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - id: changedfiles + name: Get list of changed .py files + run: | + set -x + echo "CHANGED_FILES=$(git diff --name-only --diff-filter=ACMRT ${{ github.event.pull_request.base.sha }} ${{ github.sha }} | grep .py$ | xargs)" >> "$GITHUB_OUTPUT" + if [[ $(git diff --name-only --diff-filter=ACMRT ${{ github.event.pull_request.base.sha }} ${{ github.sha }} | grep .py$ | wc -l) -eq 0 ]]; then + echo "HAS_CHANGES=false" >> "$GITHUB_OUTPUT" + else + echo "HAS_CHANGES=true" >> "$GITHUB_OUTPUT" + fi + - uses: actions/setup-python@v4 + name: Setup python for isort + if: steps.changedfiles.outputs.HAS_CHANGES == 'true' + with: + python-version: "3.10" + - name: Run isort on changed files + if: steps.changedfiles.outputs.HAS_CHANGES == 'true' + run: | + set -x + python -m pip install isort + isort --check ${{ steps.changedfiles.outputs.CHANGED_FILES }} + - uses: psf/black@stable + name: Run black on changed files + if: steps.changedfiles.outputs.HAS_CHANGES == 'true' + with: + src: ${{ steps.changedfiles.outputs.CHANGED_FILES }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..69b711d5ea --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +#=============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +repos: + - repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black + language_version: python3.10 + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + language_version: python3.10 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 63d4cf4e93..78a6a595d7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,3 +41,15 @@ To contribute your changes directly to the repository, do the following: - [Submit](https://github.com/intel/scikit-learn-intelex/pulls) a pull request into the `master` branch. Provide a brief description of the changes you are contributing. Public CI is enabled for the repository. Your PR should pass all of our checks. We will review your contribution and, if any additional fixes or modifications are necessary, we may give some feedback to guide you. When accepted, your pull request will be merged into our GitHub repository. + +## Code Style + +We use [black](https://black.readthedocs.io/en/stable/) and [isort](https://pycqa.github.io/isort/) formatters for Python* code. The line length is 90 characters; use default options otherwise. You can find the linter configuration in our [.pyproject.toml](https://github.com/intel/scikit-learn-intelex/blob/master/pyproject.toml). +A GitHub* Action verifies if your changes comply with the output of the auto-formatting tools. + +Optionally, you can install pre-commit hooks that do the formatting for you. For this, run from the top level of the repository: + +```bash +pip install pre-commit +pre-commit install +``` \ No newline at end of file diff --git a/daal4py/__init__.py b/daal4py/__init__.py index f3b7181c27..19c4cb0410 100644 --- a/daal4py/__init__.py +++ b/daal4py/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,44 +13,47 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import platform + if "Windows" in platform.system(): import os - import sys import site + import sys + current_path = os.path.dirname(__file__) path_to_env = site.getsitepackages()[0] path_to_libs = os.path.join(path_to_env, "Library", "bin") path_to_oneapi_backend = os.path.join(current_path, "oneapi") if sys.version_info.minor >= 8: - if 'DALROOT' in os.environ: - dal_root_redist = os.path.join(os.environ['DALROOT'], "redist", "intel64") + if "DALROOT" in os.environ: + dal_root_redist = os.path.join(os.environ["DALROOT"], "redist", "intel64") if os.path.exists(dal_root_redist): os.add_dll_directory(dal_root_redist) - os.environ['PATH'] = dal_root_redist + os.pathsep + os.environ['PATH'] + os.environ["PATH"] = dal_root_redist + os.pathsep + os.environ["PATH"] os.add_dll_directory(path_to_libs) os.add_dll_directory(path_to_oneapi_backend) - os.environ['PATH'] = path_to_libs + os.pathsep + os.environ['PATH'] + os.environ["PATH"] = path_to_libs + os.pathsep + os.environ["PATH"] try: from daal4py._daal4py import * from daal4py._daal4py import ( - _get__version__, + __has_dist__, _get__daal_link_version__, _get__daal_run_version__, - __has_dist__) + _get__version__, + ) except ImportError as e: s = str(e) - if 'libfabric' in s: + if "libfabric" in s: raise ImportError( - s + '\n\nActivating your conda environment or sourcing mpivars.' - '[c]sh/psxevars.[c]sh may solve the issue.\n') + s + "\n\nActivating your conda environment or sourcing mpivars." + "[c]sh/psxevars.[c]sh may solve the issue.\n" + ) raise -from . import mb -from . import sklearn +from . import mb, sklearn -__all__ = ['mb', 'sklearn'] +__all__ = ["mb", "sklearn"] diff --git a/daal4py/__main__.py b/daal4py/__main__.py index 9b3a27a43d..3ded7d1e10 100644 --- a/daal4py/__main__.py +++ b/daal4py/__main__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,9 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import sys + from .sklearn import patch_sklearn @@ -29,27 +30,30 @@ def _main(): Python* patches of scikit-learn, optimizing solvers of scikit-learn with Intel(R) oneAPI Data Analytics Library. """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('-m', action='store_true', dest='module', - help="Executes following as a module") - parser.add_argument('name', help="Script or module name") - parser.add_argument('args', nargs=argparse.REMAINDER, - help="Command line arguments") + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "-m", action="store_true", dest="module", help="Executes following as a module" + ) + parser.add_argument("name", help="Script or module name") + parser.add_argument("args", nargs=argparse.REMAINDER, help="Command line arguments") args = parser.parse_args() try: import sklearn + patch_sklearn() except ImportError: print("Scikit-learn could not be imported. Nothing to patch") sys.argv = [args.name] + args.args - if '_' + args.name in globals(): - return globals()['_' + args.name](*args.args) + if "_" + args.name in globals(): + return globals()["_" + args.name](*args.args) import runpy + runf = runpy.run_module if args.module else runpy.run_path - runf(args.name, run_name='__main__') + runf(args.name, run_name="__main__") sys.exit(_main()) diff --git a/daal4py/mb/__init__.py b/daal4py/mb/__init__.py index 279681ca07..5478f23b0b 100644 --- a/daal4py/mb/__init__.py +++ b/daal4py/mb/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .model_builders import GBTDAALBaseModel, convert_model -__all__ = ['GBTDAALBaseModel', 'convert_model'] +__all__ = ["GBTDAALBaseModel", "convert_model"] diff --git a/daal4py/mb/model_builders.py b/daal4py/mb/model_builders.py index c3f2a99be7..aafbc0be47 100644 --- a/daal4py/mb/model_builders.py +++ b/daal4py/mb/model_builders.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py Model builders API import numpy as np + import daal4py as d4p try: from pandas import DataFrame from pandas.core.dtypes.cast import find_common_type + pandas_is_imported = True except (ImportError, ModuleNotFoundError): pandas_is_imported = False @@ -41,7 +43,7 @@ def getFPType(X): dt = find_common_type(X.dtypes.tolist()) return parse_dtype(dt) - dt = getattr(X, 'dtype', None) + dt = getattr(X, "dtype", None) return parse_dtype(dt) @@ -65,9 +67,9 @@ def _get_params_from_xgboost(self, params): self.n_features_in_ = int(params["learner"]["learner_model_param"]["num_feature"]) def _get_params_from_catboost(self, params): - if 'class_params' in params['model_info']: - self.n_classes_ = len(params['model_info']['class_params']['class_to_label']) - self.n_features_in_ = len(params['features_info']['float_features']) + if "class_params" in params["model_info"]: + self.n_classes_ = len(params["model_info"]["class_params"]["class_to_label"]) + self.n_features_in_ = len(params["features_info"]["float_features"]) def _convert_model_from_lightgbm(self, booster): lgbm_params = d4p.get_lightgbm_params(booster) @@ -85,8 +87,10 @@ def _convert_model_from_catboost(self, booster): self._get_params_from_catboost(catboost_params) def _convert_model(self, model): - (submodule_name, class_name) = (model.__class__.__module__, - model.__class__.__name__) + (submodule_name, class_name) = ( + model.__class__.__module__, + model.__class__.__name__, + ) self_class_name = self.__class__.__name__ # Build GBTDAALClassifier from LightGBM @@ -94,82 +98,101 @@ def _convert_model(self, model): if self_class_name == "GBTDAALClassifier": self._convert_model_from_lightgbm(model.booster_) else: - raise TypeError(f"Only GBTDAALClassifier can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALClassifier can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALClassifier from XGBoost elif (submodule_name, class_name) == ("xgboost.sklearn", "XGBClassifier"): if self_class_name == "GBTDAALClassifier": self._convert_model_from_xgboost(model.get_booster()) else: - raise TypeError(f"Only GBTDAALClassifier can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALClassifier can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALClassifier from CatBoost elif (submodule_name, class_name) == ("catboost.core", "CatBoostClassifier"): if self_class_name == "GBTDAALClassifier": self._convert_model_from_catboost(model) else: - raise TypeError(f"Only GBTDAALClassifier can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALClassifier can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALRegressor from LightGBM elif (submodule_name, class_name) == ("lightgbm.sklearn", "LGBMRegressor"): if self_class_name == "GBTDAALRegressor": self._convert_model_from_lightgbm(model.booster_) else: - raise TypeError(f"Only GBTDAALRegressor can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALRegressor can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALRegressor from XGBoost elif (submodule_name, class_name) == ("xgboost.sklearn", "XGBRegressor"): if self_class_name == "GBTDAALRegressor": self._convert_model_from_xgboost(model.get_booster()) else: - raise TypeError(f"Only GBTDAALRegressor can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALRegressor can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALRegressor from CatBoost elif (submodule_name, class_name) == ("catboost.core", "CatBoostRegressor"): if self_class_name == "GBTDAALRegressor": self._convert_model_from_catboost(model) else: - raise TypeError(f"Only GBTDAALRegressor can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALRegressor can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALModel from LightGBM elif (submodule_name, class_name) == ("lightgbm.basic", "Booster"): if self_class_name == "GBTDAALModel": self._convert_model_from_lightgbm(model) else: - raise TypeError(f"Only GBTDAALModel can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALModel can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALModel from XGBoost elif (submodule_name, class_name) == ("xgboost.core", "Booster"): if self_class_name == "GBTDAALModel": self._convert_model_from_xgboost(model) else: - raise TypeError(f"Only GBTDAALModel can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALModel can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALModel from CatBoost elif (submodule_name, class_name) == ("catboost.core", "CatBoost"): if self_class_name == "GBTDAALModel": self._convert_model_from_catboost(model) else: - raise TypeError(f"Only GBTDAALModel can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALModel can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) else: raise TypeError(f"Unknown model format {submodule_name}.{class_name}") def _predict_classification(self, X, fptype, resultsToEvaluate): if X.shape[1] != self.n_features_in_: - raise ValueError('Shape of input is different from what was seen in `fit`') + raise ValueError("Shape of input is different from what was seen in `fit`") - if not hasattr(self, 'daal_model_'): - raise ValueError(( - "The class {} instance does not have 'daal_model_' attribute set. " - "Call 'fit' with appropriate arguments before using this method.") - .format(type(self).__name__)) + if not hasattr(self, "daal_model_"): + raise ValueError( + ( + "The class {} instance does not have 'daal_model_' attribute set. " + "Call 'fit' with appropriate arguments before using this method." + ).format(type(self).__name__) + ) # Prediction predict_algo = d4p.gbt_classification_prediction( - fptype=fptype, - nClasses=self.n_classes_, - resultsToEvaluate=resultsToEvaluate) + fptype=fptype, nClasses=self.n_classes_, resultsToEvaluate=resultsToEvaluate + ) predict_result = predict_algo.compute(X, self.daal_model_) if resultsToEvaluate == "computeClassLabels": @@ -179,13 +202,15 @@ def _predict_classification(self, X, fptype, resultsToEvaluate): def _predict_regression(self, X, fptype): if X.shape[1] != self.n_features_in_: - raise ValueError('Shape of input is different from what was seen in `fit`') - - if not hasattr(self, 'daal_model_'): - raise ValueError(( - "The class {} instance does not have 'daal_model_' attribute set. " - "Call 'fit' with appropriate arguments before using this method.").format( - type(self).__name__)) + raise ValueError("Shape of input is different from what was seen in `fit`") + + if not hasattr(self, "daal_model_"): + raise ValueError( + ( + "The class {} instance does not have 'daal_model_' attribute set. " + "Call 'fit' with appropriate arguments before using this method." + ).format(type(self).__name__) + ) # Prediction predict_algo = d4p.gbt_regression_prediction(fptype=fptype) diff --git a/daal4py/oneapi/__init__.py b/daal4py/oneapi/__init__.py index d76d4abb57..973060f7f6 100644 --- a/daal4py/oneapi/__init__.py +++ b/daal4py/oneapi/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,9 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import platform + if "Windows" in platform.system(): import os import sys @@ -23,13 +24,14 @@ current_path = os.path.dirname(__file__) - sitepackages_path = sysconfig.get_paths()['purelib'] - installed_package_path = os.path.join(sitepackages_path, 'daal4py', 'oneapi') + sitepackages_path = sysconfig.get_paths()["purelib"] + installed_package_path = os.path.join(sitepackages_path, "daal4py", "oneapi") if sys.version_info.minor >= 8: - if 'DPCPPROOT' in os.environ: - dpcpp_rt_root_bin = os.path.join(os.environ['DPCPPROOT'], "windows", "bin") - dpcpp_rt_root_redist = os.path.join(os.environ['DPCPPROOT'], "windows", - "redist", "intel64_win", "compiler") + if "DPCPPROOT" in os.environ: + dpcpp_rt_root_bin = os.path.join(os.environ["DPCPPROOT"], "windows", "bin") + dpcpp_rt_root_redist = os.path.join( + os.environ["DPCPPROOT"], "windows", "redist", "intel64_win", "compiler" + ) if os.path.exists(dpcpp_rt_root_bin): os.add_dll_directory(dpcpp_rt_root_bin) if os.path.exists(dpcpp_rt_root_redist): @@ -37,28 +39,29 @@ os.add_dll_directory(current_path) if os.path.exists(installed_package_path): os.add_dll_directory(installed_package_path) - os.environ['PATH'] = current_path + os.pathsep + os.environ['PATH'] - os.environ['PATH'] = installed_package_path + os.pathsep + os.environ['PATH'] + os.environ["PATH"] = current_path + os.pathsep + os.environ["PATH"] + os.environ["PATH"] = installed_package_path + os.pathsep + os.environ["PATH"] try: from daal4py._oneapi import * from daal4py._oneapi import ( - _get_sycl_ctxt, _get_device_name_sycl_ctxt, + _get_in_sycl_ctxt, + _get_sycl_ctxt, _get_sycl_ctxt_params, - _get_in_sycl_ctxt ) except ModuleNotFoundError: raise except ImportError: import daal4py - version = daal4py._get__version__()[1:-1].split(', ') + + version = daal4py._get__version__()[1:-1].split(", ") major_version, minor_version = version[0], version[1] raise ImportError( - f'dpcpp_cpp_rt >= {major_version}.{minor_version} ' - 'has to be installed or upgraded to use this module.\n' - 'You can download or upgrade it using the following commands:\n' - f'`pip install --upgrade dpcpp_cpp_rt>={major_version}.{minor_version}.*` ' - 'or ' - f'`conda install -c intel dpcpp_cpp_rt>={major_version}.{minor_version}.*`' + f"dpcpp_cpp_rt >= {major_version}.{minor_version} " + "has to be installed or upgraded to use this module.\n" + "You can download or upgrade it using the following commands:\n" + f"`pip install --upgrade dpcpp_cpp_rt>={major_version}.{minor_version}.*` " + "or " + f"`conda install -c intel dpcpp_cpp_rt>={major_version}.{minor_version}.*`" ) diff --git a/daal4py/sklearn/__init__.py b/daal4py/sklearn/__init__.py index b10e26370a..92cad6beef 100755 --- a/daal4py/sklearn/__init__.py +++ b/daal4py/sklearn/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,17 +12,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from .monkeypatch.dispatcher import enable as patch_sklearn +from .monkeypatch.dispatcher import _get_map_of_algorithms as sklearn_patch_map +from .monkeypatch.dispatcher import _patch_names as sklearn_patch_names from .monkeypatch.dispatcher import disable as unpatch_sklearn +from .monkeypatch.dispatcher import enable as patch_sklearn from .monkeypatch.dispatcher import patch_is_enabled as sklearn_is_patched -from .monkeypatch.dispatcher import _patch_names as sklearn_patch_names -from .monkeypatch.dispatcher import _get_map_of_algorithms as sklearn_patch_map __all__ = [ - 'cluster', 'decomposition', 'ensemble', 'linear_model', - 'manifold', 'metrics', 'model_selection', 'neighbors', - 'patch_sklearn', 'sklearn_is_patched', 'sklearn_patch_map', - 'sklearn_patch_names', 'svm', 'tree', 'unpatch_sklearn', 'utils' + "cluster", + "decomposition", + "ensemble", + "linear_model", + "manifold", + "metrics", + "model_selection", + "neighbors", + "patch_sklearn", + "sklearn_is_patched", + "sklearn_patch_map", + "sklearn_patch_names", + "svm", + "tree", + "unpatch_sklearn", + "utils", ] diff --git a/daal4py/sklearn/_device_offload.py b/daal4py/sklearn/_device_offload.py index df6b13458b..1fb3bd93f4 100644 --- a/daal4py/sklearn/_device_offload.py +++ b/daal4py/sklearn/_device_offload.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,21 +12,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from functools import wraps try: from sklearnex._config import get_config - from sklearnex._device_offload import (_get_global_queue, - _transfer_to_host, - _copy_to_usm) + from sklearnex._device_offload import ( + _copy_to_usm, + _get_global_queue, + _transfer_to_host, + ) + _sklearnex_available = True except ImportError: import logging - logging.warning('Device support is limited in daal4py patching. ' - 'Use Intel(R) Extension for Scikit-learn* ' - 'for full experience.') + + logging.warning( + "Device support is limited in daal4py patching. " + "Use Intel(R) Extension for Scikit-learn* " + "for full experience." + ) _sklearnex_available = False @@ -42,9 +48,7 @@ def _extract_usm_iface(*args, **kwargs): allargs = (*args, *kwargs.values()) if len(allargs) == 0: return None - return getattr(allargs[0], - '__sycl_usm_array_interface__', - None) + return getattr(allargs[0], "__sycl_usm_array_interface__", None) def _run_on_device(func, queue, obj=None, *args, **kwargs): @@ -54,13 +58,15 @@ def dispatch_by_obj(obj, func, *args, **kwargs): return func(*args, **kwargs) if queue is not None: - from daal4py.oneapi import sycl_context, _get_in_sycl_ctxt + from daal4py.oneapi import _get_in_sycl_ctxt, sycl_context if _get_in_sycl_ctxt() is False: - host_offload = get_config()['allow_fallback_to_host'] + host_offload = get_config()["allow_fallback_to_host"] - with sycl_context('gpu' if queue.sycl_device.is_gpu else 'cpu', - host_offload_on_fail=host_offload): + with sycl_context( + "gpu" if queue.sycl_device.is_gpu else "cpu", + host_offload_on_fail=host_offload, + ): return dispatch_by_obj(obj, func, *args, **kwargs) return dispatch_by_obj(obj, func, *args, **kwargs) @@ -72,19 +78,23 @@ def wrapper_impl(obj, *args, **kwargs): usm_iface = _extract_usm_iface(*args, **kwargs) q, hostargs, hostkwargs = _get_host_inputs(*args, **kwargs) result = _run_on_device(func, q, obj, *hostargs, **hostkwargs) - if usm_iface is not None and hasattr(result, '__array_interface__'): + if usm_iface is not None and hasattr(result, "__array_interface__"): return _copy_to_usm(q, result) return result return _run_on_device(func, None, obj, *args, **kwargs) if freefunc: + @wraps(func) def wrapper_free(*args, **kwargs): return wrapper_impl(None, *args, **kwargs) + return wrapper_free @wraps(func) def wrapper_with_self(self, *args, **kwargs): return wrapper_impl(self, *args, **kwargs) + return wrapper_with_self + return decorator diff --git a/daal4py/sklearn/_utils.py b/daal4py/sklearn/_utils.py index d257185e90..01730e21d9 100644 --- a/daal4py/sklearn/_utils.py +++ b/daal4py/sklearn/_utils.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,37 +12,41 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np -import sys import os +import sys import warnings +import numpy as np from numpy.lib.recfunctions import require_fields +from sklearn import __version__ as sklearn_version from daal4py import _get__daal_link_version__ as dv -from sklearn import __version__ as sklearn_version + try: from packaging.version import Version except ImportError: from distutils.version import LooseVersion as Version + import logging try: from pandas import DataFrame from pandas.core.dtypes.cast import find_common_type + pandas_is_imported = True except (ImportError, ModuleNotFoundError): pandas_is_imported = False try: from daal4py.oneapi import is_in_sycl_ctxt as is_in_ctx + ctx_imported = True except (ImportError, ModuleNotFoundError): ctx_imported = False -oneapi_is_available = 'daal4py.oneapi' in sys.modules +oneapi_is_available = "daal4py.oneapi" in sys.modules if oneapi_is_available: from daal4py.oneapi import _get_device_name_sycl_ctxt @@ -53,11 +57,15 @@ def set_idp_sklearn_verbose(): if logLevel is not None: logging.basicConfig( stream=sys.stdout, - format='%(levelname)s: %(message)s', level=logLevel.upper()) + format="%(levelname)s: %(message)s", + level=logLevel.upper(), + ) except Exception: - warnings.warn('Unknown level "{}" for logging.\n' - 'Please, use one of "CRITICAL", "ERROR", ' - '"WARNING", "INFO", "DEBUG".'.format(logLevel)) + warnings.warn( + 'Unknown level "{}" for logging.\n' + 'Please, use one of "CRITICAL", "ERROR", ' + '"WARNING", "INFO", "DEBUG".'.format(logLevel) + ) def daal_check_version(rule): @@ -83,7 +91,7 @@ def daal_check_version(rule): def sklearn_check_version(ver): if ver in sklearn_versions_map.keys(): return sklearn_versions_map[ver] - if hasattr(Version(ver), 'base_version'): + if hasattr(Version(ver), "base_version"): base_sklearn_version = Version(sklearn_version).base_version res = bool(Version(base_sklearn_version) >= Version(ver)) else: @@ -111,7 +119,7 @@ def getFPType(X): dt = find_common_type(X.dtypes.tolist()) return parse_dtype(dt) - dt = getattr(X, 'dtype', None) + dt = getattr(X, "dtype", None) return parse_dtype(dt) @@ -128,15 +136,16 @@ def get_patch_message(s): message = "running accelerated version on " if oneapi_is_available: dev = _get_device_name_sycl_ctxt() - if dev == 'cpu' or dev is None: - message += 'CPU' - elif dev == 'gpu': - message += 'GPU' + if dev == "cpu" or dev is None: + message += "CPU" + elif dev == "gpu": + message += "GPU" else: - raise ValueError(f"Unexpected device name {dev}." - " Supported types are cpu and gpu") + raise ValueError( + f"Unexpected device name {dev}." " Supported types are cpu and gpu" + ) else: - message += 'CPU' + message += "CPU" elif s == "sklearn": message = "fallback to original Scikit-learn" @@ -145,7 +154,8 @@ def get_patch_message(s): else: raise ValueError( f"Invalid input - expected one of 'daal','sklearn'," - f" 'sklearn_after_daal', got {s}") + f" 'sklearn_after_daal', got {s}" + ) return message @@ -182,14 +192,18 @@ def check_tree_nodes(tree_nodes): def convert_to_old_tree_nodes(tree_nodes): # conversion from sklearn>=1.3 tree nodes format to previous format: # removal of 'missing_go_to_left' field from node dtype - new_field = 'missing_go_to_left' + new_field = "missing_go_to_left" new_dtype = tree_nodes.dtype - old_dtype = np.dtype([ - (key, value[0]) for key, value in - new_dtype.fields.items() if key != new_field]) + old_dtype = np.dtype( + [ + (key, value[0]) + for key, value in new_dtype.fields.items() + if key != new_field + ] + ) return require_fields(tree_nodes, old_dtype) - if sklearn_check_version('1.3'): + if sklearn_check_version("1.3"): return tree_nodes else: return convert_to_old_tree_nodes(tree_nodes) @@ -200,7 +214,7 @@ def __init__(self, scope_name): self.scope_name = scope_name self.patching_is_enabled = True self.messages = [] - self.logger = logging.getLogger('sklearnex') + self.logger = logging.getLogger("sklearnex") def _iter_conditions(self, conditions_and_messages): result = [] @@ -212,7 +226,8 @@ def _iter_conditions(self, conditions_and_messages): def and_conditions(self, conditions_and_messages, conditions_merging=all): self.patching_is_enabled &= conditions_merging( - self._iter_conditions(conditions_and_messages)) + self._iter_conditions(conditions_and_messages) + ) return self.patching_is_enabled def and_condition(self, condition, message): @@ -220,7 +235,8 @@ def and_condition(self, condition, message): def or_conditions(self, conditions_and_messages, conditions_merging=all): self.patching_is_enabled |= conditions_merging( - self._iter_conditions(conditions_and_messages)) + self._iter_conditions(conditions_and_messages) + ) return self.patching_is_enabled def write_log(self): @@ -228,11 +244,13 @@ def write_log(self): self.logger.info(f"{self.scope_name}: {get_patch_message('daal')}") else: self.logger.debug( - f'{self.scope_name}: debugging for the patch is enabled to track' - ' the usage of IntelĀ® oneAPI Data Analytics Library (oneDAL)') + f"{self.scope_name}: debugging for the patch is enabled to track" + " the usage of IntelĀ® oneAPI Data Analytics Library (oneDAL)" + ) for message in self.messages: self.logger.debug( - f'{self.scope_name}: patching failed with cause - {message}') + f"{self.scope_name}: patching failed with cause - {message}" + ) self.logger.info(f"{self.scope_name}: {get_patch_message('sklearn')}") def get_status(self, logs=False): diff --git a/daal4py/sklearn/cluster/__init__.py b/daal4py/sklearn/cluster/__init__.py index ed774f234f..dd7ceeb93b 100644 --- a/daal4py/sklearn/cluster/__init__.py +++ b/daal4py/sklearn/cluster/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from .k_means import KMeans from .dbscan import DBSCAN -__all__ = ['KMeans', 'DBSCAN'] +from .k_means import KMeans + +__all__ = ["KMeans", "DBSCAN"] diff --git a/daal4py/sklearn/cluster/_dbscan.py b/daal4py/sklearn/cluster/_dbscan.py index 8a10284977..0cee113de5 100644 --- a/daal4py/sklearn/cluster/_dbscan.py +++ b/daal4py/sklearn/cluster/_dbscan.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,25 +12,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np -from scipy import sparse as sp import numbers +import numpy as np +from scipy import sparse as sp +from sklearn.cluster import DBSCAN as DBSCAN_original from sklearn.utils import check_array from sklearn.utils.validation import _check_sample_weight -from sklearn.cluster import DBSCAN as DBSCAN_original - import daal4py -from daal4py.sklearn._utils import ( - make2d, getFPType, PatchingConditionsChain) +from daal4py.sklearn._utils import PatchingConditionsChain, getFPType, make2d from .._device_offload import support_usm_ndarray from .._utils import sklearn_check_version -if sklearn_check_version('1.1') and not sklearn_check_version('1.2'): +if sklearn_check_version("1.1") and not sklearn_check_version("1.2"): from sklearn.utils import check_scalar @@ -40,12 +38,12 @@ def _daal_dbscan(X, eps=0.5, min_samples=5, sample_weight=None): fpt = getFPType(XX) alg = daal4py.dbscan( - method='defaultDense', + method="defaultDense", fptype=fpt, epsilon=float(eps), minObservations=int(min_samples), memorySavingMode=False, - resultsToCompute="computeCoreIndices" + resultsToCompute="computeCoreIndices", ) daal_res = alg.compute(XX, ww) @@ -189,16 +187,17 @@ class DBSCAN(DBSCAN_original): >>> clustering DBSCAN(eps=3, min_samples=2) """ - if sklearn_check_version('1.2'): + + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**DBSCAN_original._parameter_constraints} def __init__( self, eps=0.5, min_samples=5, - metric='euclidean', + metric="euclidean", metric_params=None, - algorithm='auto', + algorithm="auto", leaf_size=30, p=None, n_jobs=None, @@ -282,26 +281,29 @@ def fit(self, X, y=None, sample_weight=None): if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) - _patching_status = PatchingConditionsChain( - "sklearn.cluster.DBSCAN.fit") - _dal_ready = _patching_status.and_conditions([ - (self.algorithm in ['auto', 'brute'], - f"'{self.algorithm}' algorithm is not supported. " - "Only 'auto' and 'brute' algorithms are supported"), - (self.metric == 'euclidean' or (self.metric == 'minkowski' and self.p == 2), - f"'{self.metric}' (p={self.p}) metric is not supported. " - "Only 'euclidean' or 'minkowski' with p=2 metrics are supported."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported.") - ]) + _patching_status = PatchingConditionsChain("sklearn.cluster.DBSCAN.fit") + _dal_ready = _patching_status.and_conditions( + [ + ( + self.algorithm in ["auto", "brute"], + f"'{self.algorithm}' algorithm is not supported. " + "Only 'auto' and 'brute' algorithms are supported", + ), + ( + self.metric == "euclidean" + or (self.metric == "minkowski" and self.p == 2), + f"'{self.metric}' (p={self.p}) metric is not supported. " + "Only 'euclidean' or 'minkowski' with p=2 metrics are supported.", + ), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ] + ) _patching_status.write_log() if _dal_ready: - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) core_ind, assignments = _daal_dbscan( - X, - self.eps, - self.min_samples, - sample_weight=sample_weight + X, self.eps, self.min_samples, sample_weight=sample_weight ) self.core_sample_indices_ = core_ind self.labels_ = assignments diff --git a/daal4py/sklearn/cluster/_k_means_0_22.py b/daal4py/sklearn/cluster/_k_means_0_22.py index 4856cbb4dd..f70cce8ef9 100644 --- a/daal4py/sklearn/cluster/_k_means_0_22.py +++ b/daal4py/sklearn/cluster/_k_means_0_22.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from scipy import sparse as sp - -from sklearn.utils import (check_random_state, check_array) +from sklearn.utils import check_array, check_random_state from sklearn.utils.sparsefuncs import mean_variance_axis -from sklearn.utils.validation import (check_is_fitted, _num_samples) +from sklearn.utils.validation import _num_samples, check_is_fitted try: - from sklearn.cluster._k_means import ( - k_means, _labels_inertia, _validate_center_shape) + from sklearn.cluster._k_means import _labels_inertia, _validate_center_shape, k_means except ModuleNotFoundError: - from sklearn.cluster._kmeans import ( - k_means, _labels_inertia, _validate_center_shape) + from sklearn.cluster._kmeans import k_means, _labels_inertia, _validate_center_shape -from sklearn.utils.extmath import row_norms import warnings from sklearn.cluster import KMeans as KMeans_original +from sklearn.utils.extmath import row_norms import daal4py -from .._utils import ( - getFPType, daal_check_version, PatchingConditionsChain) + from .._device_offload import support_usm_ndarray +from .._utils import PatchingConditionsChain, daal_check_version, getFPType def _tolerance(X, rtol): @@ -52,34 +49,38 @@ def _tolerance(X, rtol): def _daal4py_compute_starting_centroids( - X, X_fptype, nClusters, cluster_centers_0, random_state): - + X, X_fptype, nClusters, cluster_centers_0, random_state +): def is_string(s, target_str): return isinstance(s, str) and s == target_str deterministic = False - if is_string(cluster_centers_0, 'k-means++'): - _seed = random_state.randint(np.iinfo('i').max) + if is_string(cluster_centers_0, "k-means++"): + _seed = random_state.randint(np.iinfo("i").max) daal_engine = daal4py.engines_mt19937( - fptype=X_fptype, method='defaultDense', seed=_seed) + fptype=X_fptype, method="defaultDense", seed=_seed + ) _n_local_trials = 2 + int(np.log(nClusters)) - kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, - nTrials=_n_local_trials, - method='plusPlusDense', engine=daal_engine) + kmeans_init = daal4py.kmeans_init( + nClusters, + fptype=X_fptype, + nTrials=_n_local_trials, + method="plusPlusDense", + engine=daal_engine, + ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids - elif is_string(cluster_centers_0, 'random'): - _seed = random_state.randint(np.iinfo('i').max) + elif is_string(cluster_centers_0, "random"): + _seed = random_state.randint(np.iinfo("i").max) daal_engine = daal4py.engines_mt19937( - seed=_seed, fptype=X_fptype, method='defaultDense') + seed=_seed, fptype=X_fptype, method="defaultDense" + ) kmeans_init = daal4py.kmeans_init( - nClusters, - fptype=X_fptype, - method='randomDense', - engine=daal_engine) + nClusters, fptype=X_fptype, method="randomDense", engine=daal_engine + ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids - elif hasattr(cluster_centers_0, '__array__'): + elif hasattr(cluster_centers_0, "__array__"): deterministic = True cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) @@ -89,60 +90,75 @@ def is_string(s, target_str): cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr - elif is_string(cluster_centers_0, 'deterministic'): + elif is_string(cluster_centers_0, "deterministic"): deterministic = True kmeans_init = daal4py.kmeans_init( - nClusters, fptype=X_fptype, method='defaultDense') + nClusters, fptype=X_fptype, method="defaultDense" + ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids else: raise ValueError( "Cluster centers should either be 'k-means++'," - " 'random', 'deterministic' or an array") + " 'random', 'deterministic' or an array" + ) return deterministic, centroids_ -def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype="double", - method="lloydDense", accuracyThreshold=0.0, - resultsToEvaluate="computeCentroids", gamma=1.0): +def _daal4py_kmeans_compatibility( + nClusters, + maxIterations, + fptype="double", + method="lloydDense", + accuracyThreshold=0.0, + resultsToEvaluate="computeCentroids", + gamma=1.0, +): kmeans_algo = None - if daal_check_version(((2020, 'P', 2), (2021, 'B', 107))): - kmeans_algo = daal4py.kmeans(nClusters=nClusters, - maxIterations=maxIterations, - fptype=fptype, - resultsToEvaluate=resultsToEvaluate, - accuracyThreshold=accuracyThreshold, - method=method, - gamma=gamma) + if daal_check_version(((2020, "P", 2), (2021, "B", 107))): + kmeans_algo = daal4py.kmeans( + nClusters=nClusters, + maxIterations=maxIterations, + fptype=fptype, + resultsToEvaluate=resultsToEvaluate, + accuracyThreshold=accuracyThreshold, + method=method, + gamma=gamma, + ) else: - assigFlag = 'computeAssignments' in resultsToEvaluate - kmeans_algo = daal4py.kmeans(nClusters=nClusters, - maxIterations=maxIterations, - fptype=fptype, - assignFlag=assigFlag, - accuracyThreshold=accuracyThreshold, - method=method, - gamma=gamma) + assigFlag = "computeAssignments" in resultsToEvaluate + kmeans_algo = daal4py.kmeans( + nClusters=nClusters, + maxIterations=maxIterations, + fptype=fptype, + assignFlag=assigFlag, + accuracyThreshold=accuracyThreshold, + method=method, + gamma=gamma, + ) return kmeans_algo -def _daal4py_k_means_predict(X, nClusters, centroids, - resultsToEvaluate='computeAssignments'): +def _daal4py_k_means_predict( + X, nClusters, centroids, resultsToEvaluate="computeAssignments" +): X_fptype = getFPType(X) kmeans_algo = _daal4py_kmeans_compatibility( nClusters=nClusters, maxIterations=0, fptype=X_fptype, resultsToEvaluate=resultsToEvaluate, - method='defaultDense') + method="defaultDense", + ) res = kmeans_algo.compute(X, centroids) return res.assignments[:, 0], res.objectiveFunction[0, 0] -def _daal4py_k_means_fit(X, nClusters, numIterations, - tol, cluster_centers_0, n_init, random_state): +def _daal4py_k_means_fit( + X, nClusters, numIterations, tol, cluster_centers_0, n_init, random_state +): if numIterations < 0: raise ValueError("Wrong iterations number") @@ -157,12 +173,14 @@ def _daal4py_k_means_fit(X, nClusters, numIterations, maxIterations=numIterations, accuracyThreshold=abs_tol, fptype=X_fptype, - resultsToEvaluate='computeCentroids', - method='defaultDense') + resultsToEvaluate="computeCentroids", + method="defaultDense", + ) for k in range(n_init): deterministic, starting_centroids_ = _daal4py_compute_starting_centroids( - X, X_fptype, nClusters, cluster_centers_0, random_state) + X, X_fptype, nClusters, cluster_centers_0, random_state + ) res = kmeans_algo.compute(X, starting_centroids_) @@ -175,14 +193,17 @@ def _daal4py_k_means_fit(X, nClusters, numIterations, best_n_iter = int(res.nIterations[0, 0]) if deterministic and n_init != 1: warnings.warn( - 'Explicit initial center position passed: ' - 'performing only one init in k-means instead of n_init=%d' - % n_init, RuntimeWarning, stacklevel=2) + "Explicit initial center position passed: " + "performing only one init in k-means instead of n_init=%d" % n_init, + RuntimeWarning, + stacklevel=2, + ) break - flag_compute = 'computeAssignments|computeExactObjectiveFunction' + flag_compute = "computeAssignments|computeExactObjectiveFunction" best_labels, best_inertia = _daal4py_k_means_predict( - X, nClusters, best_cluster_centers, flag_compute) + X, nClusters, best_cluster_centers, flag_compute + ) return best_cluster_centers, best_labels, best_inertia, best_n_iter @@ -205,73 +226,103 @@ def _fit(self, X, y=None, sample_weight=None): """ if self.n_init <= 0: - raise ValueError("Invalid number of initializations." - " n_init=%d must be bigger than zero." % self.n_init) + raise ValueError( + "Invalid number of initializations." + " n_init=%d must be bigger than zero." % self.n_init + ) random_state = check_random_state(self.random_state) if self.max_iter <= 0: - raise ValueError('Number of iterations should be a positive number,' - ' got %d instead' % self.max_iter) + raise ValueError( + "Number of iterations should be a positive number," + " got %d instead" % self.max_iter + ) - if self.precompute_distances == 'auto': + if self.precompute_distances == "auto": precompute_distances = False elif isinstance(self.precompute_distances, bool): precompute_distances = self.precompute_distances else: - raise ValueError("precompute_distances should be 'auto' or True/False" - ", but a value of %r was passed" % - self.precompute_distances) + raise ValueError( + "precompute_distances should be 'auto' or True/False" + ", but a value of %r was passed" % self.precompute_distances + ) - _patching_status = PatchingConditionsChain( - "sklearn.cluster.KMeans.fit") - _dal_ready = _patching_status.and_conditions([ - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (not precompute_distances, "The precomputing of distances is not supported.") - ]) + _patching_status = PatchingConditionsChain("sklearn.cluster.KMeans.fit") + _dal_ready = _patching_status.and_conditions( + [ + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (not precompute_distances, "The precomputing of distances is not supported."), + ] + ) if _dal_ready: X_len = _num_samples(X) - _dal_ready = _patching_status.and_conditions([ - (self.n_clusters <= X_len, - "The number of clusters is larger than the number of samples in X.") - ]) + _dal_ready = _patching_status.and_conditions( + [ + ( + self.n_clusters <= X_len, + "The number of clusters is larger than the number of samples in X.", + ) + ] + ) if _dal_ready and sample_weight is not None: sample_weight = np.asarray(sample_weight) - _dal_ready = _patching_status.and_conditions([ - (sample_weight.shape == (X_len,), - "Sample weights do not have the same length as X."), - (np.allclose(sample_weight, np.ones_like(sample_weight)), - "Sample weights are not ones.") - ]) + _dal_ready = _patching_status.and_conditions( + [ + ( + sample_weight.shape == (X_len,), + "Sample weights do not have the same length as X.", + ), + ( + np.allclose(sample_weight, np.ones_like(sample_weight)), + "Sample weights are not ones.", + ), + ] + ) _patching_status.write_log() if not _dal_ready: - self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ - k_means( - X, n_clusters=self.n_clusters, sample_weight=sample_weight, - init=self.init, n_init=self.n_init, max_iter=self.max_iter, - verbose=self.verbose, precompute_distances=precompute_distances, - tol=self.tol, random_state=random_state, copy_x=self.copy_x, - n_jobs=self.n_jobs, algorithm=self.algorithm, - return_n_iter=True) + self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = k_means( + X, + n_clusters=self.n_clusters, + sample_weight=sample_weight, + init=self.init, + n_init=self.n_init, + max_iter=self.max_iter, + verbose=self.verbose, + precompute_distances=precompute_distances, + tol=self.tol, + random_state=random_state, + copy_x=self.copy_x, + n_jobs=self.n_jobs, + algorithm=self.algorithm, + return_n_iter=True, + ) else: X = check_array( X, - accept_sparse='csr', dtype=[np.float64, np.float32], + accept_sparse="csr", + dtype=[np.float64, np.float32], order="C" if self.copy_x else None, - copy=self.copy_x + copy=self.copy_x, ) self.n_features_in_ = X.shape[1] - self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ - _daal4py_k_means_fit( - X, self.n_clusters, - self.max_iter, - self.tol, - self.init, - self.n_init, - random_state - ) + ( + self.cluster_centers_, + self.labels_, + self.inertia_, + self.n_iter_, + ) = _daal4py_k_means_fit( + X, + self.n_clusters, + self.max_iter, + self.tol, + self.init, + self.n_init, + random_state, + ) return self @@ -300,35 +351,51 @@ def _predict(self, X, sample_weight=None): X = self._check_test_data(X) - _patching_status = PatchingConditionsChain( - "sklearn.cluster.KMeans.predict") - _dal_ready = _patching_status.and_conditions([ - (sample_weight is None, "Sample weights are not supported."), - (hasattr(X, '__array__'), "X does not have '__array__' attribute.") - ]) + _patching_status = PatchingConditionsChain("sklearn.cluster.KMeans.predict") + _dal_ready = _patching_status.and_conditions( + [ + (sample_weight is None, "Sample weights are not supported."), + (hasattr(X, "__array__"), "X does not have '__array__' attribute."), + ] + ) _patching_status.write_log() if _dal_ready: - return _daal4py_k_means_predict( - X, self.n_clusters, self.cluster_centers_)[0] + return _daal4py_k_means_predict(X, self.n_clusters, self.cluster_centers_)[0] x_squared_norms = row_norms(X, squared=True) - return _labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_)[0] + return _labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[0] class KMeans(KMeans_original): __doc__ = KMeans_original.__doc__ - def __init__(self, n_clusters=8, init='k-means++', n_init=10, - max_iter=300, tol=1e-4, precompute_distances='auto', - verbose=0, random_state=None, copy_x=True, - n_jobs=None, algorithm='auto'): - + def __init__( + self, + n_clusters=8, + init="k-means++", + n_init=10, + max_iter=300, + tol=1e-4, + precompute_distances="auto", + verbose=0, + random_state=None, + copy_x=True, + n_jobs=None, + algorithm="auto", + ): super(KMeans, self).__init__( - n_clusters=n_clusters, init=init, max_iter=max_iter, - tol=tol, precompute_distances=precompute_distances, - n_init=n_init, verbose=verbose, random_state=random_state, - copy_x=copy_x, n_jobs=n_jobs, algorithm=algorithm) + n_clusters=n_clusters, + init=init, + max_iter=max_iter, + tol=tol, + precompute_distances=precompute_distances, + n_init=n_init, + verbose=verbose, + random_state=random_state, + copy_x=copy_x, + n_jobs=n_jobs, + algorithm=algorithm, + ) @support_usm_ndarray() def fit(self, X, y=None, sample_weight=None): diff --git a/daal4py/sklearn/cluster/_k_means_0_23.py b/daal4py/sklearn/cluster/_k_means_0_23.py index 3b2497fcae..48b7fb790e 100755 --- a/daal4py/sklearn/cluster/_k_means_0_23.py +++ b/daal4py/sklearn/cluster/_k_means_0_23.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,38 +12,33 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import numbers -from scipy import sparse as sp - -from sklearn.utils import check_random_state, check_array -from sklearn.utils.sparsefuncs import mean_variance_axis -from sklearn.utils.validation import ( - check_is_fitted, - _num_samples, - _deprecate_positional_args) +import warnings +import numpy as np +from scipy import sparse as sp +from sklearn.cluster import KMeans as KMeans_original from sklearn.cluster._kmeans import _labels_inertia -from sklearn.utils._openmp_helpers import _openmp_effective_n_threads - from sklearn.exceptions import ConvergenceWarning +from sklearn.utils import check_array, check_random_state +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.extmath import row_norms -import warnings - -from sklearn.cluster import KMeans as KMeans_original +from sklearn.utils.sparsefuncs import mean_variance_axis +from sklearn.utils.validation import ( + _deprecate_positional_args, + _num_samples, + check_is_fitted, +) import daal4py -from .._utils import ( - getFPType, - sklearn_check_version, - PatchingConditionsChain) + from .._device_offload import support_usm_ndarray +from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version -if sklearn_check_version('1.1'): - from sklearn.utils.validation import ( - _check_sample_weight, _is_arraylike_not_scalar) +if sklearn_check_version("1.1"): + from sklearn.utils.validation import _check_sample_weight, _is_arraylike_not_scalar def _validate_center_shape(X, n_centers, centers): @@ -51,11 +46,13 @@ def _validate_center_shape(X, n_centers, centers): if centers.shape[0] != n_centers: raise ValueError( f"The shape of the initial centers {centers.shape} does not " - f"match the number of clusters {n_centers}.") + f"match the number of clusters {n_centers}." + ) if centers.shape[1] != X.shape[1]: raise ValueError( f"The shape of the initial centers {centers.shape} does not " - f"match the number of features of the data {X.shape[1]}.") + f"match the number of features of the data {X.shape[1]}." + ) def _tolerance(X, rtol): @@ -71,23 +68,20 @@ def _tolerance(X, rtol): def _daal4py_compute_starting_centroids( - X, - X_fptype, - nClusters, - cluster_centers_0, - verbose, - random_state + X, X_fptype, nClusters, cluster_centers_0, verbose, random_state ): def is_string(s, target_str): return isinstance(s, str) and s == target_str + is_sparse = sp.isspmatrix(X) deterministic = False - if is_string(cluster_centers_0, 'k-means++'): - _seed = random_state.randint(np.iinfo('i').max) + if is_string(cluster_centers_0, "k-means++"): + _seed = random_state.randint(np.iinfo("i").max) plus_plus_method = "plusPlusCSR" if is_sparse else "plusPlusDense" daal_engine = daal4py.engines_mt19937( - fptype=X_fptype, method="defaultDense", seed=_seed) + fptype=X_fptype, method="defaultDense", seed=_seed + ) _n_local_trials = 2 + int(np.log(nClusters)) kmeans_init = daal4py.kmeans_init( nClusters, @@ -98,11 +92,12 @@ def is_string(s, target_str): ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids - elif is_string(cluster_centers_0, 'random'): - _seed = random_state.randint(np.iinfo('i').max) + elif is_string(cluster_centers_0, "random"): + _seed = random_state.randint(np.iinfo("i").max) random_method = "randomCSR" if is_sparse else "randomDense" daal_engine = daal4py.engines_mt19937( - seed=_seed, fptype=X_fptype, method="defaultDense") + seed=_seed, fptype=X_fptype, method="defaultDense" + ) kmeans_init = daal4py.kmeans_init( nClusters, fptype=X_fptype, @@ -111,7 +106,7 @@ def is_string(s, target_str): ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids - elif hasattr(cluster_centers_0, '__array__'): + elif hasattr(cluster_centers_0, "__array__"): deterministic = True cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) @@ -121,25 +116,33 @@ def is_string(s, target_str): cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr - elif is_string(cluster_centers_0, 'deterministic'): + elif is_string(cluster_centers_0, "deterministic"): deterministic = True default_method = "lloydCSR" if is_sparse else "defaultDense" kmeans_init = daal4py.kmeans_init( - nClusters, fptype=X_fptype, method=default_method) + nClusters, fptype=X_fptype, method=default_method + ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids else: raise ValueError( f"init should be either 'k-means++', 'random', a ndarray or a " - f"callable, got '{cluster_centers_0}' instead.") + f"callable, got '{cluster_centers_0}' instead." + ) if verbose: print("Initialization complete") return deterministic, centroids_ -def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype="double", - method="lloydDense", accuracyThreshold=0.0, - resultsToEvaluate="computeCentroids", gamma=1.0): +def _daal4py_kmeans_compatibility( + nClusters, + maxIterations, + fptype="double", + method="lloydDense", + accuracyThreshold=0.0, + resultsToEvaluate="computeCentroids", + gamma=1.0, +): kmeans_algo = daal4py.kmeans( nClusters=nClusters, maxIterations=maxIterations, @@ -147,13 +150,14 @@ def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype="double", resultsToEvaluate=resultsToEvaluate, accuracyThreshold=accuracyThreshold, method=method, - gamma=gamma + gamma=gamma, ) return kmeans_algo -def _daal4py_k_means_predict(X, nClusters, centroids, - resultsToEvaluate='computeAssignments'): +def _daal4py_k_means_predict( + X, nClusters, centroids, resultsToEvaluate="computeAssignments" +): X_fptype = getFPType(X) is_sparse = sp.isspmatrix(X) method = "lloydCSR" if is_sparse else "defaultDense" @@ -170,8 +174,9 @@ def _daal4py_k_means_predict(X, nClusters, centroids, return res.assignments[:, 0], res.objectiveFunction[0, 0] -def _daal4py_k_means_fit(X, nClusters, numIterations, - tol, cluster_centers_0, n_init, verbose, random_state): +def _daal4py_k_means_fit( + X, nClusters, numIterations, tol, cluster_centers_0, n_init, verbose, random_state +): if numIterations < 0: raise ValueError("Wrong iterations number") @@ -179,15 +184,15 @@ def is_string(s, target_str): return isinstance(s, str) and s == target_str default_n_init = 10 - if n_init in ['auto', 'warn']: - if n_init == "warn" and sklearn_check_version('1.2'): + if n_init in ["auto", "warn"]: + if n_init == "warn" and sklearn_check_version("1.2"): warnings.warn( "The default value of `n_init` will change from " f"{default_n_init} to 'auto' in 1.4. Set the value of `n_init`" " explicitly to suppress the warning", FutureWarning, ) - if is_string(cluster_centers_0, 'k-means++'): + if is_string(cluster_centers_0, "k-means++"): n_init = 1 else: n_init = default_n_init @@ -202,13 +207,14 @@ def is_string(s, target_str): maxIterations=numIterations, accuracyThreshold=abs_tol, fptype=X_fptype, - resultsToEvaluate='computeCentroids', + resultsToEvaluate="computeCentroids", method=method, ) for k in range(n_init): deterministic, starting_centroids_ = _daal4py_compute_starting_centroids( - X, X_fptype, nClusters, cluster_centers_0, verbose, random_state) + X, X_fptype, nClusters, cluster_centers_0, verbose, random_state + ) res = kmeans_algo.compute(X, starting_centroids_) @@ -224,14 +230,17 @@ def is_string(s, target_str): best_n_iter = int(res.nIterations[0, 0]) if deterministic and n_init != 1: warnings.warn( - 'Explicit initial center position passed: ' - 'performing only one init in k-means instead of n_init=%d' - % n_init, RuntimeWarning, stacklevel=2) + "Explicit initial center position passed: " + "performing only one init in k-means instead of n_init=%d" % n_init, + RuntimeWarning, + stacklevel=2, + ) break - flag_compute = 'computeAssignments|computeExactObjectiveFunction' + flag_compute = "computeAssignments|computeExactObjectiveFunction" best_labels, best_inertia = _daal4py_k_means_predict( - X, nClusters, best_cluster_centers, flag_compute) + X, nClusters, best_cluster_centers, flag_compute + ) distinct_clusters = np.unique(best_labels).size if distinct_clusters < nClusters: @@ -239,7 +248,9 @@ def is_string(s, target_str): "Number of distinct clusters ({}) found smaller than " "n_clusters ({}). Possibly due to duplicate points " "in X.".format(distinct_clusters, nClusters), - ConvergenceWarning, stacklevel=2) + ConvergenceWarning, + stacklevel=2, + ) # for passing test case "test_kmeans_warns_less_centers_than_unique_points" return best_cluster_centers, best_labels, best_inertia, best_n_iter @@ -264,8 +275,8 @@ def _fit(self, X, y=None, sample_weight=None): """ init = self.init - if sklearn_check_version('1.1'): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.1"): + if sklearn_check_version("1.2"): self._validate_params() X = self._validate_data( @@ -277,7 +288,7 @@ def _fit(self, X, y=None, sample_weight=None): accept_large_sparse=False, ) - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self._check_params_vs_input(X) else: self._check_params(X) @@ -292,99 +303,141 @@ def _fit(self, X, y=None, sample_weight=None): init = check_array(init, dtype=X.dtype, copy=True, order="C") self._validate_center_shape(X, init) else: - if hasattr(self, 'precompute_distances'): - if self.precompute_distances != 'deprecated': - if sklearn_check_version('0.24'): - warnings.warn("'precompute_distances' was deprecated in version " - "0.23 and will be removed in 1.0 (renaming of 0.25)." - " It has no effect", FutureWarning) - elif sklearn_check_version('0.23'): - warnings.warn("'precompute_distances' was deprecated in version " - "0.23 and will be removed in 0.25. It has no " - "effect", FutureWarning) + if hasattr(self, "precompute_distances"): + if self.precompute_distances != "deprecated": + if sklearn_check_version("0.24"): + warnings.warn( + "'precompute_distances' was deprecated in version " + "0.23 and will be removed in 1.0 (renaming of 0.25)." + " It has no effect", + FutureWarning, + ) + elif sklearn_check_version("0.23"): + warnings.warn( + "'precompute_distances' was deprecated in version " + "0.23 and will be removed in 0.25. It has no " + "effect", + FutureWarning, + ) self._n_threads = None - if hasattr(self, 'n_jobs'): - if self.n_jobs != 'deprecated': - if sklearn_check_version('0.24'): - warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" - " removed in 1.0 (renaming of 0.25).", FutureWarning) - elif sklearn_check_version('0.23'): - warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" - " removed in 0.25.", FutureWarning) + if hasattr(self, "n_jobs"): + if self.n_jobs != "deprecated": + if sklearn_check_version("0.24"): + warnings.warn( + "'n_jobs' was deprecated in version 0.23 and will be" + " removed in 1.0 (renaming of 0.25).", + FutureWarning, + ) + elif sklearn_check_version("0.23"): + warnings.warn( + "'n_jobs' was deprecated in version 0.23 and will be" + " removed in 0.25.", + FutureWarning, + ) self._n_threads = self.n_jobs self._n_threads = _openmp_effective_n_threads(self._n_threads) if self.n_init <= 0: - raise ValueError( - f"n_init should be > 0, got {self.n_init} instead.") + raise ValueError(f"n_init should be > 0, got {self.n_init} instead.") random_state = check_random_state(self.random_state) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if self.max_iter <= 0: - raise ValueError( - f"max_iter should be > 0, got {self.max_iter} instead.") + raise ValueError(f"max_iter should be > 0, got {self.max_iter} instead.") algorithm = self.algorithm - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): if algorithm == "elkan" and self.n_clusters == 1: - warnings.warn("algorithm='elkan' doesn't make sense for a single " - "cluster. Using 'full' instead.", RuntimeWarning) + warnings.warn( + "algorithm='elkan' doesn't make sense for a single " + "cluster. Using 'full' instead.", + RuntimeWarning, + ) algorithm = "lloyd" if algorithm == "auto" or algorithm == "full": - warnings.warn("algorithm= {'auto','full'} is deprecated" - "Using 'lloyd' instead.", RuntimeWarning) + warnings.warn( + "algorithm= {'auto','full'} is deprecated" "Using 'lloyd' instead.", + RuntimeWarning, + ) algorithm = "lloyd" if self.n_clusters == 1 else "elkan" if algorithm not in ["lloyd", "full", "elkan"]: - raise ValueError("Algorithm must be 'auto','lloyd', 'full' or 'elkan'," - "got {}".format(str(algorithm))) + raise ValueError( + "Algorithm must be 'auto','lloyd', 'full' or 'elkan'," + "got {}".format(str(algorithm)) + ) else: if algorithm == "elkan" and self.n_clusters == 1: - warnings.warn("algorithm='elkan' doesn't make sense for a single " - "cluster. Using 'full' instead.", RuntimeWarning) + warnings.warn( + "algorithm='elkan' doesn't make sense for a single " + "cluster. Using 'full' instead.", + RuntimeWarning, + ) algorithm = "full" if algorithm == "auto": algorithm = "full" if self.n_clusters == 1 else "elkan" if algorithm not in ["full", "elkan"]: - raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got" - " {}".format(str(algorithm))) + raise ValueError( + "Algorithm must be 'auto', 'full' or 'elkan', got" + " {}".format(str(algorithm)) + ) X_len = _num_samples(X) - _patching_status = PatchingConditionsChain( - "sklearn.cluster.KMeans.fit") - _dal_ready = _patching_status.and_conditions([ - (self.n_clusters <= X_len, - "The number of clusters is larger than the number of samples in X.") - ]) + _patching_status = PatchingConditionsChain("sklearn.cluster.KMeans.fit") + _dal_ready = _patching_status.and_conditions( + [ + ( + self.n_clusters <= X_len, + "The number of clusters is larger than the number of samples in X.", + ) + ] + ) if _dal_ready and sample_weight is not None: if isinstance(sample_weight, numbers.Number): sample_weight = np.full(X_len, sample_weight, dtype=np.float64) else: sample_weight = np.asarray(sample_weight) - _dal_ready = _patching_status.and_conditions([ - (sample_weight.shape == (X_len,), - "Sample weights do not have the same length as X."), - (np.allclose(sample_weight, np.ones_like(sample_weight)), - "Sample weights are not ones.") - ]) + _dal_ready = _patching_status.and_conditions( + [ + ( + sample_weight.shape == (X_len,), + "Sample weights do not have the same length as X.", + ), + ( + np.allclose(sample_weight, np.ones_like(sample_weight)), + "Sample weights are not ones.", + ), + ] + ) _patching_status.write_log() if _dal_ready: - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) self.n_features_in_ = X.shape[1] - self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ - _daal4py_k_means_fit( - X, self.n_clusters, self.max_iter, self.tol, init, self.n_init, - self.verbose, random_state) - if sklearn_check_version('1.1'): + ( + self.cluster_centers_, + self.labels_, + self.inertia_, + self.n_iter_, + ) = _daal4py_k_means_fit( + X, + self.n_clusters, + self.max_iter, + self.tol, + init, + self.n_init, + self.verbose, + random_state, + ) + if sklearn_check_version("1.1"): self._n_features_out = self.cluster_centers_.shape[0] else: super(KMeans, self).fit(X, y=y, sample_weight=sample_weight) @@ -395,15 +448,15 @@ def _daal4py_check_test_data(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) X = check_array( - X, - accept_sparse='csr', - dtype=[np.float64, np.float32], - accept_large_sparse=False + X, accept_sparse="csr", dtype=[np.float64, np.float32], accept_large_sparse=False ) if self.n_features_in_ != X.shape[1]: raise ValueError( - (f'X has {X.shape[1]} features, ' - f'but Kmeans is expecting {self.n_features_in_} features as input')) + ( + f"X has {X.shape[1]} features, " + f"but Kmeans is expecting {self.n_features_in_} features as input" + ) + ) return X @@ -432,22 +485,22 @@ def _predict(self, X, sample_weight=None): X = _daal4py_check_test_data(self, X) - _patching_status = PatchingConditionsChain( - "sklearn.cluster.KMeans.predict") - _patching_status.and_conditions([ - (sample_weight is None, "Sample weights are not supported."), - (hasattr(X, '__array__'), "X does not have '__array__' attribute.") - ]) - _dal_ready = _patching_status.or_conditions([ - (sp.isspmatrix_csr(X), "X is not sparse.") - ]) + _patching_status = PatchingConditionsChain("sklearn.cluster.KMeans.predict") + _patching_status.and_conditions( + [ + (sample_weight is None, "Sample weights are not supported."), + (hasattr(X, "__array__"), "X does not have '__array__' attribute."), + ] + ) + _dal_ready = _patching_status.or_conditions( + [(sp.isspmatrix_csr(X), "X is not sparse.")] + ) _patching_status.write_log() if _dal_ready: - return _daal4py_k_means_predict( - X, self.n_clusters, self.cluster_centers_)[0] - if sklearn_check_version('1.2'): - if sklearn_check_version('1.3') and sample_weight is not None: + return _daal4py_k_means_predict(X, self.n_clusters, self.cluster_centers_)[0] + if sklearn_check_version("1.2"): + if sklearn_check_version("1.3") and sample_weight is not None: warnings.warn( "'sample_weight' was deprecated in version 1.3 and " "will be removed in 1.5.", @@ -456,30 +509,30 @@ def _predict(self, X, sample_weight=None): return _labels_inertia(X, sample_weight, self.cluster_centers_)[0] else: x_squared_norms = row_norms(X, squared=True) - return _labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_)[0] + return _labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[ + 0 + ] class KMeans(KMeans_original): __doc__ = KMeans_original.__doc__ - if sklearn_check_version('1.2'): - _parameter_constraints: dict = { - **KMeans_original._parameter_constraints} + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**KMeans_original._parameter_constraints} @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', - n_init='auto' if sklearn_check_version('1.4') else 'warn', + init="k-means++", + n_init="auto" if sklearn_check_version("1.4") else "warn", max_iter=300, tol=1e-4, verbose=0, random_state=None, copy_x=True, - algorithm='lloyd', + algorithm="lloyd", ): super(KMeans, self).__init__( n_clusters=n_clusters, @@ -492,20 +545,22 @@ def __init__( copy_x=copy_x, algorithm=algorithm, ) - elif sklearn_check_version('1.0'): + + elif sklearn_check_version("1.0"): + @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', + init="k-means++", n_init=10, max_iter=300, tol=1e-4, verbose=0, random_state=None, copy_x=True, - algorithm='auto', + algorithm="auto", ): super(KMeans, self).__init__( n_clusters=n_clusters, @@ -518,22 +573,24 @@ def __init__( copy_x=copy_x, algorithm=algorithm, ) + else: + @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', + init="k-means++", n_init=10, max_iter=300, tol=1e-4, - precompute_distances='deprecated', + precompute_distances="deprecated", verbose=0, random_state=None, copy_x=True, - n_jobs='deprecated', - algorithm='auto', + n_jobs="deprecated", + algorithm="auto", ): super(KMeans, self).__init__( n_clusters=n_clusters, diff --git a/daal4py/sklearn/cluster/dbscan.py b/daal4py/sklearn/cluster/dbscan.py index 274be166d3..e868a91d63 100644 --- a/daal4py/sklearn/cluster/dbscan.py +++ b/daal4py/sklearn/cluster/dbscan.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._dbscan import * diff --git a/daal4py/sklearn/cluster/k_means.py b/daal4py/sklearn/cluster/k_means.py index caa4a69abe..e57c309d7d 100755 --- a/daal4py/sklearn/cluster/k_means.py +++ b/daal4py/sklearn/cluster/k_means.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,11 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import sklearn_check_version -if sklearn_check_version('0.23'): +if sklearn_check_version("0.23"): from ._k_means_0_23 import * else: from ._k_means_0_22 import * diff --git a/daal4py/sklearn/cluster/tests/test_dbscan.py b/daal4py/sklearn/cluster/tests/test_dbscan.py index 0b2f88f254..1640e0e746 100755 --- a/daal4py/sklearn/cluster/tests/test_dbscan.py +++ b/daal4py/sklearn/cluster/tests/test_dbscan.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,27 +12,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np import pytest from sklearn.cluster import DBSCAN as DBSCAN_SKLEARN + from daal4py.sklearn.cluster import DBSCAN as DBSCAN_DAAL -METRIC = ('euclidean', ) +METRIC = ("euclidean",) USE_WEIGHTS = (True, False) -def generate_data(low: int, high: int, samples_number: int, - sample_dimension: tuple) -> tuple: +def generate_data( + low: int, high: int, samples_number: int, sample_dimension: tuple +) -> tuple: generator = np.random.RandomState() table_size = (samples_number, sample_dimension) - return generator.uniform( - low=low, high=high, size=table_size), generator.uniform(size=samples_number) + return generator.uniform(low=low, high=high, size=table_size), generator.uniform( + size=samples_number + ) -def check_labels_equals(left_labels: np.ndarray, - right_labels: np.ndarray) -> bool: +def check_labels_equals(left_labels: np.ndarray, right_labels: np.ndarray) -> bool: if left_labels.shape != right_labels.shape: raise Exception("Shapes not equals") if len(left_labels.shape) != 1: @@ -42,42 +44,49 @@ def check_labels_equals(left_labels: np.ndarray, dict_checker = {} for index_sample in range(left_labels.shape[0]): if left_labels[index_sample] not in dict_checker: - dict_checker[left_labels[index_sample] - ] = right_labels[index_sample] + dict_checker[left_labels[index_sample]] = right_labels[index_sample] elif dict_checker[left_labels[index_sample]] != right_labels[index_sample]: raise Exception("Wrong clustering") return True -def _test_dbscan_big_data_numpy_gen(eps: float, min_samples: int, metric: str, - use_weights: bool, low=-100.0, high=100.0, - samples_number=1000, sample_dimension=4): +def _test_dbscan_big_data_numpy_gen( + eps: float, + min_samples: int, + metric: str, + use_weights: bool, + low=-100.0, + high=100.0, + samples_number=1000, + sample_dimension=4, +): data, weights = generate_data( - low=low, high=high, samples_number=samples_number, - sample_dimension=sample_dimension) + low=low, + high=high, + samples_number=samples_number, + sample_dimension=sample_dimension, + ) if use_weights is False: weights = None initialized_daal_dbscan = DBSCAN_DAAL( - eps=eps, min_samples=min_samples, metric=metric).fit( - X=data, sample_weight=weights) + eps=eps, min_samples=min_samples, metric=metric + ).fit(X=data, sample_weight=weights) initialized_sklearn_dbscan = DBSCAN_SKLEARN( - metric=metric, eps=eps, min_samples=min_samples).fit( - X=data, sample_weight=weights) + metric=metric, eps=eps, min_samples=min_samples + ).fit(X=data, sample_weight=weights) check_labels_equals( - initialized_daal_dbscan.labels_, - initialized_sklearn_dbscan.labels_) + initialized_daal_dbscan.labels_, initialized_sklearn_dbscan.labels_ + ) -@pytest.mark.parametrize('metric', METRIC) -@pytest.mark.parametrize('use_weights', USE_WEIGHTS) +@pytest.mark.parametrize("metric", METRIC) +@pytest.mark.parametrize("use_weights", USE_WEIGHTS) def test_dbscan_big_data_numpy_gen(metric, use_weights: bool): eps = 35.0 min_samples = 6 _test_dbscan_big_data_numpy_gen( - eps=eps, - min_samples=min_samples, - metric=metric, - use_weights=use_weights) + eps=eps, min_samples=min_samples, metric=metric, use_weights=use_weights + ) def _test_across_grid_parameter_numpy_gen(metric, use_weights: bool): @@ -88,17 +97,13 @@ def _test_across_grid_parameter_numpy_gen(metric, use_weights: bool): min_samples_end = 15 min_samples_step = 1 for eps in np.arange(eps_begin, eps_end, eps_step): - for min_samples in range( - min_samples_begin, min_samples_end, min_samples_step): + for min_samples in range(min_samples_begin, min_samples_end, min_samples_step): _test_dbscan_big_data_numpy_gen( - eps=eps, - min_samples=min_samples, - metric=metric, - use_weights=use_weights) + eps=eps, min_samples=min_samples, metric=metric, use_weights=use_weights + ) -@pytest.mark.parametrize('metric', METRIC) -@pytest.mark.parametrize('use_weights', USE_WEIGHTS) +@pytest.mark.parametrize("metric", METRIC) +@pytest.mark.parametrize("use_weights", USE_WEIGHTS) def test_across_grid_parameter_numpy_gen(metric, use_weights: bool): - _test_across_grid_parameter_numpy_gen( - metric=metric, use_weights=use_weights) + _test_across_grid_parameter_numpy_gen(metric=metric, use_weights=use_weights) diff --git a/daal4py/sklearn/decomposition/__init__.py b/daal4py/sklearn/decomposition/__init__.py index 404a5cff95..a58befffe2 100644 --- a/daal4py/sklearn/decomposition/__init__.py +++ b/daal4py/sklearn/decomposition/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/daal4py/sklearn/decomposition/_pca.py b/daal4py/sklearn/decomposition/_pca.py index e04f23d664..06792795f5 100644 --- a/daal4py/sklearn/decomposition/_pca.py +++ b/daal4py/sklearn/decomposition/_pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,30 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import numbers from math import sqrt -from scipy.sparse import issparse +import numpy as np +from scipy.sparse import issparse from sklearn.utils import check_array -from sklearn.utils.validation import check_is_fitted from sklearn.utils.extmath import stable_cumsum +from sklearn.utils.validation import check_is_fitted import daal4py -from .._utils import ( - getFPType, sklearn_check_version, PatchingConditionsChain) + from .._device_offload import support_usm_ndarray +from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version -if sklearn_check_version('0.22'): +if sklearn_check_version("0.22"): from sklearn.decomposition._pca import PCA as PCA_original else: from sklearn.decomposition.pca import PCA as PCA_original -if sklearn_check_version('0.23'): +if sklearn_check_version("0.23"): from sklearn.decomposition._pca import _infer_dimension -elif sklearn_check_version('0.22'): +elif sklearn_check_version("0.22"): from sklearn.decomposition._pca import _infer_dimension_ else: from sklearn.decomposition.pca import _infer_dimension_ @@ -49,12 +49,12 @@ def __init__( n_components=None, copy=True, whiten=False, - svd_solver='auto', + svd_solver="auto", tol=0.0, - iterated_power='auto', + iterated_power="auto", n_oversamples=10, power_iteration_normalizer="auto", - random_state=None + random_state=None, ): self.n_components = n_components self.copy = copy @@ -67,27 +67,30 @@ def __init__( self.random_state = random_state def _validate_n_components(self, n_components, n_samples, n_features): - if n_components == 'mle': + if n_components == "mle": if n_samples < n_features: - raise ValueError("n_components='mle' is only supported " - "if n_samples >= n_features") + raise ValueError( + "n_components='mle' is only supported " "if n_samples >= n_features" + ) elif not 0 <= n_components <= min(n_samples, n_features): - raise ValueError("n_components=%r must be between 0 and " - "min(n_samples, n_features)=%r with " - "svd_solver='full'" - % (n_components, min(n_samples, n_features))) + raise ValueError( + "n_components=%r must be between 0 and " + "min(n_samples, n_features)=%r with " + "svd_solver='full'" % (n_components, min(n_samples, n_features)) + ) elif n_components >= 1: if not isinstance(n_components, numbers.Integral): - raise ValueError("n_components=%r must be of type int " - "when greater than or equal to 1, " - "was of type=%r" - % (n_components, type(n_components))) + raise ValueError( + "n_components=%r must be of type int " + "when greater than or equal to 1, " + "was of type=%r" % (n_components, type(n_components)) + ) def _fit_full_daal4py(self, X, n_components): n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) - if n_components == 'mle': + if n_components == "mle": daal_n_components = n_features elif n_components < 1: daal_n_components = n_sf_min @@ -97,7 +100,8 @@ def _fit_full_daal4py(self, X, n_components): fpType = getFPType(X) covariance_algo = daal4py.covariance( - fptype=fpType, outputMatrixType='covarianceMatrix') + fptype=fpType, outputMatrixType="covarianceMatrix" + ) covariance_res = covariance_algo.compute(X) self.mean_ = covariance_res.mean.ravel() @@ -106,10 +110,10 @@ def _fit_full_daal4py(self, X, n_components): pca_alg = daal4py.pca( fptype=fpType, - method='correlationDense', - resultsToCompute='eigenvalue', + method="correlationDense", + resultsToCompute="eigenvalue", isDeterministic=True, - nComponents=daal_n_components + nComponents=daal_n_components, ) pca_res = pca_alg.compute(X, covariance) @@ -118,16 +122,16 @@ def _fit_full_daal4py(self, X, n_components): tot_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / tot_var - if n_components == 'mle': - if sklearn_check_version('0.23'): + if n_components == "mle": + if sklearn_check_version("0.23"): n_components = _infer_dimension(explained_variance_, n_samples) else: - n_components = \ - _infer_dimension_(explained_variance_, n_samples, n_features) + n_components = _infer_dimension_( + explained_variance_, n_samples, n_features + ) elif 0 < n_components < 1.0: ratio_cumsum = stable_cumsum(explained_variance_ratio_) - n_components = np.searchsorted(ratio_cumsum, n_components, - side='right') + 1 + n_components = np.searchsorted(ratio_cumsum, n_components, side="right") + 1 if n_components < n_sf_min: if explained_variance_.shape[0] == n_sf_min: @@ -137,9 +141,9 @@ def _fit_full_daal4py(self, X, n_components): resid_var_ -= explained_variance_[:n_components].sum() self.noise_variance_ = resid_var_ / (n_sf_min - n_components) else: - self.noise_variance_ = 0. + self.noise_variance_ = 0.0 - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self.n_samples_, self.n_features_in_ = n_samples, n_features else: self.n_samples_, self.n_features_ = n_samples, n_features @@ -159,23 +163,23 @@ def _fit_full(self, X, n_components): V = self.components_ S = self.singular_values_ - if n_components == 'mle': - if sklearn_check_version('0.23'): + if n_components == "mle": + if sklearn_check_version("0.23"): n_components = _infer_dimension(self.explained_variance_, n_samples) else: - n_components = \ - _infer_dimension_(self.explained_variance_, n_samples, n_features) + n_components = _infer_dimension_( + self.explained_variance_, n_samples, n_features + ) elif 0 < n_components < 1.0: ratio_cumsum = stable_cumsum(self.explained_variance_ratio_) - n_components = np.searchsorted(ratio_cumsum, n_components, - side='right') + 1 + n_components = np.searchsorted(ratio_cumsum, n_components, side="right") + 1 if n_components < min(n_features, n_samples): self.noise_variance_ = self.explained_variance_[n_components:].mean() else: - self.noise_variance_ = 0. + self.noise_variance_ = 0.0 - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self.n_samples_, self.n_features_in_ = n_samples, n_features else: self.n_samples_, self.n_features_ = n_samples, n_features @@ -189,17 +193,20 @@ def _fit_full(self, X, n_components): def _fit(self, X): if issparse(X): - raise TypeError('PCA does not support sparse input. See ' - 'TruncatedSVD for a possible alternative.') - - if sklearn_check_version('0.23'): - X = self._validate_data(X, dtype=[np.float64, np.float32], - ensure_2d=True, copy=False) + raise TypeError( + "PCA does not support sparse input. See " + "TruncatedSVD for a possible alternative." + ) + + if sklearn_check_version("0.23"): + X = self._validate_data( + X, dtype=[np.float64, np.float32], ensure_2d=True, copy=False + ) else: X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True, copy=False) if self.n_components is None: - if self.svd_solver != 'arpack': + if self.svd_solver != "arpack": n_components = min(X.shape) else: n_components = min(X.shape) - 1 @@ -209,8 +216,8 @@ def _fit(self, X): self._fit_svd_solver = self.svd_solver shape_good_for_daal = X.shape[1] / X.shape[0] < 2 - if self._fit_svd_solver == 'auto': - if sklearn_check_version('1.1'): + if self._fit_svd_solver == "auto": + if sklearn_check_version("1.1"): # Small problem or n_components == 'mle', just call full PCA if max(X.shape) <= 500 or n_components == "mle": self._fit_svd_solver = "full" @@ -220,8 +227,8 @@ def _fit(self, X): else: self._fit_svd_solver = "full" else: - if n_components == 'mle': - self._fit_svd_solver = 'full' + if n_components == "mle": + self._fit_svd_solver = "full" else: n, p, k = X.shape[0], X.shape[1], n_components # These coefficients are result of training of Logistic Regression @@ -230,56 +237,68 @@ def _fit(self, X): # X is a dataset with npk, np^2, and n^2 columns. # And y is speedup of patched scikit-learn's # full PCA against stock scikit-learn's randomized PCA. - regression_coefs = np.array([ - [9.779873e-11, n * p * k], - [-1.122062e-11, n * p * p], - [1.127905e-09, n ** 2], - ]) - - if n_components >= 1 and np.dot( - regression_coefs[:, 0], regression_coefs[:, 1]) <= 0: - self._fit_svd_solver = 'randomized' + regression_coefs = np.array( + [ + [9.779873e-11, n * p * k], + [-1.122062e-11, n * p * p], + [1.127905e-09, n**2], + ] + ) + + if ( + n_components >= 1 + and np.dot(regression_coefs[:, 0], regression_coefs[:, 1]) <= 0 + ): + self._fit_svd_solver = "randomized" else: - self._fit_svd_solver = 'full' + self._fit_svd_solver = "full" - if not shape_good_for_daal or self._fit_svd_solver != 'full': - if sklearn_check_version('0.23'): + if not shape_good_for_daal or self._fit_svd_solver != "full": + if sklearn_check_version("0.23"): X = self._validate_data(X, copy=self.copy) else: X = check_array(X, copy=self.copy) - _patching_status = PatchingConditionsChain( - "sklearn.decomposition.PCA.fit") - _dal_ready = _patching_status.and_conditions([ - (self._fit_svd_solver == 'full', - f"'{self._fit_svd_solver}' SVD solver is not supported. " - "Only 'full' solver is supported.") - ]) + _patching_status = PatchingConditionsChain("sklearn.decomposition.PCA.fit") + _dal_ready = _patching_status.and_conditions( + [ + ( + self._fit_svd_solver == "full", + f"'{self._fit_svd_solver}' SVD solver is not supported. " + "Only 'full' solver is supported.", + ) + ] + ) if _dal_ready: - _dal_ready = _patching_status.and_conditions([ - (shape_good_for_daal, - "The shape of X does not satisfy oneDAL requirements: " - "number of features / number of samples >= 2") - ]) + _dal_ready = _patching_status.and_conditions( + [ + ( + shape_good_for_daal, + "The shape of X does not satisfy oneDAL requirements: " + "number of features / number of samples >= 2", + ) + ] + ) if _dal_ready: result = self._fit_full(X, n_components) else: result = PCA_original._fit_full(self, X, n_components) - elif self._fit_svd_solver in ['arpack', 'randomized']: + elif self._fit_svd_solver in ["arpack", "randomized"]: result = self._fit_truncated(X, n_components, self._fit_svd_solver) else: - raise ValueError("Unrecognized svd_solver='{0}'" - "".format(self._fit_svd_solver)) + raise ValueError( + "Unrecognized svd_solver='{0}'" "".format(self._fit_svd_solver) + ) _patching_status.write_log() return result def _transform_daal4py(self, X, whiten=False, scale_eigenvalues=True, check_X=True): - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, ['mean_', 'components_'], all_or_any=all) + check_is_fitted(self, ["mean_", "components_"], all_or_any=all) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -288,30 +307,36 @@ def _transform_daal4py(self, X, whiten=False, scale_eigenvalues=True, check_X=Tr tr_data = dict() if self.mean_ is not None: - tr_data['mean'] = self.mean_.reshape((1, -1)) + tr_data["mean"] = self.mean_.reshape((1, -1)) if whiten: if scale_eigenvalues: - tr_data['eigenvalue'] = \ - (self.n_samples_ - 1) * self.explained_variance_.reshape((1, -1)) + tr_data["eigenvalue"] = ( + self.n_samples_ - 1 + ) * self.explained_variance_.reshape((1, -1)) else: - tr_data['eigenvalue'] = self.explained_variance_.reshape((1, -1)) + tr_data["eigenvalue"] = self.explained_variance_.reshape((1, -1)) elif scale_eigenvalues: - tr_data['eigenvalue'] = np.full( + tr_data["eigenvalue"] = np.full( (1, self.explained_variance_.shape[0]), - self.n_samples_ - 1.0, dtype=X.dtype) + self.n_samples_ - 1.0, + dtype=X.dtype, + ) - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): expected_n_features = self.n_features_in_ else: expected_n_features = self.n_features_ if X.shape[1] != expected_n_features: raise ValueError( - (f'X has {X.shape[1]} features, ' - f'but PCA is expecting {expected_n_features} features as input')) - - tr_res = daal4py.pca_transform( - fptype=fpType - ).compute(X, self.components_, tr_data) + ( + f"X has {X.shape[1]} features, " + f"but PCA is expecting {expected_n_features} features as input" + ) + ) + + tr_res = daal4py.pca_transform(fptype=fpType).compute( + X, self.components_, tr_data + ) return tr_res.transformedData @@ -335,16 +360,16 @@ def transform(self, X): Projection of X in the first principal components, where `n_samples` is the number of samples and `n_components` is the number of the components. """ - _patching_status = PatchingConditionsChain( - "sklearn.decomposition.PCA.transform") - _dal_ready = _patching_status.and_conditions([ - (self.n_components_ > 0, "Number of components <= 0.") - ]) + _patching_status = PatchingConditionsChain("sklearn.decomposition.PCA.transform") + _dal_ready = _patching_status.and_conditions( + [(self.n_components_ > 0, "Number of components <= 0.")] + ) _patching_status.write_log() if _dal_ready: - return self._transform_daal4py(X, whiten=self.whiten, - check_X=True, scale_eigenvalues=False) + return self._transform_daal4py( + X, whiten=self.whiten, check_X=True, scale_eigenvalues=False + ) return PCA_original.transform(self, X) @support_usm_ndarray() @@ -372,32 +397,34 @@ def fit_transform(self, X, y=None): C-ordered array, use 'np.ascontiguousarray'. """ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self._validate_params() U, S, Vt = self._fit(X) _patching_status = PatchingConditionsChain( - "sklearn.decomposition.PCA.fit_transform") - _dal_ready = _patching_status.and_conditions([ - (U is None, "Stock fitting was used.") - ]) + "sklearn.decomposition.PCA.fit_transform" + ) + _dal_ready = _patching_status.and_conditions( + [(U is None, "Stock fitting was used.")] + ) if _dal_ready: - _dal_ready = _patching_status.and_conditions([ - (self.n_components_ > 0, "Number of components <= 0.") - ]) + _dal_ready = _patching_status.and_conditions( + [(self.n_components_ > 0, "Number of components <= 0.")] + ) if _dal_ready: result = self._transform_daal4py( - X, whiten=self.whiten, check_X=False, scale_eigenvalues=False) + X, whiten=self.whiten, check_X=False, scale_eigenvalues=False + ) else: result = np.empty((self.n_samples_, 0), dtype=X.dtype) else: - U = U[:, :self.n_components_] + U = U[:, : self.n_components_] if self.whiten: U *= sqrt(X.shape[0] - 1) else: - U *= S[:self.n_components_] + U *= S[: self.n_components_] result = U diff --git a/daal4py/sklearn/ensemble/AdaBoostClassifier.py b/daal4py/sklearn/ensemble/AdaBoostClassifier.py index 320c15dc3e..6871ba26ee 100644 --- a/daal4py/sklearn/ensemble/AdaBoostClassifier.py +++ b/daal4py/sklearn/ensemble/AdaBoostClassifier.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,20 +12,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py AdaBoost (Adaptive Boosting) scikit-learn-compatible estimator class -import numpy as np import numbers -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted + +import numpy as np +from sklearn import __version__ as sklearn_version from sklearn import preprocessing +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y + import daal4py as d4p + from .._utils import getFPType -from sklearn import __version__ as sklearn_version try: from packaging.version import Version except ImportError: @@ -33,13 +36,15 @@ class AdaBoostClassifier(BaseEstimator, ClassifierMixin): - def __init__(self, - split_criterion='gini', - max_tree_depth=1, - min_observations_in_leaf_node=1, - max_iterations=100, - learning_rate=1.0, - accuracy_threshold=0.01): + def __init__( + self, + split_criterion="gini", + max_tree_depth=1, + min_observations_in_leaf_node=1, + max_iterations=100, + learning_rate=1.0, + accuracy_threshold=0.01, + ): self.split_criterion = split_criterion self.max_tree_depth = max_tree_depth self.min_observations_in_leaf_node = min_observations_in_leaf_node @@ -48,30 +53,44 @@ def __init__(self, self.accuracy_threshold = accuracy_threshold def fit(self, X, y): - if self.split_criterion not in ('gini', 'infoGain'): - raise ValueError('Parameter "split_criterion" must be ' - '"gini" or "infoGain".') - if not isinstance(self.max_tree_depth, numbers.Integral) or \ - self.max_tree_depth < 0: - raise ValueError('Parameter "max_tree_depth" must be ' - 'positive integer value or zero.') - if not isinstance(self.min_observations_in_leaf_node, numbers.Integral) or \ - self.min_observations_in_leaf_node <= 0: - raise ValueError('Parameter "min_observations_in_leaf_node" must be ' - 'non-zero positive integer value.') - if not isinstance(self.max_iterations, numbers.Integral) or \ - self.max_iterations <= 0: - raise ValueError('Parameter "max_iterations" must be ' - 'non-zero positive integer value.') + if self.split_criterion not in ("gini", "infoGain"): + raise ValueError( + 'Parameter "split_criterion" must be ' '"gini" or "infoGain".' + ) + if ( + not isinstance(self.max_tree_depth, numbers.Integral) + or self.max_tree_depth < 0 + ): + raise ValueError( + 'Parameter "max_tree_depth" must be ' "positive integer value or zero." + ) + if ( + not isinstance(self.min_observations_in_leaf_node, numbers.Integral) + or self.min_observations_in_leaf_node <= 0 + ): + raise ValueError( + 'Parameter "min_observations_in_leaf_node" must be ' + "non-zero positive integer value." + ) + if ( + not isinstance(self.max_iterations, numbers.Integral) + or self.max_iterations <= 0 + ): + raise ValueError( + 'Parameter "max_iterations" must be ' "non-zero positive integer value." + ) if self.learning_rate <= 0: - raise ValueError('Parameter "learning_rate" must be ' - 'non-zero positive value.') + raise ValueError( + 'Parameter "learning_rate" must be ' "non-zero positive value." + ) # it is not clear why it is so but we will get error from # Intel(R) oneAPI Data Analytics # Library otherwise if self.accuracy_threshold < 0 and self.accuracy_threshold >= 1: - raise ValueError('Parameter "accuracy_threshold" must be ' - 'more or equal to 0 and less than 1.') + raise ValueError( + 'Parameter "accuracy_threshold" must be ' + "more or equal to 0 and less than 1." + ) # Check that X and y have correct shape X, y = check_X_y(X, y, y_numeric=False, dtype=[np.single, np.double]) @@ -107,11 +126,12 @@ def fit(self, X, y): maxTreeDepth=self.max_tree_depth + 1, minObservationsInLeafNodes=self.min_observations_in_leaf_node, splitCriterion=self.split_criterion, - pruning='none') + pruning="none", + ) pr = d4p.decision_tree_classification_prediction( - fptype=fptype, - nClasses=self.n_classes_) + fptype=fptype, nClasses=self.n_classes_ + ) train_algo = d4p.adaboost_training( fptype=fptype, @@ -120,7 +140,8 @@ def fit(self, X, y): weakLearnerPrediction=pr, maxIterations=self.max_iterations, learningRate=self.learning_rate, - accuracyThreshold=self.accuracy_threshold) + accuracyThreshold=self.accuracy_threshold, + ) train_result = train_algo.compute(X, y_) @@ -135,33 +156,36 @@ def predict(self, X): if Version(sklearn_version) >= Version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, ['n_features_in_', 'n_classes_']) + check_is_fitted(self, ["n_features_in_", "n_classes_"]) # Input validation X = check_array(X, dtype=[np.single, np.double]) if X.shape[1] != self.n_features_in_: - raise ValueError('Shape of input is different from what was seen in `fit`') + raise ValueError("Shape of input is different from what was seen in `fit`") # Trivial case if self.n_classes_ == 1: return np.full(X.shape[0], self.classes_[0]) - if not hasattr(self, 'daal_model_'): - raise ValueError(( - "The class {} instance does not have 'daal_model_' attribute set. " - "Call 'fit' with appropriate arguments before using this method.").format( - type(self).__name__)) + if not hasattr(self, "daal_model_"): + raise ValueError( + ( + "The class {} instance does not have 'daal_model_' attribute set. " + "Call 'fit' with appropriate arguments before using this method." + ).format(type(self).__name__) + ) # Define type of data fptype = getFPType(X) - pr = d4p.decision_tree_classification_prediction(fptype=fptype, - nClasses=self.n_classes_) + pr = d4p.decision_tree_classification_prediction( + fptype=fptype, nClasses=self.n_classes_ + ) # Prediction - predict_algo = d4p.adaboost_prediction(fptype=fptype, - nClasses=self.n_classes_, - weakLearnerPrediction=pr) + predict_algo = d4p.adaboost_prediction( + fptype=fptype, nClasses=self.n_classes_, weakLearnerPrediction=pr + ) predict_result = predict_algo.compute(X, self.daal_model_) prediction = predict_result.prediction diff --git a/daal4py/sklearn/ensemble/GBTDAAL.py b/daal4py/sklearn/ensemble/GBTDAAL.py index 1ea795b2cd..be97722f18 100644 --- a/daal4py/sklearn/ensemble/GBTDAAL.py +++ b/daal4py/sklearn/ensemble/GBTDAAL.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,36 +12,41 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py GBT scikit-learn-compatible estimator class -import numpy as np import numbers -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted + +import numpy as np from sklearn import preprocessing -from sklearn.utils.multiclass import check_classification_targets +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.utils import check_random_state +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y + import daal4py as d4p + from .._utils import getFPType class GBTDAALBase(BaseEstimator, d4p.mb.GBTDAALBaseModel): - def __init__(self, - split_method='inexact', - max_iterations=50, - max_tree_depth=6, - shrinkage=0.3, - min_split_loss=0, - reg_lambda=1, - observations_per_tree_fraction=1, - features_per_node=0, - min_observations_in_leaf_node=5, - memory_saving_mode=False, - max_bins=256, - min_bin_size=5, - random_state=None): + def __init__( + self, + split_method="inexact", + max_iterations=50, + max_tree_depth=6, + shrinkage=0.3, + min_split_loss=0, + reg_lambda=1, + observations_per_tree_fraction=1, + features_per_node=0, + min_observations_in_leaf_node=5, + memory_saving_mode=False, + max_bins=256, + min_bin_size=5, + random_state=None, + ): self.split_method = split_method self.max_iterations = max_iterations self.max_tree_depth = max_tree_depth @@ -57,49 +62,65 @@ def __init__(self, self.random_state = random_state def _check_params(self): - if self.split_method not in ('inexact', 'exact'): - raise ValueError('Parameter "split_method" must be ' - '"inexact" or "exact".') - if not isinstance(self.max_iterations, numbers.Integral) or \ - self.max_iterations <= 0: - raise ValueError('Parameter "max_iterations" must be ' - 'non-zero positive integer value.') - if not isinstance(self.max_tree_depth, numbers.Integral) or \ - self.max_tree_depth < 0: - raise ValueError('Parameter "max_tree_depth" must be ' - 'positive integer value or zero.') + if self.split_method not in ("inexact", "exact"): + raise ValueError('Parameter "split_method" must be ' '"inexact" or "exact".') + if ( + not isinstance(self.max_iterations, numbers.Integral) + or self.max_iterations <= 0 + ): + raise ValueError( + 'Parameter "max_iterations" must be ' "non-zero positive integer value." + ) + if ( + not isinstance(self.max_tree_depth, numbers.Integral) + or self.max_tree_depth < 0 + ): + raise ValueError( + 'Parameter "max_tree_depth" must be ' "positive integer value or zero." + ) if self.shrinkage < 0 or self.shrinkage >= 1: - raise ValueError('Parameter "shrinkage" must be ' - 'more or equal to 0 and less than 1.') + raise ValueError( + 'Parameter "shrinkage" must be ' "more or equal to 0 and less than 1." + ) if self.min_split_loss < 0: - raise ValueError('Parameter "min_split_loss" must be ' - 'more or equal to zero.') + raise ValueError( + 'Parameter "min_split_loss" must be ' "more or equal to zero." + ) if self.reg_lambda < 0: - raise ValueError('Parameter "reg_lambda" must be ' - 'more or equal to zero.') - if self.observations_per_tree_fraction <= 0 or \ - self.observations_per_tree_fraction > 1: - raise ValueError('Parameter "observations_per_tree_fraction" must be ' - 'more than 0 and less or equal to 1.') - if not isinstance(self.features_per_node, numbers.Integral) or \ - self.features_per_node < 0: - raise ValueError('Parameter "features_per_node" must be ' - 'positive integer value or zero.') - if not isinstance(self.min_observations_in_leaf_node, numbers.Integral) or \ - self.min_observations_in_leaf_node <= 0: - raise ValueError('Parameter "min_observations_in_leaf_node" must be ' - 'non-zero positive integer value.') + raise ValueError('Parameter "reg_lambda" must be ' "more or equal to zero.") + if ( + self.observations_per_tree_fraction <= 0 + or self.observations_per_tree_fraction > 1 + ): + raise ValueError( + 'Parameter "observations_per_tree_fraction" must be ' + "more than 0 and less or equal to 1." + ) + if ( + not isinstance(self.features_per_node, numbers.Integral) + or self.features_per_node < 0 + ): + raise ValueError( + 'Parameter "features_per_node" must be ' "positive integer value or zero." + ) + if ( + not isinstance(self.min_observations_in_leaf_node, numbers.Integral) + or self.min_observations_in_leaf_node <= 0 + ): + raise ValueError( + 'Parameter "min_observations_in_leaf_node" must be ' + "non-zero positive integer value." + ) if not (isinstance(self.memory_saving_mode, bool)): - raise ValueError('Parameter "memory_saving_mode" must be ' - 'boolean value.') - if not isinstance(self.max_bins, numbers.Integral) or \ - self.max_bins <= 0: - raise ValueError('Parameter "max_bins" must be ' - 'non-zero positive integer value.') - if not isinstance(self.min_bin_size, numbers.Integral) or \ - self.min_bin_size <= 0: - raise ValueError('Parameter "min_bin_size" must be ' - 'non-zero positive integer value.') + raise ValueError('Parameter "memory_saving_mode" must be ' "boolean value.") + if not isinstance(self.max_bins, numbers.Integral) or self.max_bins <= 0: + raise ValueError( + 'Parameter "max_bins" must be ' "non-zero positive integer value." + ) + if not isinstance(self.min_bin_size, numbers.Integral) or self.min_bin_size <= 0: + raise ValueError( + 'Parameter "min_bin_size" must be ' "non-zero positive integer value." + ) allow_nan_ = False @@ -139,7 +160,7 @@ def fit(self, X, y): # Get random seed rs_ = check_random_state(self.random_state) - seed_ = rs_.randint(0, np.iinfo('i').max) + seed_ = rs_.randint(0, np.iinfo("i").max) # Define type of data fptype = getFPType(X) @@ -160,7 +181,8 @@ def fit(self, X, y): memorySavingMode=self.memory_saving_mode, maxBins=self.max_bins, minBinSize=self.min_bin_size, - engine=d4p.engines_mcg59(seed=seed_)) + engine=d4p.engines_mcg59(seed=seed_), + ) train_result = train_algo.compute(X, y_) # Store the model @@ -174,10 +196,10 @@ def _predict(self, X, resultsToEvaluate): if not self.allow_nan_: X = check_array(X, dtype=[np.single, np.double]) else: - X = check_array(X, dtype=[np.single, np.double], force_all_finite='allow-nan') + X = check_array(X, dtype=[np.single, np.double], force_all_finite="allow-nan") # Check is fit had been called - check_is_fitted(self, ['n_features_in_', 'n_classes_']) + check_is_fitted(self, ["n_features_in_", "n_classes_"]) # Trivial case if self.n_classes_ == 1: @@ -234,7 +256,7 @@ def fit(self, X, y): # Get random seed rs_ = check_random_state(self.random_state) - seed_ = rs_.randint(0, np.iinfo('i').max) + seed_ = rs_.randint(0, np.iinfo("i").max) # Define type of data fptype = getFPType(X) @@ -254,7 +276,8 @@ def fit(self, X, y): memorySavingMode=self.memory_saving_mode, maxBins=self.max_bins, minBinSize=self.min_bin_size, - engine=d4p.engines_mcg59(seed=seed_)) + engine=d4p.engines_mcg59(seed=seed_), + ) train_result = train_algo.compute(X, y_) # Store the model @@ -268,10 +291,10 @@ def predict(self, X): if not self.allow_nan_: X = check_array(X, dtype=[np.single, np.double]) else: - X = check_array(X, dtype=[np.single, np.double], force_all_finite='allow-nan') + X = check_array(X, dtype=[np.single, np.double], force_all_finite="allow-nan") # Check is fit had been called - check_is_fitted(self, ['n_features_in_']) + check_is_fitted(self, ["n_features_in_"]) fptype = getFPType(X) return self._predict_regression(X, fptype) diff --git a/daal4py/sklearn/ensemble/__init__.py b/daal4py/sklearn/ensemble/__init__.py index 15e97b423c..17e0d148d3 100644 --- a/daal4py/sklearn/ensemble/__init__.py +++ b/daal4py/sklearn/ensemble/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,11 +13,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from ._forest import (RandomForestClassifier, RandomForestRegressor) -from .GBTDAAL import (GBTDAALClassifier, GBTDAALRegressor) +from ._forest import RandomForestClassifier, RandomForestRegressor from .AdaBoostClassifier import AdaBoostClassifier +from .GBTDAAL import GBTDAALClassifier, GBTDAALRegressor -__all__ = ['RandomForestClassifier', 'RandomForestRegressor', - 'GBTDAALClassifier', 'GBTDAALRegressor', 'AdaBoostClassifier'] +__all__ = [ + "RandomForestClassifier", + "RandomForestRegressor", + "GBTDAALClassifier", + "GBTDAALRegressor", + "AdaBoostClassifier", +] diff --git a/daal4py/sklearn/ensemble/_forest.py b/daal4py/sklearn/ensemble/_forest.py index 031c23b050..2c08aa02f3 100755 --- a/daal4py/sklearn/ensemble/_forest.py +++ b/daal4py/sklearn/ensemble/_forest.py @@ -37,10 +37,10 @@ import daal4py from daal4py.sklearn._utils import ( PatchingConditionsChain, + check_tree_nodes, daal_check_version, - sklearn_check_version, getFPType, - check_tree_nodes, + sklearn_check_version, ) from .._device_offload import support_usm_ndarray diff --git a/daal4py/sklearn/linear_model/__init__.py b/daal4py/sklearn/linear_model/__init__.py index 4bfd932017..463e164575 100755 --- a/daal4py/sklearn/linear_model/__init__.py +++ b/daal4py/sklearn/linear_model/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,15 +13,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +from .coordinate_descent import ElasticNet, Lasso from .linear import LinearRegression -from .logistic_path import logistic_regression_path, LogisticRegression +from .logistic_path import LogisticRegression, logistic_regression_path from .ridge import Ridge -from .coordinate_descent import ElasticNet, Lasso -__all__ = ['Ridge', 'LinearRegression', - 'LogisticRegression', - 'logistic_regression_path', - 'ElasticNet', - 'Lasso'] +__all__ = [ + "Ridge", + "LinearRegression", + "LogisticRegression", + "logistic_regression_path", + "ElasticNet", + "Lasso", +] diff --git a/daal4py/sklearn/linear_model/_coordinate_descent.py b/daal4py/sklearn/linear_model/_coordinate_descent.py index a02f966ed6..fde5b25c27 100755 --- a/daal4py/sklearn/linear_model/_coordinate_descent.py +++ b/daal4py/sklearn/linear_model/_coordinate_descent.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,26 +12,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import numbers -import daal4py + +import numpy as np from scipy import sparse as sp -from sklearn.utils import check_array, check_X_y from sklearn.linear_model._coordinate_descent import ElasticNet as ElasticNet_original from sklearn.linear_model._coordinate_descent import Lasso as Lasso_original +from sklearn.utils import check_array, check_X_y + +import daal4py from daal4py.sklearn._utils import ( - make2d, getFPType, get_patch_message, sklearn_check_version, PatchingConditionsChain) -if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): + PatchingConditionsChain, + get_patch_message, + getFPType, + make2d, + sklearn_check_version, +) + +if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): from sklearn.linear_model._base import _deprecate_normalize -if sklearn_check_version('1.1') and not sklearn_check_version('1.2'): +if sklearn_check_version("1.1") and not sklearn_check_version("1.2"): from sklearn.utils import check_scalar import logging # only for compliance with Sklearn import warnings + from sklearn.exceptions import ConvergenceWarning from sklearn.preprocessing import normalize @@ -43,15 +52,22 @@ def _daal4py_check(self, X, y, check_input): # check alpha if self.alpha == 0: - warnings.warn("With alpha=0, this algorithm does not converge " - "well. You are advised to use the LinearRegression " - "estimator", stacklevel=2) + warnings.warn( + "With alpha=0, this algorithm does not converge " + "well. You are advised to use the LinearRegression " + "estimator", + stacklevel=2, + ) # check l1_ratio - if not isinstance(self.l1_ratio, numbers.Number) or \ - self.l1_ratio < 0 or self.l1_ratio > 1: - raise ValueError("l1_ratio must be between 0 and 1; " - f"got l1_ratio={self.l1_ratio}") + if ( + not isinstance(self.l1_ratio, numbers.Number) + or self.l1_ratio < 0 + or self.l1_ratio > 1 + ): + raise ValueError( + "l1_ratio must be between 0 and 1; " f"got l1_ratio={self.l1_ratio}" + ) # check precompute if isinstance(self.precompute, np.ndarray): @@ -59,17 +75,18 @@ def _daal4py_check(self, X, y, check_input): check_array(self.precompute, dtype=_fptype) self.precompute = make2d(self.precompute) else: - if self.precompute not in [False, True, 'auto']: - raise ValueError("precompute should be one of True, False, " - "'auto' or array-like. Got %r" % self.precompute) + if self.precompute not in [False, True, "auto"]: + raise ValueError( + "precompute should be one of True, False, " + "'auto' or array-like. Got %r" % self.precompute + ) # check selection - if self.selection not in ['random', 'cyclic']: + if self.selection not in ["random", "cyclic"]: raise ValueError("selection should be either random or cyclic.") def _daal4py_fit_enet(self, X, y_, check_input): - # appropriate checks _daal4py_check(self, X, y_, check_input) X = make2d(X) @@ -79,18 +96,18 @@ def _daal4py_fit_enet(self, X, y_, check_input): # only for dual_gap computation, it is not required for Intel(R) oneAPI # Data Analytics Library self._X = X - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): self.n_features_in_ = X.shape[1] self._y = y penalty_L1 = np.asarray(self.alpha * self.l1_ratio, dtype=X.dtype) penalty_L2 = np.asarray(self.alpha * (1.0 - self.l1_ratio), dtype=X.dtype) - if (penalty_L1.size != 1 or penalty_L2.size != 1): + if penalty_L1.size != 1 or penalty_L2.size != 1: raise ValueError("alpha or l1_ratio length is wrong") penalty_L1 = penalty_L1.reshape((1, -1)) penalty_L2 = penalty_L2.reshape((1, -1)) - #normalizing and centering + # normalizing and centering X_offset = np.zeros(X.shape[1], dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) if y.ndim == 1: @@ -98,10 +115,10 @@ def _daal4py_fit_enet(self, X, y_, check_input): else: y_offset = np.zeros(y.shape[1], dtype=X.dtype) - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _normalize = False else: - _normalize = self._normalize if sklearn_check_version('1.0') else self.normalize + _normalize = self._normalize if sklearn_check_version("1.0") else self.normalize if self.fit_intercept: X_offset = np.average(X, axis=0) if _normalize: @@ -114,25 +131,29 @@ def _daal4py_fit_enet(self, X, y_, check_input): y = y - y_offset # only for compliance with Sklearn - if isinstance(self.precompute, np.ndarray) and self.fit_intercept and \ - not np.allclose(X_offset, np.zeros(X.shape[1])) or \ - _normalize and not np.allclose(X_scale, np.ones(X.shape[1])): - warnings.warn("Gram matrix was provided but X was centered" - " to fit intercept, " - "or X was normalized : recomputing Gram matrix.", - UserWarning) + if ( + isinstance(self.precompute, np.ndarray) + and self.fit_intercept + and not np.allclose(X_offset, np.zeros(X.shape[1])) + or _normalize + and not np.allclose(X_scale, np.ones(X.shape[1])) + ): + warnings.warn( + "Gram matrix was provided but X was centered" + " to fit intercept, " + "or X was normalized : recomputing Gram matrix.", + UserWarning, + ) mse_alg = daal4py.optimization_solver_mse( - numberOfTerms=X.shape[0], - fptype=_fptype, - method='defaultDense' + numberOfTerms=X.shape[0], fptype=_fptype, method="defaultDense" ) mse_alg.setup(X, y, None) cd_solver = daal4py.optimization_solver_coordinate_descent( function=mse_alg, fptype=_fptype, - method='defaultDense', + method="defaultDense", selection=self.selection, seed=0 if self.random_state is None else self.random_state, nIterations=self.max_iter, @@ -141,36 +162,37 @@ def _daal4py_fit_enet(self, X, y_, check_input): ) # set warm_start - if self.warm_start and hasattr(self, "coef_") and \ - isinstance(self.coef_, np.ndarray): + if self.warm_start and hasattr(self, "coef_") and isinstance(self.coef_, np.ndarray): n_rows = y.shape[1] n_cols = X.shape[1] + 1 inputArgument = np.zeros((n_rows, n_cols), dtype=_fptype) for i in range(n_rows): - inputArgument[i][0] = self.intercept_ if ( - n_rows == 1) else self.intercept_[i] - inputArgument[i][1:] = self.coef_[:].copy(order='C') if ( - n_rows == 1) else self.coef_[i, :].copy(order='C') + inputArgument[i][0] = self.intercept_ if (n_rows == 1) else self.intercept_[i] + inputArgument[i][1:] = ( + self.coef_[:].copy(order="C") + if (n_rows == 1) + else self.coef_[i, :].copy(order="C") + ) cd_solver.setup(inputArgument) - doUse_condition = self.copy_X is False or \ - (self.fit_intercept and _normalize and self.copy_X) + doUse_condition = self.copy_X is False or ( + self.fit_intercept and _normalize and self.copy_X + ) elastic_net_alg = daal4py.elastic_net_training( fptype=_fptype, - method='defaultDense', - interceptFlag=( - self.fit_intercept is True), - dataUseInComputation='doUse' if doUse_condition else 'doNotUse', + method="defaultDense", + interceptFlag=(self.fit_intercept is True), + dataUseInComputation="doUse" if doUse_condition else "doNotUse", penaltyL1=penalty_L1, penaltyL2=penalty_L2, - optimizationSolver=cd_solver + optimizationSolver=cd_solver, ) try: if isinstance(self.precompute, np.ndarray): elastic_net_res = elastic_net_alg.compute( - data=X, dependentVariables=y, gramMatrix=self.precompute) + data=X, dependentVariables=y, gramMatrix=self.precompute + ) else: - elastic_net_res = elastic_net_alg.compute( - data=X, dependentVariables=y) + elastic_net_res = elastic_net_alg.compute(data=X, dependentVariables=y) except RuntimeError: return None @@ -182,12 +204,13 @@ def _daal4py_fit_enet(self, X, y_, check_input): if self.fit_intercept and _normalize: elastic_net_model.Beta[:, 1:] = elastic_net_model.Beta[:, 1:] / X_scale elastic_net_model.Beta[:, 0] = ( - y_offset - np.dot(X_offset, elastic_net_model.Beta[:, 1:].T)).T + y_offset - np.dot(X_offset, elastic_net_model.Beta[:, 1:].T) + ).T coefs = elastic_net_model.Beta - self.intercept_ = coefs[:, 0].copy(order='C') - self.coef_ = coefs[:, 1:].copy(order='C') + self.intercept_ = coefs[:, 0].copy(order="C") + self.coef_ = coefs[:, 1:].copy(order="C") # only for compliance with Sklearn if y.shape[1] == 1: @@ -205,8 +228,11 @@ def _daal4py_fit_enet(self, X, y_, check_input): # only for compliance with Sklearn if self.max_iter == n_iter + 1: - warnings.warn("Objective did not converge. You might want to " - "increase the number of iterations.", ConvergenceWarning) + warnings.warn( + "Objective did not converge. You might want to " + "increase the number of iterations.", + ConvergenceWarning, + ) return self @@ -216,14 +242,15 @@ def _daal4py_predict_enet(self, X): _fptype = getFPType(self.coef_) elastic_net_palg = daal4py.elastic_net_prediction( - fptype=_fptype, - method='defaultDense' + fptype=_fptype, method="defaultDense" ) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): if self.n_features_in_ != X.shape[1]: - raise ValueError(f'X has {X.shape[1]} features, ' - f'but ElasticNet is expecting ' - f'{self.n_features_in_} features as input') + raise ValueError( + f"X has {X.shape[1]} features, " + f"but ElasticNet is expecting " + f"{self.n_features_in_} features as input" + ) elastic_net_res = elastic_net_palg.compute(X, self.daal_model_) res = elastic_net_res.prediction @@ -234,7 +261,6 @@ def _daal4py_predict_enet(self, X): def _daal4py_fit_lasso(self, X, y_, check_input): - # appropriate checks _daal4py_check(self, X, y_, check_input) X = make2d(X) @@ -244,11 +270,11 @@ def _daal4py_fit_lasso(self, X, y_, check_input): # only for dual_gap computation, it is not required for Intel(R) oneAPI # Data Analytics Library self._X = X - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): self.n_features_in_ = X.shape[1] self._y = y - #normalizing and centering + # normalizing and centering X_offset = np.zeros(X.shape[1], dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) if y.ndim == 1: @@ -256,10 +282,10 @@ def _daal4py_fit_lasso(self, X, y_, check_input): else: y_offset = np.zeros(y.shape[1], dtype=X.dtype) - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _normalize = False else: - _normalize = self._normalize if sklearn_check_version('1.0') else self.normalize + _normalize = self._normalize if sklearn_check_version("1.0") else self.normalize if self.fit_intercept: X_offset = np.average(X, axis=0) if _normalize: @@ -272,61 +298,65 @@ def _daal4py_fit_lasso(self, X, y_, check_input): y = y - y_offset # only for compliance with Sklearn - if isinstance(self.precompute, np.ndarray) and \ - self.fit_intercept and not np.allclose( - X_offset, np.zeros(X.shape[1])) or \ - _normalize and not np.allclose(X_scale, np.ones(X.shape[1])): - warnings.warn("Gram matrix was provided but X was centered" - " to fit intercept, " - "or X was normalized : recomputing Gram matrix.", - UserWarning) + if ( + isinstance(self.precompute, np.ndarray) + and self.fit_intercept + and not np.allclose(X_offset, np.zeros(X.shape[1])) + or _normalize + and not np.allclose(X_scale, np.ones(X.shape[1])) + ): + warnings.warn( + "Gram matrix was provided but X was centered" + " to fit intercept, " + "or X was normalized : recomputing Gram matrix.", + UserWarning, + ) mse_alg = daal4py.optimization_solver_mse( - numberOfTerms=X.shape[0], - fptype=_fptype, - method='defaultDense' + numberOfTerms=X.shape[0], fptype=_fptype, method="defaultDense" ) mse_alg.setup(X, y, None) cd_solver = daal4py.optimization_solver_coordinate_descent( function=mse_alg, fptype=_fptype, - method='defaultDense', + method="defaultDense", selection=self.selection, seed=0 if self.random_state is None else self.random_state, nIterations=self.max_iter, positive=self.positive, - accuracyThreshold=self.tol + accuracyThreshold=self.tol, ) # set warm_start - if self.warm_start and hasattr(self, "coef_") and \ - isinstance(self.coef_, np.ndarray): + if self.warm_start and hasattr(self, "coef_") and isinstance(self.coef_, np.ndarray): n_rows = y.shape[1] n_cols = X.shape[1] + 1 inputArgument = np.zeros((n_rows, n_cols), dtype=_fptype) for i in range(n_rows): - inputArgument[i][0] = self.intercept_ if ( - n_rows == 1) else self.intercept_[i] - inputArgument[i][1:] = self.coef_[:].copy(order='C') if ( - n_rows == 1) else self.coef_[i, :].copy(order='C') + inputArgument[i][0] = self.intercept_ if (n_rows == 1) else self.intercept_[i] + inputArgument[i][1:] = ( + self.coef_[:].copy(order="C") + if (n_rows == 1) + else self.coef_[i, :].copy(order="C") + ) cd_solver.setup(inputArgument) - doUse_condition = self.copy_X is False or \ - (self.fit_intercept and _normalize and self.copy_X) + doUse_condition = self.copy_X is False or ( + self.fit_intercept and _normalize and self.copy_X + ) lasso_alg = daal4py.lasso_regression_training( fptype=_fptype, - method='defaultDense', + method="defaultDense", interceptFlag=(self.fit_intercept is True), - dataUseInComputation='doUse' if doUse_condition else 'doNotUse', - lassoParameters=np.asarray( - self.alpha, dtype=X.dtype - ).reshape((1, -1)), + dataUseInComputation="doUse" if doUse_condition else "doNotUse", + lassoParameters=np.asarray(self.alpha, dtype=X.dtype).reshape((1, -1)), optimizationSolver=cd_solver, ) try: if isinstance(self.precompute, np.ndarray): lasso_res = lasso_alg.compute( - data=X, dependentVariables=y, gramMatrix=self.precompute) + data=X, dependentVariables=y, gramMatrix=self.precompute + ) else: lasso_res = lasso_alg.compute(data=X, dependentVariables=y) except RuntimeError: @@ -339,13 +369,14 @@ def _daal4py_fit_lasso(self, X, y_, check_input): # update coefficients if normalizing and centering if self.fit_intercept and _normalize: lasso_model.Beta[:, 1:] = lasso_model.Beta[:, 1:] / X_scale - lasso_model.Beta[:, 0] = \ - (y_offset - np.dot(X_offset, lasso_model.Beta[:, 1:].T)).T + lasso_model.Beta[:, 0] = ( + y_offset - np.dot(X_offset, lasso_model.Beta[:, 1:].T) + ).T coefs = lasso_model.Beta - self.intercept_ = coefs[:, 0].copy(order='C') - self.coef_ = coefs[:, 1:].copy(order='C') + self.intercept_ = coefs[:, 0].copy(order="C") + self.coef_ = coefs[:, 1:].copy(order="C") # only for compliance with Sklearn if y.shape[1] == 1: @@ -362,9 +393,12 @@ def _daal4py_fit_lasso(self, X, y_, check_input): self.n_iter_ = np.full(y.shape[1], n_iter) # only for compliance with Sklearn - if (self.max_iter == n_iter + 1): - warnings.warn("Objective did not converge. You might want to " - "increase the number of iterations.", ConvergenceWarning) + if self.max_iter == n_iter + 1: + warnings.warn( + "Objective did not converge. You might want to " + "increase the number of iterations.", + ConvergenceWarning, + ) return self @@ -374,14 +408,15 @@ def _daal4py_predict_lasso(self, X): _fptype = getFPType(self.coef_) lasso_palg = daal4py.lasso_regression_prediction( - fptype=_fptype, - method='defaultDense' + fptype=_fptype, method="defaultDense" ) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): if self.n_features_in_ != X.shape[1]: - raise ValueError(f'X has {X.shape[1]} features, ' - f'but Lasso is expecting ' - f'{self.n_features_in_} features as input') + raise ValueError( + f"X has {X.shape[1]} features, " + f"but Lasso is expecting " + f"{self.n_features_in_} features as input" + ) lasso_res = lasso_palg.compute(X, self.daal_model_) res = lasso_res.prediction @@ -392,11 +427,11 @@ def _daal4py_predict_lasso(self, X): def _fit(self, X, y, sample_weight=None, check_input=True): - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - elif sklearn_check_version('1.1'): + elif sklearn_check_version("1.1"): check_scalar( self.alpha, "alpha", @@ -433,7 +468,7 @@ def _fit(self, X, y, sample_weight=None, check_input=True): X, y, copy=False, - accept_sparse='csc', + accept_sparse="csc", dtype=[np.float64, np.float32], multi_output=True, y_numeric=True, @@ -441,37 +476,44 @@ def _fit(self, X, y, sample_weight=None, check_input=True): y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False) if not sp.issparse(X): - self.fit_shape_good_for_daal_ = \ + self.fit_shape_good_for_daal_ = ( True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False + ) else: self.fit_shape_good_for_daal_ = False class_name = self.__class__.__name__ - class_inst = ElasticNet if class_name == 'ElasticNet' else Lasso + class_inst = ElasticNet if class_name == "ElasticNet" else Lasso _function_name = f"sklearn.linear_model.{class_name}.fit" - _patching_status = PatchingConditionsChain( - _function_name) - _dal_ready = _patching_status.and_conditions([ - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.fit_shape_good_for_daal_, - "The shape of X does not satisfy oneDAL requirements: " - "number of features > number of samples."), - (X.dtype == np.float64 or X.dtype == np.float32, - f"'{X.dtype}' X data type is not supported. " - "Only np.float32 and np.float64 are supported."), - (sample_weight is None, "Sample weights are not supported.")]) + _patching_status = PatchingConditionsChain(_function_name) + _dal_ready = _patching_status.and_conditions( + [ + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.fit_shape_good_for_daal_, + "The shape of X does not satisfy oneDAL requirements: " + "number of features > number of samples.", + ), + ( + X.dtype == np.float64 or X.dtype == np.float32, + f"'{X.dtype}' X data type is not supported. " + "Only np.float32 and np.float64 are supported.", + ), + (sample_weight is None, "Sample weights are not supported."), + ] + ) _patching_status.write_log() if not _dal_ready: - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): res_new = super(class_inst, self).fit( - X, y, sample_weight=sample_weight, check_input=check_input) + X, y, sample_weight=sample_weight, check_input=check_input + ) else: - res_new = super(class_inst, self).fit( - X, y, check_input=check_input) + res_new = super(class_inst, self).fit(X, y, check_input=check_input) self._gap = res_new.dual_gap_ return res_new self.n_iter_ = None @@ -481,17 +523,14 @@ def _fit(self, X, y, sample_weight=None, check_input=True): # only for compliance with Sklearn, # this assert is not required for Intel(R) oneAPI Data # Analytics Library - print(type(X), X.flags['F_CONTIGUOUS']) - if isinstance(X, np.ndarray) and \ - X.flags['F_CONTIGUOUS'] is False: + print(type(X), X.flags["F_CONTIGUOUS"]) + if isinstance(X, np.ndarray) and X.flags["F_CONTIGUOUS"] is False: # print(X.flags) raise ValueError("ndarray is not Fortran contiguous") - if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): self._normalize = _deprecate_normalize( - self.normalize, - default=False, - estimator_name=class_name + self.normalize, default=False, estimator_name=class_name ) # only for pass tests @@ -507,29 +546,27 @@ def _fit(self, X, y, sample_weight=None, check_input=True): else: res = _daal4py_fit_lasso(self, X, y, check_input=check_input) if res is None: - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - logging.info( - _function_name + ": " + get_patch_message("sklearn_after_daal") - ) - if sklearn_check_version('0.23'): + logging.info(_function_name + ": " + get_patch_message("sklearn_after_daal")) + if sklearn_check_version("0.23"): res_new = super(class_inst, self).fit( - X, y, sample_weight=sample_weight, check_input=check_input) + X, y, sample_weight=sample_weight, check_input=check_input + ) else: - res_new = super(class_inst, self).fit( - X, y, check_input=check_input) + res_new = super(class_inst, self).fit(X, y, check_input=check_input) self._gap = res_new.dual_gap_ return res_new return res def _dual_gap(self): - if (self._gap is None): + if self._gap is None: l1_reg = self.alpha * self.l1_ratio * self._X.shape[0] l2_reg = self.alpha * (1.0 - self.l1_ratio) * self._X.shape[0] n_targets = self._y.shape[1] - if (n_targets == 1): + if n_targets == 1: self._gap = self.tol + 1.0 X_offset = np.average(self._X, axis=0) y_offset = np.average(self._y, axis=0) @@ -538,11 +575,10 @@ def _dual_gap(self): XtA = np.dot((self._X - X_offset).T, R) - l2_reg * coef R_norm2 = np.dot(R.T, R) coef_norm2 = np.dot(self.coef_, self.coef_) - dual_norm_XtA = np.max( - XtA) if self.positive else np.max(np.abs(XtA)) + dual_norm_XtA = np.max(XtA) if self.positive else np.max(np.abs(XtA)) if dual_norm_XtA > l1_reg: const = l1_reg / dual_norm_XtA - A_norm2 = R_norm2 * (const ** 2) + A_norm2 = R_norm2 * (const**2) self._gap = 0.5 * (R_norm2 + A_norm2) else: const = 1.0 @@ -550,7 +586,7 @@ def _dual_gap(self): l1_norm = np.sum(np.abs(self.coef_)) tmp = l1_reg * l1_norm tmp -= const * np.dot(R.T, (self._y - y_offset)) - tmp += 0.5 * l2_reg * (1 + const ** 2) * coef_norm2 + tmp += 0.5 * l2_reg * (1 + const**2) * coef_norm2 self._gap += tmp self._gap = self._gap[0][0] else: @@ -558,17 +594,16 @@ def _dual_gap(self): X_offset = np.average(self._X, axis=0) y_offset = np.average(self._y, axis=0) for k in range(n_targets): - R = (self._y[:, k] - y_offset[k]) - \ - np.dot((self._X - X_offset), self.coef_[k, :].T) - XtA = np.dot((self._X - X_offset).T, R) - \ - l2_reg * self.coef_[k, :].T + R = (self._y[:, k] - y_offset[k]) - np.dot( + (self._X - X_offset), self.coef_[k, :].T + ) + XtA = np.dot((self._X - X_offset).T, R) - l2_reg * self.coef_[k, :].T R_norm2 = np.dot(R.T, R) coef_norm2 = np.dot(self.coef_[k, :], self.coef_[k, :].T) - dual_norm_XtA = np.max( - XtA) if self.positive else np.max(np.abs(XtA)) + dual_norm_XtA = np.max(XtA) if self.positive else np.max(np.abs(XtA)) if dual_norm_XtA > l1_reg: const = l1_reg / dual_norm_XtA - A_norm2 = R_norm2 * (const ** 2) + A_norm2 = R_norm2 * (const**2) self._gap[k] = 0.5 * (R_norm2 + A_norm2) else: const = 1.0 @@ -576,7 +611,7 @@ def _dual_gap(self): l1_norm = np.sum(np.abs(self.coef_[k, :])) tmp = l1_reg * l1_norm tmp -= const * np.dot(R.T, (self._y[:, k] - y_offset[k])) - tmp += 0.5 * l2_reg * (1 + const ** 2) * coef_norm2 + tmp += 0.5 * l2_reg * (1 + const**2) * coef_norm2 self._gap[k] += tmp return self._gap @@ -584,7 +619,7 @@ def _dual_gap(self): class ElasticNet(ElasticNet_original): __doc__ = ElasticNet_original.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**ElasticNet_original._parameter_constraints} def __init__( @@ -599,7 +634,7 @@ def __init__( warm_start=False, positive=False, random_state=None, - selection='cyclic', + selection="cyclic", ): super(ElasticNet, self).__init__( alpha=alpha, @@ -614,13 +649,15 @@ def __init__( random_state=random_state, selection=selection, ) + else: + def __init__( self, alpha=1.0, l1_ratio=0.5, fit_intercept=True, - normalize="deprecated" if sklearn_check_version('1.0') else False, + normalize="deprecated" if sklearn_check_version("1.0") else False, precompute=False, max_iter=1000, copy_X=True, @@ -628,7 +665,7 @@ def __init__( warm_start=False, positive=False, random_state=None, - selection='cyclic', + selection="cyclic", ): super(ElasticNet, self).__init__( alpha=alpha, @@ -645,7 +682,8 @@ def __init__( selection=selection, ) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): + @support_usm_ndarray() def fit(self, X, y, sample_weight=None, check_input=True): """ @@ -685,7 +723,9 @@ def fit(self, X, y, sample_weight=None, check_input=True): initial data in memory directly using that format. """ return _fit(self, X, y, sample_weight=sample_weight, check_input=check_input) + else: + @support_usm_ndarray() def fit(self, X, y, check_input=True): """ @@ -730,25 +770,30 @@ def predict(self, X): Returns predicted values. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) X = check_array( - X, - accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float64, np.float32] + X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float64, np.float32] ) - good_shape_for_daal = \ + good_shape_for_daal = ( True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False + ) _patching_status = PatchingConditionsChain( - "sklearn.linear_model.ElasticNet.predict") - _dal_ready = _patching_status.and_conditions([ - (hasattr(self, 'daal_model_'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (good_shape_for_daal, - "The shape of X does not satisfy oneDAL requirements: " - "number of features > number of samples.")]) + "sklearn.linear_model.ElasticNet.predict" + ) + _dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "daal_model_"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + good_shape_for_daal, + "The shape of X does not satisfy oneDAL requirements: " + "number of features > number of samples.", + ), + ] + ) _patching_status.write_log() if not _dal_ready: @@ -771,7 +816,8 @@ def dual_gap_(self): class Lasso(Lasso_original): __doc__ = Lasso_original.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): + def __init__( self, alpha=1.0, @@ -783,7 +829,7 @@ def __init__( warm_start=False, positive=False, random_state=None, - selection='cyclic', + selection="cyclic", ): self.l1_ratio = 1.0 super().__init__( @@ -798,12 +844,14 @@ def __init__( random_state=random_state, selection=selection, ) + else: + def __init__( self, alpha=1.0, fit_intercept=True, - normalize="deprecated" if sklearn_check_version('1.0') else False, + normalize="deprecated" if sklearn_check_version("1.0") else False, precompute=False, copy_X=True, max_iter=1000, @@ -811,7 +859,7 @@ def __init__( warm_start=False, positive=False, random_state=None, - selection='cyclic', + selection="cyclic", ): self.l1_ratio = 1.0 super().__init__( @@ -828,7 +876,8 @@ def __init__( selection=selection, ) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): + @support_usm_ndarray() def fit(self, X, y, sample_weight=None, check_input=True): """ @@ -868,7 +917,9 @@ def fit(self, X, y, sample_weight=None, check_input=True): initial data in memory directly using that format. """ return _fit(self, X, y, sample_weight, check_input) + else: + @support_usm_ndarray() def fit(self, X, y, check_input=True): """ @@ -912,24 +963,27 @@ def predict(self, X): C : array, shape = (n_samples,) Returns predicted values. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) X = check_array( - X, - accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float64, np.float32] + X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float64, np.float32] ) - good_shape_for_daal = \ + good_shape_for_daal = ( True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False + ) - _patching_status = PatchingConditionsChain( - "sklearn.linear_model.Lasso.predict") - _dal_ready = _patching_status.and_conditions([ - (hasattr(self, 'daal_model_'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (good_shape_for_daal, - "The shape of X does not satisfy oneDAL requirements: " - "number of features > number of samples.")]) + _patching_status = PatchingConditionsChain("sklearn.linear_model.Lasso.predict") + _dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "daal_model_"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + good_shape_for_daal, + "The shape of X does not satisfy oneDAL requirements: " + "number of features > number of samples.", + ), + ] + ) _patching_status.write_log() if not _dal_ready: diff --git a/daal4py/sklearn/linear_model/_linear.py b/daal4py/sklearn/linear_model/_linear.py index 9883958fdb..d7044b585d 100644 --- a/daal4py/sklearn/linear_model/_linear.py +++ b/daal4py/sklearn/linear_model/_linear.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from scipy import sparse as sp +from sklearn.linear_model import LinearRegression as LinearRegression_original +from sklearn.utils import check_array -from ..utils.validation import _daal_check_array, _daal_check_X_y -from ..utils.base import _daal_validate_data -from .._utils import sklearn_check_version from .._device_offload import support_usm_ndarray -from sklearn.utils import check_array +from .._utils import sklearn_check_version +from ..utils.base import _daal_validate_data +from ..utils.validation import _daal_check_array, _daal_check_X_y -from sklearn.linear_model import LinearRegression as LinearRegression_original -if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): +if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): from sklearn.linear_model._base import _deprecate_normalize try: @@ -32,15 +32,18 @@ except ImportError: from sklearn.externals.joblib import Parallel, delayed +import logging + import daal4py + from .._utils import ( - make2d, - getFPType, + PatchingConditionsChain, + get_dtype, get_patch_message, + getFPType, is_DataFrame, - get_dtype, - PatchingConditionsChain) -import logging + make2d, +) def _daal4py_fit(self, X, y_): @@ -49,18 +52,14 @@ def _daal4py_fit(self, X, y_): try: lr_algorithm = daal4py.linear_regression_training( - fptype=X_fptype, - interceptFlag=bool(self.fit_intercept), - method='defaultDense' + fptype=X_fptype, interceptFlag=bool(self.fit_intercept), method="defaultDense" ) lr_res = lr_algorithm.compute(X, y) except RuntimeError: # Normal system is not invertible, try QR try: lr_algorithm = daal4py.linear_regression_training( - fptype=X_fptype, - interceptFlag=bool(self.fit_intercept), - method='qrDense' + fptype=X_fptype, interceptFlag=bool(self.fit_intercept), method="qrDense" ) lr_res = lr_algorithm.compute(X, y) except RuntimeError: @@ -71,8 +70,8 @@ def _daal4py_fit(self, X, y_): self.daal_model_ = lr_model coefs = lr_model.Beta - self.intercept_ = coefs[:, 0].copy(order='C') - self.coef_ = coefs[:, 1:].copy(order='C') + self.intercept_ = coefs[:, 0].copy(order="C") + self.coef_ = coefs[:, 1:].copy(order="C") self.n_features_in_ = X.shape[1] self.rank_ = X.shape[1] self.singular_ = np.full((X.shape[1],), np.nan) @@ -87,21 +86,19 @@ def _daal4py_fit(self, X, y_): def _daal4py_predict(self, X): X = make2d(X) _fptype = getFPType(self.coef_) - lr_pred = daal4py.linear_regression_prediction( - fptype=_fptype, - method='defaultDense' - ) - if sklearn_check_version('0.23'): + lr_pred = daal4py.linear_regression_prediction(fptype=_fptype, method="defaultDense") + if sklearn_check_version("0.23"): if X.shape[1] != self.n_features_in_: raise ValueError( - f'X has {X.shape[1]} features, ' - f'but LinearRegression is expecting ' - f'{self.n_features_in_} features as input') + f"X has {X.shape[1]} features, " + f"but LinearRegression is expecting " + f"{self.n_features_in_} features as input" + ) try: lr_res = lr_pred.compute(X, self.daal_model_) except RuntimeError: raise ValueError( - f'Input data shape {X.shape} is inconsistent with the trained model' + f"Input data shape {X.shape} is inconsistent with the trained model" ) res = lr_res.prediction if res.shape[1] == 1 and self.coef_.ndim == 1: @@ -134,13 +131,13 @@ def _fit_linear(self, X, y, sample_weight=None): """ params = { - 'X': X, - 'y': y, - 'accept_sparse': ['csr', 'csc', 'coo'], - 'y_numeric': True, - 'multi_output': True, + "X": X, + "y": y, + "accept_sparse": ["csr", "csc", "coo"], + "y_numeric": True, + "multi_output": True, } - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): X, y = _daal_validate_data( self, dtype=[np.float64, np.float32], @@ -151,23 +148,35 @@ def _fit_linear(self, X, y, sample_weight=None): dtype = get_dtype(X) - self.fit_shape_good_for_daal_ = \ - bool(X.shape[0] > X.shape[1] + int(self.fit_intercept)) + self.fit_shape_good_for_daal_ = bool( + X.shape[0] > X.shape[1] + int(self.fit_intercept) + ) _patching_status = PatchingConditionsChain( - "sklearn.linear_model.LinearRegression.fit") - _patching_status.and_conditions([ - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.fit_shape_good_for_daal_, - "The shape of X does not satisfy oneDAL requirements: " - "number of features + 1 >= number of samples."), - (sample_weight is None, "Sample weights are not supported.")]) - - if sklearn_check_version('0.22') and not sklearn_check_version('0.23'): - _patching_status.and_conditions([ - (dtype in [np.float32, np.float64], - f"'{X.dtype}' X data type is not supported. " - "Only np.float32 and np.float64 are supported.")]) + "sklearn.linear_model.LinearRegression.fit" + ) + _patching_status.and_conditions( + [ + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.fit_shape_good_for_daal_, + "The shape of X does not satisfy oneDAL requirements: " + "number of features + 1 >= number of samples.", + ), + (sample_weight is None, "Sample weights are not supported."), + ] + ) + + if sklearn_check_version("0.22") and not sklearn_check_version("0.23"): + _patching_status.and_conditions( + [ + ( + dtype in [np.float32, np.float64], + f"'{X.dtype}' X data type is not supported. " + "Only np.float32 and np.float64 are supported.", + ) + ] + ) _dal_ready = _patching_status.get_status() _patching_status.write_log() @@ -177,7 +186,8 @@ def _fit_linear(self, X, y, sample_weight=None): return res logging.info( "sklearn.linar_model.LinearRegression." - "fit: " + get_patch_message("sklearn_after_daal")) + "fit: " + get_patch_message("sklearn_after_daal") + ) return super(LinearRegression, self).fit( X, @@ -199,30 +209,44 @@ def _predict_linear(self, X): C : array, shape = (n_samples,) Returns predicted values. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) is_df = is_DataFrame(X) - if sklearn_check_version('0.23'): - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + if sklearn_check_version("0.23"): + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) X = np.asarray(X) if not sp.issparse(X) and not is_df else X - good_shape_for_daal = \ + good_shape_for_daal = ( True if X.ndim <= 1 else True if X.shape[0] > X.shape[1] else False + ) _patching_status = PatchingConditionsChain( - "sklearn.linear_model.LinearRegression.predict") - _dal_ready = _patching_status.and_conditions([ - (hasattr(self, 'daal_model_'), 'oneDAL model was not trained.'), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (good_shape_for_daal, - "The shape of X does not satisfy oneDAL requirements: " - "Number of features >= number of samples."), - (not hasattr(self, 'sample_weight_') or self.sample_weight_ is None, - "Sample weights are not supported.")]) - if hasattr(self, 'fit_shape_good_for_daal_'): - _dal_ready = _patching_status.and_conditions([ - (self.fit_shape_good_for_daal_, - "The shape of X (fitting) does not satisfy oneDAL requirements: " - "Number of features + 1 >= number of samples.")]) + "sklearn.linear_model.LinearRegression.predict" + ) + _dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "daal_model_"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + good_shape_for_daal, + "The shape of X does not satisfy oneDAL requirements: " + "Number of features >= number of samples.", + ), + ( + not hasattr(self, "sample_weight_") or self.sample_weight_ is None, + "Sample weights are not supported.", + ), + ] + ) + if hasattr(self, "fit_shape_good_for_daal_"): + _dal_ready = _patching_status.and_conditions( + [ + ( + self.fit_shape_good_for_daal_, + "The shape of X (fitting) does not satisfy oneDAL requirements: " + "Number of features + 1 >= number of samples.", + ) + ] + ) _patching_status.write_log() if not _dal_ready: return self._decision_function(X) @@ -233,7 +257,7 @@ def _predict_linear(self, X): class LinearRegression(LinearRegression_original): __doc__ = LinearRegression_original.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **LinearRegression_original._parameter_constraints } @@ -251,11 +275,13 @@ def __init__( n_jobs=n_jobs, positive=positive, ) - elif sklearn_check_version('0.24'): + + elif sklearn_check_version("0.24"): + def __init__( self, fit_intercept=True, - normalize='deprecated' if sklearn_check_version('1.0') else False, + normalize="deprecated" if sklearn_check_version("1.0") else False, copy_X=True, n_jobs=None, positive=False, @@ -267,7 +293,9 @@ def __init__( n_jobs=n_jobs, positive=positive, ) + else: + def __init__( self, fit_intercept=True, @@ -279,7 +307,7 @@ def __init__( fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, - n_jobs=n_jobs + n_jobs=n_jobs, ) @support_usm_ndarray() @@ -305,23 +333,29 @@ def fit(self, X, y, sample_weight=None): self : object Fitted Estimator. """ - if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): self._normalize = _deprecate_normalize( self.normalize, default=False, estimator_name=self.__class__.__name__, ) - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - if sklearn_check_version('0.24'): + if sklearn_check_version("0.24"): _patching_status = PatchingConditionsChain( - "sklearn.linear_model.LinearRegression.fit") - _dal_ready = _patching_status.and_conditions([ - (self.positive is False, - "Forced positive coefficients are not supported.")]) + "sklearn.linear_model.LinearRegression.fit" + ) + _dal_ready = _patching_status.and_conditions( + [ + ( + self.positive is False, + "Forced positive coefficients are not supported.", + ) + ] + ) if not _dal_ready: _patching_status.write_log() return super(LinearRegression, self).fit( diff --git a/daal4py/sklearn/linear_model/_ridge.py b/daal4py/sklearn/linear_model/_ridge.py index b3bf466aca..06fd2ade02 100644 --- a/daal4py/sklearn/linear_model/_ridge.py +++ b/daal4py/sklearn/linear_model/_ridge.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,25 +12,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +import logging import numbers + import numpy as np from scipy import sparse as sp -from sklearn.utils import check_array, check_X_y -from sklearn.linear_model._ridge import _BaseRidge from sklearn.linear_model._ridge import Ridge as Ridge_original +from sklearn.linear_model._ridge import _BaseRidge +from sklearn.utils import check_array, check_X_y import daal4py -from .._utils import ( - make2d, getFPType, get_patch_message, sklearn_check_version, - PatchingConditionsChain) -from .._device_offload import support_usm_ndarray -import logging -if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): +from .._device_offload import support_usm_ndarray +from .._utils import ( + PatchingConditionsChain, + get_patch_message, + getFPType, + make2d, + sklearn_check_version, +) + +if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): from sklearn.linear_model._base import _deprecate_normalize -if sklearn_check_version('1.1') and not sklearn_check_version('1.2'): +if sklearn_check_version("1.1") and not sklearn_check_version("1.2"): from sklearn.utils import check_scalar @@ -44,14 +50,15 @@ def _daal4py_fit(self, X, y_): if ridge_params.size != 1 and ridge_params.size != y.shape[1]: raise ValueError( "Number of targets and number of penalties do not correspond: " - f"{ridge_params.size} != {y.shape[1]}") + f"{ridge_params.size} != {y.shape[1]}" + ) ridge_params = ridge_params.reshape((1, -1)) ridge_alg = daal4py.ridge_regression_training( fptype=_fptype, - method='defaultDense', + method="defaultDense", interceptFlag=(self.fit_intercept is True), - ridgeParameters=ridge_params + ridgeParameters=ridge_params, ) try: ridge_res = ridge_alg.compute(X, y) @@ -62,8 +69,8 @@ def _daal4py_fit(self, X, y_): self.daal_model_ = ridge_model coefs = ridge_model.Beta - self.intercept_ = coefs[:, 0].copy(order='C') - self.coef_ = coefs[:, 1:].copy(order='C') + self.intercept_ = coefs[:, 0].copy(order="C") + self.coef_ = coefs[:, 1:].copy(order="C") if self.coef_.shape[0] == 1 and y_.ndim == 1: self.coef_ = np.ravel(self.coef_) @@ -77,13 +84,12 @@ def _daal4py_predict(self, X): _fptype = getFPType(self.coef_) ridge_palg = daal4py.ridge_regression_prediction( - fptype=_fptype, - method='defaultDense' + fptype=_fptype, method="defaultDense" ) if self.n_features_in_ != X.shape[1]: raise ValueError( - f'X has {X.shape[1]} features, ' - f'but Ridge is expecting {self.n_features_in_} features as input' + f"X has {X.shape[1]} features, " + f"but Ridge is expecting {self.n_features_in_} features as input" ) ridge_res = ridge_palg.compute(X, self.daal_model_) @@ -112,17 +118,15 @@ def _fit_ridge(self, X, y, sample_weight=None): ------- self : returns an instance of self. """ - if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): self._normalize = _deprecate_normalize( - self.normalize, - default=False, - estimator_name=self.__class__.__name__ + self.normalize, default=False, estimator_name=self.__class__.__name__ ) - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - elif sklearn_check_version('1.1'): + elif sklearn_check_version("1.1"): if self.max_iter is not None: self.max_iter = check_scalar( self.max_iter, "max_iter", target_type=numbers.Integral, min_val=1 @@ -137,40 +141,57 @@ def _fit_ridge(self, X, y, sample_weight=None): include_boundaries="left", ) - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=[np.float64, np.float32], - multi_output=True, y_numeric=True) + X, y = check_X_y( + X, + y, + ["csr", "csc", "coo"], + dtype=[np.float64, np.float32], + multi_output=True, + y_numeric=True, + ) self.n_features_in_ = X.shape[1] self.sample_weight_ = sample_weight self.fit_shape_good_for_daal_ = True if X.shape[0] >= X.shape[1] else False - _patching_status = PatchingConditionsChain( - "sklearn.linear_model.Ridge.fit") - _dal_ready = _patching_status.and_conditions([ - (self.solver == 'auto', - f"'{self.solver}' solver is not supported. " - "Only 'auto' solver is supported."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.fit_shape_good_for_daal_, - "The shape of X does not satisfy oneDAL requirements: " - "number of features > number of samples."), - (X.dtype == np.float64 or X.dtype == np.float32, - f"'{X.dtype}' X data type is not supported. " - "Only np.float32 and np.float64 are supported."), - (sample_weight is None, "Sample weights are not supported."), - (not (hasattr(self, 'positive') and self.positive), - "Forced positive coefficients are not supported.")]) + _patching_status = PatchingConditionsChain("sklearn.linear_model.Ridge.fit") + _dal_ready = _patching_status.and_conditions( + [ + ( + self.solver == "auto", + f"'{self.solver}' solver is not supported. " + "Only 'auto' solver is supported.", + ), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.fit_shape_good_for_daal_, + "The shape of X does not satisfy oneDAL requirements: " + "number of features > number of samples.", + ), + ( + X.dtype == np.float64 or X.dtype == np.float32, + f"'{X.dtype}' X data type is not supported. " + "Only np.float32 and np.float64 are supported.", + ), + (sample_weight is None, "Sample weights are not supported."), + ( + not (hasattr(self, "positive") and self.positive), + "Forced positive coefficients are not supported.", + ), + ] + ) _patching_status.write_log() if not _dal_ready: - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ return super(Ridge, self).fit(X, y, sample_weight=sample_weight) self.n_iter_ = None res = _daal4py_fit(self, X, y) if res is None: logging.info( - "sklearn.linear_model.Ridge.fit: " + get_patch_message("sklearn_after_daal")) - if hasattr(self, 'daal_model_'): + "sklearn.linear_model.Ridge.fit: " + get_patch_message("sklearn_after_daal") + ) + if hasattr(self, "daal_model_"): del self.daal_model_ return super(Ridge, self).fit(X, y, sample_weight=sample_weight) return res @@ -189,30 +210,42 @@ def _predict_ridge(self, X): C : array, shape = (n_samples,) Returns predicted values. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) X = check_array( - X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float64, np.float32]) - good_shape_for_daal = \ + X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float64, np.float32] + ) + good_shape_for_daal = ( True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False + ) - _patching_status = PatchingConditionsChain( - "sklearn.linear_model.Ridge.predict") - _dal_ready = _patching_status.and_conditions([ - (self.solver == 'auto', - f"'{self.solver}' solver is not supported. " - "Only 'auto' solver is supported."), - (hasattr(self, 'daal_model_'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (good_shape_for_daal, - "The shape of X does not satisfy oneDAL requirements: " - "number of features > number of samples."), - (X.dtype == np.float64 or X.dtype == np.float32, - f"'{X.dtype}' X data type is not supported. " - "Only np.float32 and np.float64 are supported."), - (not hasattr(self, 'sample_weight_') or self.sample_weight_ is None, - "Sample weights are not supported.")]) + _patching_status = PatchingConditionsChain("sklearn.linear_model.Ridge.predict") + _dal_ready = _patching_status.and_conditions( + [ + ( + self.solver == "auto", + f"'{self.solver}' solver is not supported. " + "Only 'auto' solver is supported.", + ), + (hasattr(self, "daal_model_"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + good_shape_for_daal, + "The shape of X does not satisfy oneDAL requirements: " + "number of features > number of samples.", + ), + ( + X.dtype == np.float64 or X.dtype == np.float32, + f"'{X.dtype}' X data type is not supported. " + "Only np.float32 and np.float64 are supported.", + ), + ( + not hasattr(self, "sample_weight_") or self.sample_weight_ is None, + "Sample weights are not supported.", + ), + ] + ) _patching_status.write_log() if not _dal_ready: @@ -223,7 +256,7 @@ def _predict_ridge(self, X): class Ridge(Ridge_original, _BaseRidge): __doc__ = Ridge_original.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**Ridge_original._parameter_constraints} def __init__( @@ -245,12 +278,14 @@ def __init__( self.solver = solver self.positive = positive self.random_state = random_state - elif sklearn_check_version('1.0'): + + elif sklearn_check_version("1.0"): + def __init__( self, alpha=1.0, fit_intercept=True, - normalize='deprecated', + normalize="deprecated", copy_X=True, max_iter=None, tol=1e-3, @@ -267,7 +302,9 @@ def __init__( self.solver = solver self.positive = positive self.random_state = random_state + else: + def __init__( self, alpha=1.0, diff --git a/daal4py/sklearn/linear_model/coordinate_descent.py b/daal4py/sklearn/linear_model/coordinate_descent.py index 2519306665..a70fcb3f80 100755 --- a/daal4py/sklearn/linear_model/coordinate_descent.py +++ b/daal4py/sklearn/linear_model/coordinate_descent.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._coordinate_descent import * diff --git a/daal4py/sklearn/linear_model/linear.py b/daal4py/sklearn/linear_model/linear.py index 5f1970460a..5325b86de5 100644 --- a/daal4py/sklearn/linear_model/linear.py +++ b/daal4py/sklearn/linear_model/linear.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._linear import * diff --git a/daal4py/sklearn/linear_model/logistic_loss.py b/daal4py/sklearn/linear_model/logistic_loss.py index 9aea83cfe8..a1a1c4cba6 100644 --- a/daal4py/sklearn/linear_model/logistic_loss.py +++ b/daal4py/sklearn/linear_model/logistic_loss.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,29 +12,39 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np import daal4py -from .._utils import (make2d, getFPType) + +from .._utils import getFPType, make2d def _resultsToCompute_string(value=True, gradient=True, hessian=False): results_needed = [] if value: - results_needed.append('value') + results_needed.append("value") if gradient: - results_needed.append('gradient') + results_needed.append("gradient") if hessian: - results_needed.append('hessian') - - return '|'.join(results_needed) - - -def _daal4py_logistic_loss_extra_args(nClasses_unused, beta, X, y, - l1=0.0, l2=0.0, fit_intercept=True, - value=True, gradient=True, hessian=False): + results_needed.append("hessian") + + return "|".join(results_needed) + + +def _daal4py_logistic_loss_extra_args( + nClasses_unused, + beta, + X, + y, + l1=0.0, + l2=0.0, + fit_intercept=True, + value=True, + gradient=True, + hessian=False, +): X = make2d(X) nSamples, nFeatures = X.shape @@ -43,26 +53,35 @@ def _daal4py_logistic_loss_extra_args(nClasses_unused, beta, X, y, n = X.shape[0] results_to_compute = _resultsToCompute_string( - value=value, gradient=gradient, hessian=hessian) - - objective_function_algorithm_instance = \ - daal4py.optimization_solver_logistic_loss( - numberOfTerms=n, - fptype=getFPType(X), - method='defaultDense', - interceptFlag=fit_intercept, - penaltyL1=l1 / n, - penaltyL2=l2 / n, - resultsToCompute=results_to_compute - ) + value=value, gradient=gradient, hessian=hessian + ) + + objective_function_algorithm_instance = daal4py.optimization_solver_logistic_loss( + numberOfTerms=n, + fptype=getFPType(X), + method="defaultDense", + interceptFlag=fit_intercept, + penaltyL1=l1 / n, + penaltyL2=l2 / n, + resultsToCompute=results_to_compute, + ) objective_function_algorithm_instance.setup(X, y, beta) return (objective_function_algorithm_instance, X, y, n) -def _daal4py_cross_entropy_loss_extra_args(nClasses, beta, X, y, - l1=0.0, l2=0.0, fit_intercept=True, - value=True, gradient=True, hessian=False): +def _daal4py_cross_entropy_loss_extra_args( + nClasses, + beta, + X, + y, + l1=0.0, + l2=0.0, + fit_intercept=True, + value=True, + gradient=True, + hessian=False, +): X = make2d(X) nSamples, nFeatures = X.shape y = make2d(y) @@ -70,19 +89,21 @@ def _daal4py_cross_entropy_loss_extra_args(nClasses, beta, X, y, n = X.shape[0] results_to_compute = _resultsToCompute_string( - value=value, gradient=gradient, hessian=hessian) + value=value, gradient=gradient, hessian=hessian + ) - objective_function_algorithm_instance = \ + objective_function_algorithm_instance = ( daal4py.optimization_solver_cross_entropy_loss( nClasses=nClasses, numberOfTerms=n, fptype=getFPType(X), - method='defaultDense', + method="defaultDense", interceptFlag=fit_intercept, penaltyL1=l1 / n, penaltyL2=l2 / n, - resultsToCompute=results_to_compute + resultsToCompute=results_to_compute, ) + ) objective_function_algorithm_instance.setup(X, y, beta) return (objective_function_algorithm_instance, X, y, n) @@ -150,6 +171,7 @@ def hessp(v): res[1:] = np.dot(pp0, X) res[1:] += (2 * l2) * v[1:] return res + else: # dealing with multi-class logistic regression beta__ = beta_.reshape((-1, 1 + X.shape[1])) # (nClasses, nSamples) diff --git a/daal4py/sklearn/linear_model/logistic_path.py b/daal4py/sklearn/linear_model/logistic_path.py index 85d4165e32..93ada4aada 100755 --- a/daal4py/sklearn/linear_model/logistic_path.py +++ b/daal4py/sklearn/linear_model/logistic_path.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,39 +12,48 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import daal4py as d4p -import numpy as np -import scipy.sparse as sparse -import scipy.optimize as optimize import numbers -from .._utils import ( - getFPType, sklearn_check_version, PatchingConditionsChain) -from .logistic_loss import (_daal4py_loss_and_grad, - _daal4py_logistic_loss_extra_args, - _daal4py_cross_entropy_loss_extra_args, - _daal4py_loss_, _daal4py_grad_, - _daal4py_grad_hess_) +import numpy as np +import scipy.optimize as optimize +import scipy.sparse as sparse import sklearn.linear_model._logistic as logistic_module - -from sklearn.utils import (check_array, - check_consistent_length, - compute_class_weight, - check_random_state) -from sklearn.utils.validation import _check_sample_weight, check_is_fitted from sklearn.linear_model._sag import sag_solver -from sklearn.utils.optimize import _newton_cg, _check_optimize_result -if sklearn_check_version('1.1'): - from sklearn.linear_model._linear_loss import LinearModelLoss +from sklearn.utils import ( + check_array, + check_consistent_length, + check_random_state, + compute_class_weight, +) +from sklearn.utils.optimize import _check_optimize_result, _newton_cg +from sklearn.utils.validation import _check_sample_weight, check_is_fitted + +import daal4py as d4p + +from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version +from .logistic_loss import ( + _daal4py_cross_entropy_loss_extra_args, + _daal4py_grad_, + _daal4py_grad_hess_, + _daal4py_logistic_loss_extra_args, + _daal4py_loss_, + _daal4py_loss_and_grad, +) + +if sklearn_check_version("1.1"): from sklearn._loss.loss import HalfBinomialLoss, HalfMultinomialLoss + from sklearn.linear_model._linear_loss import LinearModelLoss + from sklearn.linear_model._logistic import _LOGISTIC_SOLVER_CONVERGENCE_MSG + from sklearn.linear_model._logistic import ( + LogisticRegression as LogisticRegression_original, + ) from sklearn.linear_model._logistic import ( - _check_solver, _check_multi_class, + _check_solver, _fit_liblinear, - _LOGISTIC_SOLVER_CONVERGENCE_MSG, - LogisticRegression as LogisticRegression_original) + ) else: from sklearn.linear_model._logistic import ( _check_solver, @@ -57,9 +66,12 @@ _multinomial_loss_grad, _multinomial_grad_hess, _LOGISTIC_SOLVER_CONVERGENCE_MSG, - LogisticRegression as LogisticRegression_original) + LogisticRegression as LogisticRegression_original, + ) + from sklearn.linear_model._logistic import _logistic_regression_path as lr_path_original -from sklearn.preprocessing import LabelEncoder, LabelBinarizer +from sklearn.preprocessing import LabelBinarizer, LabelEncoder + from .._device_offload import support_usm_ndarray @@ -73,13 +85,13 @@ def __logistic_regression_path( max_iter=100, tol=1e-4, verbose=0, - solver='lbfgs', + solver="lbfgs", coef=None, class_weight=None, dual=False, - penalty='l2', - intercept_scaling=1., - multi_class='warn', + penalty="l2", + intercept_scaling=1.0, + multi_class="warn", random_state=None, check_input=True, max_squared_sum=None, @@ -237,22 +249,35 @@ def __logistic_regression_path( The "copy" parameter was removed. """ _patching_status = PatchingConditionsChain( - "sklearn.linear_model.LogisticRegression.fit") + "sklearn.linear_model.LogisticRegression.fit" + ) # TODO: remove this fallback workaround after # logistic path is reworked to align with sklearn 1.2 - _dal_ready = _patching_status.and_conditions([ - (not (sklearn_check_version('1.2') and solver == 'newton-cholesky'), - f"'{solver}' solver is not supported. " - "Only 'lbfgs' and 'newton-cg' solvers are supported.")]) + _dal_ready = _patching_status.and_conditions( + [ + ( + not (sklearn_check_version("1.2") and solver == "newton-cholesky"), + f"'{solver}' solver is not supported. " + "Only 'lbfgs' and 'newton-cg' solvers are supported.", + ) + ] + ) if not _dal_ready: _patching_status.write_log() return lr_path_original( - X, y, pos_class=pos_class, - Cs=Cs, fit_intercept=fit_intercept, - max_iter=max_iter, tol=tol, verbose=verbose, - solver=solver, coef=coef, + X, + y, + pos_class=pos_class, + Cs=Cs, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + verbose=verbose, + solver=solver, + coef=coef, class_weight=class_weight, - dual=dual, penalty=penalty, + dual=dual, + penalty=penalty, intercept_scaling=intercept_scaling, multi_class=multi_class, random_state=random_state, @@ -260,7 +285,7 @@ def __logistic_regression_path( max_squared_sum=max_squared_sum, sample_weight=sample_weight, l1_ratio=l1_ratio, - n_threads=n_threads + n_threads=n_threads, ) if isinstance(Cs, numbers.Integral): @@ -270,19 +295,19 @@ def __logistic_regression_path( # Preprocessing. if check_input: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): X = check_array( X, - accept_sparse='csr', + accept_sparse="csr", dtype=np.float64, accept_large_sparse=solver not in ["liblinear", "sag", "saga"], ) else: X = check_array( X, - accept_sparse='csr', + accept_sparse="csr", dtype=np.float64, - accept_large_sparse=solver != 'liblinear', + accept_large_sparse=solver != "liblinear", ) y = check_array(y, ensure_2d=False, dtype=None) check_consistent_length(X, y) @@ -292,45 +317,50 @@ def __logistic_regression_path( random_state = check_random_state(random_state) multi_class = _check_multi_class(multi_class, solver, len(classes)) - if pos_class is None and multi_class != 'multinomial': - if (classes.size > 2): - raise ValueError('To fit OvR, use the pos_class argument') + if pos_class is None and multi_class != "multinomial": + if classes.size > 2: + raise ValueError("To fit OvR, use the pos_class argument") # np.unique(y) gives labels in sorted order. pos_class = classes[1] - _dal_ready = _patching_status.and_conditions([ - (solver in ['lbfgs', 'newton-cg'], - f"'{solver}' solver is not supported. " - "Only 'lbfgs' and 'newton-cg' solvers are supported."), - (not sparse.issparse(X), "X is sparse. Sparse input is not supported."), - (sample_weight is None, "Sample weights are not supported."), - (class_weight is None, "Class weights are not supported.")]) + _dal_ready = _patching_status.and_conditions( + [ + ( + solver in ["lbfgs", "newton-cg"], + f"'{solver}' solver is not supported. " + "Only 'lbfgs' and 'newton-cg' solvers are supported.", + ), + (not sparse.issparse(X), "X is sparse. Sparse input is not supported."), + (sample_weight is None, "Sample weights are not supported."), + (class_weight is None, "Class weights are not supported."), + ] + ) if not _dal_ready: - if sklearn_check_version('0.24'): - sample_weight = _check_sample_weight(sample_weight, X, - dtype=X.dtype, - copy=True) + if sklearn_check_version("0.24"): + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype, copy=True + ) else: - sample_weight = _check_sample_weight(sample_weight, X, - dtype=X.dtype) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) # If class_weights is a dict (provided by the user), the weights # are assigned to the original labels. If it is "balanced", then # the class_weights are assigned after masking the labels with a OvR. le = LabelEncoder() - if (isinstance(class_weight, dict) or multi_class == 'multinomial') and \ - not _dal_ready: + if ( + isinstance(class_weight, dict) or multi_class == "multinomial" + ) and not _dal_ready: class_weight_ = compute_class_weight(class_weight, classes=classes, y=y) if not np.allclose(class_weight_, np.ones_like(class_weight_)): sample_weight *= class_weight_[le.fit_transform(y)] # For doing a ovr, we need to mask the labels first. for the # multinomial case this is not necessary. - if multi_class == 'ovr': + if multi_class == "ovr": y_bin = np.ones(y.shape, dtype=X.dtype) - if sklearn_check_version('1.1'): - mask = (y == pos_class) + if sklearn_check_version("1.1"): + mask = y == pos_class y_bin = np.ones(y.shape, dtype=X.dtype) # for compute_class_weight @@ -344,24 +374,25 @@ def __logistic_regression_path( y_bin[~mask] = -1.0 else: mask_classes = np.array([-1, 1]) - mask = (y == pos_class) - y_bin[~mask] = -1. + mask = y == pos_class + y_bin[~mask] = -1.0 # for compute_class_weight if class_weight == "balanced" and not _dal_ready: - class_weight_ = compute_class_weight(class_weight, classes=mask_classes, - y=y_bin) + class_weight_ = compute_class_weight( + class_weight, classes=mask_classes, y=y_bin + ) if not np.allclose(class_weight_, np.ones_like(class_weight_)): sample_weight *= class_weight_[le.fit_transform(y_bin)] if _dal_ready: w0 = np.zeros(n_features + 1, dtype=X.dtype) - y_bin[~mask] = 0. + y_bin[~mask] = 0.0 else: w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype) else: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): if solver in ["sag", "saga", "lbfgs", "newton-cg"]: # SAG, lbfgs and newton-cg multinomial solvers need LabelEncoder, # not LabelBinarizer, i.e. y as a 1d-array of integers. @@ -379,7 +410,7 @@ def __logistic_regression_path( if Y_multi.shape[1] == 1: Y_multi = np.hstack([1 - Y_multi, Y_multi]) else: - if solver not in ['sag', 'saga']: + if solver not in ["sag", "saga"]: if _dal_ready: Y_multi = le.fit_transform(y).astype(X.dtype, copy=False) else: @@ -393,24 +424,26 @@ def __logistic_regression_path( Y_multi = le.fit_transform(y).astype(X.dtype, copy=False) if _dal_ready: - w0 = np.zeros((classes.size, n_features + 1), - order='C', dtype=X.dtype) + w0 = np.zeros((classes.size, n_features + 1), order="C", dtype=X.dtype) else: - w0 = np.zeros((classes.size, n_features + int(fit_intercept)), - order='F', dtype=X.dtype) + w0 = np.zeros( + (classes.size, n_features + int(fit_intercept)), order="F", dtype=X.dtype + ) if coef is not None: # it must work both giving the bias term and not - if multi_class == 'ovr': + if multi_class == "ovr": if coef.size not in (n_features, w0.size): raise ValueError( - 'Initialization coef is of shape %d, expected shape ' - '%d or %d' % (coef.size, n_features, w0.size)) + "Initialization coef is of shape %d, expected shape " + "%d or %d" % (coef.size, n_features, w0.size) + ) if _dal_ready: - w0[-coef.size:] = \ + w0[-coef.size :] = ( np.roll(coef, 1, -1) if coef.size != n_features else coef + ) else: - w0[:coef.size] = coef + w0[: coef.size] = coef else: # For binary problems coef.shape[0] should be 1, otherwise it # should be classes.size. @@ -418,49 +451,59 @@ def __logistic_regression_path( if n_classes == 2: n_classes = 1 - if coef.shape[0] != n_classes or \ - coef.shape[1] not in (n_features, n_features + 1): + if coef.shape[0] != n_classes or coef.shape[1] not in ( + n_features, + n_features + 1, + ): raise ValueError( - 'Initialization coef is of shape (%d, %d), expected ' - 'shape (%d, %d) or (%d, %d)' % ( - coef.shape[0], coef.shape[1], classes.size, - n_features, classes.size, n_features + 1)) + "Initialization coef is of shape (%d, %d), expected " + "shape (%d, %d) or (%d, %d)" + % ( + coef.shape[0], + coef.shape[1], + classes.size, + n_features, + classes.size, + n_features + 1, + ) + ) if _dal_ready: - w0[:, -coef.shape[1]:] = \ + w0[:, -coef.shape[1] :] = ( np.roll(coef, 1, -1) if coef.shape[1] != n_features else coef + ) else: if n_classes == 1: - w0[0, :coef.shape[1]] = -coef - w0[1, :coef.shape[1]] = coef + w0[0, : coef.shape[1]] = -coef + w0[1, : coef.shape[1]] = coef else: - w0[:, :coef.shape[1]] = coef + w0[:, : coef.shape[1]] = coef C_daal_multiplier = 1 # commented out because this is Py3 feature - #def _map_to_binary_logistic_regression(): + # def _map_to_binary_logistic_regression(): # nonlocal C_daal_multiplier # nonlocal w0 # C_daal_multiplier = 2 # w0 *= 2 - if multi_class == 'multinomial': + if multi_class == "multinomial": # fmin_l_bfgs_b and newton-cg accepts only ravelled parameters. - if solver in ['lbfgs', 'newton-cg']: + if solver in ["lbfgs", "newton-cg"]: if _dal_ready and classes.size == 2: w0 = w0[-1:, :] - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): w0 = w0.ravel(order="F") else: w0 = w0.ravel() target = Y_multi loss = None - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): loss = LinearModelLoss( base_loss=HalfMultinomialLoss(n_classes=classes.size), fit_intercept=fit_intercept, ) - if solver == 'lbfgs': + if solver == "lbfgs": if _dal_ready: if classes.size == 2: # _map_to_binary_logistic_regression() @@ -471,12 +514,14 @@ def __logistic_regression_path( daal_extra_args_func = _daal4py_cross_entropy_loss_extra_args func = _daal4py_loss_and_grad else: - if sklearn_check_version('1.1') and loss is not None: + if sklearn_check_version("1.1") and loss is not None: func = loss.loss_gradient else: + def func(x, *args): return _multinomial_loss_grad(x, *args)[0:2] - elif solver == 'newton-cg': + + elif solver == "newton-cg": if _dal_ready: if classes.size == 2: # _map_to_binary_logistic_regression() @@ -489,40 +534,42 @@ def func(x, *args): grad = _daal4py_grad_ hess = _daal4py_grad_hess_ else: - if sklearn_check_version('1.1') and loss is not None: + if sklearn_check_version("1.1") and loss is not None: func = loss.loss grad = loss.gradient hess = loss.gradient_hessian_product # hess = [gradient, hessp] else: + def func(x, *args): return _multinomial_loss(x, *args)[0] def grad(x, *args): return _multinomial_loss_grad(x, *args)[1] + hess = _multinomial_grad_hess - warm_start_sag = {'coef': w0.T} + warm_start_sag = {"coef": w0.T} else: target = y_bin - if solver == 'lbfgs': + if solver == "lbfgs": if _dal_ready: func = _daal4py_loss_and_grad daal_extra_args_func = _daal4py_logistic_loss_extra_args else: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): loss = LinearModelLoss( base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept ) func = loss.loss_gradient else: func = _logistic_loss_and_grad - elif solver == 'newton-cg': + elif solver == "newton-cg": if _dal_ready: daal_extra_args_func = _daal4py_logistic_loss_extra_args func = _daal4py_loss_ grad = _daal4py_grad_ hess = _daal4py_grad_hess_ else: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): loss = LinearModelLoss( base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept ) @@ -534,90 +581,105 @@ def grad(x, *args): def grad(x, *args): return _logistic_loss_and_grad(x, *args)[1] + hess = _logistic_grad_hess - warm_start_sag = {'coef': np.expand_dims(w0, axis=1)} + warm_start_sag = {"coef": np.expand_dims(w0, axis=1)} coefs = list() n_iter = np.zeros(len(Cs), dtype=np.int32) for i, C in enumerate(Cs): - if solver == 'lbfgs': + if solver == "lbfgs": if _dal_ready: extra_args = daal_extra_args_func( classes.size, w0, X, target, - 0., - 1. / (2 * C * C_daal_multiplier), + 0.0, + 1.0 / (2 * C * C_daal_multiplier), fit_intercept, value=True, gradient=True, - hessian=False + hessian=False, ) else: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): l2_reg_strength = 1.0 / C extra_args = (X, target, sample_weight, l2_reg_strength, n_threads) else: - extra_args = (X, target, 1. / C, sample_weight) + extra_args = (X, target, 1.0 / C, sample_weight) iprint = [-1, 50, 1, 100, 101][ - np.searchsorted(np.array([0, 1, 2, 3]), verbose)] + np.searchsorted(np.array([0, 1, 2, 3]), verbose) + ] opt_res = optimize.minimize( func, w0, method="L-BFGS-B", jac=True, args=extra_args, - options={"iprint": iprint, "gtol": tol, "maxiter": max_iter} + options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}, ) n_iter_i = _check_optimize_result( solver, opt_res, max_iter, - extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG) + extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG, + ) w0, loss = opt_res.x, opt_res.fun if _dal_ready and C_daal_multiplier == 2: w0 /= 2 - elif solver == 'newton-cg': + elif solver == "newton-cg": if _dal_ready: + def make_ncg_funcs(f, value=False, gradient=False, hessian=False): - daal_penaltyL2 = 1. / (2 * C * C_daal_multiplier) + daal_penaltyL2 = 1.0 / (2 * C * C_daal_multiplier) _obj_, X_, y_, n_samples = daal_extra_args_func( classes.size, w0, X, target, - 0., + 0.0, daal_penaltyL2, fit_intercept, value=value, gradient=gradient, - hessian=hessian + hessian=hessian, ) def _func_(x, *args): return f(x, _obj_, *args) + return _func_, (X_, y_, n_samples, daal_penaltyL2) loss_func, extra_args = make_ncg_funcs(func, value=True) grad_func, _ = make_ncg_funcs(grad, gradient=True) grad_hess_func, _ = make_ncg_funcs(hess, gradient=True) - w0, n_iter_i = _newton_cg(grad_hess_func, loss_func, grad_func, - w0, args=extra_args, - maxiter=max_iter, tol=tol) + w0, n_iter_i = _newton_cg( + grad_hess_func, + loss_func, + grad_func, + w0, + args=extra_args, + maxiter=max_iter, + tol=tol, + ) else: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): l2_reg_strength = 1.0 / C args = (X, target, sample_weight, l2_reg_strength, n_threads) else: - args = (X, target, 1. / C, sample_weight) + args = (X, target, 1.0 / C, sample_weight) w0, n_iter_i = _newton_cg( hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol ) - elif solver == 'liblinear': - coef_, intercept_, n_iter_i, = _fit_liblinear( + elif solver == "liblinear": + ( + coef_, + intercept_, + n_iter_i, + ) = _fit_liblinear( X, target, C, @@ -637,22 +699,22 @@ def _func_(x, *args): else: w0 = coef_.ravel() - elif solver in ['sag', 'saga']: - if multi_class == 'multinomial': + elif solver in ["sag", "saga"]: + if multi_class == "multinomial": target = target.astype(X.dtype, copy=False) - loss = 'multinomial' + loss = "multinomial" else: - loss = 'log' + loss = "log" # alpha is for L2-norm, beta is for L1-norm - if penalty == 'l1': - alpha = 0. - beta = 1. / C - elif penalty == 'l2': - alpha = 1. / C - beta = 0. + if penalty == "l1": + alpha = 0.0 + beta = 1.0 / C + elif penalty == "l2": + alpha = 1.0 / C + beta = 0.0 else: # Elastic-Net penalty - alpha = (1. / C) * (1 - l1_ratio) - beta = (1. / C) * l1_ratio + alpha = (1.0 / C) * (1 - l1_ratio) + beta = (1.0 / C) * l1_ratio w0, n_iter_i, warm_start_sag = sag_solver( X, @@ -668,7 +730,7 @@ def _func_(x, *args): False, max_squared_sum, warm_start_sag, - is_saga=(solver == 'saga') + is_saga=(solver == "saga"), ) else: @@ -677,7 +739,7 @@ def _func_(x, *args): "'newton-cg', 'sag'}, got '%s' instead" % solver ) - if multi_class == 'multinomial': + if multi_class == "multinomial": if _dal_ready: if classes.size == 2: multi_w0 = w0[np.newaxis, :] @@ -685,7 +747,7 @@ def _func_(x, *args): multi_w0 = np.reshape(w0, (classes.size, -1)) else: n_classes = max(2, classes.size) - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): if solver in ["lbfgs", "newton-cg"]: multi_w0 = np.reshape(w0, (n_classes, -1), order="F") else: @@ -715,82 +777,102 @@ def _func_(x, *args): def daal4py_predict(self, X, resultsToEvaluate): check_is_fitted(self) - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) try: fptype = getFPType(X) except ValueError: fptype = None - if resultsToEvaluate == 'computeClassLabels': - _function_name = 'predict' - elif resultsToEvaluate == 'computeClassProbabilities': - _function_name = 'predict_proba' - elif resultsToEvaluate == 'computeClassLogProbabilities': - _function_name = 'predict_log_proba' + if resultsToEvaluate == "computeClassLabels": + _function_name = "predict" + elif resultsToEvaluate == "computeClassProbabilities": + _function_name = "predict_proba" + elif resultsToEvaluate == "computeClassLogProbabilities": + _function_name = "predict_log_proba" else: - raise ValueError('resultsToEvaluate must be in [computeClassLabels, \ - computeClassProbabilities, computeClassLogProbabilities]') + raise ValueError( + "resultsToEvaluate must be in [computeClassLabels, \ + computeClassProbabilities, computeClassLogProbabilities]" + ) _patching_status = PatchingConditionsChain( - f"sklearn.linear_model.LogisticRegression.{_function_name}") - _patching_status.and_conditions([ - (self.multi_class in ["multinomial", "warn"], - f"{self.multi_class} multiclass option is not supported. " - "Only 'multinomial' or 'warn' options are supported."), - (self.classes_.size == 2, "Number of classes != 2."), - (resultsToEvaluate == 'computeClassLabels', - "resultsToEvaluate != 'computeClassLabels'.")], - conditions_merging=any) - _dal_ready = _patching_status.and_conditions([ - (not sparse.issparse(X), "X is sparse. Sparse input is not supported."), - (not sparse.issparse(self.coef_), - "self.coef_ is sparse. Sparse coefficients are not supported."), - (fptype is not None, "Unable to get dtype.")]) + f"sklearn.linear_model.LogisticRegression.{_function_name}" + ) + _patching_status.and_conditions( + [ + ( + self.multi_class in ["multinomial", "warn"], + f"{self.multi_class} multiclass option is not supported. " + "Only 'multinomial' or 'warn' options are supported.", + ), + (self.classes_.size == 2, "Number of classes != 2."), + ( + resultsToEvaluate == "computeClassLabels", + "resultsToEvaluate != 'computeClassLabels'.", + ), + ], + conditions_merging=any, + ) + _dal_ready = _patching_status.and_conditions( + [ + (not sparse.issparse(X), "X is sparse. Sparse input is not supported."), + ( + not sparse.issparse(self.coef_), + "self.coef_ is sparse. Sparse coefficients are not supported.", + ), + (fptype is not None, "Unable to get dtype."), + ] + ) _patching_status.write_log() if _dal_ready: n_features = self.coef_.shape[1] if X.shape[1] != n_features: raise ValueError( - f'X has {X.shape[1]} features, ' - f'but LogisticRegression is expecting {n_features} features as input' + f"X has {X.shape[1]} features, " + f"but LogisticRegression is expecting {n_features} features as input" ) builder = d4p.logistic_regression_model_builder(X.shape[1], len(self.classes_)) builder.set_beta(self.coef_, self.intercept_) predict = d4p.logistic_regression_prediction( nClasses=len(self.classes_), fptype=fptype, - method='defaultDense', - resultsToEvaluate=resultsToEvaluate + method="defaultDense", + resultsToEvaluate=resultsToEvaluate, ) res = predict.compute(X, builder.model) - if resultsToEvaluate == 'computeClassLabels': + if resultsToEvaluate == "computeClassLabels": res = res.prediction - if not np.array_equal(self.classes_, np.arange(0, len(self.classes_))) or \ - self.classes_.dtype != X.dtype: + if ( + not np.array_equal(self.classes_, np.arange(0, len(self.classes_))) + or self.classes_.dtype != X.dtype + ): res = self.classes_.take(np.asarray(res, dtype=np.intp)) - elif resultsToEvaluate == 'computeClassProbabilities': + elif resultsToEvaluate == "computeClassProbabilities": res = res.probabilities - elif resultsToEvaluate == 'computeClassLogProbabilities': + elif resultsToEvaluate == "computeClassLogProbabilities": res = res.logProbabilities else: - raise ValueError('resultsToEvaluate must be in [computeClassLabels, \ - computeClassProbabilities, computeClassLogProbabilities]') + raise ValueError( + "resultsToEvaluate must be in [computeClassLabels, \ + computeClassProbabilities, computeClassLogProbabilities]" + ) if res.shape[1] == 1: res = np.ravel(res) return res - if resultsToEvaluate == 'computeClassLabels': + if resultsToEvaluate == "computeClassLabels": return LogisticRegression_original.predict(self, X) - if resultsToEvaluate == 'computeClassProbabilities': + if resultsToEvaluate == "computeClassProbabilities": return LogisticRegression_original.predict_proba(self, X) - if resultsToEvaluate == 'computeClassLogProbabilities': + if resultsToEvaluate == "computeClassLogProbabilities": return LogisticRegression_original.predict_log_proba(self, X) -if sklearn_check_version('0.24'): +if sklearn_check_version("0.24"): + @support_usm_ndarray() def logistic_regression_path( X, @@ -801,13 +883,13 @@ def logistic_regression_path( max_iter=100, tol=1e-4, verbose=0, - solver='lbfgs', + solver="lbfgs", coef=None, class_weight=None, dual=False, - penalty='l2', - intercept_scaling=1., - multi_class='auto', + penalty="l2", + intercept_scaling=1.0, + multi_class="auto", random_state=None, check_input=True, max_squared_sum=None, @@ -815,14 +897,21 @@ def logistic_regression_path( l1_ratio=None, n_threads=1, ): - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): return __logistic_regression_path( - X, y, pos_class=pos_class, - Cs=Cs, fit_intercept=fit_intercept, - max_iter=max_iter, tol=tol, verbose=verbose, - solver=solver, coef=coef, + X, + y, + pos_class=pos_class, + Cs=Cs, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + verbose=verbose, + solver=solver, + coef=coef, class_weight=class_weight, - dual=dual, penalty=penalty, + dual=dual, + penalty=penalty, intercept_scaling=intercept_scaling, multi_class=multi_class, random_state=random_state, @@ -830,35 +919,42 @@ def logistic_regression_path( max_squared_sum=max_squared_sum, sample_weight=sample_weight, l1_ratio=l1_ratio, - n_threads=n_threads + n_threads=n_threads, ) return __logistic_regression_path( - X, y, pos_class=pos_class, - Cs=Cs, fit_intercept=fit_intercept, - max_iter=max_iter, tol=tol, verbose=verbose, - solver=solver, coef=coef, + X, + y, + pos_class=pos_class, + Cs=Cs, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + verbose=verbose, + solver=solver, + coef=coef, class_weight=class_weight, - dual=dual, penalty=penalty, + dual=dual, + penalty=penalty, intercept_scaling=intercept_scaling, multi_class=multi_class, random_state=random_state, check_input=check_input, max_squared_sum=max_squared_sum, sample_weight=sample_weight, - l1_ratio=l1_ratio + l1_ratio=l1_ratio, ) class LogisticRegression(LogisticRegression_original): __doc__ = LogisticRegression_original.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **LogisticRegression_original._parameter_constraints } def __init__( self, - penalty='l2', + penalty="l2", dual=False, tol=1e-4, C=1.0, @@ -866,13 +962,13 @@ def __init__( intercept_scaling=1, class_weight=None, random_state=None, - solver='lbfgs', + solver="lbfgs", max_iter=100, - multi_class='auto', + multi_class="auto", verbose=0, warm_start=False, n_jobs=None, - l1_ratio=None + l1_ratio=None, ): self.penalty = penalty self.dual = dual @@ -920,11 +1016,11 @@ def fit(self, X, y, sample_weight=None): ----- The SAGA solver supports both float64 and float32 bit arrays. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - which, what = logistic_module, '_logistic_regression_path' + which, what = logistic_module, "_logistic_regression_path" replacer = logistic_regression_path descriptor = getattr(which, what, None) setattr(which, what, replacer) @@ -947,7 +1043,7 @@ def predict(self, X): C : array, shape [n_samples] Predicted class label per sample. """ - return daal4py_predict(self, X, 'computeClassLabels') + return daal4py_predict(self, X, "computeClassLabels") @support_usm_ndarray() def predict_log_proba(self, X): @@ -969,7 +1065,7 @@ def predict_log_proba(self, X): Returns the log-probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ - return daal4py_predict(self, X, 'computeClassLogProbabilities') + return daal4py_predict(self, X, "computeClassLogProbabilities") @support_usm_ndarray() def predict_proba(self, X): @@ -998,10 +1094,10 @@ def predict_proba(self, X): Returns the probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ - return daal4py_predict(self, X, 'computeClassProbabilities') - + return daal4py_predict(self, X, "computeClassProbabilities") else: + @support_usm_ndarray() def logistic_regression_path( X, @@ -1012,13 +1108,13 @@ def logistic_regression_path( max_iter=100, tol=1e-4, verbose=0, - solver='lbfgs', + solver="lbfgs", coef=None, class_weight=None, dual=False, - penalty='l2', - intercept_scaling=1., - multi_class='auto', + penalty="l2", + intercept_scaling=1.0, + multi_class="auto", random_state=None, check_input=True, max_squared_sum=None, @@ -1026,19 +1122,26 @@ def logistic_regression_path( l1_ratio=None, ): return __logistic_regression_path( - X, y, pos_class=pos_class, - Cs=Cs, fit_intercept=fit_intercept, - max_iter=max_iter, tol=tol, verbose=verbose, - solver=solver, coef=coef, + X, + y, + pos_class=pos_class, + Cs=Cs, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + verbose=verbose, + solver=solver, + coef=coef, class_weight=class_weight, - dual=dual, penalty=penalty, + dual=dual, + penalty=penalty, intercept_scaling=intercept_scaling, multi_class=multi_class, random_state=random_state, check_input=check_input, max_squared_sum=max_squared_sum, sample_weight=sample_weight, - l1_ratio=l1_ratio + l1_ratio=l1_ratio, ) class LogisticRegression(LogisticRegression_original): @@ -1046,7 +1149,7 @@ class LogisticRegression(LogisticRegression_original): def __init__( self, - penalty='l2', + penalty="l2", dual=False, tol=1e-4, C=1.0, @@ -1054,15 +1157,14 @@ def __init__( intercept_scaling=1, class_weight=None, random_state=None, - solver='lbfgs', + solver="lbfgs", max_iter=100, - multi_class='auto', + multi_class="auto", verbose=0, warm_start=False, n_jobs=None, l1_ratio=None, ): - self.penalty = penalty self.dual = dual self.tol = tol @@ -1109,7 +1211,7 @@ def fit(self, X, y, sample_weight=None): ----- The SAGA solver supports both float64 and float32 bit arrays. """ - which, what = logistic_module, '_logistic_regression_path' + which, what = logistic_module, "_logistic_regression_path" replacer = logistic_regression_path descriptor = getattr(which, what, None) setattr(which, what, replacer) @@ -1132,7 +1234,7 @@ def predict(self, X): C : array, shape [n_samples] Predicted class label per sample. """ - return daal4py_predict(self, X, 'computeClassLabels') + return daal4py_predict(self, X, "computeClassLabels") @support_usm_ndarray() def predict_log_proba(self, X): @@ -1154,7 +1256,7 @@ def predict_log_proba(self, X): Returns the log-probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ - return daal4py_predict(self, X, 'computeClassLogProbabilities') + return daal4py_predict(self, X, "computeClassLogProbabilities") @support_usm_ndarray() def predict_proba(self, X): @@ -1183,4 +1285,4 @@ def predict_proba(self, X): Returns the probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ - return daal4py_predict(self, X, 'computeClassProbabilities') + return daal4py_predict(self, X, "computeClassProbabilities") diff --git a/daal4py/sklearn/linear_model/ridge.py b/daal4py/sklearn/linear_model/ridge.py index 11df4b1aed..1d96dc5fa7 100644 --- a/daal4py/sklearn/linear_model/ridge.py +++ b/daal4py/sklearn/linear_model/ridge.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._ridge import * diff --git a/daal4py/sklearn/linear_model/tests/test_linear.py b/daal4py/sklearn/linear_model/tests/test_linear.py index 57e34e52ed..34ca62f37e 100644 --- a/daal4py/sklearn/linear_model/tests/test_linear.py +++ b/daal4py/sklearn/linear_model/tests/test_linear.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,18 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.linear_model import LinearRegression +import pytest from sklearn.datasets import make_regression +from sklearn.linear_model import LinearRegression +from sklearn.utils._testing import assert_array_almost_equal def make_dataset(n_samples, n_features, kind=np.array, random_state=0, types=None): try: from pandas import DataFrame + if kind not in (list, np.array, DataFrame): kind = np.array except ImportError: @@ -62,7 +63,7 @@ def make_dataset(n_samples, n_features, kind=np.array, random_state=0, types=Non def test_linear_array_vs_dataframe_homogen(): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") x_train, y_train = make_dataset(100, 20) x_test, _ = make_dataset(100, 20, random_state=1) @@ -77,15 +78,17 @@ def test_linear_array_vs_dataframe_homogen(): df_reg.fit(df_x_train, df_y_train) assert_array_almost_equal( - array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1))) + array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1)) + ) assert_array_almost_equal(array_reg.intercept_, df_reg.intercept_) assert_array_almost_equal( array_reg.predict(x_test).reshape((-1, 1)), - df_reg.predict(df_x_test).reshape((-1, 1))) + df_reg.predict(df_x_test).reshape((-1, 1)), + ) def test_linear_array_vs_dataframe_heterogen(): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") types = (np.float64, np.float32) @@ -102,15 +105,18 @@ def test_linear_array_vs_dataframe_heterogen(): df_reg.fit(df_x_train, df_y_train) assert_array_almost_equal( - array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1))) + array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1)) + ) assert_array_almost_equal(array_reg.intercept_, df_reg.intercept_) assert_array_almost_equal( array_reg.predict(x_test).reshape((-1, 1)), - df_reg.predict(df_x_test).reshape((-1, 1)), decimal=5) + df_reg.predict(df_x_test).reshape((-1, 1)), + decimal=5, + ) def test_linear_array_vs_dataframe_heterogen_double_float(): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") types = (np.float64, np.float32) @@ -127,15 +133,17 @@ def test_linear_array_vs_dataframe_heterogen_double_float(): df_reg.fit(df_x_train, df_y_train) assert_array_almost_equal( - array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1))) + array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1)) + ) assert_array_almost_equal(array_reg.intercept_, df_reg.intercept_) assert_array_almost_equal( array_reg.predict(x_test).reshape((-1, 1)), - df_reg.predict(df_x_test).reshape((-1, 1))) + df_reg.predict(df_x_test).reshape((-1, 1)), + ) def test_linear_array_vs_dataframe_heterogen_double_int(): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") types = (np.float64, np.int32) @@ -152,15 +160,17 @@ def test_linear_array_vs_dataframe_heterogen_double_int(): df_reg.fit(df_x_train, df_y_train) assert_array_almost_equal( - array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1))) + array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1)) + ) assert_array_almost_equal(array_reg.intercept_, df_reg.intercept_) assert_array_almost_equal( array_reg.predict(x_test).reshape((-1, 1)), - df_reg.predict(df_x_test).reshape((-1, 1))) + df_reg.predict(df_x_test).reshape((-1, 1)), + ) def test_linear_array_vs_dataframe_heterogen_float_int(): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") types = (np.float32, np.int32) @@ -177,8 +187,10 @@ def test_linear_array_vs_dataframe_heterogen_float_int(): df_reg.fit(df_x_train, df_y_train) assert_array_almost_equal( - array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1))) + array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1)) + ) assert_array_almost_equal(array_reg.intercept_, df_reg.intercept_) assert_array_almost_equal( array_reg.predict(x_test).reshape((-1, 1)), - df_reg.predict(df_x_test).reshape((-1, 1))) + df_reg.predict(df_x_test).reshape((-1, 1)), + ) diff --git a/daal4py/sklearn/manifold/__init__.py b/daal4py/sklearn/manifold/__init__.py index c2e3047cbf..9ec5be77fc 100644 --- a/daal4py/sklearn/manifold/__init__.py +++ b/daal4py/sklearn/manifold/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._t_sne import TSNE -__all__ = ['TSNE'] +__all__ = ["TSNE"] diff --git a/daal4py/sklearn/manifold/_t_sne.py b/daal4py/sklearn/manifold/_t_sne.py index bf349431c3..e3fa6f07c2 100755 --- a/daal4py/sklearn/manifold/_t_sne.py +++ b/daal4py/sklearn/manifold/_t_sne.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,33 +12,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py TSNE scikit-learn-compatible class import warnings from time import time + import numpy as np from scipy.sparse import issparse -import daal4py -from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version, PatchingConditionsChain) - -from sklearn.manifold import TSNE as BaseTSNE from sklearn.decomposition import PCA +from sklearn.manifold import TSNE as BaseTSNE from sklearn.metrics.pairwise import pairwise_distances +from sklearn.utils import check_array, check_random_state from sklearn.utils.validation import check_non_negative -from sklearn.utils import check_random_state, check_array -from ..neighbors import NearestNeighbors +import daal4py +from daal4py.sklearn._utils import ( + PatchingConditionsChain, + daal_check_version, + sklearn_check_version, +) + from .._device_offload import support_usm_ndarray +from ..neighbors import NearestNeighbors -if sklearn_check_version('0.22'): - from sklearn.manifold._t_sne import _joint_probabilities - from sklearn.manifold._t_sne import _joint_probabilities_nn +if sklearn_check_version("0.22"): + from sklearn.manifold._t_sne import _joint_probabilities, _joint_probabilities_nn else: - from sklearn.manifold.t_sne import _joint_probabilities - from sklearn.manifold.t_sne import _joint_probabilities_nn + from sklearn.manifold.t_sne import _joint_probabilities, _joint_probabilities_nn class TSNE(BaseTSNE): @@ -101,39 +103,33 @@ def _daal_tsne(self, P, n_samples, X_embedded): # * final optimization with momentum at 0.8 # N, nnz, n_iter_without_progress, n_iter - size_iter = [[n_samples], [P.nnz], - [self.n_iter_without_progress], - [self.n_iter]] + size_iter = [[n_samples], [P.nnz], [self.n_iter_without_progress], [self.n_iter]] # Pass params to daal4py backend - if daal_check_version((2023, 'P', 1)): - size_iter.extend( - [[self._EXPLORATION_N_ITER], - [self._N_ITER_CHECK]] - ) + if daal_check_version((2023, "P", 1)): + size_iter.extend([[self._EXPLORATION_N_ITER], [self._N_ITER_CHECK]]) size_iter = np.array(size_iter, dtype=P.dtype) - params = np.array([[self.early_exaggeration], [self._learning_rate], - [self.min_grad_norm], [self.angle]], dtype=P.dtype) + params = np.array( + [ + [self.early_exaggeration], + [self._learning_rate], + [self.min_grad_norm], + [self.angle], + ], + dtype=P.dtype, + ) results = np.zeros((3, 1), dtype=P.dtype) # curIter, error, gradNorm if P.dtype == np.float64: daal4py.daal_tsne_gradient_descent( - X_embedded, - P, - size_iter, - params, - results, - 0) + X_embedded, P, size_iter, params, results, 0 + ) elif P.dtype == np.float32: daal4py.daal_tsne_gradient_descent( - X_embedded, - P, - size_iter, - params, - results, - 1) + X_embedded, P, size_iter, params, results, 1 + ) else: raise ValueError("unsupported dtype of 'P' matrix") @@ -147,40 +143,49 @@ def _daal_tsne(self, P, n_samples, X_embedded): def _fit(self, X, skip_num_points=0): """Private function to fit the model using X as training data.""" - if isinstance(self.init, str) and self.init == 'warn': - warnings.warn("The default initialization in TSNE will change " - "from 'random' to 'pca' in 1.2.", FutureWarning) - self._init = 'random' + if isinstance(self.init, str) and self.init == "warn": + warnings.warn( + "The default initialization in TSNE will change " + "from 'random' to 'pca' in 1.2.", + FutureWarning, + ) + self._init = "random" else: self._init = self.init - if isinstance(self._init, str) and self._init == 'pca' and issparse(X): - raise TypeError("PCA initialization is currently not suported " - "with the sparse input matrix. Use " - "init=\"random\" instead.") + if isinstance(self._init, str) and self._init == "pca" and issparse(X): + raise TypeError( + "PCA initialization is currently not suported " + "with the sparse input matrix. Use " + 'init="random" instead.' + ) - if self.method not in ['barnes_hut', 'exact']: + if self.method not in ["barnes_hut", "exact"]: raise ValueError("'method' must be 'barnes_hut' or 'exact'") if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") - if self.learning_rate == 'warn': - warnings.warn("The default learning rate in TSNE will change " - "from 200.0 to 'auto' in 1.2.", FutureWarning) + if self.learning_rate == "warn": + warnings.warn( + "The default learning rate in TSNE will change " + "from 200.0 to 'auto' in 1.2.", + FutureWarning, + ) self._learning_rate = 200.0 else: self._learning_rate = self.learning_rate - if self._learning_rate == 'auto': + if self._learning_rate == "auto": self._learning_rate = X.shape[0] / self.early_exaggeration / 4 self._learning_rate = np.maximum(self._learning_rate, 50) else: if not (self._learning_rate > 0): - raise ValueError("'learning_rate' must be a positive number " - "or 'auto'.") + raise ValueError( + "'learning_rate' must be a positive number " "or 'auto'." + ) # rename attribute for compatibility with sklearn>=1.2 - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self.learning_rate_ = self._learning_rate - if hasattr(self, 'square_distances'): + if hasattr(self, "square_distances"): if sklearn_check_version("1.1"): if self.square_distances != "deprecated": warnings.warn( @@ -190,8 +195,7 @@ def _fit(self, X, skip_num_points=0): ) else: if self.square_distances not in [True, "legacy"]: - raise ValueError( - "'square_distances' must be True or 'legacy'.") + raise ValueError("'square_distances' must be True or 'legacy'.") if self.metric != "euclidean" and self.square_distances is not True: warnings.warn( "'square_distances' has been introduced in 0.24 to help phase " @@ -204,47 +208,67 @@ def _fit(self, X, skip_num_points=0): FutureWarning, ) - if self.method == 'barnes_hut': - if sklearn_check_version('0.23'): - X = self._validate_data(X, accept_sparse=['csr'], - ensure_min_samples=2, - dtype=[np.float32, np.float64]) + if self.method == "barnes_hut": + if sklearn_check_version("0.23"): + X = self._validate_data( + X, + accept_sparse=["csr"], + ensure_min_samples=2, + dtype=[np.float32, np.float64], + ) else: - X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2, - dtype=[np.float32, np.float64]) + X = check_array( + X, + accept_sparse=["csr"], + ensure_min_samples=2, + dtype=[np.float32, np.float64], + ) else: - if sklearn_check_version('0.23'): - X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float32, np.float64]) + if sklearn_check_version("0.23"): + X = self._validate_data( + X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float32, np.float64] + ) else: - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float32, np.float64]) + X = check_array( + X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float32, np.float64] + ) if self.metric == "precomputed": - if isinstance(self._init, str) and self._init == 'pca': - raise ValueError("The parameter init=\"pca\" cannot be " - "used with metric=\"precomputed\".") + if isinstance(self._init, str) and self._init == "pca": + raise ValueError( + 'The parameter init="pca" cannot be ' + 'used with metric="precomputed".' + ) if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") - check_non_negative(X, "TSNE.fit(). With metric='precomputed', X " - "should contain positive distances.") + check_non_negative( + X, + "TSNE.fit(). With metric='precomputed', X " + "should contain positive distances.", + ) if self.method == "exact" and issparse(X): raise TypeError( 'TSNE with method="exact" does not accept sparse ' 'precomputed distance matrix. Use method="barnes_hut" ' - 'or provide the dense distance matrix.') + "or provide the dense distance matrix." + ) - if self.method == 'barnes_hut' and self.n_components > 3: - raise ValueError("'n_components' should be inferior to 4 for the " - "barnes_hut algorithm as it relies on " - "quad-tree or oct-tree.") + if self.method == "barnes_hut" and self.n_components > 3: + raise ValueError( + "'n_components' should be inferior to 4 for the " + "barnes_hut algorithm as it relies on " + "quad-tree or oct-tree." + ) random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: - raise ValueError("early_exaggeration must be at least 1, but is {}" - .format(self.early_exaggeration)) + raise ValueError( + "early_exaggeration must be at least 1, but is {}".format( + self.early_exaggeration + ) + ) if self.n_iter < 250: raise ValueError("n_iter should be at least 250") @@ -267,78 +291,85 @@ def _fit(self, X, skip_num_points=0): # squared distances, and returns np.sqrt(dist) for # squared=False. # Also, Euclidean is slower for n_jobs>1, so don't set here - distances = pairwise_distances(X, metric=self.metric, - squared=True) + distances = pairwise_distances(X, metric=self.metric, squared=True) else: metric_params_ = {} - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): metric_params_ = self.metric_params or {} - distances = pairwise_distances(X, metric=self.metric, - n_jobs=self.n_jobs, - **metric_params_) + distances = pairwise_distances( + X, metric=self.metric, n_jobs=self.n_jobs, **metric_params_ + ) if np.any(distances < 0): - raise ValueError("All distances should be positive, the " - "metric given is not correct") + raise ValueError( + "All distances should be positive, the " "metric given is not correct" + ) - if self.metric != "euclidean" and \ - getattr(self, 'square_distances', True) is True: + if ( + self.metric != "euclidean" + and getattr(self, "square_distances", True) is True + ): distances **= 2 # compute the joint probability distribution for the input space P = _joint_probabilities(distances, self.perplexity, self.verbose) assert np.all(np.isfinite(P)), "All probabilities should be finite" assert np.all(P >= 0), "All probabilities should be non-negative" - assert np.all(P <= 1), ("All probabilities should be less " - "or then equal to one") + assert np.all(P <= 1), ( + "All probabilities should be less " "or then equal to one" + ) else: # Compute the number of nearest neighbors to find. # LvdM uses 3 * perplexity as the number of neighbors. # In the event that we have very small # of points # set the neighbors to n - 1. - n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1)) + n_neighbors = min(n_samples - 1, int(3.0 * self.perplexity + 1)) if self.verbose: - print("[t-SNE] Computing {} nearest neighbors..." - .format(n_neighbors)) + print("[t-SNE] Computing {} nearest neighbors...".format(n_neighbors)) # Find the nearest neighbors for every point knn = None if sklearn_check_version("1.1"): knn = NearestNeighbors( - algorithm='auto', + algorithm="auto", n_jobs=self.n_jobs, n_neighbors=n_neighbors, metric=self.metric, - metric_params=self.metric_params + metric_params=self.metric_params, ) else: knn = NearestNeighbors( - algorithm='auto', + algorithm="auto", n_jobs=self.n_jobs, n_neighbors=n_neighbors, - metric=self.metric + metric=self.metric, ) t0 = time() knn.fit(X) duration = time() - t0 if self.verbose: - print("[t-SNE] Indexed {} samples in {:.3f}s...".format( - n_samples, duration)) + print( + "[t-SNE] Indexed {} samples in {:.3f}s...".format(n_samples, duration) + ) t0 = time() - distances_nn = knn.kneighbors_graph(mode='distance') + distances_nn = knn.kneighbors_graph(mode="distance") duration = time() - t0 if self.verbose: - print("[t-SNE] Computed neighbors for {} samples " - "in {:.3f}s...".format(n_samples, duration)) + print( + "[t-SNE] Computed neighbors for {} samples " + "in {:.3f}s...".format(n_samples, duration) + ) # Free the memory used by the ball_tree del knn - if getattr(self, 'square_distances', True) is True or \ - self.metric == "euclidean": + if ( + getattr(self, "square_distances", True) is True + or self.metric == "euclidean" + ): # knn return the euclidean distance but we need it squared # to be consistent with the 'exact' method. Note that the # the method was derived using the euclidean method as in the @@ -347,30 +378,31 @@ def _fit(self, X, skip_num_points=0): distances_nn.data **= 2 # compute the joint probability distribution for the input space - P = _joint_probabilities_nn(distances_nn, self.perplexity, - self.verbose) + P = _joint_probabilities_nn(distances_nn, self.perplexity, self.verbose) if isinstance(self._init, np.ndarray): X_embedded = self._init - elif self._init == 'pca': + elif self._init == "pca": pca = PCA( n_components=self.n_components, - svd_solver='randomized', + svd_solver="randomized", random_state=random_state, ) X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) - warnings.warn("The PCA initialization in TSNE will change to " - "have the standard deviation of PC1 equal to 1e-4 " - "in 1.2. This will ensure better convergence.", - FutureWarning) - elif self._init == 'random': + warnings.warn( + "The PCA initialization in TSNE will change to " + "have the standard deviation of PC1 equal to 1e-4 " + "in 1.2. This will ensure better convergence.", + FutureWarning, + ) + elif self._init == "random": # The embedding is initialized with iid samples from Gaussians with # standard deviation 1e-4. - X_embedded = 1e-4 * random_state.randn( - n_samples, self.n_components).astype(np.float32) + X_embedded = 1e-4 * random_state.randn(n_samples, self.n_components).astype( + np.float32 + ) else: - raise ValueError("'init' must be 'pca', 'random', or " - "a numpy array") + raise ValueError("'init' must be 'pca', 'random', or " "a numpy array") # Degrees of freedom of the Student's t-distribution. The suggestion # degrees_of_freedom = n_components - 1 comes from @@ -378,31 +410,31 @@ def _fit(self, X, skip_num_points=0): # Laurens van der Maaten, 2009. degrees_of_freedom = max(self.n_components - 1, 1) - _patching_status = PatchingConditionsChain( - "sklearn.manifold.TSNE._tsne") - _patching_status.and_conditions([ - (self.method == 'barnes_hut', - 'Used t-SNE method is not "barnes_hut" which is the only supported.'), - (self.n_components == 2, 'Number of components != 2.'), - (self.verbose == 0, 'Verbose mode is set.'), - (daal_check_version((2021, 'P', 600)), - 'oneDAL version is lower than 2021.6.') - ]) + _patching_status = PatchingConditionsChain("sklearn.manifold.TSNE._tsne") + _patching_status.and_conditions( + [ + ( + self.method == "barnes_hut", + 'Used t-SNE method is not "barnes_hut" which is the only supported.', + ), + (self.n_components == 2, "Number of components != 2."), + (self.verbose == 0, "Verbose mode is set."), + ( + daal_check_version((2021, "P", 600)), + "oneDAL version is lower than 2021.6.", + ), + ] + ) _dal_ready = _patching_status.get_status(logs=True) if _dal_ready: - X_embedded = check_array( - X_embedded, dtype=[np.float32, np.float64]) - return self._daal_tsne( - P, - n_samples, - X_embedded=X_embedded - ) + X_embedded = check_array(X_embedded, dtype=[np.float32, np.float64]) + return self._daal_tsne(P, n_samples, X_embedded=X_embedded) return self._tsne( P, degrees_of_freedom, n_samples, X_embedded=X_embedded, neighbors=neighbors_nn, - skip_num_points=skip_num_points + skip_num_points=skip_num_points, ) diff --git a/daal4py/sklearn/metrics/__init__.py b/daal4py/sklearn/metrics/__init__.py index 7695eb680c..3975869648 100644 --- a/daal4py/sklearn/metrics/__init__.py +++ b/daal4py/sklearn/metrics/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,9 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from ._ranking import roc_auc_score from ._pairwise import pairwise_distances +from ._ranking import roc_auc_score -__all__ = ['roc_auc_score', 'pairwise_distances'] +__all__ = ["roc_auc_score", "pairwise_distances"] diff --git a/daal4py/sklearn/metrics/_pairwise.py b/daal4py/sklearn/metrics/_pairwise.py index a4222564e2..5db848fc9a 100755 --- a/daal4py/sklearn/metrics/_pairwise.py +++ b/daal4py/sklearn/metrics/_pairwise.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,54 +12,63 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np +import warnings from functools import partial -from sklearn.metrics.pairwise import _parallel_pairwise, _pairwise_callable -from sklearn.metrics.pairwise import _VALID_METRICS, PAIRWISE_DISTANCE_FUNCTIONS -from sklearn.metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS -from sklearn.metrics.pairwise import check_pairwise_arrays + +import numpy as np +from sklearn.exceptions import DataConversionWarning +from sklearn.metrics.pairwise import ( + _VALID_METRICS, + PAIRWISE_BOOLEAN_FUNCTIONS, + PAIRWISE_DISTANCE_FUNCTIONS, + _pairwise_callable, + _parallel_pairwise, + check_pairwise_arrays, +) from sklearn.utils._joblib import effective_n_jobs from sklearn.utils.validation import check_non_negative -import warnings -from sklearn.exceptions import DataConversionWarning + try: from sklearn.metrics.pairwise import _precompute_metric_params except ImportError: + def _precompute_metric_params(*args, **kwrds): return dict() + from scipy.sparse import issparse from scipy.spatial import distance import daal4py from daal4py.sklearn.utils.validation import _daal_check_array -from .._utils import (getFPType, PatchingConditionsChain, sklearn_check_version) + from .._device_offload import support_usm_ndarray +from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version -if sklearn_check_version('1.3'): - from sklearn.utils._param_validation import ( - validate_params, Integral, StrOptions) +if sklearn_check_version("1.3"): + from sklearn.utils._param_validation import Integral, StrOptions, validate_params def _daal4py_cosine_distance_dense(X): X_fptype = getFPType(X) - alg = daal4py.cosine_distance(fptype=X_fptype, method='defaultDense') + alg = daal4py.cosine_distance(fptype=X_fptype, method="defaultDense") res = alg.compute(X) return res.cosineDistance def _daal4py_correlation_distance_dense(X): X_fptype = getFPType(X) - alg = daal4py.correlation_distance(fptype=X_fptype, method='defaultDense') + alg = daal4py.correlation_distance(fptype=X_fptype, method="defaultDense") res = alg.compute(X) return res.correlationDistance @support_usm_ndarray(freefunc=True) -def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, - force_all_finite=True, **kwds): +def pairwise_distances( + X, Y=None, metric="euclidean", n_jobs=None, force_all_finite=True, **kwds +): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns @@ -159,46 +168,57 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, elements of two arrays """ if metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed": - raise ValueError("Unknown metric %s. Valid metrics are %s, or 'precomputed', " - "or a callable" % (metric, _VALID_METRICS)) - - X = _daal_check_array(X, accept_sparse=['csr', 'csc', 'coo'], - force_all_finite=force_all_finite) - - _patching_status = PatchingConditionsChain( - "sklearn.metrics.pairwise_distances") - _dal_ready = _patching_status.and_conditions([ - (metric == 'cosine' or metric == 'correlation', - f"'{metric}' metric is not supported. " - "Only 'cosine' and 'correlation' metrics are supported."), - (Y is None, "Second feature array is not supported."), - (not issparse(X), "X is sparse. Sparse input is not supported."), - (X.dtype == np.float64, - f"{X.dtype} X data type is not supported. Only np.float64 is supported.") - ]) + raise ValueError( + "Unknown metric %s. Valid metrics are %s, or 'precomputed', " + "or a callable" % (metric, _VALID_METRICS) + ) + + X = _daal_check_array( + X, accept_sparse=["csr", "csc", "coo"], force_all_finite=force_all_finite + ) + + _patching_status = PatchingConditionsChain("sklearn.metrics.pairwise_distances") + _dal_ready = _patching_status.and_conditions( + [ + ( + metric == "cosine" or metric == "correlation", + f"'{metric}' metric is not supported. " + "Only 'cosine' and 'correlation' metrics are supported.", + ), + (Y is None, "Second feature array is not supported."), + (not issparse(X), "X is sparse. Sparse input is not supported."), + ( + X.dtype == np.float64, + f"{X.dtype} X data type is not supported. Only np.float64 is supported.", + ), + ] + ) _patching_status.write_log() if _dal_ready: - if metric == 'cosine': + if metric == "cosine": return _daal4py_cosine_distance_dense(X) - if metric == 'correlation': + if metric == "correlation": return _daal4py_correlation_distance_dense(X) raise ValueError(f"'{metric}' distance is wrong for daal4py.") if metric == "precomputed": - X, _ = check_pairwise_arrays(X, Y, precomputed=True, - force_all_finite=force_all_finite) - whom = ("`pairwise_distances`. Precomputed distance " - " need to have non-negative values.") + X, _ = check_pairwise_arrays( + X, Y, precomputed=True, force_all_finite=force_all_finite + ) + whom = ( + "`pairwise_distances`. Precomputed distance " + " need to have non-negative values." + ) check_non_negative(X, whom=whom) return X if metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): - func = partial(_pairwise_callable, metric=metric, - force_all_finite=force_all_finite, **kwds) + func = partial( + _pairwise_callable, metric=metric, force_all_finite=force_all_finite, **kwds + ) else: if issparse(X) or issparse(Y): - raise TypeError("scipy distance metrics do not" - " support sparse matrices.") + raise TypeError("scipy distance metrics do not" " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None @@ -206,22 +226,20 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, msg = "Data was converted to boolean for metric %s" % metric warnings.warn(msg, DataConversionWarning) - X, Y = check_pairwise_arrays(X, Y, dtype=dtype, - force_all_finite=force_all_finite) + X, Y = check_pairwise_arrays(X, Y, dtype=dtype, force_all_finite=force_all_finite) # precompute data-derived metric params params = _precompute_metric_params(X, Y, metric=metric, **kwds) kwds.update(**params) if effective_n_jobs(n_jobs) == 1 and X is Y: - return distance.squareform(distance.pdist(X, metric=metric, - **kwds)) + return distance.squareform(distance.pdist(X, metric=metric, **kwds)) func = partial(distance.cdist, metric=metric, **kwds) return _parallel_pairwise(X, Y, func, n_jobs, **kwds) -if sklearn_check_version('1.3'): +if sklearn_check_version("1.3"): pairwise_distances = validate_params( { "X": ["array-like", "sparse matrix"], @@ -229,5 +247,6 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], "n_jobs": [Integral, None], "force_all_finite": ["boolean", StrOptions({"allow-nan"})], - }, prefer_skip_nested_validation=True + }, + prefer_skip_nested_validation=True, )(pairwise_distances) diff --git a/daal4py/sklearn/metrics/_ranking.py b/daal4py/sklearn/metrics/_ranking.py index 8341dde30b..432e3d3568 100644 --- a/daal4py/sklearn/metrics/_ranking.py +++ b/daal4py/sklearn/metrics/_ranking.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,37 +12,44 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import daal4py as d4p -import numpy as np -from functools import partial +import logging from collections.abc import Sequence -from scipy import sparse as sp +from functools import partial +import numpy as np +from scipy import sparse as sp +from sklearn.preprocessing import label_binarize from sklearn.utils import check_array from sklearn.utils.multiclass import is_multilabel -from sklearn.preprocessing import label_binarize -from ..utils.validation import _assert_all_finite -from .._utils import get_patch_message, sklearn_check_version, PatchingConditionsChain +import daal4py as d4p + from .._device_offload import support_usm_ndarray -import logging +from .._utils import PatchingConditionsChain, get_patch_message, sklearn_check_version +from ..utils.validation import _assert_all_finite -if sklearn_check_version('0.22'): - from sklearn.metrics._ranking import _multiclass_roc_auc_score as \ - multiclass_roc_auc_score - from sklearn.metrics._ranking import _binary_roc_auc_score +if sklearn_check_version("0.22"): from sklearn.metrics._base import _average_binary_score + from sklearn.metrics._ranking import _binary_roc_auc_score + from sklearn.metrics._ranking import ( + _multiclass_roc_auc_score as multiclass_roc_auc_score, + ) else: from sklearn.metrics.ranking import roc_auc_score as multiclass_roc_auc_score -if sklearn_check_version('1.3'): +if sklearn_check_version("1.3"): from sklearn.utils._param_validation import ( - validate_params, Interval, Real, StrOptions) + Interval, + Real, + StrOptions, + validate_params, + ) try: import pandas as pd + pandas_is_imported = True except ImportError: pandas_is_imported = False @@ -50,47 +57,52 @@ def _daal_type_of_target(y): valid = ( - isinstance( - y, Sequence) or sp.isspmatrix(y) or hasattr( - y, '__array__')) and not isinstance( - y, str) + isinstance(y, Sequence) or sp.isspmatrix(y) or hasattr(y, "__array__") + ) and not isinstance(y, str) if not valid: - raise ValueError('Expected array-like (array or non-string sequence), ' - 'got %r' % y) + raise ValueError( + "Expected array-like (array or non-string sequence), " "got %r" % y + ) - sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray']) + sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"] if sparse_pandas: raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") if is_multilabel(y): - return 'multilabel-indicator' + return "multilabel-indicator" try: y = np.asarray(y) except ValueError: # Known to fail in numpy 1.3 for array of arrays - return 'unknown' + return "unknown" # The old sequence of sequences format try: - if not hasattr(y[0], '__array__') and isinstance( - y[0], Sequence) and not isinstance(y[0], str): - raise ValueError('You appear to be using a legacy multi-label data' - ' representation. Sequence of sequences are no' - ' longer supported; use a binary array or sparse' - ' matrix instead - the MultiLabelBinarizer' - ' transformer can convert to this format.') + if ( + not hasattr(y[0], "__array__") + and isinstance(y[0], Sequence) + and not isinstance(y[0], str) + ): + raise ValueError( + "You appear to be using a legacy multi-label data" + " representation. Sequence of sequences are no" + " longer supported; use a binary array or sparse" + " matrix instead - the MultiLabelBinarizer" + " transformer can convert to this format." + ) except IndexError: pass # Invalid inputs - if y.ndim > 2 or (y.dtype == object and len( - y) != 0 and not isinstance(y.flat[0], str)): - return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] + if y.ndim > 2 or ( + y.dtype == object and len(y) != 0 and not isinstance(y.flat[0], str) + ): + return "unknown" # [[[1, 2]]] or [obj_1] and not ["label_1"] if y.ndim == 2 and y.shape[1] == 0: - return 'unknown' # [[]] + return "unknown" # [[]] if y.ndim == 2 and y.shape[1] > 1: suffix = "-multioutput" # [[1, 2], [1, 2]] @@ -98,20 +110,18 @@ def _daal_type_of_target(y): suffix = "" # [1, 2, 3] or [[1], [2], [3]] # check float and contains non-integer float values - if y.dtype.kind == 'f' and np.any(y != y.astype(int)): + if y.dtype.kind == "f" and np.any(y != y.astype(int)): # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] _assert_all_finite(y) - return 'continuous' + suffix + return "continuous" + suffix - unique = np.sort( - pd.unique( - y.ravel())) if pandas_is_imported else np.unique(y) + unique = np.sort(pd.unique(y.ravel())) if pandas_is_imported else np.unique(y) if (len(unique) > 2) or (y.ndim >= 2 and len(y[0]) > 1): # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] - result = ('multiclass' + suffix, None) + result = ("multiclass" + suffix, None) else: - result = ('binary', unique) # [1, 2] or [["a"], ["b"]] + result = ("binary", unique) # [1, 2] or [["a"], ["b"]] return result @@ -130,46 +140,57 @@ def roc_auc_score( y_true = check_array(y_true, ensure_2d=False, dtype=None) y_score = check_array(y_score, ensure_2d=False) - _patching_status = PatchingConditionsChain( - "sklearn.metrics.roc_auc_score") - _dal_ready = _patching_status.and_conditions([ - (y_type[0] == "binary" and not (y_score.ndim == 2 and y_score.shape[1] > 2), - "y_true type is not one-dimensional binary.") - ]) + _patching_status = PatchingConditionsChain("sklearn.metrics.roc_auc_score") + _dal_ready = _patching_status.and_conditions( + [ + ( + y_type[0] == "binary" + and not (y_score.ndim == 2 and y_score.shape[1] > 2), + "y_true type is not one-dimensional binary.", + ) + ] + ) _patching_status.write_log() if y_type[0] == "multiclass" or ( y_type[0] == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2 ): # do not support partial ROC computation for multiclass - if max_fpr is not None and max_fpr != 1.: - raise ValueError("Partial AUC computation not available in " - "multiclass setting, 'max_fpr' must be" - " set to `None`, received `max_fpr={0}` " - "instead".format(max_fpr)) - if multi_class == 'raise': + if max_fpr is not None and max_fpr != 1.0: + raise ValueError( + "Partial AUC computation not available in " + "multiclass setting, 'max_fpr' must be" + " set to `None`, received `max_fpr={0}` " + "instead".format(max_fpr) + ) + if multi_class == "raise": raise ValueError("multi_class must be in ('ovo', 'ovr')") return multiclass_roc_auc_score( - y_true, y_score, labels, multi_class, average, sample_weight) + y_true, y_score, labels, multi_class, average, sample_weight + ) if y_type[0] == "binary": labels = y_type[1] - _dal_ready = _patching_status.and_conditions([ - (len(labels) == 2, "Number of unique labels is not equal to 2."), - (max_fpr is None, "Maximum false-positive rate is not supported."), - (sample_weight is None, "Sample weights are not supported.")]) + _dal_ready = _patching_status.and_conditions( + [ + (len(labels) == 2, "Number of unique labels is not equal to 2."), + (max_fpr is None, "Maximum false-positive rate is not supported."), + (sample_weight is None, "Sample weights are not supported."), + ] + ) if _dal_ready: if not np.array_equal(labels, [0, 1]) or labels.dtype == bool: y_true = label_binarize(y_true, classes=labels)[:, 0] - if hasattr(y_score, 'dtype') and y_score.dtype == bool: + if hasattr(y_score, "dtype") and y_score.dtype == bool: y_score = label_binarize(y_score, classes=labels)[:, 0] - result = d4p.daal_roc_auc_score(y_true.reshape(-1, 1), - y_score.reshape(-1, 1)) + result = d4p.daal_roc_auc_score(y_true.reshape(-1, 1), y_score.reshape(-1, 1)) if result != -1: return result - logging.info("sklearn.metrics.roc_auc_score: " + get_patch_message( - "sklearn_after_daal")) + logging.info( + "sklearn.metrics.roc_auc_score: " + + get_patch_message("sklearn_after_daal") + ) # return to sklearn implementation y_true = label_binarize(y_true, classes=labels)[:, 0] @@ -182,7 +203,7 @@ def roc_auc_score( ) -if sklearn_check_version('1.3'): +if sklearn_check_version("1.3"): roc_auc_score = validate_params( { "y_true": ["array-like"], @@ -192,5 +213,6 @@ def roc_auc_score( "max_fpr": [Interval(Real, 0.0, 1, closed="right"), None], "multi_class": [StrOptions({"raise", "ovr", "ovo"})], "labels": ["array-like", None], - }, prefer_skip_nested_validation=True + }, + prefer_skip_nested_validation=True, )(roc_auc_score) diff --git a/daal4py/sklearn/model_selection/__init__.py b/daal4py/sklearn/model_selection/__init__.py index 4fd1ce1bc3..f1d827da72 100644 --- a/daal4py/sklearn/model_selection/__init__.py +++ b/daal4py/sklearn/model_selection/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._split import train_test_split -__all__ = ['train_test_split'] +__all__ = ["train_test_split"] diff --git a/daal4py/sklearn/model_selection/_split.py b/daal4py/sklearn/model_selection/_split.py index b4e2c58107..f60f57af28 100644 --- a/daal4py/sklearn/model_selection/_split.py +++ b/daal4py/sklearn/model_selection/_split.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,16 +13,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +import platform + +import numpy as np +from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit +from sklearn.model_selection._split import _validate_shuffle_split from sklearn.utils import indexable from sklearn.utils.validation import _num_samples -from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit -from sklearn.model_selection._split import _validate_shuffle_split + import daal4py as d4p -import numpy as np from daal4py.sklearn._utils import PatchingConditionsChain -import platform + from .._device_offload import support_usm_ndarray from .._utils import sklearn_check_version @@ -33,28 +36,30 @@ try: import mkl_random + mkl_random_is_imported = True except (ImportError, ModuleNotFoundError): mkl_random_is_imported = False try: import pandas as pd + pandas_is_imported = True except (ImportError, ModuleNotFoundError): pandas_is_imported = False -if sklearn_check_version('1.3'): +if sklearn_check_version("1.3"): import numbers - from sklearn.utils._param_validation import ( - validate_params, Interval, RealNotInt) + + from sklearn.utils._param_validation import Interval, RealNotInt, validate_params def get_dtypes(data): - if hasattr(data, 'dtype'): + if hasattr(data, "dtype"): return [data.dtype] - if hasattr(data, 'dtypes'): + if hasattr(data, "dtypes"): return list(data.dtypes) - if hasattr(data, 'values'): + if hasattr(data, "values"): return [data.values.dtype] return None @@ -64,20 +69,32 @@ def train_test_split(*arrays, **options): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") - test_size = options.pop('test_size', None) - train_size = options.pop('train_size', None) - random_state = options.pop('random_state', None) - stratify = options.pop('stratify', None) - shuffle = options.pop('shuffle', True) - rng = options.pop('rng', 'OPTIMIZED_MT19937') - - available_rngs = ['default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', - 'WH', 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', - 'NONDETERM', 'OPTIMIZED_MT19937'] + test_size = options.pop("test_size", None) + train_size = options.pop("train_size", None) + random_state = options.pop("random_state", None) + stratify = options.pop("stratify", None) + shuffle = options.pop("shuffle", True) + rng = options.pop("rng", "OPTIMIZED_MT19937") + + available_rngs = [ + "default", + "MT19937", + "SFMT19937", + "MT2203", + "R250", + "WH", + "MCG31", + "MCG59", + "MRG32K3A", + "PHILOX4X32X10", + "NONDETERM", + "OPTIMIZED_MT19937", + ] if rng not in available_rngs: raise ValueError( "Wrong random numbers generator is chosen. " - "Available generators: %s" % str(available_rngs)[1:-1]) + "Available generators: %s" % str(available_rngs)[1:-1] + ) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) @@ -91,83 +108,99 @@ def train_test_split(*arrays, **options): if shuffle is False: if stratify is not None: raise ValueError( - "Stratified train/test split is not implemented for shuffle=False") + "Stratified train/test split is not implemented for shuffle=False" + ) train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) else: if stratify is not None: cv = StratifiedShuffleSplit( - test_size=n_test, - train_size=n_train, - random_state=random_state + test_size=n_test, train_size=n_train, random_state=random_state ) train, test = next(cv.split(X=arrays[0], y=stratify)) else: - if mkl_random_is_imported and \ - rng not in ['default', 'OPTIMIZED_MT19937'] and \ - (isinstance(random_state, int) or random_state is None): + if ( + mkl_random_is_imported + and rng not in ["default", "OPTIMIZED_MT19937"] + and (isinstance(random_state, int) or random_state is None) + ): random_state = mkl_random.RandomState(random_state, rng) indexes = random_state.permutation(n_samples) - test, train = indexes[:n_test], indexes[n_test:( - n_test + n_train)] - elif rng == 'OPTIMIZED_MT19937' and \ - (isinstance(random_state, int) or random_state is None) and \ - platform.system() != 'Windows': + test, train = indexes[:n_test], indexes[n_test : (n_test + n_train)] + elif ( + rng == "OPTIMIZED_MT19937" + and (isinstance(random_state, int) or random_state is None) + and platform.system() != "Windows" + ): indexes = np.empty( shape=(n_samples,), - dtype=np.int64 if n_train + n_test > 2 ** 31 - 1 else np.int32 + dtype=np.int64 if n_train + n_test > 2**31 - 1 else np.int32, ) random_state = np.random.RandomState(random_state) random_state = random_state.get_state()[1] d4p.daal_generate_shuffled_indices([indexes], [random_state]) - test, train = indexes[:n_test], indexes[n_test:( - n_test + n_train)] + test, train = indexes[:n_test], indexes[n_test : (n_test + n_train)] else: cv = ShuffleSplit( - test_size=n_test, - train_size=n_train, - random_state=random_state + test_size=n_test, train_size=n_train, random_state=random_state ) train, test = next(cv.split(X=arrays[0], y=stratify)) res = [] for arr in arrays: _patching_status = PatchingConditionsChain( - "sklearn.model_selection.train_test_split") + "sklearn.model_selection.train_test_split" + ) # input format check - _patching_status.and_conditions([ - (isinstance(arr, np.ndarray), "The input is not a np.ndarray object.")]) + _patching_status.and_conditions( + [(isinstance(arr, np.ndarray), "The input is not a np.ndarray object.")] + ) if pandas_is_imported: - _patching_status.or_conditions([ - (isinstance(arr, pd.core.frame.DataFrame), - "The input is not a pd.DataFrame object."), - (isinstance(arr, pd.core.series.Series), - "The input is not a pd.Series object.") - ], conditions_merging=any) + _patching_status.or_conditions( + [ + ( + isinstance(arr, pd.core.frame.DataFrame), + "The input is not a pd.DataFrame object.", + ), + ( + isinstance(arr, pd.core.series.Series), + "The input is not a pd.Series object.", + ), + ], + conditions_merging=any, + ) # dimensions check - _dal_ready = _patching_status.and_conditions([ - (hasattr(arr, 'ndim'), "The input does not have 'ndim' attribute.")]) - if hasattr(arr, 'ndim'): - _patching_status.and_conditions([ - (arr.ndim <= 2, "The input has more than 2 dimensions.")]) + _dal_ready = _patching_status.and_conditions( + [(hasattr(arr, "ndim"), "The input does not have 'ndim' attribute.")] + ) + if hasattr(arr, "ndim"): + _patching_status.and_conditions( + [(arr.ndim <= 2, "The input has more than 2 dimensions.")] + ) # data types check dtypes = get_dtypes(arr) - _dal_ready = _patching_status.and_conditions([ - (dtypes is not None, "Unable to parse input data types.")]) + _dal_ready = _patching_status.and_conditions( + [(dtypes is not None, "Unable to parse input data types.")] + ) if dtypes is not None: incorrect_dtype = None for i, dtype in enumerate(dtypes): - if 'float' not in str(dtype) and 'int' not in str(dtype): + if "float" not in str(dtype) and "int" not in str(dtype): incorrect_dtype = str(dtype) break - _dal_ready = _patching_status.and_conditions([ - (incorrect_dtype is None, - f"Input has incorrect data type '{incorrect_dtype}'. " - "Only integer and floating point types are supported.")]) + _dal_ready = _patching_status.and_conditions( + [ + ( + incorrect_dtype is None, + f"Input has incorrect data type '{incorrect_dtype}'. " + "Only integer and floating point types are supported.", + ) + ] + ) _patching_status.write_log() if not _dal_ready: @@ -185,10 +218,10 @@ def train_test_split(*arrays, **options): if not isinstance(arr_copy, list): arr_copy = arr_copy.reshape( (arr_copy.shape[0], n_cols), - order='A', + order="A", ) if isinstance(arr_copy, np.ndarray): - order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F' + order = "C" if arr_copy.flags["C_CONTIGUOUS"] else "F" train_arr = np.empty( shape=(n_train, n_cols), dtype=arr_copy.dtype, @@ -199,55 +232,56 @@ def train_test_split(*arrays, **options): dtype=arr_copy.dtype, order=order, ) - d4p.daal_train_test_split( - arr_copy, train_arr, test_arr, [train], [test] - ) + d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) if reshape_later: - train_arr, test_arr = train_arr.reshape( - (n_train,)), test_arr.reshape((n_test,)) + train_arr, test_arr = train_arr.reshape((n_train,)), test_arr.reshape( + (n_test,) + ) elif isinstance(arr_copy, list): train_arr = [ np.empty( shape=(n_train,), dtype=el.dtype, - order='C' if el.flags['C_CONTIGUOUS'] else 'F', - ) for el in arr_copy + order="C" if el.flags["C_CONTIGUOUS"] else "F", + ) + for el in arr_copy ] test_arr = [ np.empty( shape=(n_test,), dtype=el.dtype, - order='C' if el.flags['C_CONTIGUOUS'] else 'F' - ) for el in arr_copy + order="C" if el.flags["C_CONTIGUOUS"] else "F", + ) + for el in arr_copy ] - d4p.daal_train_test_split( - arr_copy, train_arr, test_arr, [train], [test]) - train_arr = {col: train_arr[i] - for i, col in enumerate(arr.columns)} - test_arr = {col: test_arr[i] - for i, col in enumerate(arr.columns)} + d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) + train_arr = {col: train_arr[i] for i, col in enumerate(arr.columns)} + test_arr = {col: test_arr[i] for i, col in enumerate(arr.columns)} else: - raise ValueError('Array can\'t be converted to needed format') + raise ValueError("Array can't be converted to needed format") if pandas_is_imported: if isinstance(arr, pd.core.frame.DataFrame): - train_arr, test_arr = pd.DataFrame(train_arr, columns=arr.columns), \ - pd.DataFrame(test_arr, columns=arr.columns) + train_arr, test_arr = pd.DataFrame( + train_arr, columns=arr.columns + ), pd.DataFrame(test_arr, columns=arr.columns) if isinstance(arr, pd.core.series.Series): - train_arr, test_arr = \ - train_arr.reshape(n_train), test_arr.reshape(n_test) - train_arr, test_arr = pd.Series(train_arr, name=arr.name), \ - pd.Series(test_arr, name=arr.name) + train_arr, test_arr = train_arr.reshape(n_train), test_arr.reshape( + n_test + ) + train_arr, test_arr = pd.Series(train_arr, name=arr.name), pd.Series( + test_arr, name=arr.name + ) - if hasattr(arr, 'index'): + if hasattr(arr, "index"): train_arr.index = train test_arr.index = test - if hasattr(arr, 'columns'): + if hasattr(arr, "columns"): train_arr.columns = arr.columns test_arr.columns = arr.columns - if hasattr(arr, 'name'): + if hasattr(arr, "name"): train_arr.name = arr.name test_arr.name = arr.name @@ -257,19 +291,22 @@ def train_test_split(*arrays, **options): return res -if sklearn_check_version('1.3'): - train_test_split = validate_params({ - "test_size": [ - Interval(RealNotInt, 0, 1, closed="neither"), - Interval(numbers.Integral, 1, None, closed="left"), - None, - ], - "train_size": [ - Interval(RealNotInt, 0, 1, closed="neither"), - Interval(numbers.Integral, 1, None, closed="left"), - None, - ], - "random_state": ["random_state"], - "shuffle": ["boolean"], - "stratify": ["array-like", None], - }, prefer_skip_nested_validation=True)(train_test_split) +if sklearn_check_version("1.3"): + train_test_split = validate_params( + { + "test_size": [ + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(numbers.Integral, 1, None, closed="left"), + None, + ], + "train_size": [ + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(numbers.Integral, 1, None, closed="left"), + None, + ], + "random_state": ["random_state"], + "shuffle": ["boolean"], + "stratify": ["array-like", None], + }, + prefer_skip_nested_validation=True, + )(train_test_split) diff --git a/daal4py/sklearn/model_selection/tests/test_split.py b/daal4py/sklearn/model_selection/tests/test_split.py index 8922ac4d0c..037188135d 100644 --- a/daal4py/sklearn/model_selection/tests/test_split.py +++ b/daal4py/sklearn/model_selection/tests/test_split.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,43 +12,45 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np import pytest -from sklearn.model_selection import train_test_split as skl_train_test_split -from daal4py.sklearn.model_selection import train_test_split as d4p_train_test_split -from daal4py.sklearn._utils import daal_check_version from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split as skl_train_test_split +from daal4py.sklearn._utils import daal_check_version +from daal4py.sklearn.model_selection import train_test_split as d4p_train_test_split -N_SAMPLES = [2 ** i + 1 for i in range(2, 17)] +N_SAMPLES = [2**i + 1 for i in range(2, 17)] RANDOM_STATE = 777 @pytest.mark.skipif( - not daal_check_version((2021, 'P', 400)), - reason='train_test_split has bugfix since 2021.4 release') -@pytest.mark.parametrize('n_samples', N_SAMPLES) + not daal_check_version((2021, "P", 400)), + reason="train_test_split has bugfix since 2021.4 release", +) +@pytest.mark.parametrize("n_samples", N_SAMPLES) def test_results_similarity(n_samples): x, y = make_classification( - n_samples=n_samples, n_features=4, random_state=RANDOM_STATE) + n_samples=n_samples, n_features=4, random_state=RANDOM_STATE + ) d4p_res = d4p_train_test_split( x, y, test_size=n_samples // 2 - 1, train_size=n_samples // 2 - 1, - random_state=RANDOM_STATE) + random_state=RANDOM_STATE, + ) skl_res = skl_train_test_split( x, y, test_size=n_samples // 2 - 1, train_size=n_samples // 2 - 1, - random_state=RANDOM_STATE) + random_state=RANDOM_STATE, + ) - assert len(d4p_res) == len( - skl_res), 'train_test_splits have different output size' + assert len(d4p_res) == len(skl_res), "train_test_splits have different output size" for i, _ in enumerate(d4p_res): - assert np.all(d4p_res[i] == skl_res[i] - ), 'train_test_splits have different output' + assert np.all(d4p_res[i] == skl_res[i]), "train_test_splits have different output" diff --git a/daal4py/sklearn/monkeypatch/dispatcher.py b/daal4py/sklearn/monkeypatch/dispatcher.py index e8c3b02725..93745d66e8 100755 --- a/daal4py/sklearn/monkeypatch/dispatcher.py +++ b/daal4py/sklearn/monkeypatch/dispatcher.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,90 +12,120 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import set_idp_sklearn_verbose -from ..neighbors import KNeighborsRegressor as KNeighborsRegressor_daal4py -from ..neighbors import NearestNeighbors as NearestNeighbors_daal4py -from ..neighbors import KNeighborsClassifier as KNeighborsClassifier_daal4py -from ..model_selection import train_test_split -from ..utils.validation import _assert_all_finite -from ..svm.svm import SVC as SVC_daal4py -from ..ensemble._forest import RandomForestClassifier as RandomForestClassifier_daal4py -from ..ensemble._forest import RandomForestRegressor as RandomForestRegressor_daal4py -from ..metrics import roc_auc_score -from ..metrics import pairwise_distances -from ..cluster.k_means import KMeans as KMeans_daal4py -from ..cluster.dbscan import DBSCAN as DBSCAN_daal4py -from ..linear_model.coordinate_descent import Lasso as Lasso_daal4py -from ..linear_model.coordinate_descent import ElasticNet as ElasticNet_daal4py -from ..linear_model.linear import LinearRegression as LinearRegression_daal4py -from ..linear_model.ridge import Ridge as Ridge_daal4py -from ..linear_model.logistic_path import LogisticRegression as LogisticRegression_daal4py -from ..linear_model.logistic_path import logistic_regression_path as \ - daal_optimized_logistic_path -from ..decomposition._pca import PCA as PCA_daal4py -from ..manifold import TSNE as TSNE_daal4py -from sklearn import model_selection -from sklearn import metrics -from sklearn.utils import validation import sys +import warnings from functools import lru_cache import sklearn.cluster as cluster_module -import sklearn.ensemble as ensemble_module -import sklearn.svm as svm_module -import sklearn.linear_model._logistic as logistic_module -import sklearn.neighbors as neighbors_module import sklearn.decomposition as decomposition_module +import sklearn.ensemble as ensemble_module import sklearn.linear_model as linear_model_module +import sklearn.linear_model._logistic as logistic_module import sklearn.manifold as manifold_module +import sklearn.neighbors as neighbors_module +import sklearn.svm as svm_module +from sklearn import metrics, model_selection +from sklearn.utils import validation -import warnings +from daal4py.sklearn._utils import set_idp_sklearn_verbose + +from ..cluster.dbscan import DBSCAN as DBSCAN_daal4py +from ..cluster.k_means import KMeans as KMeans_daal4py +from ..decomposition._pca import PCA as PCA_daal4py +from ..ensemble._forest import RandomForestClassifier as RandomForestClassifier_daal4py +from ..ensemble._forest import RandomForestRegressor as RandomForestRegressor_daal4py +from ..linear_model.coordinate_descent import ElasticNet as ElasticNet_daal4py +from ..linear_model.coordinate_descent import Lasso as Lasso_daal4py +from ..linear_model.linear import LinearRegression as LinearRegression_daal4py +from ..linear_model.logistic_path import LogisticRegression as LogisticRegression_daal4py +from ..linear_model.logistic_path import ( + logistic_regression_path as daal_optimized_logistic_path, +) +from ..linear_model.ridge import Ridge as Ridge_daal4py +from ..manifold import TSNE as TSNE_daal4py +from ..metrics import pairwise_distances, roc_auc_score +from ..model_selection import train_test_split +from ..neighbors import KNeighborsClassifier as KNeighborsClassifier_daal4py +from ..neighbors import KNeighborsRegressor as KNeighborsRegressor_daal4py +from ..neighbors import NearestNeighbors as NearestNeighbors_daal4py +from ..svm.svm import SVC as SVC_daal4py +from ..utils.validation import _assert_all_finite @lru_cache(maxsize=None) def _get_map_of_algorithms(): mapping = { - 'pca': [[(decomposition_module, 'PCA', PCA_daal4py), None]], - 'kmeans': [[(cluster_module, 'KMeans', KMeans_daal4py), None]], - 'dbscan': [[(cluster_module, 'DBSCAN', DBSCAN_daal4py), None]], - 'distances': [[(metrics, 'pairwise_distances', pairwise_distances), None]], - 'linear': [[(linear_model_module, 'LinearRegression', - LinearRegression_daal4py), None]], - 'ridge': [[(linear_model_module, 'Ridge', Ridge_daal4py), None]], - 'elasticnet': [[(linear_model_module, 'ElasticNet', ElasticNet_daal4py), None]], - 'lasso': [[(linear_model_module, 'Lasso', Lasso_daal4py), None]], - 'svm': [[(svm_module, 'SVC', SVC_daal4py), None]], - 'logistic': [[(logistic_module, '_logistic_regression_path', - daal_optimized_logistic_path), None]], - 'log_reg': [[(linear_model_module, 'LogisticRegression', - LogisticRegression_daal4py), None]], - 'knn_classifier': [[(neighbors_module, 'KNeighborsClassifier', - KNeighborsClassifier_daal4py), None]], - 'nearest_neighbors': [[(neighbors_module, 'NearestNeighbors', - NearestNeighbors_daal4py), None]], - 'knn_regressor': [[(neighbors_module, 'KNeighborsRegressor', - KNeighborsRegressor_daal4py), None]], - 'random_forest_classifier': [[(ensemble_module, 'RandomForestClassifier', - RandomForestClassifier_daal4py), None]], - 'random_forest_regressor': [[(ensemble_module, 'RandomForestRegressor', - RandomForestRegressor_daal4py), None]], - 'train_test_split': [[(model_selection, 'train_test_split', - train_test_split), None]], - 'fin_check': [[(validation, '_assert_all_finite', - _assert_all_finite), None]], - 'roc_auc_score': [[(metrics, 'roc_auc_score', - roc_auc_score), None]], - 'tsne': [[(manifold_module, 'TSNE', TSNE_daal4py), None]], + "pca": [[(decomposition_module, "PCA", PCA_daal4py), None]], + "kmeans": [[(cluster_module, "KMeans", KMeans_daal4py), None]], + "dbscan": [[(cluster_module, "DBSCAN", DBSCAN_daal4py), None]], + "distances": [[(metrics, "pairwise_distances", pairwise_distances), None]], + "linear": [ + [(linear_model_module, "LinearRegression", LinearRegression_daal4py), None] + ], + "ridge": [[(linear_model_module, "Ridge", Ridge_daal4py), None]], + "elasticnet": [[(linear_model_module, "ElasticNet", ElasticNet_daal4py), None]], + "lasso": [[(linear_model_module, "Lasso", Lasso_daal4py), None]], + "svm": [[(svm_module, "SVC", SVC_daal4py), None]], + "logistic": [ + [ + ( + logistic_module, + "_logistic_regression_path", + daal_optimized_logistic_path, + ), + None, + ] + ], + "log_reg": [ + [ + (linear_model_module, "LogisticRegression", LogisticRegression_daal4py), + None, + ] + ], + "knn_classifier": [ + [ + (neighbors_module, "KNeighborsClassifier", KNeighborsClassifier_daal4py), + None, + ] + ], + "nearest_neighbors": [ + [(neighbors_module, "NearestNeighbors", NearestNeighbors_daal4py), None] + ], + "knn_regressor": [ + [(neighbors_module, "KNeighborsRegressor", KNeighborsRegressor_daal4py), None] + ], + "random_forest_classifier": [ + [ + ( + ensemble_module, + "RandomForestClassifier", + RandomForestClassifier_daal4py, + ), + None, + ] + ], + "random_forest_regressor": [ + [ + (ensemble_module, "RandomForestRegressor", RandomForestRegressor_daal4py), + None, + ] + ], + "train_test_split": [ + [(model_selection, "train_test_split", train_test_split), None] + ], + "fin_check": [[(validation, "_assert_all_finite", _assert_all_finite), None]], + "roc_auc_score": [[(metrics, "roc_auc_score", roc_auc_score), None]], + "tsne": [[(manifold_module, "TSNE", TSNE_daal4py), None]], } - mapping['svc'] = mapping['svm'] - mapping['logisticregression'] = mapping['log_reg'] - mapping['kneighborsclassifier'] = mapping['knn_classifier'] - mapping['nearestneighbors'] = mapping['nearest_neighbors'] - mapping['kneighborsregressor'] = mapping['knn_regressor'] - mapping['randomforestclassifier'] = mapping['random_forest_classifier'] - mapping['randomforestregressor'] = mapping['random_forest_regressor'] + mapping["svc"] = mapping["svm"] + mapping["logisticregression"] = mapping["log_reg"] + mapping["kneighborsclassifier"] = mapping["knn_classifier"] + mapping["nearestneighbors"] = mapping["nearest_neighbors"] + mapping["kneighborsregressor"] = mapping["knn_regressor"] + mapping["randomforestclassifier"] = mapping["random_forest_classifier"] + mapping["randomforestregressor"] = mapping["random_forest_regressor"] mapping["linearregression"] = mapping["linear"] mapping["logisticregression"] = mapping["log_reg"] mapping["_logistic_regression_path"] = mapping["logistic"] @@ -135,23 +165,28 @@ def enable(name=None, verbose=True, deprecation=True, get_map=_get_map_of_algori do_patch(key, get_map) if deprecation: set_idp_sklearn_verbose() - warnings.warn_explicit("\nScikit-learn patching with daal4py is deprecated " - "and will be removed in the future.\n" - "Use Intel(R) Extension " - "for Scikit-learn* module instead " - "(pip install scikit-learn-intelex).\n" - "To enable patching, please use one of the " - "following options:\n" - "1) From the command line:\n" - " python -m sklearnex \n" - "2) From your script:\n" - " from sklearnex import patch_sklearn\n" - " patch_sklearn()", - FutureWarning, "dispatcher.py", 151) + warnings.warn_explicit( + "\nScikit-learn patching with daal4py is deprecated " + "and will be removed in the future.\n" + "Use Intel(R) Extension " + "for Scikit-learn* module instead " + "(pip install scikit-learn-intelex).\n" + "To enable patching, please use one of the " + "following options:\n" + "1) From the command line:\n" + " python -m sklearnex \n" + "2) From your script:\n" + " from sklearnex import patch_sklearn\n" + " patch_sklearn()", + FutureWarning, + "dispatcher.py", + 151, + ) if verbose and deprecation and sys.stderr is not None: sys.stderr.write( "Intel(R) oneAPI Data Analytics Library solvers for sklearn enabled: " - "https://intelpython.github.io/daal4py/sklearn.html\n") + "https://intelpython.github.io/daal4py/sklearn.html\n" + ) def disable(name=None, get_map=_get_map_of_algorithms): diff --git a/daal4py/sklearn/monkeypatch/tests/_models_info.py b/daal4py/sklearn/monkeypatch/tests/_models_info.py index aa1f3b6f81..47dbf138e0 100644 --- a/daal4py/sklearn/monkeypatch/tests/_models_info.py +++ b/daal4py/sklearn/monkeypatch/tests/_models_info.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,105 +12,113 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np - -from sklearn.svm import SVC -from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor) -from sklearn.neighbors import ( - KNeighborsClassifier, - KNeighborsRegressor, - NearestNeighbors) +from sklearn.cluster import DBSCAN, KMeans +from sklearn.decomposition import PCA +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import ( + ElasticNet, + Lasso, + LinearRegression, LogisticRegression, LogisticRegressionCV, - LinearRegression, Ridge, - ElasticNet, - Lasso) -from sklearn.cluster import (KMeans, DBSCAN) +) from sklearn.manifold import TSNE -from sklearn.decomposition import PCA +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors +from sklearn.svm import SVC + from daal4py.sklearn._utils import daal_check_version MODELS_INFO = [ { - 'model': KNeighborsClassifier(algorithm='brute'), - 'methods': ['kneighbors', 'predict', 'predict_proba', 'score'], - 'dataset': 'classifier', + "model": KNeighborsClassifier(algorithm="brute"), + "methods": ["kneighbors", "predict", "predict_proba", "score"], + "dataset": "classifier", }, { - 'model': KNeighborsRegressor(algorithm='brute'), - 'methods': ['kneighbors', 'predict', 'score'], - 'dataset': 'regression', + "model": KNeighborsRegressor(algorithm="brute"), + "methods": ["kneighbors", "predict", "score"], + "dataset": "regression", }, { - 'model': NearestNeighbors(algorithm='brute'), - 'methods': ['kneighbors'], - 'dataset': 'blobs', + "model": NearestNeighbors(algorithm="brute"), + "methods": ["kneighbors"], + "dataset": "blobs", }, { - 'model': DBSCAN(), - 'methods': ['fit_predict'], - 'dataset': 'blobs', + "model": DBSCAN(), + "methods": ["fit_predict"], + "dataset": "blobs", }, { - 'model': SVC(probability=True), - 'methods': ['decision_function', 'predict', 'predict_proba', 'score'], - 'dataset': 'classifier', + "model": SVC(probability=True), + "methods": ["decision_function", "predict", "predict_proba", "score"], + "dataset": "classifier", }, { - 'model': KMeans(), - 'methods': ['fit_predict', 'fit_transform', 'transform', 'predict', 'score'], - 'dataset': 'blobs', + "model": KMeans(), + "methods": ["fit_predict", "fit_transform", "transform", "predict", "score"], + "dataset": "blobs", }, { - 'model': ElasticNet(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": ElasticNet(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': Lasso(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": Lasso(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': PCA(), - 'methods': ['fit_transform', 'transform', 'score'], - 'dataset': 'classifier', + "model": PCA(), + "methods": ["fit_transform", "transform", "score"], + "dataset": "classifier", }, { - 'model': RandomForestClassifier(n_estimators=10), - 'methods': ['predict', 'predict_proba', 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": RandomForestClassifier(n_estimators=10), + "methods": ["predict", "predict_proba", "predict_log_proba", "score"], + "dataset": "classifier", }, { - 'model': LogisticRegression(max_iter=100, multi_class='multinomial'), - 'methods': ['decision_function', 'predict', 'predict_proba', - 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": LogisticRegression(max_iter=100, multi_class="multinomial"), + "methods": [ + "decision_function", + "predict", + "predict_proba", + "predict_log_proba", + "score", + ], + "dataset": "classifier", }, { - 'model': LogisticRegressionCV(max_iter=100), - 'methods': ['decision_function', 'predict', 'predict_proba', - 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": LogisticRegressionCV(max_iter=100), + "methods": [ + "decision_function", + "predict", + "predict_proba", + "predict_log_proba", + "score", + ], + "dataset": "classifier", }, { - 'model': RandomForestRegressor(n_estimators=10), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": RandomForestRegressor(n_estimators=10), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': LinearRegression(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": LinearRegression(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': Ridge(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": Ridge(), + "methods": ["predict", "score"], + "dataset": "regression", }, ] @@ -130,21 +138,22 @@ TO_SKIP = [ # --------------- NO INFO --------------- - r'KMeans .*transform', - r'KMeans .*score', - r'PCA .*score', - r'LogisticRegression .*decision_function', - r'LogisticRegressionCV .*decision_function', - r'LogisticRegressionCV .*predict', - r'LogisticRegressionCV .*predict_proba', - r'LogisticRegressionCV .*predict_log_proba', - r'LogisticRegressionCV .*score', + r"KMeans .*transform", + r"KMeans .*score", + r"PCA .*score", + r"LogisticRegression .*decision_function", + r"LogisticRegressionCV .*decision_function", + r"LogisticRegressionCV .*predict", + r"LogisticRegressionCV .*predict_proba", + r"LogisticRegressionCV .*predict_log_proba", + r"LogisticRegressionCV .*score", # --------------- Scikit --------------- - r'Ridge float16 predict', - r'Ridge float16 score', - r'RandomForestClassifier .*predict_proba', - r'RandomForestClassifier .*predict_log_proba', - r'pairwise_distances .*pairwise_distances', # except float64 - r'roc_auc_score .*roc_auc_score' \ - if not daal_check_version((2021, 'P', 200)) else None, + r"Ridge float16 predict", + r"Ridge float16 score", + r"RandomForestClassifier .*predict_proba", + r"RandomForestClassifier .*predict_log_proba", + r"pairwise_distances .*pairwise_distances", # except float64 + r"roc_auc_score .*roc_auc_score" + if not daal_check_version((2021, "P", 200)) + else None, ] diff --git a/daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py b/daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py index a9f7ca8de4..d11675eb8d 100755 --- a/daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py +++ b/daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import daal4py.sklearn @@ -25,8 +25,9 @@ def test_monkey_patching(): _classes.append(v[0][0]) assert len(_tokens) == len(_classes) - assert isinstance(_tokens, list) and len(_tokens) > 0, \ - "Internal Error: list of patched names has unexcepable format." + assert ( + isinstance(_tokens, list) and len(_tokens) > 0 + ), "Internal Error: list of patched names has unexcepable format." daal4py.sklearn.patch_sklearn() @@ -36,8 +37,7 @@ def test_monkey_patching(): n = _classes[i][1] class_module = getattr(p, n).__module__ - assert class_module.startswith('daal4py'), \ - "Patching has completed with error." + assert class_module.startswith("daal4py"), "Patching has completed with error." for i, _ in enumerate(_tokens): t = _tokens[i] @@ -46,8 +46,7 @@ def test_monkey_patching(): daal4py.sklearn.unpatch_sklearn(t) class_module = getattr(p, n).__module__ - assert class_module.startswith('sklearn'), \ - "Unpatching has completed with error." + assert class_module.startswith("sklearn"), "Unpatching has completed with error." daal4py.sklearn.unpatch_sklearn() @@ -57,8 +56,7 @@ def test_monkey_patching(): n = _classes[i][1] class_module = getattr(p, n).__module__ - assert class_module.startswith('sklearn'), \ - "Unpatching has completed with error." + assert class_module.startswith("sklearn"), "Unpatching has completed with error." for i, _ in enumerate(_tokens): t = _tokens[i] @@ -68,7 +66,6 @@ def test_monkey_patching(): daal4py.sklearn.patch_sklearn(t) class_module = getattr(p, n).__module__ - assert class_module.startswith('daal4py'), \ - "Patching has completed with error." + assert class_module.startswith("daal4py"), "Patching has completed with error." daal4py.sklearn.unpatch_sklearn() diff --git a/daal4py/sklearn/monkeypatch/tests/test_patching.py b/daal4py/sklearn/monkeypatch/tests/test_patching.py index 23d5117007..9a5657d752 100644 --- a/daal4py/sklearn/monkeypatch/tests/test_patching.py +++ b/daal4py/sklearn/monkeypatch/tests/test_patching.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,27 +12,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +import os +import pathlib import re import subprocess import sys -import os -import pathlib + import pytest from _models_info import TO_SKIP def get_branch(s): if len(s) == 0: - return 'NO INFO' + return "NO INFO" for i in s: - if 'failed to run accelerated version, fallback to original Scikit-learn' in i: - return 'was in OPT, but go in Scikit' + if "failed to run accelerated version, fallback to original Scikit-learn" in i: + return "was in OPT, but go in Scikit" for i in s: - if 'running accelerated version' in i: - return 'OPT' - return 'Scikit' + if "running accelerated version" in i: + return "OPT" + return "Scikit" def run_parse(mas, result): @@ -41,8 +42,8 @@ def run_parse(mas, result): INFO_POS = 6 for i in range(1, len(mas)): mas[i] = mas[i][INFO_POS:] # remove 'INFO: ' - if not mas[i].startswith('sklearn'): - ind = name + ' ' + dtype + ' ' + mas[i] + if not mas[i].startswith("sklearn"): + ind = name + " " + dtype + " " + mas[i] result[ind] = get_branch(temp) temp.clear() else: @@ -50,14 +51,11 @@ def run_parse(mas, result): def get_result_log(): - os.environ['IDP_SKLEARN_VERBOSE'] = 'INFO' + os.environ["IDP_SKLEARN_VERBOSE"] = "INFO" absolute_path = str(pathlib.Path(__file__).parent.absolute()) try: process = subprocess.check_output( - [ - sys.executable, - absolute_path + '/utils/_launch_algorithms.py' - ] + [sys.executable, absolute_path + "/utils/_launch_algorithms.py"] ) except subprocess.CalledProcessError as e: print(e) @@ -65,25 +63,25 @@ def get_result_log(): mas = [] result = {} - for i in process.decode().split('\n'): - if not i.startswith('INFO') and len(mas) != 0: + for i in process.decode().split("\n"): + if not i.startswith("INFO") and len(mas) != 0: run_parse(mas, result) mas.clear() mas.append(i.strip()) else: mas.append(i.strip()) - del os.environ['IDP_SKLEARN_VERBOSE'] + del os.environ["IDP_SKLEARN_VERBOSE"] return result result_log = get_result_log() -@pytest.mark.parametrize('configuration', result_log) +@pytest.mark.parametrize("configuration", result_log) def test_patching(configuration): - if 'OPT' in result_log[configuration]: + if "OPT" in result_log[configuration]: return for skip in TO_SKIP: if re.search(skip, configuration) is not None: pytest.skip("SKIPPED", allow_module_level=False) - raise ValueError('Test patching failed: ' + configuration) + raise ValueError("Test patching failed: " + configuration) diff --git a/daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py b/daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py index b1232f1278..ed867b19d9 100644 --- a/daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py +++ b/daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,25 +12,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import logging import random +import numpy as np + from daal4py.sklearn import patch_sklearn + patch_sklearn() +import pathlib +import sys + +from sklearn.datasets import load_diabetes, load_iris, make_regression from sklearn.metrics import pairwise_distances, roc_auc_score -from sklearn.datasets import ( - make_regression, - load_iris, - load_diabetes) -import sys -import pathlib absolute_path = str(pathlib.Path(__file__).parent.absolute()) -sys.path.append(absolute_path + '/../') +sys.path.append(absolute_path + "/../") from _models_info import MODELS_INFO, TYPES @@ -39,80 +39,80 @@ def get_class_name(x): def generate_dataset(name, dtype, model_name): - if model_name == 'LinearRegression': + if model_name == "LinearRegression": X, y = make_regression(n_samples=1000, n_features=5) - elif name in ['blobs', 'classifier']: + elif name in ["blobs", "classifier"]: X, y = load_iris(return_X_y=True) - elif name == 'regression': + elif name == "regression": X, y = load_diabetes(return_X_y=True) else: - raise ValueError('Unknown dataset type') + raise ValueError("Unknown dataset type") X = np.array(X, dtype=dtype) y = np.array(y, dtype=dtype) return (X, y) def run_patch(model_info, dtype): - print(get_class_name(model_info['model']), dtype.__name__) - X, y = generate_dataset(model_info['dataset'], - dtype, - get_class_name(model_info['model'])) - model = model_info['model'] + print(get_class_name(model_info["model"]), dtype.__name__) + X, y = generate_dataset( + model_info["dataset"], dtype, get_class_name(model_info["model"]) + ) + model = model_info["model"] model.fit(X, y) - logging.info('fit') - for i in model_info['methods']: - if i == 'predict': + logging.info("fit") + for i in model_info["methods"]: + if i == "predict": model.predict(X) - elif i == 'predict_proba': + elif i == "predict_proba": model.predict_proba(X) - elif i == 'predict_log_proba': + elif i == "predict_log_proba": model.predict_log_proba(X) - elif i == 'decision_function': + elif i == "decision_function": model.decision_function(X) - elif i == 'fit_predict': + elif i == "fit_predict": model.fit_predict(X) - elif i == 'transform': + elif i == "transform": model.transform(X) - elif i == 'fit_transform': + elif i == "fit_transform": model.fit_transform(X) - elif i == 'kneighbors': + elif i == "kneighbors": model.kneighbors(X) - elif i == 'score': + elif i == "score": model.score(X, y) else: - raise ValueError(i + ' is wrong method') + raise ValueError(i + " is wrong method") logging.info(i) def run_algotithms(): for info in MODELS_INFO: for t in TYPES: - model_name = get_class_name(info['model']) - if model_name in ['Ridge', 'LinearRegression'] and t.__name__ == 'uint32': + model_name = get_class_name(info["model"]) + if model_name in ["Ridge", "LinearRegression"] and t.__name__ == "uint32": continue run_patch(info, t) def run_utils(): # pairwise_distances - for metric in ['cosine', 'correlation']: + for metric in ["cosine", "correlation"]: for t in TYPES: X = np.random.rand(1000) X = np.array(X, dtype=t) - print('pairwise_distances', t.__name__) + print("pairwise_distances", t.__name__) _ = pairwise_distances(X.reshape(1, -1), metric=metric) - logging.info('pairwise_distances') + logging.info("pairwise_distances") # roc_auc_score for t in [np.float32, np.float64]: a = [random.randint(0, 1) for i in range(1000)] b = [random.randint(0, 1) for i in range(1000)] a = np.array(a, dtype=t) b = np.array(b, dtype=t) - print('roc_auc_score', t.__name__) + print("roc_auc_score", t.__name__) _ = roc_auc_score(a, b) - logging.info('roc_auc_score') + logging.info("roc_auc_score") -if __name__ == '__main__': +if __name__ == "__main__": run_algotithms() run_utils() diff --git a/daal4py/sklearn/neighbors/__init__.py b/daal4py/sklearn/neighbors/__init__.py index 0aecdf94d6..901f7ebce1 100755 --- a/daal4py/sklearn/neighbors/__init__.py +++ b/daal4py/sklearn/neighbors/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,10 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._classification import KNeighborsClassifier -from ._unsupervised import NearestNeighbors from ._regression import KNeighborsRegressor +from ._unsupervised import NearestNeighbors -__all__ = ['KNeighborsClassifier', 'NearestNeighbors', 'KNeighborsRegressor'] +__all__ = ["KNeighborsClassifier", "NearestNeighbors", "KNeighborsRegressor"] diff --git a/daal4py/sklearn/neighbors/_base.py b/daal4py/sklearn/neighbors/_base.py index df5b511fab..9bc3491fd6 100644 --- a/daal4py/sklearn/neighbors/_base.py +++ b/daal4py/sklearn/neighbors/_base.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,78 +12,75 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py KNN scikit-learn-compatible base classes -import numpy as np +import logging import numbers -import daal4py as d4p +import warnings + +import numpy as np from scipy import sparse as sp +from sklearn.base import is_classifier, is_regressor +from sklearn.neighbors import VALID_METRICS +from sklearn.neighbors._ball_tree import BallTree +from sklearn.neighbors._base import KNeighborsMixin as BaseKNeighborsMixin +from sklearn.neighbors._base import NeighborsBase as BaseNeighborsBase +from sklearn.neighbors._base import RadiusNeighborsMixin as BaseRadiusNeighborsMixin +from sklearn.neighbors._kd_tree import KDTree +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y + +import daal4py as d4p + from .._utils import ( + PatchingConditionsChain, + get_patch_message, getFPType, sklearn_check_version, - get_patch_message, - PatchingConditionsChain) -from sklearn.utils.validation import check_array, check_is_fitted, check_X_y -from sklearn.utils.multiclass import check_classification_targets -from sklearn.base import is_classifier, is_regressor -import logging -import warnings +) -if sklearn_check_version("0.22"): - from sklearn.neighbors._base import KNeighborsMixin as BaseKNeighborsMixin - from sklearn.neighbors._base import RadiusNeighborsMixin as BaseRadiusNeighborsMixin - from sklearn.neighbors._base import NeighborsBase as BaseNeighborsBase - from sklearn.neighbors._ball_tree import BallTree - from sklearn.neighbors._kd_tree import KDTree - if not sklearn_check_version("1.2"): - from sklearn.neighbors._base import _check_weights -else: - from sklearn.neighbors.base import KNeighborsMixin as BaseKNeighborsMixin - from sklearn.neighbors.base import RadiusNeighborsMixin as BaseRadiusNeighborsMixin - from sklearn.neighbors.base import NeighborsBase as BaseNeighborsBase - from sklearn.neighbors.ball_tree import BallTree - from sklearn.neighbors.kd_tree import KDTree - from sklearn.neighbors.base import _check_weights +if not sklearn_check_version("1.2"): + from sklearn.neighbors._base import _check_weights def training_algorithm(method, fptype, params): - if method == 'brute': + if method == "brute": train_alg = d4p.bf_knn_classification_training else: train_alg = d4p.kdtree_knn_classification_training - params['fptype'] = fptype + params["fptype"] = fptype return train_alg(**params) def prediction_algorithm(method, fptype, params): - if method == 'brute': + if method == "brute": predict_alg = d4p.bf_knn_classification_prediction else: predict_alg = d4p.kdtree_knn_classification_prediction - params['fptype'] = fptype + params["fptype"] = fptype return predict_alg(**params) def parse_auto_method(estimator, method, n_samples, n_features): result_method = method - if (method in ['auto', 'ball_tree']): - condition = estimator.n_neighbors is not None and \ - estimator.n_neighbors >= estimator.n_samples_fit_ // 2 - if estimator.metric == 'precomputed' or n_features > 11 or condition: - result_method = 'brute' + if method in ["auto", "ball_tree"]: + condition = ( + estimator.n_neighbors is not None + and estimator.n_neighbors >= estimator.n_samples_fit_ // 2 + ) + if estimator.metric == "precomputed" or n_features > 11 or condition: + result_method = "brute" else: - kdtree_valid_metrics = KDTree.valid_metrics() \ - if sklearn_check_version('1.3') else KDTree.valid_metrics - if estimator.effective_metric_ in kdtree_valid_metrics: - result_method = 'kd_tree' + if estimator.effective_metric_ in VALID_METRICS["kd_tree"]: + result_method = "kd_tree" else: - result_method = 'brute' + result_method = "brute" return result_method @@ -91,41 +88,45 @@ def parse_auto_method(estimator, method, n_samples, n_features): def daal4py_fit(estimator, X, fptype): estimator._fit_X = X estimator._fit_method = estimator.algorithm - estimator.effective_metric_ = 'euclidean' + estimator.effective_metric_ = "euclidean" estimator._tree = None - weights = getattr(estimator, 'weights', 'uniform') + weights = getattr(estimator, "weights", "uniform") params = { - 'method': 'defaultDense', - 'k': estimator.n_neighbors, - 'voteWeights': 'voteUniform' if weights == 'uniform' else 'voteDistance', - 'resultsToCompute': 'computeIndicesOfNeighbors|computeDistances', - 'resultsToEvaluate': 'none' if getattr(estimator, '_y', None) is None - else 'computeClassLabels' + "method": "defaultDense", + "k": estimator.n_neighbors, + "voteWeights": "voteUniform" if weights == "uniform" else "voteDistance", + "resultsToCompute": "computeIndicesOfNeighbors|computeDistances", + "resultsToEvaluate": "none" + if getattr(estimator, "_y", None) is None + else "computeClassLabels", } - if hasattr(estimator, 'classes_'): - params['nClasses'] = len(estimator.classes_) + if hasattr(estimator, "classes_"): + params["nClasses"] = len(estimator.classes_) - if getattr(estimator, '_y', None) is None: + if getattr(estimator, "_y", None) is None: labels = None else: labels = estimator._y.reshape(-1, 1) method = parse_auto_method( - estimator, estimator.algorithm, - estimator.n_samples_fit_, estimator.n_features_in_) + estimator, estimator.algorithm, estimator.n_samples_fit_, estimator.n_features_in_ + ) estimator._fit_method = method train_alg = training_algorithm(method, fptype, params) estimator._daal_model = train_alg.compute(X, labels).model -def daal4py_kneighbors(estimator, X=None, n_neighbors=None, - return_distance=True): - n_features = getattr(estimator, 'n_features_in_', None) - shape = getattr(X, 'shape', None) +def daal4py_kneighbors(estimator, X=None, n_neighbors=None, return_distance=True): + n_features = getattr(estimator, "n_features_in_", None) + shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError((f'X has {X.shape[1]} features, ' - f'but kneighbors is expecting {n_features} features as input')) + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but kneighbors is expecting {n_features} features as input" + ) + ) if sklearn_check_version("0.22"): check_is_fitted(estimator) @@ -135,20 +136,17 @@ def daal4py_kneighbors(estimator, X=None, n_neighbors=None, if n_neighbors is None: n_neighbors = estimator.n_neighbors elif n_neighbors <= 0: - raise ValueError( - "Expected n_neighbors > 0. Got %d" % - n_neighbors - ) + raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) else: if not isinstance(n_neighbors, numbers.Integral): raise TypeError( "n_neighbors does not take %s value, " - "enter integer value" % - type(n_neighbors)) + "enter integer value" % type(n_neighbors) + ) if X is not None: query_is_train = False - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) else: query_is_train = True X = estimator._fit_X @@ -160,8 +158,7 @@ def daal4py_kneighbors(estimator, X=None, n_neighbors=None, if n_neighbors > n_samples_fit: raise ValueError( "Expected n_neighbors <= n_samples, " - " but n_samples = %d, n_neighbors = %d" % - (n_samples_fit, n_neighbors) + " but n_samples = %d, n_neighbors = %d" % (n_samples_fit, n_neighbors) ) chunked_results = None @@ -171,21 +168,23 @@ def daal4py_kneighbors(estimator, X=None, n_neighbors=None, except ValueError: fptype = None - weights = getattr(estimator, 'weights', 'uniform') + weights = getattr(estimator, "weights", "uniform") params = { - 'method': 'defaultDense', - 'k': n_neighbors, - 'voteWeights': 'voteUniform' if weights == 'uniform' else 'voteDistance', - 'resultsToCompute': 'computeIndicesOfNeighbors|computeDistances', - 'resultsToEvaluate': 'none' if getattr(estimator, '_y', None) is None - else 'computeClassLabels' + "method": "defaultDense", + "k": n_neighbors, + "voteWeights": "voteUniform" if weights == "uniform" else "voteDistance", + "resultsToCompute": "computeIndicesOfNeighbors|computeDistances", + "resultsToEvaluate": "none" + if getattr(estimator, "_y", None) is None + else "computeClassLabels", } - if hasattr(estimator, 'classes_'): - params['nClasses'] = len(estimator.classes_) + if hasattr(estimator, "classes_"): + params["nClasses"] = len(estimator.classes_) method = parse_auto_method( - estimator, estimator._fit_method, estimator.n_samples_fit_, n_features) + estimator, estimator._fit_method, estimator.n_samples_fit_, n_features + ) predict_alg = prediction_algorithm(method, fptype, params) prediction_result = predict_alg.compute(X, estimator._daal_model) @@ -193,7 +192,7 @@ def daal4py_kneighbors(estimator, X=None, n_neighbors=None, distances = prediction_result.distances indices = prediction_result.indices - if method == 'kd_tree': + if method == "kd_tree": for i in range(distances.shape[0]): seq = distances[i].argsort() indices[i] = indices[i][seq] @@ -231,18 +230,17 @@ def daal4py_kneighbors(estimator, X=None, n_neighbors=None, # In that case mask the first duplicate. dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False - neigh_ind = np.reshape( - neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) if return_distance: - neigh_dist = np.reshape( - neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) return neigh_dist, neigh_ind return neigh_ind -def validate_data(estimator, X, y=None, reset=True, - validate_separately=False, **check_params): +def validate_data( + estimator, X, y=None, reset=True, validate_separately=False, **check_params +): if y is None: try: requires_y = estimator._get_tags()["requires_y"] @@ -269,37 +267,59 @@ def validate_data(estimator, X, y=None, reset=True, X, y = check_X_y(X, y, **check_params) out = X, y - if sklearn_check_version("0.23") and check_params.get('ensure_2d', True): + if sklearn_check_version("0.23") and check_params.get("ensure_2d", True): estimator._check_n_features(X, reset=reset) return out class NeighborsBase(BaseNeighborsBase): - def __init__(self, n_neighbors=None, radius=None, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + n_neighbors=None, + radius=None, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( - n_neighbors=n_neighbors, radius=radius, - algorithm=algorithm, leaf_size=leaf_size, metric=metric, - p=p, metric_params=metric_params, n_jobs=n_jobs) + n_neighbors=n_neighbors, + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) def _fit(self, X, y=None): - if self.metric_params is not None and 'p' in self.metric_params: + if self.metric_params is not None and "p" in self.metric_params: if self.p is not None: - warnings.warn("Parameter p is found in metric_params. " - "The corresponding parameter from __init__ " - "is ignored.", SyntaxWarning, stacklevel=2) + warnings.warn( + "Parameter p is found in metric_params. " + "The corresponding parameter from __init__ " + "is ignored.", + SyntaxWarning, + stacklevel=2, + ) - if hasattr(self, 'weights') and sklearn_check_version("1.0") \ - and not sklearn_check_version("1.2"): + if ( + hasattr(self, "weights") + and sklearn_check_version("1.0") + and not sklearn_check_version("1.2") + ): self.weights = _check_weights(self.weights) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) X_incorrect_type = isinstance( - X, (KDTree, BallTree, NeighborsBase, BaseNeighborsBase)) + X, (KDTree, BallTree, NeighborsBase, BaseNeighborsBase) + ) single_output = True self._daal_model = None shape = None @@ -313,8 +333,13 @@ def _fit(self, X, y=None): if y is not None or requires_y: if not X_incorrect_type or y is None: X, y = validate_data( - self, X, y, accept_sparse="csr", multi_output=True, - dtype=[np.float64, np.float32]) + self, + X, + y, + accept_sparse="csr", + multi_output=True, + dtype=[np.float64, np.float32], + ) single_output = False if y.ndim > 1 and y.shape[1] > 1 else True shape = y.shape @@ -331,8 +356,7 @@ def _fit(self, X, y=None): self.classes_ = [] self._y = np.empty(y.shape, dtype=int) for k in range(self._y.shape[1]): - classes, self._y[:, k] = np.unique( - y[:, k], return_inverse=True) + classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes) if not self.outputs_2d_: @@ -347,7 +371,8 @@ def _fit(self, X, y=None): else: if not X_incorrect_type: X, _ = validate_data( - self, X, accept_sparse='csr', dtype=[np.float64, np.float32]) + self, X, accept_sparse="csr", dtype=[np.float64, np.float32] + ) if not X_incorrect_type: self.n_samples_fit_ = X.shape[0] @@ -358,7 +383,7 @@ def _fit(self, X, y=None): except ValueError: fptype = None - weights = getattr(self, 'weights', 'uniform') + weights = getattr(self, "weights", "uniform") def stock_fit(self, X, y): if sklearn_check_version("0.24"): @@ -369,34 +394,43 @@ def stock_fit(self, X, y): if self.n_neighbors is not None: if self.n_neighbors <= 0: - raise ValueError( - "Expected n_neighbors > 0. Got %d" % - self.n_neighbors - ) + raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) if not isinstance(self.n_neighbors, numbers.Integral): raise TypeError( "n_neighbors does not take %s value, " - "enter integer value" % - type(self.n_neighbors)) + "enter integer value" % type(self.n_neighbors) + ) _patching_status = PatchingConditionsChain( - "sklearn.neighbors.KNeighborsMixin.kneighbors") - _dal_ready = _patching_status.and_conditions([ - (self.metric == 'minkowski' and self.p == 2 or self.metric == 'euclidean', - f"'{self.metric}' (p={self.p}) metric is not supported. " - "Only 'euclidean' or 'minkowski' with p=2 metrics are supported."), - (not X_incorrect_type, "X is not Tree or Neighbors instance or array."), - (weights in ['uniform', 'distance'], - f"'{weights}' weights is not supported. " - "Only 'uniform' and 'distance' weights are supported."), - (self.algorithm in ['brute', 'kd_tree', 'auto', 'ball_tree'], - f"'{self.algorithm}' algorithm is not supported. " - "Only 'brute', 'kd_tree', 'auto' and 'ball_tree' " - "algorithms are supported."), - (single_output, "Multiple outputs are not supported."), - (fptype is not None, "Unable to get dtype."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (correct_n_classes, "Number of classes < 2.")]) + "sklearn.neighbors.KNeighborsMixin.kneighbors" + ) + _dal_ready = _patching_status.and_conditions( + [ + ( + self.metric == "minkowski" + and self.p == 2 + or self.metric == "euclidean", + f"'{self.metric}' (p={self.p}) metric is not supported. " + "Only 'euclidean' or 'minkowski' with p=2 metrics are supported.", + ), + (not X_incorrect_type, "X is not Tree or Neighbors instance or array."), + ( + weights in ["uniform", "distance"], + f"'{weights}' weights is not supported. " + "Only 'uniform' and 'distance' weights are supported.", + ), + ( + self.algorithm in ["brute", "kd_tree", "auto", "ball_tree"], + f"'{self.algorithm}' algorithm is not supported. " + "Only 'brute', 'kd_tree', 'auto' and 'ball_tree' " + "algorithms are supported.", + ), + (single_output, "Multiple outputs are not supported."), + (fptype is not None, "Unable to get dtype."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (correct_n_classes, "Number of classes < 2."), + ] + ) _patching_status.write_log() if _dal_ready: try: @@ -405,7 +439,8 @@ def stock_fit(self, X, y): except RuntimeError: logging.info( "sklearn.neighbors.KNeighborsMixin." - "kneighbors: " + get_patch_message("sklearn_after_daal")) + "kneighbors: " + get_patch_message("sklearn_after_daal") + ) result = stock_fit(self, X, y) else: result = stock_fit(self, X, y) @@ -418,11 +453,9 @@ def stock_fit(self, X, y): class KNeighborsMixin(BaseKNeighborsMixin): def kneighbors(self, X=None, n_neighbors=None, return_distance=True): - daal_model = getattr(self, '_daal_model', None) + daal_model = getattr(self, "_daal_model", None) if X is not None and self.metric != "precomputed": - X = check_array( - X, accept_sparse='csr', dtype=[ - np.float64, np.float32]) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) x = self._fit_X if X is None else X try: fptype = getFPType(x) @@ -430,44 +463,58 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): fptype = None _patching_status = PatchingConditionsChain( - "sklearn.neighbors.KNeighborsMixin.kneighbors") - _dal_ready = _patching_status.and_conditions([ - (daal_model is not None, "oneDAL model was not trained."), - (fptype is not None, "Unable to get dtype."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported.")]) + "sklearn.neighbors.KNeighborsMixin.kneighbors" + ) + _dal_ready = _patching_status.and_conditions( + [ + (daal_model is not None, "oneDAL model was not trained."), + (fptype is not None, "Unable to get dtype."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ] + ) _patching_status.write_log() if _dal_ready: result = daal4py_kneighbors(self, X, n_neighbors, return_distance) else: - if daal_model is not None or getattr(self, '_tree', 0) is None and \ - self._fit_method == 'kd_tree': + if ( + daal_model is not None + or getattr(self, "_tree", 0) is None + and self._fit_method == "kd_tree" + ): if sklearn_check_version("0.24"): - BaseNeighborsBase._fit(self, self._fit_X, getattr(self, '_y', None)) + BaseNeighborsBase._fit(self, self._fit_X, getattr(self, "_y", None)) else: BaseNeighborsBase._fit(self, self._fit_X) result = super(KNeighborsMixin, self).kneighbors( - X, n_neighbors, return_distance) + X, n_neighbors, return_distance + ) return result class RadiusNeighborsMixin(BaseRadiusNeighborsMixin): - def radius_neighbors(self, X=None, radius=None, return_distance=True, - sort_results=False): - daal_model = getattr(self, '_daal_model', None) - - if daal_model is not None or getattr(self, '_tree', 0) is None and \ - self._fit_method == 'kd_tree': + def radius_neighbors( + self, X=None, radius=None, return_distance=True, sort_results=False + ): + daal_model = getattr(self, "_daal_model", None) + + if ( + daal_model is not None + or getattr(self, "_tree", 0) is None + and self._fit_method == "kd_tree" + ): if sklearn_check_version("0.24"): - BaseNeighborsBase._fit(self, self._fit_X, getattr(self, '_y', None)) + BaseNeighborsBase._fit(self, self._fit_X, getattr(self, "_y", None)) else: BaseNeighborsBase._fit(self, self._fit_X) if sklearn_check_version("0.22"): result = BaseRadiusNeighborsMixin.radius_neighbors( - self, X, radius, return_distance, sort_results) + self, X, radius, return_distance, sort_results + ) else: result = BaseRadiusNeighborsMixin.radius_neighbors( - self, X, radius, return_distance) + self, X, radius, return_distance + ) return result diff --git a/daal4py/sklearn/neighbors/_classification.py b/daal4py/sklearn/neighbors/_classification.py index 73edeae401..75ca98d9a9 100644 --- a/daal4py/sklearn/neighbors/_classification.py +++ b/daal4py/sklearn/neighbors/_classification.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,48 +12,52 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py KNN classification scikit-learn-compatible classes -from ._base import NeighborsBase, KNeighborsMixin -from ._base import parse_auto_method, prediction_algorithm -from sklearn.base import ClassifierMixin as BaseClassifierMixin -from .._utils import ( - getFPType, - sklearn_check_version, - PatchingConditionsChain) -from .._device_offload import support_usm_ndarray -from sklearn.utils.validation import check_array import numpy as np from scipy import sparse as sp +from sklearn.base import ClassifierMixin as BaseClassifierMixin +from sklearn.utils.validation import check_array + +from .._device_offload import support_usm_ndarray +from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version +from ._base import KNeighborsMixin, NeighborsBase, parse_auto_method, prediction_algorithm if sklearn_check_version("0.22"): - from sklearn.neighbors._classification import KNeighborsClassifier as \ - BaseKNeighborsClassifier + from sklearn.neighbors._classification import ( + KNeighborsClassifier as BaseKNeighborsClassifier, + ) + if not sklearn_check_version("1.2"): from sklearn.neighbors._base import _check_weights from sklearn.utils.validation import _deprecate_positional_args else: - from sklearn.neighbors.classification import KNeighborsClassifier as \ - BaseKNeighborsClassifier from sklearn.neighbors.base import _check_weights + from sklearn.neighbors.classification import ( + KNeighborsClassifier as BaseKNeighborsClassifier, + ) def _deprecate_positional_args(f): return f def daal4py_classifier_predict(estimator, X, base_predict): - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): estimator._check_feature_names(X, reset=False) - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) - daal_model = getattr(estimator, '_daal_model', None) - n_features = getattr(estimator, 'n_features_in_', None) - shape = getattr(X, 'shape', None) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + daal_model = getattr(estimator, "_daal_model", None) + n_features = getattr(estimator, "n_features_in_", None) + shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError((f'X has {X.shape[1]} features, ' - f'but KNNClassifier is expecting ' - f'{n_features} features as input')) + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but KNNClassifier is expecting " + f"{n_features} features as input" + ) + ) try: fptype = getFPType(X) @@ -61,30 +65,37 @@ def daal4py_classifier_predict(estimator, X, base_predict): fptype = None _patching_status = PatchingConditionsChain( - "sklearn.neighbors.KNeighborsClassifier.predict") - _dal_ready = _patching_status.and_conditions([ - (daal_model is not None, "oneDAL model was not trained."), - (fptype is not None, "Unable to get dtype."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported.")]) + "sklearn.neighbors.KNeighborsClassifier.predict" + ) + _dal_ready = _patching_status.and_conditions( + [ + (daal_model is not None, "oneDAL model was not trained."), + (fptype is not None, "Unable to get dtype."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ] + ) _patching_status.write_log() if _dal_ready: params = { - 'method': 'defaultDense', - 'k': estimator.n_neighbors, - 'nClasses': len(estimator.classes_), - 'voteWeights': 'voteUniform' - if estimator.weights == 'uniform' else 'voteDistance', - 'resultsToEvaluate': 'computeClassLabels', - 'resultsToCompute': '' + "method": "defaultDense", + "k": estimator.n_neighbors, + "nClasses": len(estimator.classes_), + "voteWeights": "voteUniform" + if estimator.weights == "uniform" + else "voteDistance", + "resultsToEvaluate": "computeClassLabels", + "resultsToCompute": "", } method = parse_auto_method( - estimator, estimator.algorithm, estimator.n_samples_fit_, n_features) + estimator, estimator.algorithm, estimator.n_samples_fit_, n_features + ) predict_alg = prediction_algorithm(method, fptype, params) prediction_result = predict_alg.compute(X, daal_model) result = estimator.classes_.take( - np.asarray(prediction_result.prediction.ravel(), dtype=np.intp)) + np.asarray(prediction_result.prediction.ravel(), dtype=np.intp) + ) else: result = base_predict(estimator, X) @@ -92,55 +103,102 @@ def daal4py_classifier_predict(estimator, X, base_predict): if sklearn_check_version("0.24"): + class KNeighborsClassifier_(KNeighborsMixin, BaseClassifierMixin, NeighborsBase): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) - self.weights = \ + n_jobs=n_jobs, + **kwargs, + ) + self.weights = ( weights if sklearn_check_version("1.0") else _check_weights(weights) + ) + elif sklearn_check_version("0.22"): - from sklearn.neighbors._base import SupervisedIntegerMixin as \ - BaseSupervisedIntegerMixin + from sklearn.neighbors._base import ( + SupervisedIntegerMixin as BaseSupervisedIntegerMixin, + ) - class KNeighborsClassifier_(NeighborsBase, KNeighborsMixin, - BaseSupervisedIntegerMixin, BaseClassifierMixin): + class KNeighborsClassifier_( + NeighborsBase, KNeighborsMixin, BaseSupervisedIntegerMixin, BaseClassifierMixin + ): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) + else: - from sklearn.neighbors.base import SupervisedIntegerMixin as \ - BaseSupervisedIntegerMixin + from sklearn.neighbors.base import ( + SupervisedIntegerMixin as BaseSupervisedIntegerMixin, + ) - class KNeighborsClassifier_(NeighborsBase, KNeighborsMixin, - BaseSupervisedIntegerMixin, BaseClassifierMixin): + class KNeighborsClassifier_( + NeighborsBase, KNeighborsMixin, BaseSupervisedIntegerMixin, BaseClassifierMixin + ): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) @@ -148,17 +206,30 @@ class KNeighborsClassifier(KNeighborsClassifier_): __doc__ = BaseKNeighborsClassifier.__doc__ @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) @support_usm_ndarray() def fit(self, X, y): @@ -217,6 +288,6 @@ def predict_proba(self, X): The class probabilities of the input samples. Classes are ordered by lexicographic order. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) return BaseKNeighborsClassifier.predict_proba(self, X) diff --git a/daal4py/sklearn/neighbors/_regression.py b/daal4py/sklearn/neighbors/_regression.py index a33d5d153a..d7efa48b69 100644 --- a/daal4py/sklearn/neighbors/_regression.py +++ b/daal4py/sklearn/neighbors/_regression.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,80 +12,131 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py KNN regression scikit-learn-compatible classes -from ._base import NeighborsBase, KNeighborsMixin from sklearn.base import RegressorMixin -from .._utils import sklearn_check_version -from .._device_offload import support_usm_ndarray +from .._device_offload import support_usm_ndarray +from .._utils import sklearn_check_version +from ._base import KNeighborsMixin, NeighborsBase if sklearn_check_version("0.22"): - from sklearn.neighbors._regression import KNeighborsRegressor as \ - BaseKNeighborsRegressor + from sklearn.neighbors._regression import ( + KNeighborsRegressor as BaseKNeighborsRegressor, + ) + if not sklearn_check_version("1.2"): from sklearn.neighbors._base import _check_weights from sklearn.utils.validation import _deprecate_positional_args else: - from sklearn.neighbors.regression import KNeighborsRegressor as \ - BaseKNeighborsRegressor from sklearn.neighbors.base import _check_weights + from sklearn.neighbors.regression import ( + KNeighborsRegressor as BaseKNeighborsRegressor, + ) def _deprecate_positional_args(f): return f if sklearn_check_version("0.24"): + class KNeighborsRegressor_(KNeighborsMixin, RegressorMixin, NeighborsBase): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, weights='uniform', - algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs, **kwargs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + **kwargs, + ) + else: if sklearn_check_version("0.22"): - from sklearn.neighbors._base import SupervisedFloatMixin as \ - BaseSupervisedFloatMixin + from sklearn.neighbors._base import ( + SupervisedFloatMixin as BaseSupervisedFloatMixin, + ) else: - from sklearn.neighbors.base import SupervisedFloatMixin as \ - BaseSupervisedFloatMixin + from sklearn.neighbors.base import ( + SupervisedFloatMixin as BaseSupervisedFloatMixin, + ) - class KNeighborsRegressor_(NeighborsBase, KNeighborsMixin, - BaseSupervisedFloatMixin, RegressorMixin): + class KNeighborsRegressor_( + NeighborsBase, KNeighborsMixin, BaseSupervisedFloatMixin, RegressorMixin + ): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, weights='uniform', - algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs, **kwargs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + **kwargs, + ) class KNeighborsRegressor(KNeighborsRegressor_): __doc__ = BaseKNeighborsRegressor.__doc__ @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, weights='uniform', - algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs, **kwargs) - self.weights = \ + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + **kwargs, + ) + self.weights = ( weights if sklearn_check_version("1.0") else _check_weights(weights) + ) def _more_tags(self): return BaseKNeighborsRegressor._more_tags(self) @@ -127,6 +178,6 @@ def predict(self, X): y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int Target values. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) return BaseKNeighborsRegressor.predict(self, X) diff --git a/daal4py/sklearn/neighbors/_unsupervised.py b/daal4py/sklearn/neighbors/_unsupervised.py index 71e4839ac7..341dec73ab 100644 --- a/daal4py/sklearn/neighbors/_unsupervised.py +++ b/daal4py/sklearn/neighbors/_unsupervised.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,48 +12,77 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py KNN scikit-learn-compatible classes -from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin -from .._utils import sklearn_check_version from .._device_offload import support_usm_ndarray +from .._utils import sklearn_check_version +from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin if sklearn_check_version("0.22"): from sklearn.utils.validation import _deprecate_positional_args else: + def _deprecate_positional_args(f): return f if sklearn_check_version("0.22") and not sklearn_check_version("0.23"): + class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase): - def __init__(self, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) @support_usm_ndarray() def fit(self, X, y=None): return NeighborsBase._fit(self, X) + else: + class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase): @_deprecate_positional_args - def __init__(self, *, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + *, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) @support_usm_ndarray() def fit(self, X, y=None): diff --git a/daal4py/sklearn/neighbors/tests/test_kneighbors.py b/daal4py/sklearn/neighbors/tests/test_kneighbors.py index 3a3e77f6a7..47087d32ef 100644 --- a/daal4py/sklearn/neighbors/tests/test_kneighbors.py +++ b/daal4py/sklearn/neighbors/tests/test_kneighbors.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,46 +12,52 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import pytest -from sklearn.neighbors \ - import KNeighborsClassifier as ScikitKNeighborsClassifier -from daal4py.sklearn.neighbors \ - import KNeighborsClassifier as DaalKNeighborsClassifier from sklearn.datasets import load_iris -from sklearn.metrics import (accuracy_score, log_loss, roc_auc_score) +from sklearn.metrics import accuracy_score, log_loss, roc_auc_score from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier as ScikitKNeighborsClassifier + from daal4py.sklearn._utils import daal_check_version +from daal4py.sklearn.neighbors import KNeighborsClassifier as DaalKNeighborsClassifier -DISTANCES = ['minkowski'] -ALGORITHMS = ['brute', 'kd_tree', 'auto'] -WEIGHTS = ['uniform', 'distance'] +DISTANCES = ["minkowski"] +ALGORITHMS = ["brute", "kd_tree", "auto"] +WEIGHTS = ["uniform", "distance"] KS = [1, 3, 7, 15, 31] N_TRIES = 10 -ACCURACY_RATIO = 1.0 if daal_check_version(((2020, 'P', 300))) else 0.9 +ACCURACY_RATIO = 1.0 if daal_check_version(((2020, "P", 300))) else 0.9 LOG_LOSS_RATIO = 1.02 ROC_AUC_RATIO = 0.999 IRIS = load_iris() def _test_determenistic(distance, algorithm, weight, k): - x_train, x_test, y_train, y_test = \ - train_test_split(IRIS.data, IRIS.target, - test_size=0.33, random_state=31) + x_train, x_test, y_train, y_test = train_test_split( + IRIS.data, IRIS.target, test_size=0.33, random_state=31 + ) alg_results = [] for _ in range(N_TRIES): # models - scikit_model = ScikitKNeighborsClassifier(n_neighbors=k, - weights=weight, - algorithm=algorithm, - leaf_size=30, p=2, - metric=distance) - daal_model = DaalKNeighborsClassifier(n_neighbors=k, weights=weight, - algorithm=algorithm, - leaf_size=30, p=2, - metric=distance) + scikit_model = ScikitKNeighborsClassifier( + n_neighbors=k, + weights=weight, + algorithm=algorithm, + leaf_size=30, + p=2, + metric=distance, + ) + daal_model = DaalKNeighborsClassifier( + n_neighbors=k, + weights=weight, + algorithm=algorithm, + leaf_size=30, + p=2, + metric=distance, + ) # training scikit_model.fit(x_train, y_train) daal_model.fit(x_train, y_train) @@ -65,8 +71,9 @@ def _test_determenistic(distance, algorithm, weight, k): scikit_accuracy = accuracy_score(y_test, scikit_predict) daal_accuracy = accuracy_score(y_test, daal_predict) ratio = daal_accuracy / scikit_accuracy - reason = ("kNN accuracy: scikit_accuracy={},daal_accuracy={}, ratio={}".format( - scikit_accuracy, daal_accuracy, ratio)) + reason = "kNN accuracy: scikit_accuracy={},daal_accuracy={}, ratio={}".format( + scikit_accuracy, daal_accuracy, ratio + ) assert ratio >= ACCURACY_RATIO, reason # predict proba @@ -77,29 +84,30 @@ def _test_determenistic(distance, algorithm, weight, k): daal_log_loss = log_loss(y_test, daal_predict_proba) ratio = daal_log_loss / scikit_log_loss reason = "kNN log_loss: scikit_log_loss={},daal_log_loss={}, ratio={}".format( - scikit_log_loss, daal_log_loss, ratio) + scikit_log_loss, daal_log_loss, ratio + ) assert ratio <= LOG_LOSS_RATIO, reason # ROC AUC - scikit_roc_auc = roc_auc_score( - y_test, scikit_predict_proba, multi_class='ovr') - daal_roc_auc = roc_auc_score( - y_test, daal_predict_proba, multi_class='ovr') + scikit_roc_auc = roc_auc_score(y_test, scikit_predict_proba, multi_class="ovr") + daal_roc_auc = roc_auc_score(y_test, daal_predict_proba, multi_class="ovr") ratio = daal_roc_auc / scikit_roc_auc reason = "kNN roc_auc: scikit_roc_auc={}, daal_roc_auc={}, ratio={}".format( - scikit_roc_auc, daal_roc_auc, ratio) + scikit_roc_auc, daal_roc_auc, ratio + ) assert ratio >= ROC_AUC_RATIO, reason for i in range(1, N_TRIES): for j, res in enumerate(alg_results[i]): - reason = 'Results are different between runs for {}, {}, {}, k={}'.format( - algorithm, weight, distance, k) + reason = "Results are different between runs for {}, {}, {}, k={}".format( + algorithm, weight, distance, k + ) assert (res == alg_results[0][j]).mean() == 1, reason -@pytest.mark.parametrize('distance', DISTANCES) -@pytest.mark.parametrize('algorithm', ALGORITHMS) -@pytest.mark.parametrize('weight', WEIGHTS) -@pytest.mark.parametrize('k', KS) +@pytest.mark.parametrize("distance", DISTANCES) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("weight", WEIGHTS) +@pytest.mark.parametrize("k", KS) def test_determenistic(distance, algorithm, weight, k): _test_determenistic(distance, algorithm, weight, k) diff --git a/daal4py/sklearn/svm/__init__.py b/daal4py/sklearn/svm/__init__.py index c0765729b5..8002e241f9 100644 --- a/daal4py/sklearn/svm/__init__.py +++ b/daal4py/sklearn/svm/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .svm import SVC -__all__ = ['SVC'] +__all__ = ["SVC"] diff --git a/daal4py/sklearn/svm/_svm_0_22.py b/daal4py/sklearn/svm/_svm_0_22.py index 7501e4f900..4b131061ec 100644 --- a/daal4py/sklearn/svm/_svm_0_22.py +++ b/daal4py/sklearn/svm/_svm_0_22.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,30 +12,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from __future__ import print_function -import numpy as np +import warnings +import numpy as np +import sklearn.svm._base as svm_base +import sklearn.svm._classes as svm_classes from scipy import sparse as sp from sklearn.utils import check_random_state, check_X_y -from sklearn.utils.validation import check_is_fitted, _check_sample_weight - -import sklearn.svm._classes as svm_classes -import sklearn.svm._base as svm_base -import warnings +from sklearn.utils.validation import _check_sample_weight, check_is_fitted import daal4py -from .._utils import ( - make2d, - getFPType, - sklearn_check_version, - PatchingConditionsChain) + +from .._utils import PatchingConditionsChain, getFPType, make2d, sklearn_check_version def _get_libsvm_impl(): - return ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr'] + return ["c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr"] def _dual_coef_getter(self): @@ -48,17 +44,17 @@ def _intercept_getter(self): def _dual_coef_setter(self, val): self._internal_dual_coef_ = val - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - if getattr(self, '_daal_fit', False): + if getattr(self, "_daal_fit", False): self._daal_fit = False def _intercept_setter(self, val): self._internal_intercept_ = val - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - if getattr(self, '_daal_fit', False): + if getattr(self, "_daal_fit", False): self._daal_fit = False @@ -67,8 +63,7 @@ def group_indices_by_class(num_classes, sv_ind_by_clf, labels): sv_ind_counters = np.zeros(num_classes, dtype=np.intp) num_of_sv_per_class = np.bincount(labels[np.hstack(sv_ind_by_clf)]) - sv_ind_by_class = [np.empty(n, dtype=np.int32) - for n in num_of_sv_per_class] + sv_ind_by_class = [np.empty(n, dtype=np.int32) for n in num_of_sv_per_class] for indices_per_clf in sv_ind_by_clf: for sv_index in indices_per_clf: @@ -82,6 +77,7 @@ def group_indices_by_class(num_classes, sv_ind_by_clf, labels): def map_sv_to_columns_in_dual_coef_matrix(sv_ind_by_class): from collections import defaultdict + sv_ind_mapping = defaultdict(lambda: -1) p = 0 for indices_per_class in sv_ind_by_class: @@ -98,13 +94,10 @@ def map_to_lexicographic(n): Returns permutation of reverse lexicographics to lexicographics orders for pairs of n consecutive integer indexes """ - from itertools import (combinations, count) + from itertools import combinations, count + two_class_order_gen = ((j, i) for i in range(n) for j in range(i)) - reverse_lookup = { - key: val for key, - val in zip( - two_class_order_gen, - count(0))} + reverse_lookup = {key: val for key, val in zip(two_class_order_gen, count(0))} perm_iter = (reverse_lookup[pair] for pair in combinations(range(n), 2)) return np.fromiter(perm_iter, dtype=np.intp) @@ -119,8 +112,7 @@ def extract_dual_coef(num_classes, sv_ind_by_clf, sv_coef_by_clf, labels): Construct dual coefficients array in SKLearn peculiar layout, as well corresponding support vector indexes """ - sv_ind_by_class = group_indices_by_class( - num_classes, sv_ind_by_clf, labels) + sv_ind_by_class = group_indices_by_class(num_classes, sv_ind_by_clf, labels) sv_ind_mapping = map_sv_to_columns_in_dual_coef_matrix(sv_ind_by_class) num_unique_sv = len(sv_ind_mapping) @@ -150,14 +142,15 @@ def extract_dual_coef(num_classes, sv_ind_by_clf, sv_coef_by_clf, labels): def _daal4py_kf(kernel, X_fptype, gamma=1.0): - if kernel == 'rbf': + if kernel == "rbf": sigma_value = np.sqrt(0.5 / gamma) kf = daal4py.kernel_function_rbf(fptype=X_fptype, sigma=sigma_value) - elif kernel == 'linear': + elif kernel == "linear": kf = daal4py.kernel_function_linear(fptype=X_fptype) else: raise ValueError( - "_daal4py_fit received unexpected kernel specifiction {}.".format(kernel)) + "_daal4py_fit received unexpected kernel specifiction {}.".format(kernel) + ) return kf @@ -167,13 +160,13 @@ def _daal4py_check_weight(self, X, y, sample_weight): if sample_weight.shape[0] > 0: sample_weight = _check_sample_weight(sample_weight, X) if np.all(sample_weight <= 0): - raise ValueError( - 'Invalid input - all samples have zero or negative weights.') + raise ValueError("Invalid input - all samples have zero or negative weights.") if np.any(sample_weight <= 0): if len(np.unique(y[sample_weight > 0])) != len(self.classes_): raise ValueError( - 'Invalid input - all samples with positive weights' - ' have the same label.') + "Invalid input - all samples with positive weights" + " have the same label." + ) ww = sample_weight elif self.class_weight is not None: ww = np.ones(X.shape[0], dtype=np.float64) @@ -185,10 +178,19 @@ def _daal4py_check_weight(self, X, y, sample_weight): return ww -def _daal4py_svm(fptype, C, accuracyThreshold, tau, - maxIterations, cacheSize, doShrinking, kernel, nClasses=2): +def _daal4py_svm( + fptype, + C, + accuracyThreshold, + tau, + maxIterations, + cacheSize, + doShrinking, + kernel, + nClasses=2, +): svm_train = daal4py.svm_training( - method='thunder', + method="thunder", fptype=fptype, C=C, accuracyThreshold=accuracyThreshold, @@ -196,7 +198,7 @@ def _daal4py_svm(fptype, C, accuracyThreshold, tau, maxIterations=maxIterations, cacheSize=cacheSize, doShrinking=doShrinking, - kernel=kernel + kernel=kernel, ) if nClasses == 2: algo = svm_train @@ -204,7 +206,7 @@ def _daal4py_svm(fptype, C, accuracyThreshold, tau, algo = daal4py.multi_class_classifier_training( nClasses=nClasses, fptype=fptype, - method='oneAgainstOne', + method="oneAgainstOne", training=svm_train, ) @@ -212,7 +214,6 @@ def _daal4py_svm(fptype, C, accuracyThreshold, tau, def _daal4py_fit(self, X, y_inp, sample_weight, kernel): - if self.C <= 0: raise ValueError("C <= 0") num_classes = len(self.classes_) @@ -230,16 +231,17 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): X_fptype = getFPType(X) kf = _daal4py_kf(kernel, X_fptype, gamma=self._gamma) - algo = _daal4py_svm(fptype=X_fptype, - C=float(self.C), - accuracyThreshold=float(self.tol), - tau=1e-12, - maxIterations=int( - self.max_iter if self.max_iter > 0 else 2**30), - cacheSize=int(self.cache_size * 1024 * 1024), - doShrinking=bool(self.shrinking), - kernel=kf, - nClasses=num_classes) + algo = _daal4py_svm( + fptype=X_fptype, + C=float(self.C), + accuracyThreshold=float(self.tol), + tau=1e-12, + maxIterations=int(self.max_iter if self.max_iter > 0 else 2**30), + cacheSize=int(self.cache_size * 1024 * 1024), + doShrinking=bool(self.shrinking), + kernel=kf, + nClasses=num_classes, + ) res = algo.compute(data=X, labels=y, weights=ww) model = res.model @@ -252,11 +254,13 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): # support indexes need permutation to arrange them into the same layout # as that of Scikit-Learn - tmp = np.empty(two_class_sv_ind_.shape, dtype=np.dtype( - [('label', y.dtype), ('ind', two_class_sv_ind_.dtype)])) - tmp['label'][:] = y[two_class_sv_ind_].ravel() - tmp['ind'][:] = two_class_sv_ind_ - perm = np.argsort(tmp, order=['label', 'ind']) + tmp = np.empty( + two_class_sv_ind_.shape, + dtype=np.dtype([("label", y.dtype), ("ind", two_class_sv_ind_.dtype)]), + ) + tmp["label"][:] = y[two_class_sv_ind_].ravel() + tmp["ind"][:] = two_class_sv_ind_ + perm = np.argsort(tmp, order=["label", "ind"]) del tmp self.support_ = two_class_sv_ind_[perm] @@ -284,10 +288,9 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): two_class_sv_ind_ = svm_model.SupportIndices # Map these indexes to indexes of the training data sv_ind = np.take( - np.hstack( - (label_indexes[i1], - label_indexes[i2])), - two_class_sv_ind_.ravel()) + np.hstack((label_indexes[i1], label_indexes[i2])), + two_class_sv_ind_.ravel(), + ) sv_ind_by_clf.append(sv_ind) # svs_ = getArrayFromNumericTable(svm_model.getSupportVectors()) @@ -304,10 +307,10 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): intercepts = permute_list(intercepts, to_lex_perm) self.dual_coef_, self.support_ = extract_dual_coef( - num_classes, # number of classes + num_classes, # number of classes sv_ind_by_clf, # support vector indexes by two-class classifiers sv_coef_by_clf, # classification coefficients by two-class classifiers - y.squeeze().astype(np.intp, copy=False) # integer labels + y.squeeze().astype(np.intp, copy=False), # integer labels ) self.support_vectors_ = X[self.support_] self.intercept_ = np.array(intercepts) @@ -315,10 +318,12 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): indices = y.take(self.support_, axis=0) if num_classes == 2: self._n_support = np.array( - [np.sum(indices == -1), np.sum(indices == 1)], dtype=np.int32) + [np.sum(indices == -1), np.sum(indices == 1)], dtype=np.int32 + ) else: self._n_support = np.array( - [np.sum(indices == i) for i, c in enumerate(self.classes_)], dtype=np.int32) + [np.sum(indices == i) for i, c in enumerate(self.classes_)], dtype=np.int32 + ) try: self.probA_ = np.empty(0) @@ -329,8 +334,7 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): self._probB = np.empty(0) -def __compute_gamma__(gamma, kernel, X, sparse, - use_var=True, deprecation=True): +def __compute_gamma__(gamma, kernel, X, sparse, use_var=True, deprecation=True): """ Computes actual value of 'gamma' parameter of RBF kernel corresponding to SVC keyword values `gamma` and `kernel`, and feature @@ -343,23 +347,25 @@ def __compute_gamma__(gamma, kernel, X, sparse, See: https://github.com/scikit-learn/scikit-learn/pull/13221 """ if deprecation: - _gamma_is_scale = gamma in ('scale', 'auto_deprecated') + _gamma_is_scale = gamma in ("scale", "auto_deprecated") else: - _gamma_is_scale = (gamma == 'scale') + _gamma_is_scale = gamma == "scale" if _gamma_is_scale: - kernel_uses_gamma = (not callable(kernel) and kernel - not in ('linear', 'precomputed')) + kernel_uses_gamma = not callable(kernel) and kernel not in ( + "linear", + "precomputed", + ) if kernel_uses_gamma: if sparse: # var = E[X^2] - E[X]^2 - X_sc = (X.multiply(X)).mean() - (X.mean())**2 + X_sc = (X.multiply(X)).mean() - (X.mean()) ** 2 else: X_sc = X.var() if not use_var: X_sc = np.sqrt(X_sc) else: X_sc = 1.0 / X.shape[1] - if gamma == 'scale': + if gamma == "scale": if X_sc != 0: _gamma = 1.0 / (X.shape[1] * X_sc) else: @@ -370,13 +376,16 @@ def __compute_gamma__(gamma, kernel, X, sparse, # setting `gamma` in examples (also in tests). See # https://github.com/scikit-learn/scikit-learn/pull/10331 # for the examples/tests that need to be reverted. - warnings.warn("The default value of gamma will change " - "from 'auto' to 'scale' in version 0.22 to " - "account better for unscaled features. Set " - "gamma explicitly to 'auto' or 'scale' to " - "avoid this warning.", FutureWarning) + warnings.warn( + "The default value of gamma will change " + "from 'auto' to 'scale' in version 0.22 to " + "account better for unscaled features. Set " + "gamma explicitly to 'auto' or 'scale' to " + "avoid this warning.", + FutureWarning, + ) _gamma = 1.0 / X.shape[1] - elif gamma == 'auto': + elif gamma == "auto": _gamma = 1.0 / X.shape[1] elif isinstance(gamma, str) and not deprecation: raise ValueError( @@ -393,7 +402,8 @@ def _compute_gamma(*args): no_older_than_0_20_3 = sklearn_check_version("0.20.3") no_older_than_0_22 = not sklearn_check_version("0.22") return __compute_gamma__( - *args, use_var=no_older_than_0_20_3, deprecation=no_older_than_0_22) + *args, use_var=no_older_than_0_20_3, deprecation=no_older_than_0_22 + ) def fit(self, X, y, sample_weight=None): @@ -434,54 +444,60 @@ def fit(self, X, y, sample_weight=None): raise TypeError("Sparse precomputed kernels are not supported.") self._sparse = sparse and not callable(self.kernel) - X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr', - accept_large_sparse=False) + X, y = check_X_y( + X, y, dtype=np.float64, order="C", accept_sparse="csr", accept_large_sparse=False + ) y = self._validate_targets(y) - sample_weight = np.asarray([] - if sample_weight is None - else sample_weight, dtype=np.float64) + sample_weight = np.asarray( + [] if sample_weight is None else sample_weight, dtype=np.float64 + ) solver_type = _get_libsvm_impl().index(self._impl) # input validation if solver_type != 2 and X.shape[0] != y.shape[0]: raise ValueError( "X and y have incompatible shapes.\n" - "X has %s samples, but y has %s." % (X.shape[0], y.shape[0])) + "X has %s samples, but y has %s." % (X.shape[0], y.shape[0]) + ) if self.kernel == "precomputed" and X.shape[0] != X.shape[1]: raise ValueError("X.shape[0] should be equal to X.shape[1]") if sample_weight.shape[0] > 0 and sample_weight.shape[0] != X.shape[0]: - raise ValueError("sample_weight and X have incompatible shapes: " - "%r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (sample_weight.shape, X.shape)) + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." % (sample_weight.shape, X.shape) + ) self._gamma = _compute_gamma(self.gamma, self.kernel, X, sparse) kernel = self.kernel if callable(kernel): - kernel = 'precomputed' + kernel = "precomputed" fit = self._sparse_fit if self._sparse else self._dense_fit if self.verbose: # pragma: no cover - print('[LibSVM]', end='') + print("[LibSVM]", end="") # see comment on the other call to np.iinfo in this file - seed = rnd.randint(np.iinfo('i').max) - - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.fit") - _dal_ready = _patching_status.and_conditions([ - (not sparse, "X is sparse. Sparse input is not supported."), - (not self.probability, "Probabilities are not supported."), - (not getattr(self, 'break_ties', False), "Breaking ties is not supported."), - (kernel in ['linear', 'rbf'], - f"'{kernel}' kernel is not supported. " - "Only 'linear' and 'rbf' kernels are supported.") - ]) + seed = rnd.randint(np.iinfo("i").max) + + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.fit") + _dal_ready = _patching_status.and_conditions( + [ + (not sparse, "X is sparse. Sparse input is not supported."), + (not self.probability, "Probabilities are not supported."), + (not getattr(self, "break_ties", False), "Breaking ties is not supported."), + ( + kernel in ["linear", "rbf"], + f"'{kernel}' kernel is not supported. " + "Only 'linear' and 'rbf' kernels are supported.", + ), + ] + ) _patching_status.write_log() if _dal_ready: self._daal_fit = True @@ -505,8 +521,11 @@ def fit(self, X, y, sample_weight=None): self._internal_dual_coef_ *= -1 self._internal_intercept_ *= -1 - if not self._daal_fit and len(self.classes_) == 2 and self._impl in [ - 'c_svc', 'nu_svc']: + if ( + not self._daal_fit + and len(self.classes_) == 2 + and self._impl in ["c_svc", "nu_svc"] + ): self.intercept_ *= -1 self.dual_coef_ *= -1 @@ -520,9 +539,7 @@ def _daal4py_predict(self, X): kf = _daal4py_kf(self.kernel, X_fptype, gamma=self._gamma) svm_predict = daal4py.svm_prediction( - fptype=X_fptype, - method='defaultDense', - kernel=kf + fptype=X_fptype, method="defaultDense", kernel=kf ) if num_classes == 2: alg = svm_predict @@ -533,8 +550,8 @@ def _daal4py_predict(self, X): maxIterations=int(self.max_iter if self.max_iter > 0 else 2**30), accuracyThreshold=float(self.tol), pmethod="voteBased", - tmethod='oneAgainstOne', - prediction=svm_predict + tmethod="oneAgainstOne", + prediction=svm_predict, ) predictionRes = alg.compute(X, self.daal_model_) @@ -566,27 +583,37 @@ def predict(self, X): y_pred : array, shape (n_samples,) """ check_is_fitted(self) - _break_ties = getattr(self, 'break_ties', False) - if _break_ties and self.decision_function_shape == 'ovo': - raise ValueError("break_ties must be False when " - "decision_function_shape is 'ovo'") - - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.predict") - _dal_ready = _patching_status.and_conditions([ - (not _break_ties, "Breaking ties is not supported."), - (self.decision_function_shape != 'ovr', - "'ovr' decision function shape is not supported."), - (len(self.classes_) <= 2, "Number of classes > 2.") - ], conditions_merging=any) + _break_ties = getattr(self, "break_ties", False) + if _break_ties and self.decision_function_shape == "ovo": + raise ValueError( + "break_ties must be False when " "decision_function_shape is 'ovo'" + ) + + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.predict") + _dal_ready = _patching_status.and_conditions( + [ + (not _break_ties, "Breaking ties is not supported."), + ( + self.decision_function_shape != "ovr", + "'ovr' decision function shape is not supported.", + ), + (len(self.classes_) <= 2, "Number of classes > 2."), + ], + conditions_merging=any, + ) _patching_status.write_log() if not _dal_ready: y = np.argmax(self.decision_function(X), axis=1) else: X = self._validate_for_predict(X) - _dal_ready = _patching_status.and_conditions([ - (getattr(self, '_daal_fit', False) and hasattr(self, 'daal_model_'), - "oneDAL model was not trained.")]) + _dal_ready = _patching_status.and_conditions( + [ + ( + getattr(self, "_daal_fit", False) and hasattr(self, "daal_model_"), + "oneDAL model was not trained.", + ) + ] + ) if _dal_ready: y = _daal4py_predict(self, X) else: @@ -612,40 +639,88 @@ def predict(self, X): del __base_svc_init_function_code__ -if 'break_ties' in __base_svc_init_arg_names__: - class SVC(svm_base.BaseSVC): - _impl = 'c_svc' - - def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1, decision_function_shape='ovr', - break_ties=False, random_state=None): +if "break_ties" in __base_svc_init_arg_names__: - super(SVC, self).__init__( - kernel=kernel, degree=degree, gamma=gamma, - coef0=coef0, tol=tol, C=C, nu=0., shrinking=shrinking, - probability=probability, cache_size=cache_size, - class_weight=class_weight, verbose=verbose, max_iter=max_iter, - decision_function_shape=decision_function_shape, break_ties=break_ties, - random_state=random_state) -else: class SVC(svm_base.BaseSVC): - _impl = 'c_svc' + _impl = "c_svc" + + def __init__( + self, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): + super(SVC, self).__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=0.0, + shrinking=shrinking, + probability=probability, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) - def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated', - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1, decision_function_shape='ovr', - random_state=None): +else: + class SVC(svm_base.BaseSVC): + _impl = "c_svc" + + def __init__( + self, + C=1.0, + kernel="rbf", + degree=3, + gamma="auto_deprecated", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + random_state=None, + ): super(SVC, self).__init__( - kernel=kernel, degree=degree, gamma=gamma, - coef0=coef0, tol=tol, C=C, nu=0., shrinking=shrinking, - probability=probability, cache_size=cache_size, - class_weight=class_weight, verbose=verbose, max_iter=max_iter, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=0.0, + shrinking=shrinking, + probability=probability, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, decision_function_shape=decision_function_shape, - random_state=random_state) + random_state=random_state, + ) + SVC.fit = fit SVC.predict = predict diff --git a/daal4py/sklearn/svm/_svm_0_23.py b/daal4py/sklearn/svm/_svm_0_23.py index 92de19688c..96026668ee 100755 --- a/daal4py/sklearn/svm/_svm_0_23.py +++ b/daal4py/sklearn/svm/_svm_0_23.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,40 +12,42 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from __future__ import print_function -import numpy as np +import warnings +import numpy as np +import sklearn.svm._base as svm_base +import sklearn.svm._classes as svm_classes from scipy import sparse as sp from sklearn.calibration import CalibratedClassifierCV +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import StratifiedKFold from sklearn.utils import check_random_state +from sklearn.utils.multiclass import _ovr_decision_function from sklearn.utils.validation import ( - check_is_fitted, - check_consistent_length, + _check_sample_weight, _num_samples, - _check_sample_weight) -import sklearn.svm._classes as svm_classes -import sklearn.svm._base as svm_base -import warnings -from sklearn.exceptions import NotFittedError -from sklearn.utils.multiclass import _ovr_decision_function -from sklearn.model_selection import StratifiedKFold + check_consistent_length, + check_is_fitted, +) try: from packaging.version import Version except ImportError: from distutils.version import LooseVersion as Version + from sklearn import __version__ as sklearn_version import daal4py -from .._utils import ( - make2d, getFPType, sklearn_check_version, PatchingConditionsChain) + +from .._utils import PatchingConditionsChain, getFPType, make2d, sklearn_check_version def _get_libsvm_impl(): - return ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr'] + return ["c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr"] def _dual_coef_getter(self): @@ -58,17 +60,17 @@ def _intercept_getter(self): def _dual_coef_setter(self, val): self._internal_dual_coef_ = val - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - if getattr(self, '_daal_fit', False): + if getattr(self, "_daal_fit", False): self._daal_fit = False def _intercept_setter(self, val): self._internal_intercept_ = val - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - if getattr(self, '_daal_fit', False): + if getattr(self, "_daal_fit", False): self._daal_fit = False @@ -77,8 +79,7 @@ def group_indices_by_class(num_classes, sv_ind_by_clf, labels): sv_ind_counters = np.zeros(num_classes, dtype=np.intp) num_of_sv_per_class = np.bincount(labels[np.hstack(sv_ind_by_clf)]) - sv_ind_by_class = [np.empty(n, dtype=np.int32) - for n in num_of_sv_per_class] + sv_ind_by_class = [np.empty(n, dtype=np.int32) for n in num_of_sv_per_class] for indices_per_clf in sv_ind_by_clf: for sv_index in indices_per_clf: @@ -92,6 +93,7 @@ def group_indices_by_class(num_classes, sv_ind_by_clf, labels): def map_sv_to_columns_in_dual_coef_matrix(sv_ind_by_class): from collections import defaultdict + sv_ind_mapping = defaultdict(lambda: -1) p = 0 for indices_per_class in sv_ind_by_class: @@ -104,13 +106,13 @@ def map_sv_to_columns_in_dual_coef_matrix(sv_ind_by_class): def map_to_lexicographic(n): - """ Returns permutation of reverse lexicographics to + """Returns permutation of reverse lexicographics to lexicographics orders for pairs of n consecutive integer indexes """ - from itertools import (combinations, count) + from itertools import combinations, count + two_class_order_gen = ((j, i) for i in range(n) for j in range(i)) - reverse_lookup = {key: val for key, - val in zip(two_class_order_gen, count(0))} + reverse_lookup = {key: val for key, val in zip(two_class_order_gen, count(0))} perm_iter = (reverse_lookup[pair] for pair in combinations(range(n), 2)) return np.fromiter(perm_iter, dtype=np.intp) @@ -121,11 +123,10 @@ def permute_list(li, perm): def extract_dual_coef(num_classes, sv_ind_by_clf, sv_coef_by_clf, labels): - """ Construct dual coefficients array in SKLearn peculiar layout, + """Construct dual coefficients array in SKLearn peculiar layout, as well corresponding support vector indexes """ - sv_ind_by_class = group_indices_by_class( - num_classes, sv_ind_by_clf, labels) + sv_ind_by_class = group_indices_by_class(num_classes, sv_ind_by_clf, labels) sv_ind_mapping = map_sv_to_columns_in_dual_coef_matrix(sv_ind_by_class) num_unique_sv = len(sv_ind_mapping) @@ -156,15 +157,17 @@ def extract_dual_coef(num_classes, sv_ind_by_clf, sv_coef_by_clf, labels): def _daal4py_kf(kernel, X_fptype, gamma=1.0, is_sparse=False): method = "fastCSR" if is_sparse else "defaultDense" - if kernel == 'rbf': + if kernel == "rbf": sigma_value = np.sqrt(0.5 / gamma) kf = daal4py.kernel_function_rbf( - fptype=X_fptype, method=method, sigma=sigma_value) - elif kernel == 'linear': + fptype=X_fptype, method=method, sigma=sigma_value + ) + elif kernel == "linear": kf = daal4py.kernel_function_linear(fptype=X_fptype, method=method) else: raise ValueError( - "_daal4py_fit received unexpected kernel specifiction {}.".format(kernel)) + "_daal4py_fit received unexpected kernel specifiction {}.".format(kernel) + ) return kf @@ -174,13 +177,13 @@ def _daal4py_check_weight(self, X, y, sample_weight): if sample_weight.shape[0] > 0: sample_weight = _check_sample_weight(sample_weight, X) if np.all(sample_weight <= 0): - raise ValueError( - 'Invalid input - all samples have zero or negative weights.') + raise ValueError("Invalid input - all samples have zero or negative weights.") if np.any(sample_weight <= 0): if len(np.unique(y[sample_weight > 0])) != len(self.classes_): raise ValueError( - 'Invalid input - all samples with positive weights ' - 'have the same label.') + "Invalid input - all samples with positive weights " + "have the same label." + ) ww = sample_weight elif self.class_weight is not None: ww = np.ones(X.shape[0], dtype=np.float64) @@ -190,10 +193,19 @@ def _daal4py_check_weight(self, X, y, sample_weight): return ww -def _daal4py_svm(fptype, C, accuracyThreshold, tau, - maxIterations, cacheSize, doShrinking, kernel, nClasses=2): +def _daal4py_svm( + fptype, + C, + accuracyThreshold, + tau, + maxIterations, + cacheSize, + doShrinking, + kernel, + nClasses=2, +): svm_train = daal4py.svm_training( - method='thunder', + method="thunder", fptype=fptype, C=C, accuracyThreshold=accuracyThreshold, @@ -201,7 +213,7 @@ def _daal4py_svm(fptype, C, accuracyThreshold, tau, maxIterations=maxIterations, cacheSize=cacheSize, doShrinking=doShrinking, - kernel=kernel + kernel=kernel, ) if nClasses == 2: algo = svm_train @@ -209,7 +221,7 @@ def _daal4py_svm(fptype, C, accuracyThreshold, tau, algo = daal4py.multi_class_classifier_training( nClasses=nClasses, fptype=fptype, - method='oneAgainstOne', + method="oneAgainstOne", training=svm_train, ) @@ -227,17 +239,17 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel, is_sparse=False): y = make2d(y_inp) X_fptype = getFPType(X) kf = _daal4py_kf(kernel, X_fptype, gamma=self._gamma, is_sparse=is_sparse) - algo = _daal4py_svm(fptype=X_fptype, - C=float(self.C), - accuracyThreshold=float(self.tol), - tau=1e-12, - maxIterations=int( - self.max_iter if self.max_iter > 0 else 2**30), - cacheSize=int( - self.cache_size * 1024 * 1024), - doShrinking=bool(self.shrinking), - kernel=kf, - nClasses=num_classes) + algo = _daal4py_svm( + fptype=X_fptype, + C=float(self.C), + accuracyThreshold=float(self.tol), + tau=1e-12, + maxIterations=int(self.max_iter if self.max_iter > 0 else 2**30), + cacheSize=int(self.cache_size * 1024 * 1024), + doShrinking=bool(self.shrinking), + kernel=kf, + nClasses=num_classes, + ) res = algo.compute(data=X, labels=y, weights=sample_weight) @@ -251,11 +263,13 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel, is_sparse=False): # support indexes need permutation to arrange them # into the same layout as that of Scikit-Learn - tmp = np.empty(two_class_sv_ind_.shape, dtype=np.dtype( - [('label', y.dtype), ('ind', two_class_sv_ind_.dtype)])) - tmp['label'][:] = y[two_class_sv_ind_].ravel() - tmp['ind'][:] = two_class_sv_ind_ - perm = np.argsort(tmp, order=['label', 'ind']) + tmp = np.empty( + two_class_sv_ind_.shape, + dtype=np.dtype([("label", y.dtype), ("ind", two_class_sv_ind_.dtype)]), + ) + tmp["label"][:] = y[two_class_sv_ind_].ravel() + tmp["ind"][:] = two_class_sv_ind_ + perm = np.argsort(tmp, order=["label", "ind"]) del tmp self.support_ = two_class_sv_ind_[perm] @@ -285,10 +299,9 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel, is_sparse=False): two_class_sv_ind_ = svm_model.SupportIndices # Map these indexes to indexes of the training data sv_ind = np.take( - np.hstack( - (label_indexes[i1], - label_indexes[i2])), - two_class_sv_ind_.ravel()) + np.hstack((label_indexes[i1], label_indexes[i2])), + two_class_sv_ind_.ravel(), + ) sv_ind_by_clf.append(sv_ind) # svs_ = getArrayFromNumericTable(svm_model.getSupportVectors()) @@ -305,10 +318,10 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel, is_sparse=False): intercepts = permute_list(intercepts, to_lex_perm) self.dual_coef_, self.support_ = extract_dual_coef( - num_classes, # number of classes + num_classes, # number of classes sv_ind_by_clf, # support vector indexes by two-class classifiers sv_coef_by_clf, # classification coefficients by two-class classifiers - y.squeeze().astype(np.intp, copy=False) # integer labels + y.squeeze().astype(np.intp, copy=False), # integer labels ) if is_sparse: self.dual_coef_ = sp.csr_matrix(self.dual_coef_) @@ -317,7 +330,8 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel, is_sparse=False): indices = y.take(self.support_, axis=0) self._n_support = np.array( - [np.sum(indices == i) for i, c in enumerate(self.classes_)], dtype=np.int32) + [np.sum(indices == i) for i, c in enumerate(self.classes_)], dtype=np.int32 + ) self._probA = np.empty(0) self._probB = np.empty(0) @@ -336,23 +350,25 @@ def __compute_gamma__(gamma, kernel, X, use_var=True, deprecation=True): See: https://github.com/scikit-learn/scikit-learn/pull/13221 """ if deprecation: - _gamma_is_scale = gamma in ('scale', 'auto_deprecated') + _gamma_is_scale = gamma in ("scale", "auto_deprecated") else: - _gamma_is_scale = (gamma == 'scale') + _gamma_is_scale = gamma == "scale" if _gamma_is_scale: - kernel_uses_gamma = (not callable(kernel) and kernel - not in ('linear', 'precomputed')) + kernel_uses_gamma = not callable(kernel) and kernel not in ( + "linear", + "precomputed", + ) if kernel_uses_gamma: if sp.isspmatrix(X): # var = E[X^2] - E[X]^2 - X_sc = (X.multiply(X)).mean() - (X.mean())**2 + X_sc = (X.multiply(X)).mean() - (X.mean()) ** 2 else: X_sc = X.var() if not use_var: X_sc = np.sqrt(X_sc) else: X_sc = 1.0 / X.shape[1] - if gamma == 'scale': + if gamma == "scale": if X_sc != 0: _gamma = 1.0 / (X.shape[1] * X_sc) else: @@ -363,13 +379,16 @@ def __compute_gamma__(gamma, kernel, X, use_var=True, deprecation=True): # setting `gamma` in examples (also in tests). See # https://github.com/scikit-learn/scikit-learn/pull/10331 # for the examples/tests that need to be reverted. - warnings.warn("The default value of gamma will change " - "from 'auto' to 'scale' in version 0.22 to " - "account better for unscaled features. Set " - "gamma explicitly to 'auto' or 'scale' to " - "avoid this warning.", FutureWarning) + warnings.warn( + "The default value of gamma will change " + "from 'auto' to 'scale' in version 0.22 to " + "account better for unscaled features. Set " + "gamma explicitly to 'auto' or 'scale' to " + "avoid this warning.", + FutureWarning, + ) _gamma = 1.0 / X.shape[1] - elif gamma == 'auto': + elif gamma == "auto": _gamma = 1.0 / X.shape[1] elif isinstance(gamma, str) and not deprecation: raise ValueError( @@ -386,9 +405,8 @@ def _compute_gamma(*args): no_older_than_0_20_3 = sklearn_check_version("0.20.3") no_older_than_0_22 = not sklearn_check_version("0.22") return __compute_gamma__( - *args, - use_var=no_older_than_0_20_3, - deprecation=no_older_than_0_22) + *args, use_var=no_older_than_0_20_3, deprecation=no_older_than_0_22 + ) def fit(self, X, y, sample_weight=None): @@ -429,8 +447,8 @@ def fit(self, X, y, sample_weight=None): raise TypeError("Sparse precomputed kernels are not supported.") self._sparse = is_sparse and not callable(self.kernel) - if hasattr(self, 'decision_function_shape'): - if self.decision_function_shape not in ('ovr', 'ovo'): + if hasattr(self, "decision_function_shape"): + if self.decision_function_shape not in ("ovr", "ovo"): raise ValueError( f"decision_function_shape must be either 'ovr' or 'ovo', " f"got {self.decision_function_shape}." @@ -439,14 +457,19 @@ def fit(self, X, y, sample_weight=None): if callable(self.kernel): check_consistent_length(X, y) else: - X, y = self._validate_data(X, y, dtype=np.float64, - order='C', accept_sparse='csr', - accept_large_sparse=False) + X, y = self._validate_data( + X, + y, + dtype=np.float64, + order="C", + accept_sparse="csr", + accept_large_sparse=False, + ) y = self._validate_targets(y) - sample_weight = np.asarray([] - if sample_weight is None - else sample_weight, dtype=np.float64) + sample_weight = np.asarray( + [] if sample_weight is None else sample_weight, dtype=np.float64 + ) solver_type = _get_libsvm_impl().index(self._impl) # input validation @@ -454,37 +477,43 @@ def fit(self, X, y, sample_weight=None): if solver_type != 2 and n_samples != y.shape[0]: raise ValueError( "X and y have incompatible shapes.\n" - "X has %s samples, but y has %s." % (n_samples, y.shape[0])) + "X has %s samples, but y has %s." % (n_samples, y.shape[0]) + ) if self.kernel == "precomputed" and n_samples != X.shape[1]: raise ValueError("X.shape[0] should be equal to X.shape[1]") if sample_weight.shape[0] > 0 and sample_weight.shape[0] != n_samples: - raise ValueError("sample_weight and X have incompatible shapes: " - "%r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (sample_weight.shape, X.shape)) - - kernel = 'precomputed' if callable(self.kernel) else self.kernel - if kernel == 'precomputed': + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." % (sample_weight.shape, X.shape) + ) + + kernel = "precomputed" if callable(self.kernel) else self.kernel + if kernel == "precomputed": self._gamma = 0.0 else: self._gamma = _compute_gamma(self.gamma, kernel, X) fit = self._sparse_fit if self._sparse else self._dense_fit if self.verbose: # pragma: no cover - print('[LibSVM]', end='') + print("[LibSVM]", end="") # see comment on the other call to np.iinfo in this file - seed = rnd.randint(np.iinfo('i').max) - - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.fit") - _dal_ready = _patching_status.and_conditions([ - (kernel in ['linear', 'rbf'], - f"'{kernel}' kernel is not supported. " - "Only 'linear' and 'rbf' kernels are supported.")]) + seed = rnd.randint(np.iinfo("i").max) + + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.fit") + _dal_ready = _patching_status.and_conditions( + [ + ( + kernel in ["linear", "rbf"], + f"'{kernel}' kernel is not supported. " + "Only 'linear' and 'rbf' kernels are supported.", + ) + ] + ) _patching_status.write_log() if _dal_ready: sample_weight = _daal4py_check_weight(self, X, y, sample_weight) @@ -496,36 +525,37 @@ def fit(self, X, y, sample_weight=None): if self.probability: params = self.get_params() params["probability"] = False - params["decision_function_shape"] = 'ovr' + params["decision_function_shape"] = "ovr" clf_base = SVC(**params) try: n_splits = 5 cv = StratifiedKFold( - n_splits=n_splits, - shuffle=True, - random_state=self.random_state) + n_splits=n_splits, shuffle=True, random_state=self.random_state + ) if Version(sklearn_version) >= Version("0.24"): self.clf_prob = CalibratedClassifierCV( - clf_base, ensemble=False, cv=cv, method='sigmoid', - n_jobs=n_splits) + clf_base, ensemble=False, cv=cv, method="sigmoid", n_jobs=n_splits + ) else: self.clf_prob = CalibratedClassifierCV( - clf_base, cv=cv, method='sigmoid') + clf_base, cv=cv, method="sigmoid" + ) self.clf_prob.fit(X, y, sample_weight) except ValueError: clf_base = clf_base.fit(X, y, sample_weight) self.clf_prob = CalibratedClassifierCV( - clf_base, cv="prefit", method='sigmoid') + clf_base, cv="prefit", method="sigmoid" + ) self.clf_prob.fit(X, y, sample_weight) else: self._daal_fit = False fit(X, y, sample_weight, solver_type, kernel, random_seed=seed) - self.shape_fit_ = X.shape if hasattr(X, "shape") else (n_samples, ) + self.shape_fit_ = X.shape if hasattr(X, "shape") else (n_samples,) # In binary case, we need to flip the sign of coef, intercept and # decision function. Use self._intercept_ and self._dual_coef_ internally. - if not getattr(self, '_daal_fit', False): + if not getattr(self, "_daal_fit", False): self._internal_intercept_ = self.intercept_.copy() self._internal_dual_coef_ = self.dual_coef_.copy() else: @@ -535,13 +565,11 @@ def fit(self, X, y, sample_weight=None): self._internal_dual_coef_ *= -1 self._internal_intercept_ *= -1 - if not getattr( - self, - '_daal_fit', - False) and len( - self.classes_) == 2 and self._impl in [ - 'c_svc', - 'nu_svc']: + if ( + not getattr(self, "_daal_fit", False) + and len(self.classes_) == 2 + and self._impl in ["c_svc", "nu_svc"] + ): self.intercept_ *= -1 self.dual_coef_ *= -1 @@ -552,26 +580,24 @@ def _daal4py_predict(self, X, is_decision_function=False): X_fptype = getFPType(X) num_classes = len(self.classes_) - kf = _daal4py_kf(self.kernel, X_fptype, gamma=self._gamma, - is_sparse=sp.isspmatrix(X)) + kf = _daal4py_kf(self.kernel, X_fptype, gamma=self._gamma, is_sparse=sp.isspmatrix(X)) svm_predict = daal4py.svm_prediction( - fptype=X_fptype, - method='defaultDense', - kernel=kf + fptype=X_fptype, method="defaultDense", kernel=kf ) if num_classes == 2: alg = svm_predict else: - result_to_compute = 'computeDecisionFunction' \ - if is_decision_function else 'computeClassLabels' + result_to_compute = ( + "computeDecisionFunction" if is_decision_function else "computeClassLabels" + ) alg = daal4py.multi_class_classifier_prediction( nClasses=num_classes, fptype=X_fptype, pmethod="voteBased", - tmethod='oneAgainstOne', + tmethod="oneAgainstOne", resultsToEvaluate=result_to_compute, - prediction=svm_predict + prediction=svm_predict, ) predictionRes = alg.compute(X, self.daal_model_) @@ -606,27 +632,37 @@ def predict(self, X): """ check_is_fitted(self) - _break_ties = getattr(self, 'break_ties', False) - if _break_ties and self.decision_function_shape == 'ovo': - raise ValueError("break_ties must be False when " - "decision_function_shape is 'ovo'") - - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.predict") - _dal_ready = _patching_status.and_conditions([ - (not _break_ties, "Breaking ties is not supported."), - (self.decision_function_shape != 'ovr', - "'ovr' decision function shape is not supported."), - (len(self.classes_) <= 2, "Number of classes > 2.") - ], conditions_merging=any) + _break_ties = getattr(self, "break_ties", False) + if _break_ties and self.decision_function_shape == "ovo": + raise ValueError( + "break_ties must be False when " "decision_function_shape is 'ovo'" + ) + + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.predict") + _dal_ready = _patching_status.and_conditions( + [ + (not _break_ties, "Breaking ties is not supported."), + ( + self.decision_function_shape != "ovr", + "'ovr' decision function shape is not supported.", + ), + (len(self.classes_) <= 2, "Number of classes > 2."), + ], + conditions_merging=any, + ) _patching_status.write_log() if not _dal_ready: y = np.argmax(self.decision_function(X), axis=1) else: X = self._validate_for_predict(X) - _dal_ready = _patching_status.and_conditions([ - (getattr(self, '_daal_fit', False) and hasattr(self, 'daal_model_'), - "oneDAL model was not trained.")]) + _dal_ready = _patching_status.and_conditions( + [ + ( + getattr(self, "_daal_fit", False) and hasattr(self, "daal_model_"), + "oneDAL model was not trained.", + ) + ] + ) if _dal_ready: if self.probability and self.clf_prob is not None: y = self.clf_prob.predict(X) @@ -642,9 +678,10 @@ def predict(self, X): def _daal4py_predict_proba(self, X): X = self._validate_for_predict(X) - if getattr(self, 'clf_prob', None) is None: + if getattr(self, "clf_prob", None) is None: raise NotFittedError( - "predict_proba is not available when fitted with probability=False") + "predict_proba is not available when fitted with probability=False" + ) prob = self.clf_prob.predict_proba(X) return prob @@ -678,10 +715,10 @@ def predict_proba(self): """ self._check_proba() - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.predict_proba") - _dal_ready = _patching_status.and_conditions([ - (getattr(self, '_daal_fit', False), "oneDAL model was not trained.")]) + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.predict_proba") + _dal_ready = _patching_status.and_conditions( + [(getattr(self, "_daal_fit", False), "oneDAL model was not trained.")] + ) _patching_status.write_log() if _dal_ready: algo = self._daal4py_predict_proba @@ -717,17 +754,17 @@ def decision_function(self, X): transformation of ovo decision function. """ - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.decision_function") - _dal_ready = _patching_status.and_conditions([ - (getattr(self, '_daal_fit', False), "oneDAL model was not trained.")]) + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.decision_function") + _dal_ready = _patching_status.and_conditions( + [(getattr(self, "_daal_fit", False), "oneDAL model was not trained.")] + ) _patching_status.write_log() if _dal_ready: X = self._validate_for_predict(X) dec = _daal4py_predict(self, X, is_decision_function=True) else: dec = self._decision_function(X) - if self.decision_function_shape == 'ovr' and len(self.classes_) > 2: + if self.decision_function_shape == "ovr" and len(self.classes_) > 2: return _ovr_decision_function(dec < 0, -dec, len(self.classes_)) return dec @@ -749,14 +786,26 @@ def decision_function(self, X): class SVC(svm_base.BaseSVC): - _impl = 'c_svc' - - def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1, decision_function_shape='ovr', - break_ties=False, random_state=None): - + _impl = "c_svc" + + def __init__( + self, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): super(SVC, self).__init__( kernel=kernel, degree=degree, @@ -764,7 +813,7 @@ def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=coef0, tol=tol, C=C, - nu=0., + nu=0.0, shrinking=shrinking, probability=probability, cache_size=cache_size, @@ -773,7 +822,8 @@ def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties, - random_state=random_state) + random_state=random_state, + ) SVC.fit = fit diff --git a/daal4py/sklearn/svm/svm.py b/daal4py/sklearn/svm/svm.py index 14ca6f258e..fb15c62fdf 100644 --- a/daal4py/sklearn/svm/svm.py +++ b/daal4py/sklearn/svm/svm.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,11 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import sklearn_check_version -if sklearn_check_version('0.23'): +if sklearn_check_version("0.23"): from ._svm_0_23 import * -elif sklearn_check_version('0.22'): +elif sklearn_check_version("0.22"): from ._svm_0_22 import * diff --git a/daal4py/sklearn/test/test_common.py b/daal4py/sklearn/test/test_common.py index eb22ecda0b..b2b59c9651 100644 --- a/daal4py/sklearn/test/test_common.py +++ b/daal4py/sklearn/test/test_common.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,46 +12,45 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +import numpy as np import pandas as pd import pytest -import numpy as np -from daal4py.sklearn.ensemble \ - import RandomForestClassifier as DaalRandomForestClassifier -from daal4py.sklearn.ensemble \ - import RandomForestRegressor as DaalRandomForestRegressor -from daal4py.sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split +from daal4py.sklearn.ensemble import RandomForestClassifier as DaalRandomForestClassifier +from daal4py.sklearn.ensemble import RandomForestRegressor as DaalRandomForestRegressor +from daal4py.sklearn.neighbors import KNeighborsClassifier + -def convert_data(data, class_name=np.array, order='C', dtype=np.float64): - if order == 'C': +def convert_data(data, class_name=np.array, order="C", dtype=np.float64): + if order == "C": data = np.ascontiguousarray(data, dtype=dtype) else: data = np.asfortranarray(data, dtype=dtype) return class_name(data) -def make_dataset(n_samples=256, n_features=5, n_classes=2, - test_size=0.5, shuffle=True): - x, y = make_classification(n_samples=n_samples, n_features=n_features, - n_classes=n_classes, random_state=777) - return train_test_split(x, y, random_state=777, - test_size=test_size, shuffle=shuffle) +def make_dataset(n_samples=256, n_features=5, n_classes=2, test_size=0.5, shuffle=True): + x, y = make_classification( + n_samples=n_samples, n_features=n_features, n_classes=n_classes, random_state=777 + ) + return train_test_split(x, y, random_state=777, test_size=test_size, shuffle=shuffle) ESTIMATORS = { - 'KNeighborsClassifier': - KNeighborsClassifier(n_neighbors=10), - 'DaalRandomForestClassifier': - DaalRandomForestClassifier(n_estimators=10, random_state=777), - 'DaalRandomForestRegressor': - DaalRandomForestRegressor(n_estimators=10, random_state=777), + "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=10), + "DaalRandomForestClassifier": DaalRandomForestClassifier( + n_estimators=10, random_state=777 + ), + "DaalRandomForestRegressor": DaalRandomForestRegressor( + n_estimators=10, random_state=777 + ), } -ORDERS = ['C', 'F'] +ORDERS = ["C", "F"] DATA_FORMATS = [pd.DataFrame, np.array] @@ -70,10 +69,11 @@ def check_data_formats_diff(name): for i in range(1, len(alg_results)): for j, res in enumerate(alg_results[i]): - assert (res == alg_results[0][j]).mean() == 1, \ - ('Results are different between formats: estimator=%s' % (name)) + assert ( + res == alg_results[0][j] + ).mean() == 1, "Results are different between formats: estimator=%s" % (name) -@pytest.mark.parametrize('name', ESTIMATORS) +@pytest.mark.parametrize("name", ESTIMATORS) def test_data_formats_diff(name): check_data_formats_diff(name) diff --git a/daal4py/sklearn/tree/__init__.py b/daal4py/sklearn/tree/__init__.py index 95a262ed20..7ffdab77a7 100644 --- a/daal4py/sklearn/tree/__init__.py +++ b/daal4py/sklearn/tree/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .decision_tree import DecisionTreeClassifier -__all__ = ['DecisionTreeClassifier'] +__all__ = ["DecisionTreeClassifier"] diff --git a/daal4py/sklearn/tree/decision_tree.py b/daal4py/sklearn/tree/decision_tree.py index b20bc75b90..8ec6f73439 100644 --- a/daal4py/sklearn/tree/decision_tree.py +++ b/daal4py/sklearn/tree/decision_tree.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,23 +12,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py DecisionTree scikit-learn-compatible estimator classes -import numpy as np import numbers import warnings + +import numpy as np +from scipy.sparse import issparse from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.exceptions import DataConversionWarning -from sklearn.utils.validation import ( - check_array, check_is_fitted, check_consistent_length -) from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_array, check_consistent_length, check_is_fitted + import daal4py as d4p -from .._utils import (make2d, getFPType) + from .._device_offload import support_usm_ndarray -from scipy.sparse import issparse +from .._utils import getFPType, make2d class DecisionTreeClassifier(BaseEstimator, ClassifierMixin): @@ -71,8 +72,10 @@ class DecisionTreeClassifier(BaseEstimator, ClassifierMixin): nBins is the number of bins used to compute probabilities of the observations belonging to the class. """ - def __init__(self, max_depth=None, min_observations_in_leaf_node=1, - split_criterion='gini'): + + def __init__( + self, max_depth=None, min_observations_in_leaf_node=1, split_criterion="gini" + ): self.max_depth = max_depth self.min_observations_in_leaf_node = min_observations_in_leaf_node self.split_criterion = split_criterion @@ -94,9 +97,11 @@ def _daal4py_fit(self, X, y, w, pruning_set=None): _pruning_X = make2d(_pruning_X) _pruning_y = make2d(_pruning_y) else: - raise ValueError("pruning_set parameter is expected to be " - "a tuple of pruning features and pruning " - "dependent variables") + raise ValueError( + "pruning_set parameter is expected to be " + "a tuple of pruning features and pruning " + "dependent variables" + ) if w is not None: w = make2d(np.asarray(w)) @@ -109,11 +114,11 @@ def _daal4py_fit(self, X, y, w, pruning_set=None): splitCriterion=self.split_criterion, maxTreeDepth=daal_max_tree_depth, minObservationsInLeafNodes=int(self.min_observations_in_leaf_node), - pruning=_pruning) - res = alg.compute(X, y, - dataForPruning=_pruning_X, - labelsForPruning=_pruning_y, - weights=w) + pruning=_pruning, + ) + res = alg.compute( + X, y, dataForPruning=_pruning_X, labelsForPruning=_pruning_y, weights=w + ) self.daal_model_ = res.model self._cached_tree_state_ = None @@ -122,7 +127,7 @@ def _get_tree_state(self): Internal utility that returns an array behind scikit-learn's tree object from daal_model_ produced by call to fit """ - check_is_fitted(self, ['daal_model_', '_cached_tree_state_']) + check_is_fitted(self, ["daal_model_", "_cached_tree_state_"]) if self._cached_tree_state_ is None: tree_state_class = d4p.getTreeState(self.daal_model_, int(self.n_classes_)) self._cached_tree_state_ = tree_state_class @@ -170,20 +175,26 @@ def fit(self, X, y, sample_weight=None, pruning_set=None): onedal-documentation.html """ - if self.split_criterion not in ('gini', 'infoGain'): - raise ValueError('Parameter "split_criterion" must be ' - '"gini" or "infoGain".') + if self.split_criterion not in ("gini", "infoGain"): + raise ValueError( + 'Parameter "split_criterion" must be ' '"gini" or "infoGain".' + ) - if not isinstance(self.max_depth, numbers.Integral) or \ - self.max_depth < 0: + if not isinstance(self.max_depth, numbers.Integral) or self.max_depth < 0: if self.max_depth is not None: - raise ValueError('Parameter "max_depth" must be ' - 'a non-negative integer value or None.') - - if not isinstance(self.min_observations_in_leaf_node, numbers.Integral) or \ - self.min_observations_in_leaf_node <= 0: - raise ValueError('Parameter "min_observations_in_leaf_node" must be ' - 'non-zero positive integer value.') + raise ValueError( + 'Parameter "max_depth" must be ' + "a non-negative integer value or None." + ) + + if ( + not isinstance(self.min_observations_in_leaf_node, numbers.Integral) + or self.min_observations_in_leaf_node <= 0 + ): + raise ValueError( + 'Parameter "min_observations_in_leaf_node" must be ' + "non-zero positive integer value." + ) X = check_array(X, dtype=[np.single, np.double]) y = np.asarray(y) @@ -194,7 +205,8 @@ def fit(self, X, y, sample_weight=None, pruning_set=None): "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", - DataConversionWarning, stacklevel=2 + DataConversionWarning, + stacklevel=2, ) check_consistent_length(X, y) @@ -207,9 +219,11 @@ def fit(self, X, y, sample_weight=None, pruning_set=None): self.n_outputs_ = y.shape[1] if self.n_outputs_ != 1: _class_name = self.__class__.__name__ - raise ValueError(_class_name + " does not currently support " - "multi-output data. " - "Consider using OneHotEncoder") + raise ValueError( + _class_name + " does not currently support " + "multi-output data. " + "Consider using OneHotEncoder" + ) y = check_array(y, ensure_2d=False, dtype=None) check_classification_targets(y) @@ -221,8 +235,9 @@ def fit(self, X, y, sample_weight=None, pruning_set=None): y_store_unique_indices = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): - classes_k, y_store_unique_indices[:, k] = \ - np.unique(y[:, k], return_inverse=True) + classes_k, y_store_unique_indices[:, k] = np.unique( + y[:, k], return_inverse=True + ) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_store_unique_indices @@ -243,17 +258,16 @@ def _validate_X_predict(self, X, check_input): """Validate X whenever one tries to predict, apply, predict_proba""" if check_input: X = check_array(X, dtype=[np.single, np.double], accept_sparse="csr") - if issparse(X) and \ - (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): - raise ValueError("No support for np.int64 index based " - "sparse matrices") + if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): + raise ValueError("No support for np.int64 index based " "sparse matrices") n_features = X.shape[1] if self.n_features_ != n_features: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is %s and " - "input n_features is %s " - % (self.n_features_, n_features)) + raise ValueError( + "Number of features of the model must " + "match the input. Model n_features is %s and " + "input n_features is %s " % (self.n_features_, n_features) + ) return X @@ -264,21 +278,21 @@ def _daal4py_predict(self, X): method="defaultDense", nBins=1, nClasses=self.n_classes_, - resultsToEvaluate="computeClassLabels" + resultsToEvaluate="computeClassLabels", ) res = alg.compute(X, self.daal_model_) return res.prediction.ravel() @support_usm_ndarray() def predict(self, X, check_input=True): - check_is_fitted(self, 'daal_model_') + check_is_fitted(self, "daal_model_") X = self._validate_X_predict(X, check_input) y = self._daal4py_predict(X) return self.classes_.take(np.asarray(y, dtype=np.intp), axis=0) @support_usm_ndarray() def predict_proba(self, X, check_input=True): - check_is_fitted(self, 'daal_model_') + check_is_fitted(self, "daal_model_") X = self._validate_X_predict(X, check_input) y = self._daal4py_predict(X) return self.classes_.take(np.asarray(y, dtype=np.intp), axis=0) diff --git a/daal4py/sklearn/utils/__init__.py b/daal4py/sklearn/utils/__init__.py index 810c35da04..6b7cf7b664 100644 --- a/daal4py/sklearn/utils/__init__.py +++ b/daal4py/sklearn/utils/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,9 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .validation import _assert_all_finite -__all__ = ['_assert_all_finite', '_daal_check_array', '_daal_check_X_y', - '_daal_validate_data'] +__all__ = [ + "_assert_all_finite", + "_daal_check_array", + "_daal_check_X_y", + "_daal_validate_data", +] diff --git a/daal4py/sklearn/utils/base.py b/daal4py/sklearn/utils/base.py index e7718d8f87..586f137ee0 100644 --- a/daal4py/sklearn/utils/base.py +++ b/daal4py/sklearn/utils/base.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .validation import _daal_check_array, _daal_check_X_y -def _daal_validate_data(self, X, y=None, reset=True, - validate_separately=False, **check_params): +def _daal_validate_data( + self, X, y=None, reset=True, validate_separately=False, **check_params +): """Validate input data and set or check the `n_features_in_` attribute. Parameters @@ -49,7 +50,7 @@ def _daal_validate_data(self, X, y=None, reset=True, """ if y is None: - if self._get_tags()['requires_y']: + if self._get_tags()["requires_y"]: raise ValueError( f"This {self.__class__.__name__} estimator " f"requires y to be passed, but the target y is None." @@ -69,6 +70,6 @@ def _daal_validate_data(self, X, y=None, reset=True, X, y = _daal_check_X_y(X, y, **check_params) out = X, y - if check_params.get('ensure_2d', True): + if check_params.get("ensure_2d", True): self._check_n_features(X, reset=reset) return out diff --git a/daal4py/sklearn/utils/validation.py b/daal4py/sklearn/utils/validation.py index b9e6aa959a..a57aea6f18 100644 --- a/daal4py/sklearn/utils/validation.py +++ b/daal4py/sklearn/utils/validation.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,38 +12,55 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np -import daal4py as d4p -from sklearn import get_config as _get_config -from sklearn.utils.fixes import _object_dtype_isnan import warnings from contextlib import suppress + +import numpy as np import scipy.sparse as sp from numpy.core.numeric import ComplexWarning -from sklearn.utils.validation import (_num_samples, _ensure_no_complex_data, - _ensure_sparse_format, column_or_1d, - check_consistent_length) -from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite +from sklearn import get_config as _get_config from sklearn.utils.extmath import _safe_accumulator_op -from .._utils import (is_DataFrame, get_dtype, get_number_of_types, - sklearn_check_version, PatchingConditionsChain) +from sklearn.utils.fixes import _object_dtype_isnan +from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite +from sklearn.utils.validation import ( + _ensure_no_complex_data, + _ensure_sparse_format, + _num_samples, + check_consistent_length, + column_or_1d, +) +import daal4py as d4p -def _assert_all_finite(X, allow_nan=False, msg_dtype=None, - estimator_name=None, input_name=""): - if _get_config()['assume_finite']: +from .._utils import ( + PatchingConditionsChain, + get_dtype, + get_number_of_types, + is_DataFrame, + sklearn_check_version, +) + + +def _assert_all_finite( + X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name="" +): + if _get_config()["assume_finite"]: return # Data with small size has too big relative overhead # TODO: tune threshold size - if hasattr(X, 'size'): + if hasattr(X, "size"): if X.size < 32768: if sklearn_check_version("1.1"): - _sklearn_assert_all_finite(X, allow_nan=allow_nan, msg_dtype=msg_dtype, - estimator_name=estimator_name, - input_name=input_name) + _sklearn_assert_all_finite( + X, + allow_nan=allow_nan, + msg_dtype=msg_dtype, + estimator_name=estimator_name, + input_name=input_name, + ) else: _sklearn_assert_all_finite(X, allow_nan=allow_nan, msg_dtype=msg_dtype) return @@ -57,25 +74,28 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None, lst = [] for idx in X: arr = X[idx].to_numpy() - lst.append(arr if arr.flags['C_CONTIGUOUS'] else np.ascontiguousarray(arr)) + lst.append(arr if arr.flags["C_CONTIGUOUS"] else np.ascontiguousarray(arr)) else: X = np.asanyarray(X) is_df = False dt = np.dtype(get_dtype(X)) - is_float = dt.kind in 'fc' + is_float = dt.kind in "fc" msg_err = "Input {} contains {} or a value too large for {!r}." - type_err = 'infinity' if allow_nan else 'NaN, infinity' - err = msg_err.format( - input_name, type_err, msg_dtype if msg_dtype is not None else dt) + type_err = "infinity" if allow_nan else "NaN, infinity" + err = msg_err.format(input_name, type_err, msg_dtype if msg_dtype is not None else dt) _patching_status = PatchingConditionsChain( - 'sklearn.utils.validation._assert_all_finite') - _dal_ready = _patching_status.and_conditions([ - (X.ndim in [1, 2], "X has not 1 or 2 dimensions."), - (not np.any(np.equal(X.shape, 0)), "X shape contains 0."), - (dt in [np.float32, np.float64], "X dtype is not float32 or float64.")]) + "sklearn.utils.validation._assert_all_finite" + ) + _dal_ready = _patching_status.and_conditions( + [ + (X.ndim in [1, 2], "X has not 1 or 2 dimensions."), + (not np.any(np.equal(X.shape, 0)), "X shape contains 0."), + (dt in [np.float32, np.float64], "X dtype is not float32 or float64."), + ] + ) _patching_status.write_log() if _dal_ready: if X.ndim == 1: @@ -96,35 +116,43 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None, elif is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))): pass elif is_float: - if allow_nan and np.isinf(X).any() or \ - not allow_nan and not np.isfinite(X).all(): + if allow_nan and np.isinf(X).any() or not allow_nan and not np.isfinite(X).all(): raise ValueError(err) # for object dtype data, we only check for NaNs (GH-13254) - elif dt == np.dtype('object') and not allow_nan: + elif dt == np.dtype("object") and not allow_nan: if _object_dtype_isnan(X).any(): raise ValueError(f"Input {input_name} contains NaN") -def _pandas_check_array(array, array_orig, force_all_finite, ensure_min_samples, - ensure_min_features, copy, context): +def _pandas_check_array( + array, + array_orig, + force_all_finite, + ensure_min_samples, + ensure_min_features, + copy, + context, +): if force_all_finite: - _assert_all_finite(array, allow_nan=force_all_finite == 'allow-nan') + _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") if ensure_min_samples > 0: n_samples = _num_samples(array) if n_samples < ensure_min_samples: - raise ValueError("Found array with %d sample(s) (shape=%s) while a" - " minimum of %d is required%s." - % (n_samples, array.shape, ensure_min_samples, - context)) + raise ValueError( + "Found array with %d sample(s) (shape=%s) while a" + " minimum of %d is required%s." + % (n_samples, array.shape, ensure_min_samples, context) + ) if ensure_min_features > 0: n_features = array.shape[1] if n_features < ensure_min_features: - raise ValueError("Found array with %d feature(s) (shape=%s) while" - " a minimum of %d is required%s." - % (n_features, array.shape, ensure_min_features, - context)) + raise ValueError( + "Found array with %d feature(s) (shape=%s) while" + " a minimum of %d is required%s." + % (n_features, array.shape, ensure_min_features, context) + ) if copy and np.may_share_memory(array, array_orig): array = array.copy() @@ -132,11 +160,21 @@ def _pandas_check_array(array, array_orig, force_all_finite, ensure_min_samples, return array -def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, - dtype="numeric", order=None, copy=False, force_all_finite=True, - ensure_2d=True, allow_nd=False, ensure_min_samples=1, - ensure_min_features=1, estimator=None): - +def _daal_check_array( + array, + accept_sparse=False, + *, + accept_large_sparse=True, + dtype="numeric", + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + allow_nd=False, + ensure_min_samples=1, + ensure_min_features=1, + estimator=None, +): """Input validation on an array, list, sparse matrix or similar. By default, the input is checked to be a non-empty 2D array containing @@ -219,9 +257,11 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, array_converted : object The converted and validated array. """ - if force_all_finite not in (True, False, 'allow-nan'): - raise ValueError('force_all_finite should be a bool or "allow-nan"' - '. Got {!r} instead'.format(force_all_finite)) + if force_all_finite not in (True, False, "allow-nan"): + raise ValueError( + 'force_all_finite should be a bool or "allow-nan"' + ". Got {!r} instead".format(force_all_finite) + ) if estimator is not None: if isinstance(estimator, str): @@ -237,17 +277,23 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, # a branch for heterogeneous pandas.DataFrame if is_DataFrame(array) and get_number_of_types(array) > 1: from pandas.api.types import is_sparse - if hasattr(array, 'sparse') or \ - not array.dtypes.apply(is_sparse).any(): - return _pandas_check_array(array, array_orig, force_all_finite, - ensure_min_samples, ensure_min_features, - copy, context) + + if hasattr(array, "sparse") or not array.dtypes.apply(is_sparse).any(): + return _pandas_check_array( + array, + array_orig, + force_all_finite, + ensure_min_samples, + ensure_min_features, + copy, + context, + ) # store whether originally we wanted numeric dtype dtype_numeric = isinstance(dtype, str) and dtype == "numeric" dtype_orig = getattr(array, "dtype", None) - if not hasattr(dtype_orig, 'kind'): + if not hasattr(dtype_orig, "kind"): # not a data type (e.g. a column named dtype in a pandas DataFrame) dtype_orig = None @@ -255,13 +301,13 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, # DataFrame), and store them. If not, store None. dtypes_orig = None has_pd_integer_array = False - if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): + if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"): # throw warning if columns are sparse. If all columns are sparse, then # array.sparse exists and sparsity will be perserved (later). with suppress(ImportError): from pandas.api.types import is_sparse - if not hasattr(array, 'sparse') and \ - array.dtypes.apply(is_sparse).any(): + + if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any(): warnings.warn( "pandas.DataFrame with sparse columns found." "It will be converted to a dense numpy array." @@ -270,20 +316,36 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, dtypes_orig = list(array.dtypes) # pandas boolean dtype __array__ interface coerces bools to objects for i, dtype_iter in enumerate(dtypes_orig): - if dtype_iter.kind == 'b': + if dtype_iter.kind == "b": dtypes_orig[i] = np.dtype(np.object) elif dtype_iter.name.startswith(("Int", "UInt")): # name looks like an Integer Extension Array, now check for # the dtype with suppress(ImportError): - from pandas import (Int8Dtype, Int16Dtype, - Int32Dtype, Int64Dtype, - UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype) - if isinstance(dtype_iter, (Int8Dtype, Int16Dtype, - Int32Dtype, Int64Dtype, - UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype)): + from pandas import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ) + + if isinstance( + dtype_iter, + ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ), + ): has_pd_integer_array = True if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): @@ -310,16 +372,20 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, array = array.astype(dtype) # When all dataframe columns are sparse, convert to a sparse array - if hasattr(array, 'sparse') and array.ndim > 1: + if hasattr(array, "sparse") and array.ndim > 1: # DataFrame.sparse only supports `to_coo` array = array.sparse.to_coo() if sp.issparse(array): _ensure_no_complex_data(array) - array = _ensure_sparse_format(array, accept_sparse=accept_sparse, - dtype=dtype, copy=copy, - force_all_finite=force_all_finite, - accept_large_sparse=accept_large_sparse) + array = _ensure_sparse_format( + array, + accept_sparse=accept_sparse, + dtype=dtype, + copy=copy, + force_all_finite=force_all_finite, + accept_large_sparse=accept_large_sparse, + ) else: # If np.array(..) gives ComplexWarning, then we convert the warning # to an error. This is needed because specifying a non complex @@ -328,21 +394,19 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, # of warnings context manager. with warnings.catch_warnings(): try: - warnings.simplefilter('error', ComplexWarning) - if dtype is not None and np.dtype(dtype).kind in 'iu': + warnings.simplefilter("error", ComplexWarning) + if dtype is not None and np.dtype(dtype).kind in "iu": # Conversion float -> int should not contain NaN or # inf (numpy#14412). We cannot use casting='safe' because # then conversion float -> int would be disallowed. array = np.asarray(array, order=order) - if array.dtype.kind == 'f': - _assert_all_finite(array, allow_nan=False, - msg_dtype=dtype) + if array.dtype.kind == "f": + _assert_all_finite(array, allow_nan=False, msg_dtype=dtype) array = array.astype(dtype, casting="unsafe", copy=False) else: array = np.asarray(array, order=order, dtype=dtype) except ComplexWarning: - raise ValueError("Complex data not supported\n" - "{}\n".format(array)) + raise ValueError("Complex data not supported\n" "{}\n".format(array)) # It is possible that the np.array(..) gave no warning. This happens # when no dtype conversion happened, for example dtype = None. The @@ -357,14 +421,16 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, "Expected 2D array, got scalar array instead:\narray={}.\n" "Reshape your data either using array.reshape(-1, 1) if " "your data has a single feature or array.reshape(1, -1) " - "if it contains a single sample.".format(array)) + "if it contains a single sample.".format(array) + ) # If input is 1D raise error if array.ndim == 1: raise ValueError( "Expected 2D array, got 1D array instead:\narray={}.\n" "Reshape your data either using array.reshape(-1, 1) if " "your data has a single feature or array.reshape(1, -1) " - "if it contains a single sample.".format(array)) + "if it contains a single sample.".format(array) + ) # in the future np.flexible dtypes will be handled like object dtypes if dtype_numeric and np.issubdtype(array.dtype, np.flexible): @@ -375,33 +441,39 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, "a float dtype before using it in scikit-learn, " "for example by using " "your_array = your_array.astype(np.float64).", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) # make sure we actually converted to numeric: if dtype_numeric and array.dtype.kind == "O": array = array.astype(np.float64) if not allow_nd and array.ndim >= 3: - raise ValueError("Found array with dim %d. %s expected <= 2." - % (array.ndim, estimator_name)) + raise ValueError( + "Found array with dim %d. %s expected <= 2." + % (array.ndim, estimator_name) + ) if force_all_finite: - _assert_all_finite(array, allow_nan=force_all_finite == 'allow-nan') + _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") if ensure_min_samples > 0: n_samples = _num_samples(array) if n_samples < ensure_min_samples: - raise ValueError("Found array with %d sample(s) (shape=%s) while a" - " minimum of %d is required%s." - % (n_samples, array.shape, ensure_min_samples, - context)) + raise ValueError( + "Found array with %d sample(s) (shape=%s) while a" + " minimum of %d is required%s." + % (n_samples, array.shape, ensure_min_samples, context) + ) if ensure_min_features > 0 and array.ndim == 2: n_features = array.shape[1] if n_features < ensure_min_features: - raise ValueError("Found array with %d feature(s) (shape=%s) while" - " a minimum of %d is required%s." - % (n_features, array.shape, ensure_min_features, - context)) + raise ValueError( + "Found array with %d feature(s) (shape=%s) while" + " a minimum of %d is required%s." + % (n_features, array.shape, ensure_min_features, context) + ) if copy and np.may_share_memory(array, array_orig): array = np.array(array, dtype=dtype, order=order) @@ -409,11 +481,24 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, return array -def _daal_check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, - dtype="numeric", order=None, copy=False, force_all_finite=True, - ensure_2d=True, allow_nd=False, multi_output=False, - ensure_min_samples=1, ensure_min_features=1, y_numeric=False, - estimator=None): +def _daal_check_X_y( + X, + y, + accept_sparse=False, + *, + accept_large_sparse=True, + dtype="numeric", + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + allow_nd=False, + multi_output=False, + ensure_min_samples=1, + ensure_min_features=1, + y_numeric=False, + estimator=None, +): """Input validation for standard estimators. Checks X and y for consistent length, enforces X to be 2D and y 1D. By @@ -516,22 +601,27 @@ def _daal_check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, raise ValueError("y cannot be None") X = _daal_check_array( - X, accept_sparse=accept_sparse, + X, + accept_sparse=accept_sparse, accept_large_sparse=accept_large_sparse, - dtype=dtype, order=order, copy=copy, + dtype=dtype, + order=order, + copy=copy, force_all_finite=force_all_finite, - ensure_2d=ensure_2d, allow_nd=allow_nd, + ensure_2d=ensure_2d, + allow_nd=allow_nd, ensure_min_samples=ensure_min_samples, ensure_min_features=ensure_min_features, - estimator=estimator + estimator=estimator, ) if multi_output: - y = _daal_check_array(y, accept_sparse='csr', force_all_finite=True, - ensure_2d=False, dtype=None) + y = _daal_check_array( + y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None + ) else: y = column_or_1d(y, warn=True) _assert_all_finite(y) - if y_numeric and hasattr(y, 'dtype') and y.dtype.kind == 'O': + if y_numeric and hasattr(y, "dtype") and y.dtype.kind == "O": y = y.astype(np.float64) check_consistent_length(X, y) diff --git a/deselected_tests.yaml b/deselected_tests.yaml index a76a5ee6dc..0c462ed88c 100755 --- a/deselected_tests.yaml +++ b/deselected_tests.yaml @@ -645,6 +645,7 @@ gpu: - ensemble/tests/test_forest.py::test_forest_classifier_oob - ensemble/tests/test_forest.py::test_forest_regressor_oob - tests/test_common.py::test_search_cv + - manifold/tests/test_t_sne.py::test_n_iter_without_progress # KMeans based (unsupported for GPU) - cluster/tests/test_k_means.py @@ -724,3 +725,380 @@ gpu: - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-l2-1000-5-100] - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-l2-1000-5-100] - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-l2-1000-5-100] + # failing due to numeric/code error + - ensemble/tests/test_bagging.py::test_parallel_classification + - linear_model/tests/test_common.py::test_balance_property[42-False-LogisticRegressionCV] + - sklearn/manifold/tests/test_t_sne.py::test_n_iter_without_progress + - model_selection/tests/test_search.py::test_searchcv_raise_warning_with_non_finite_score[RandomizedSearchCV-specialized_params1-False] + - model_selection/tests/test_search.py::test_searchcv_raise_warning_with_non_finite_score[RandomizedSearchCV-specialized_params1-True] + - tests/test_calibration.py::test_calibrated_classifier_cv_double_sample_weights_equivalence + - tests/test_calibration.py::test_calibrated_classifier_cv_zeros_sample_weights_equivalence + - tests/test_common.py::test_estimators[FeatureAgglomeration()-check_parameters_default_constructible] + - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_estimator_sparse_data] + - tests/test_common.py::test_transformers_get_feature_names_out[StackingRegressor(estimators=[('est1',Ridge(alpha=0.1)),('est2',Ridge(alpha=1))])] + - tests/test_common.py::test_transformers_get_feature_names_out[VotingRegressor(estimators=[('est1',Ridge(alpha=0.1)),('est2',Ridge(alpha=1))])] + - tests/test_common.py::test_f_contiguous_array_estimator[TSNE] + - manifold/tests/test_t_sne.py::test_tsne_works_with_pandas_output + + # RuntimeError: Device support is not implemented, failing as result of fallback to cpu false + # NearestNeighbors + - cluster/tests/test_dbscan.py + - cluster/tests/test_spectral + - manifold/tests/test_t_sne.py::test_binary_search_neighbors + - manifold/tests/test_t_sne.py::test_binary_perplexity_stability + - manifold/tests/test_t_sne.py::test_gradient_bh_multithread_match_sequential + - neighbors/tests/test_kde.py::test_kernel_density_sampling + - neighbors/tests/test_lof.py + - tests/test_common.py::test_check_n_features_in_after_fitting[LocalOutlierFactor()] + - tests/test_common.py::test_check_n_features_in_after_fitting[NearestNeighbors()] + - tests/test_common.py::test_f_contiguous_array_estimator[LocalOutlierFactor] + - tests/test_common.py::test_estimators[NearestNeighbors()- + - model_selection/tests/test_search.py::test_search_cv_score_samples_method[search_cv0] + - model_selection/tests/test_search.py::test_search_cv_score_samples_method[search_cv1] + - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_outliers_fit_predict] + - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit_idempotent] + - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_fit_check_is_fitted] + - tests/test_common.py::test_estimators[LocalOutlierFactor()-check_n_features_in] + - manifold/tests/test_t_sne.py::test_barnes_hut_angle + # KNeighborsRegressor + - ensemble/tests/test_bagging.py::test_regression + - ensemble/tests/test_bagging.py::test_single_estimator + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-chebyshev-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-chebyshev-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-cityblock-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-euclidean-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-l1-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-l2-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-manhattan-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-manhattan-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-1-100-minkowski-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-chebyshev-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-chebyshev-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-cityblock-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-cityblock-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-euclidean-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-euclidean-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-l1-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-l1-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-l2-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-manhattan-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-manhattan-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-minkowski-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-50-500-minkowski-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-chebyshev-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-chebyshev-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-cityblock-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-cityblock-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-euclidean-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-euclidean-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-l1-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-l1-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-l2-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-manhattan-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-manhattan-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-minkowski-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsRegressor-100-1000-minkowski-1000-5-100] + - tests/test_common.py::test_check_n_features_in_after_fitting[KNeighborsRegressor()] + - tests/test_common.py::test_f_contiguous_array_estimator[KNeighborsRegressor] + - tests/test_common.py::test_estimators[KNeighborsRegressor()- + # KNeighborsClassifier + - ensemble/tests/test_bagging.py::test_oob_score_consistency + - ensemble/tests/test_bagging.py::test_max_samples_consistency + - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_predict_proba[MLPClassifier] + - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_predict_proba[RandomForestClassifier] + - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_decision_function + - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_auto_predict[False-auto] + - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_auto_predict[False-predict] + - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_auto_predict[True-auto] + - ensemble/tests/test_stacking.py::test_stacking_classifier_multilabel_auto_predict[True-predict] + - metrics/tests/test_score_objects.py::test_multimetric_scorer_calls_method_once_classifier_no_decision + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-chebyshev-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-chebyshev-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-cityblock-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-euclidean-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-l1-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-l2-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-manhattan-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-manhattan-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-1-100-minkowski-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-chebyshev-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-chebyshev-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-cityblock-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-cityblock-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-euclidean-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-euclidean-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-l1-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-l1-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-l2-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-manhattan-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-manhattan-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-minkowski-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-50-500-minkowski-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-chebyshev-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-chebyshev-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-cityblock-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-cityblock-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-euclidean-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-euclidean-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-l1-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-l1-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-l2-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-manhattan-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-manhattan-1000-5-100] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-minkowski-100-100-10] + - neighbors/tests/test_neighbors.py::test_neigh_predictions_algorithm_agnosticity[float64-KNeighborsClassifier-100-1000-minkowski-1000-5-100] + - tests/test_common.py::test_check_n_features_in_after_fitting[KNeighborsClassifier()] + - tests/test_common.py::test_f_contiguous_array_estimator[KNeighborsClassifier] + - tests/test_common.py::test_estimators[KNeighborsClassifier()- + - model_selection/tests/test_search.py::test_search_cv_pairwise_property_equivalence_of_precomputed + - model_selection/tests/test_validation.py::test_cross_val_score_multilabel + - neighbors/tests/test_neighbors.py::test_precomputed_cross_validation + # SVR + - ensemble/tests/test_bagging.py::test_sparse_regression + - tests/test_common.py::test_check_n_features_in_after_fitting[NuSVR()] + - tests/test_common.py::test_check_n_features_in_after_fitting[SVR()] + - tests/test_multiclass.py::test_ovr_single_label_predict_proba + - utils/tests/test_validation.py::test_check_is_fitted + - tests/test_common.py::test_estimators[NuSVR()- + - tests/test_common.py::test_estimators[SVR()- + # SVC + - ensemble/tests/test_bagging.py::test_oob_score_classification + - ensemble/tests/test_bagging.py::test_deprecated_base_estimator_has_decision_function + - ensemble/tests/test_stacking.py::test_stacking_classifier_error[y1-params1-ValueError-does + - feature_selection/tests/test_rfe + - metrics/tests/test_classification.py::test_classification_report_dictionary_output + - metrics/tests/test_classification.py::test_multilabel_confusion_matrix_multiclass + - metrics/tests/test_classification.py::test_precision_recall_f1_score_multiclass + - metrics/tests/test_classification.py::test_confusion_matrix_multiclass_subset_labels + - metrics/tests/test_classification.py::test_confusion_matrix_error[empty + - metrics/tests/test_classification.py::test_confusion_matrix_error[unknown + - metrics/tests/test_classification.py::test_classification_report_multiclass + - metrics/tests/test_classification.py::test_classification_report_multiclass_with_label_detection + - metrics/tests/test_classification.py::test_classification_report_multiclass_with_digits + - metrics/tests/test_classification.py::test_classification_report_multiclass_with_string_label + - metrics/tests/test_classification.py::test_classification_report_multiclass_with_unicode_label + - metrics/tests/test_classification.py::test_classification_report_multiclass_with_long_string_label + - model_selection/tests/test_validation.py::test_permutation_score + - svm/tests/test_sparse.py::test_unsorted_indices + - svm/tests/test_sparse.py::test_sparse_decision_function + - svm/tests/test_sparse.py::test_weight + - svm/tests/test_sparse.py::test_sparse_svc_clone_with_callable_kernel + - svm/tests/test_sparse.py::test_timeout + - tests/test_common.py::test_check_n_features_in_after_fitting[NuSVC()] + - tests/test_multiclass.py::test_pairwise_indices + - tests/test_multiclass.py::test_pairwise_n_features_in + - tests/test_pipeline.py::test_pipeline_memory + - tests/test_common.py::test_estimators[NuSVC()- + - tests/test_common.py::test_estimators[SVC()- + - model_selection/tests/test_search.py::test_grid_search_precomputed_kernel + - model_selection/tests/test_search.py::test_search_cv_results_rank_tie_breaking + - model_selection/tests/test_split.py::test_kfold_can_detect_dependent_samples_on_digits + - model_selection/tests/test_validation.py::test_cross_val_score_mask + - model_selection/tests/test_validation.py::test_cross_val_score_precomputed + - model_selection/tests/test_validation.py::test_cross_val_score_with_score_func_classification + - svm/tests/test_svm.py::test_unfitted + # part SVC, part KNeighborsClassifier + - semi_supervised/tests/test_self_training + # unsorted NearestNeighbors/KNClassifier/KNRegressor + - neighbors/tests/test_neighbors.py::test_unsupervised_inputs[float64-KNeighborsClassifier] + - neighbors/tests/test_neighbors.py::test_unsupervised_inputs[float64-KNeighborsRegressor] + - neighbors/tests/test_neighbors.py::test_unsupervised_inputs[float64-NearestNeighbors] + - neighbors/tests/test_neighbors.py::test_precomputed_dense + - neighbors/tests/test_neighbors.py::test_precomputed_sparse_knn[csr] + - neighbors/tests/test_neighbors.py::test_precomputed_sparse_knn[lil] + - neighbors/tests/test_neighbors.py::test_precomputed_sparse_radius[csr] + - neighbors/tests/test_neighbors.py::test_precomputed_sparse_radius[lil] + - neighbors/tests/test_neighbors.py::test_precomputed_sparse_invalid + - neighbors/tests/test_neighbors.py::test_unsupervised_radius_neighbors[float64] + - neighbors/tests/test_neighbors.py::test_neighbors_regressors_zero_distance + - neighbors/tests/test_neighbors.py::test_radius_neighbors_boundary_handling + - neighbors/tests/test_neighbors.py::test_radius_neighbors_returns_array_of_objects + - neighbors/tests/test_neighbors.py::test_query_equidistant_kth_nn[kd_tree] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_sort_results[kd_tree-euclidean] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_sort_results[brute-precomputed] + - neighbors/tests/test_neighbors.py::test_kneighbors_regressor + - neighbors/tests/test_neighbors.py::test_KNeighborsRegressor_multioutput_uniform_weight + - neighbors/tests/test_neighbors.py::test_kneighbors_regressor_multioutput + - neighbors/tests/test_neighbors.py::test_kneighbors_regressor_sparse + - neighbors/tests/test_neighbors.py::test_neighbors_validate_parameters[KNeighborsClassifier] + - neighbors/tests/test_neighbors.py::test_neighbors_validate_parameters[KNeighborsRegressor] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[auto-2-KNeighborsClassifier] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[auto-2-KNeighborsRegressor] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[auto-100-KNeighborsClassifier] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[auto-100-KNeighborsRegressor] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[brute-2-KNeighborsClassifier] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[brute-2-KNeighborsRegressor] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[brute-100-KNeighborsClassifier] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_warn[brute-100-KNeighborsRegressor] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[kd_tree-2-KNeighborsClassifier] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[kd_tree-2-KNeighborsRegressor] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[kd_tree-100-KNeighborsClassifier] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[kd_tree-100-KNeighborsRegressor] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[ball_tree-2-KNeighborsClassifier] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[ball_tree-2-KNeighborsRegressor] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[ball_tree-100-KNeighborsClassifier] + - neighbors/tests/test_neighbors.py::test_neighbors_minkowski_semimetric_algo_error[ball_tree-100-KNeighborsRegressor] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-braycurtis] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-canberra] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-correlation] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-dice] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-hamming] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-haversine] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-jaccard] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-mahalanobis] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-matching] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-nan_euclidean] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-rogerstanimoto] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-russellrao] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-seuclidean] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-sokalmichener] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-sokalsneath] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-sqeuclidean] + - neighbors/tests/test_neighbors.py::test_kneighbors_brute_backend[float64-yule] + - neighbors/tests/test_neighbors.py::test_callable_metric + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-braycurtis] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-canberra] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-chebyshev] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-cityblock] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-correlation] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-cosine] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-dice] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-euclidean] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-hamming] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-haversine] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-jaccard] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-kulsinski] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-l1] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-l2] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-mahalanobis] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-manhattan] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-matching] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-minkowski] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-nan_euclidean] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-precomputed] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-rogerstanimoto] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-russellrao] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-seuclidean] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-sokalmichener] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-sokalsneath] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-sqeuclidean] + - neighbors/tests/test_neighbors.py::test_valid_brute_metric_for_auto_algorithm[float64-yule] + - neighbors/tests/test_neighbors.py::test_predict_sparse_ball_kd_tree + - neighbors/tests/test_neighbors.py::test_k_and_radius_neighbors_train_is_not_query + - neighbors/tests/test_neighbors.py::test_k_and_radius_neighbors_X_None[kd_tree] + - neighbors/tests/test_neighbors.py::test_k_and_radius_neighbors_duplicates[kd_tree] + - neighbors/tests/test_neighbors.py::test_same_knn_parallel[ball_tree] + - neighbors/tests/test_neighbors.py::test_same_knn_parallel[kd_tree] + - neighbors/tests/test_neighbors.py::test_same_knn_parallel[auto] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[ball_tree-threading] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[ball_tree-sequential] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[ball_tree-multiprocessing] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[ball_tree-loky] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[ball_tree-testing] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[kd_tree-threading] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[kd_tree-sequential] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[kd_tree-multiprocessing] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[kd_tree-loky] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[kd_tree-testing] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[auto-threading] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[auto-sequential] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[auto-multiprocessing] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[auto-loky] + - neighbors/tests/test_neighbors.py::test_knn_forcing_backend[auto-testing] + - neighbors/tests/test_neighbors.py::test_dtype_convert + - neighbors/tests/test_neighbors.py::test_sparse_metric_callable + - neighbors/tests/test_neighbors.py::test_pairwise_boolean_distance + - neighbors/tests/test_neighbors.py::test_pipeline_with_nearest_neighbors_transformer + - neighbors/tests/test_neighbors.py::test_auto_algorithm[X0-precomputed-None-brute] + - neighbors/tests/test_neighbors.py::test_auto_algorithm[X3-euclidean-None-kd_tree] + - neighbors/tests/test_neighbors.py::test_auto_algorithm[X4-seuclidean-metric_params4-ball_tree] + - neighbors/tests/test_neighbors.py::test_auto_algorithm[X5-correlation-None-brute] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[braycurtis] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[canberra] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[correlation] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[dice] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[hamming] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[haversine] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[jaccard] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[mahalanobis] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[matching] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[nan_euclidean] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[rogerstanimoto] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[russellrao] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[seuclidean] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[sokalmichener] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[sokalsneath] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[sqeuclidean] + - neighbors/tests/test_neighbors.py::test_radius_neighbors_brute_backend[yule] + - neighbors/tests/test_neighbors.py::test_regressor_predict_on_arraylikes + - neighbors/tests/test_neighbors_pipeline.py::test_lof_novelty_false + - neighbors/tests/test_neighbors_pipeline.py::test_lof_novelty_true + - neighbors/tests/test_neighbors_pipeline.py::test_kneighbors_regressor + # unsorted svm + - svm/tests/test_svm.py::test_libsvm_iris + - svm/tests/test_svm.py::test_svr + - svm/tests/test_svm.py::test_linearsvr + - svm/tests/test_svm.py::test_svr_errors + - svm/tests/test_svm.py::test_probability + - svm/tests/test_svm.py::test_decision_function + - svm/tests/test_svm.py::test_decision_function_shape[SVC] + - svm/tests/test_svm.py::test_decision_function_shape[NuSVC] + - svm/tests/test_svm.py::test_svr_predict + - svm/tests/test_svm.py::test_weight + - svm/tests/test_svm.py::test_svm_classifier_sided_sample_weight[estimator1] + - svm/tests/test_svm.py::test_svm_regressor_sided_sample_weight[estimator0] + - svm/tests/test_svm.py::test_svm_regressor_sided_sample_weight[estimator1] + - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-zero-NuSVC] + - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-zero-SVR] + - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-zero-NuSVR] + - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-negative-NuSVC] + - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-negative-SVR] + - svm/tests/test_svm.py::test_negative_sample_weights_mask_all_samples[weights-are-negative-NuSVR] + - svm/tests/test_svm.py::test_negative_weights_svc_leave_just_one_label[mask-label-1-NuSVC] + - svm/tests/test_svm.py::test_negative_weights_svc_leave_just_one_label[mask-label-2-NuSVC] + - svm/tests/test_svm.py::test_negative_weights_svc_leave_two_labels[partial-mask-label-1-NuSVC] + - svm/tests/test_svm.py::test_negative_weights_svc_leave_two_labels[partial-mask-label-2-NuSVC] + - svm/tests/test_svm.py::test_negative_weight_equal_coeffs[partial-mask-label-1-NuSVC] + - svm/tests/test_svm.py::test_negative_weight_equal_coeffs[partial-mask-label-1-NuSVR] + - svm/tests/test_svm.py::test_negative_weight_equal_coeffs[partial-mask-label-2-NuSVC] + - svm/tests/test_svm.py::test_negative_weight_equal_coeffs[partial-mask-label-2-NuSVR] + - svm/tests/test_svm.py::test_auto_weight + - svm/tests/test_svm.py::test_bad_input + - svm/tests/test_svm.py::test_sparse_precomputed + - svm/tests/test_svm.py::test_sparse_fit_support_vectors_empty + - svm/tests/test_svm.py::test_immutable_coef_property + - svm/tests/test_svm.py::test_svc_bad_kernel + - svm/tests/test_svm.py::test_libsvm_convergence_warnings + - svm/tests/test_svm.py::test_svr_coef_sign + - svm/tests/test_svm.py::test_hasattr_predict_proba + - svm/tests/test_svm.py::test_decision_function_shape_two_class + - svm/tests/test_svm.py::test_ovr_decision_function + - svm/tests/test_svm.py::test_svc_invalid_break_ties_param[SVC] + - svm/tests/test_svm.py::test_svc_invalid_break_ties_param[NuSVC] + - svm/tests/test_svm.py::test_n_support[SVR] + - svm/tests/test_svm.py::test_n_support[NuSVR] + - svm/tests/test_svm.py::test_custom_kernel_not_array_input[SVC] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset0-NuSVC-ndarray] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset0-SVR-int] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset0-NuSVR-int] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset1-SVC-ndarray] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset1-NuSVC-ndarray] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset1-SVR-int] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset1-NuSVR-int] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset2-SVC-ndarray] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset2-NuSVC-ndarray] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset2-SVR-int] + - svm/tests/test_svm.py::test_n_iter_libsvm[dataset2-NuSVR-int] + - svm/tests/test_svm.py::test_svm_class_weights_deprecation[SVR] + - svm/tests/test_svm.py::test_svm_class_weights_deprecation[NuSVR] + # possible cause of timeout + - tests/test_common.py::test_estimators[CalibratedClassifierCV(estimator=LogisticRegression(C=1))- + - tests/test_common.py::test_estimators[LogisticRegression()- + - tests/test_common.py::test_estimators[LogisticRegressionCV()- + - tests/test_common.py::test_estimators[OneVsOneClassifier(estimator=LogisticRegression(C=1))- + - tests/test_common.py::test_estimators[OneVsRestClassifier(estimator=LogisticRegression(C=1))- + - tests/test_common.py::test_estimators[OutputCodeClassifier(estimator=LogisticRegression(C=1))- + - tests/test_common.py::test_estimators[RFE(estimator=LogisticRegression(C=1))- + - tests/test_common.py::test_estimators[RFECV(estimator=LogisticRegression(C=1))- + - tests/test_common.py::test_estimators[SelfTrainingClassifier(base_estimator=LogisticRegression(C=1))- + - tests/test_common.py::test_estimators[SequentialFeatureSelector(estimator=LogisticRegression(C=1))- diff --git a/examples/sklearnex/knn_bf_classification_dpnp_batch.py b/examples/sklearnex/knn_bf_classification_dpnp_batch.py new file mode 100644 index 0000000000..4c8a976400 --- /dev/null +++ b/examples/sklearnex/knn_bf_classification_dpnp_batch.py @@ -0,0 +1,58 @@ +# =============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +# sklearnex kNN example for GPU offloading with DPNP ndarray: +# python ./knn_bf_classification_dpnp_batch.py.py + +import dpctl +import dpnp +import numpy as np +from sklearn.datasets import make_classification +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + +from sklearnex.neighbors import KNeighborsClassifier + +X, y = make_classification( + n_samples=1000, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, +) + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) + +# Make sure that all DPNP ndarrays using the same device. +q = dpctl.SyclQueue("gpu") # GPU + +dpnp_X_train = dpnp.asarray(X_train, usm_type="device", sycl_queue=q) +dpnp_y_train = dpnp.asarray(y_train, usm_type="device", sycl_queue=q) +dpnp_X_test = dpnp.asarray(X_test, usm_type="device", sycl_queue=q) + +knn_mdl = KNeighborsClassifier( + algorithm="brute", n_neighbors=20, weights="uniform", p=2, metric="minkowski" +) +knn_mdl.fit(dpnp_X_train, dpnp_y_train) + +y_predict = knn_mdl.predict(dpnp_X_test) + +print("Brute Force Distributed kNN classification results:") +print("Ground truth (first 5 observations):\n{}".format(y_test[:5])) +print("Classification results (first 5 observations):\n{}".format(y_predict[:5])) +print("Accuracy (2 classes): {}\n".format(accuracy_score(y_test, y_predict.asnumpy()))) +print("Are predicted results on GPU: {}".format(y_predict.sycl_device.is_gpu)) diff --git a/examples/sklearnex/random_forest_classifier_dpctl_batch.py b/examples/sklearnex/random_forest_classifier_dpctl_batch.py new file mode 100644 index 0000000000..0a5e0e8e09 --- /dev/null +++ b/examples/sklearnex/random_forest_classifier_dpctl_batch.py @@ -0,0 +1,53 @@ +# =============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +# sklearnex RF example for GPU offloading with DPCtl tensor: +# python ./random_forest_classifier_dpctl_batch.py + +import dpctl +import dpctl.tensor as dpt +import numpy as np +from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split + +from sklearnex.preview.ensemble import RandomForestClassifier + +# Make sure that all DPCtl tensors using the same device. +q = dpctl.SyclQueue("gpu") # GPU + +X, y = make_classification( + n_samples=1000, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, +) + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) + +dpt_X_train = dpt.asarray(X_train, usm_type="device", sycl_queue=q) +dpt_y_train = dpt.asarray(y_train, usm_type="device", sycl_queue=q) +dpt_X_test = dpt.asarray(X_test, usm_type="device", sycl_queue=q) + +rf = RandomForestClassifier(max_depth=2, random_state=0).fit(dpt_X_train, dpt_y_train) + +pred = rf.predict(dpt_X_test) + +print("Random Forest classification results:") +print("Ground truth (first 5 observations):\n{}".format(y_test[:5])) +print("Classification results (first 5 observations):\n{}".format(pred[:5])) +print("Are predicted results on GPU: {}".format(pred.sycl_device.is_gpu)) diff --git a/examples/sklearnex/random_forest_regressor_dpnp_batch.py b/examples/sklearnex/random_forest_regressor_dpnp_batch.py new file mode 100644 index 0000000000..d741bdaf57 --- /dev/null +++ b/examples/sklearnex/random_forest_regressor_dpnp_batch.py @@ -0,0 +1,46 @@ +# =============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +# sklearnex RF example for GPU offloading with DPNP ndarray: +# python ./random_forest_regressor_dpnp_batch.py.py + +import dpnp +import numpy as np +from sklearn.datasets import make_regression +from sklearn.model_selection import train_test_split + +from sklearnex.preview.ensemble import RandomForestRegressor + +sycl_device = "gpu:0" + +X, y = make_regression( + n_samples=1000, n_features=4, n_informative=2, random_state=0, shuffle=False +) + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) + +dpnp_X_train = dpnp.asarray(X_train, device=sycl_device) +dpnp_y_train = dpnp.asarray(y_train, device=sycl_device) +dpnp_X_test = dpnp.asarray(X_test, device=sycl_device) + +rf = RandomForestRegressor(max_depth=2, random_state=0).fit(dpnp_X_train, dpnp_y_train) + +pred = rf.predict(dpnp_X_test) + +print("Random Forest regression results:") +print("Ground truth (first 5 observations):\n{}".format(y_test[:5])) +print("Regression results (first 5 observations):\n{}".format(pred[:5])) +print("Are predicted results on GPU: {}".format(pred.sycl_device.is_gpu)) diff --git a/onedal/__init__.py b/onedal/__init__.py index 3a425a8c0a..5704f42461 100644 --- a/onedal/__init__.py +++ b/onedal/__init__.py @@ -15,45 +15,52 @@ # =============================================================================== import platform + from daal4py.sklearn._utils import daal_check_version if "Windows" in platform.system(): import os - import sys import site + import sys + path_to_env = site.getsitepackages()[0] path_to_libs = os.path.join(path_to_env, "Library", "bin") if sys.version_info.minor >= 8: - if 'DALROOT' in os.environ: - dal_root_redist = os.path.join( - os.environ['DALROOT'], "redist", "intel64") + if "DALROOT" in os.environ: + dal_root_redist = os.path.join(os.environ["DALROOT"], "redist", "intel64") if os.path.exists(dal_root_redist): os.add_dll_directory(dal_root_redist) os.add_dll_directory(path_to_libs) - os.environ['PATH'] = path_to_libs + os.pathsep + os.environ['PATH'] + os.environ["PATH"] = path_to_libs + os.pathsep + os.environ["PATH"] try: import onedal._onedal_py_dpc as _backend + _is_dpc_backend = True except ImportError: import onedal._onedal_py_host as _backend + _is_dpc_backend = False -__all__ = ['decomposition', 'ensemble', 'neighbors', 'primitives', 'svm'] +__all__ = ["decomposition", "ensemble", "neighbors", "primitives", "svm"] if _is_dpc_backend: - __all__.append('spmd') + __all__.append("spmd") -if daal_check_version((2023, 'P', 100)): - __all__ += ['basic_statistics', 'linear_model'] +if daal_check_version((2023, "P", 100)): + __all__ += ["basic_statistics", "linear_model"] if _is_dpc_backend: - __all__ += ['spmd.basic_statistics', 'spmd.decomposition', - 'spmd.linear_model', 'spmd.neighbors'] + __all__ += [ + "spmd.basic_statistics", + "spmd.decomposition", + "spmd.linear_model", + "spmd.neighbors", + ] -if daal_check_version((2023, 'P', 200)): - __all__ += ['cluster'] +if daal_check_version((2023, "P", 200)): + __all__ += ["cluster"] if _is_dpc_backend: - __all__ += ['spmd.cluster'] + __all__ += ["spmd.cluster"] diff --git a/onedal/_device_offload.py b/onedal/_device_offload.py index 09cd48f681..6ff1990ebe 100644 --- a/onedal/_device_offload.py +++ b/onedal/_device_offload.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from functools import wraps try: - from sklearnex._device_offload import (_get_global_queue, - _transfer_to_host, - _copy_to_usm) + from sklearnex._device_offload import ( + _copy_to_usm, + _get_global_queue, + _transfer_to_host, + ) + _sklearnex_available = True except ImportError: import logging - logging.warning('Device support requires ' - 'Intel(R) Extension for Scikit-learn*.') + + logging.warning("Device support requires " "Intel(R) Extension for Scikit-learn*.") _sklearnex_available = False @@ -40,9 +43,7 @@ def _extract_usm_iface(*args, **kwargs): allargs = (*args, *kwargs.values()) if len(allargs) == 0: return None - return getattr(allargs[0], - '__sycl_usm_array_interface__', - None) + return getattr(allargs[0], "__sycl_usm_array_interface__", None) def _run_on_device(func, obj=None, *args, **kwargs): @@ -57,21 +58,25 @@ def wrapper_impl(obj, *args, **kwargs): if _sklearnex_available: usm_iface = _extract_usm_iface(*args, **kwargs) data_queue, hostargs, hostkwargs = _get_host_inputs(*args, **kwargs) - hostkwargs['queue'] = data_queue + hostkwargs["queue"] = data_queue result = _run_on_device(func, obj, *hostargs, **hostkwargs) - if usm_iface is not None and hasattr(result, '__array_interface__'): + if usm_iface is not None and hasattr(result, "__array_interface__"): return _copy_to_usm(data_queue, result) return result return _run_on_device(func, obj, *args, **kwargs) if freefunc: + @wraps(func) def wrapper_free(*args, **kwargs): return wrapper_impl(None, *args, **kwargs) + return wrapper_free @wraps(func) def wrapper_with_self(self, *args, **kwargs): return wrapper_impl(self, *args, **kwargs) + return wrapper_with_self + return decorator diff --git a/onedal/basic_statistics/__init__.py b/onedal/basic_statistics/__init__.py index 6f45ecfe5c..2b99fdbdb7 100644 --- a/onedal/basic_statistics/__init__.py +++ b/onedal/basic_statistics/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .basic_statistics import BasicStatistics -__all__ = ['BasicStatistics'] +__all__ = ["BasicStatistics"] diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 29ca8673dd..b048f9c02f 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -14,20 +14,17 @@ # limitations under the License. # =============================================================================== -from sklearn.base import BaseEstimator from abc import ABCMeta, abstractmethod - -import numpy as np from numbers import Number -from ..common._policy import _get_policy +import numpy as np +from sklearn.base import BaseEstimator -from ..datatypes._data_conversion import ( - from_table, - to_table, - _convert_to_supported) from onedal import _backend +from ..common._policy import _get_policy +from ..datatypes import _convert_to_supported, from_table, to_table + class BaseBasicStatistics(metaclass=ABCMeta): @abstractmethod @@ -37,10 +34,18 @@ def __init__(self, result_options, algorithm): @staticmethod def get_all_result_options(): - return ["min", "max", "sum", "mean", - "variance", "variation", "sum_squares", - "standard_deviation", "sum_squares_centered", - "second_order_raw_moment"] + return [ + "min", + "max", + "sum", + "mean", + "variance", + "variation", + "sum_squares", + "standard_deviation", + "sum_squares_centered", + "second_order_raw_moment", + ] def _get_policy(self, queue, *data): return _get_policy(queue, *data) @@ -56,8 +61,9 @@ def _get_result_options(self, options): def _get_onedal_params(self, dtype=np.float32): options = self._get_result_options(self.options) return { - 'fptype': 'float' if dtype == np.float32 else 'double', - 'method': self.algorithm, 'result_option': options, + "fptype": "float" if dtype == np.float32 else "double", + "method": self.algorithm, + "result_option": options, } def _compute_raw(self, data_table, weights_table, module, policy, dtype=np.float32): @@ -78,14 +84,12 @@ def _compute(self, data, weights, module, queue): if not (weights is None): weights = np.asarray(weights) - data, weights = _convert_to_supported( - policy, data, weights) + data, weights = _convert_to_supported(policy, data, weights) data_table, weights_table = to_table(data, weights) dtype = data.dtype - res = self._compute_raw(data_table, weights_table, - module, policy, dtype) + res = self._compute_raw(data_table, weights_table, module, policy, dtype) return {k: from_table(v).ravel() for k, v in res.items()} @@ -95,17 +99,13 @@ class BasicStatistics(BaseBasicStatistics): Basic Statistics oneDAL implementation. """ - def __init__( - self, - result_options="all", - *, - algorithm="by_default", - **kwargs): + def __init__(self, result_options="all", *, algorithm="by_default", **kwargs): super().__init__(result_options, algorithm) def compute(self, data, weights=None, queue=None): return super()._compute(data, weights, _backend.basic_statistics.compute, queue) def compute_raw(self, data_table, weights_table, policy, dtype=np.float32): - return super()._compute_raw(data_table, weights_table, - _backend.basic_statistics.compute, policy, dtype) + return super()._compute_raw( + data_table, weights_table, _backend.basic_statistics.compute, policy, dtype + ) diff --git a/onedal/basic_statistics/tests/test_basic_statistics.py b/onedal/basic_statistics/tests/test_basic_statistics.py index 0ca4670d49..01bd7b54ac 100644 --- a/onedal/basic_statistics/tests/test_basic_statistics.py +++ b/onedal/basic_statistics/tests/test_basic_statistics.py @@ -16,9 +16,9 @@ from daal4py.sklearn._utils import daal_check_version, sklearn_check_version -if daal_check_version((2023, 'P', 100)): - import pytest +if daal_check_version((2023, "P", 100)): import numpy as np + import pytest from numpy.testing import assert_allclose from onedal.basic_statistics import BasicStatistics @@ -29,18 +29,17 @@ ("min", np.min, (1e-5, 1e-7)), ("max", np.max, (1e-5, 1e-7)), ("mean", np.mean, (1e-5, 1e-7)), - ("standard_deviation", np.std, (3e-5, 3e-5)) + ("standard_deviation", np.std, (3e-5, 3e-5)), ] - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_basic_uniform(queue, dtype): seed = 42 s_count, f_count = 70000, 29 gen = np.random.default_rng(seed) - data = gen.uniform(low=-0.5, high=+0.6, - size=(s_count, f_count)) + data = gen.uniform(low=-0.5, high=+0.6, size=(s_count, f_count)) data = data.astype(dtype=dtype) alg = BasicStatistics(result_options="mean") @@ -51,9 +50,9 @@ def test_basic_uniform(queue, dtype): tol = 2e-5 if res_mean.dtype == np.float32 else 1e-7 assert_allclose(gtr_mean, res_mean, rtol=tol) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('option', options_and_tests) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("option", options_and_tests) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_option_uniform(queue, option, dtype): seed = 77 s_count, f_count = 19999, 31 @@ -62,8 +61,7 @@ def test_option_uniform(queue, option, dtype): fp32tol, fp64tol = tols gen = np.random.default_rng(seed) - data = gen.uniform(low=-0.3, high=+0.7, - size=(s_count, f_count)) + data = gen.uniform(low=-0.3, high=+0.7, size=(s_count, f_count)) data = data.astype(dtype=dtype) alg = BasicStatistics(result_options=result_option) @@ -74,9 +72,9 @@ def test_option_uniform(queue, option, dtype): tol = fp32tol if res.dtype == np.float32 else fp64tol assert_allclose(gtr, res, rtol=tol) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('option', options_and_tests) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("option", options_and_tests) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_option_weighted(queue, option, dtype): seed = 999 s_count, f_count = 1024, 127 @@ -86,10 +84,8 @@ def test_option_weighted(queue, option, dtype): fp32tol, fp64tol = 30 * fp32tol, 50 * fp64tol gen = np.random.default_rng(seed) - data = gen.uniform(low=-5.0, high=+9.0, - size=(s_count, f_count)) - weights = gen.uniform(low=-0.5, high=+1.0, - size=s_count) + data = gen.uniform(low=-5.0, high=+9.0, size=(s_count, f_count)) + weights = gen.uniform(low=-0.5, high=+1.0, size=s_count) data = data.astype(dtype=dtype) weights = weights.astype(dtype=dtype) diff --git a/onedal/cluster/__init__.py b/onedal/cluster/__init__.py index 609f7670b3..d8e38f9632 100644 --- a/onedal/cluster/__init__.py +++ b/onedal/cluster/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import daal_check_version from .kmeans import KMeans, k_means -__all__ = ['KMeans', 'k_means'] +__all__ = ["KMeans", "k_means"] -if daal_check_version((2023, 'P', 200)): +if daal_check_version((2023, "P", 200)): from .kmeans_init import KMeansInit, kmeans_plusplus - __all__ += ['KMeansInit', 'kmeans_plusplus'] + + __all__ += ["KMeansInit", "kmeans_plusplus"] diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 0bbf04a1c2..1519d88e23 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -15,53 +15,45 @@ # =============================================================================== import warnings +from abc import ABC import numpy as np +from daal4py.sklearn._utils import daal_check_version, get_dtype from onedal import _backend -from abc import ABC - -from daal4py.sklearn._utils import get_dtype -from daal4py.sklearn._utils import daal_check_version +from ..datatypes import _convert_to_supported, from_table, to_table -from ..datatypes import _convert_to_supported - -if daal_check_version((2023, 'P', 200)): +if daal_check_version((2023, "P", 200)): from .kmeans_init import KMeansInit else: from sklearn.cluster import _kmeans_plusplus -from onedal.basic_statistics import BasicStatistics - -from ..common._policy import _get_policy -from ..datatypes.validation import _is_arraylike_not_scalar -from ..datatypes._data_conversion import from_table, to_table - +from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.utils import check_array, check_random_state from sklearn.utils.validation import check_is_fitted -from sklearn.utils import check_random_state, check_array -from sklearn.base import ( - BaseEstimator, - ClusterMixin, - TransformerMixin) +from onedal.basic_statistics import BasicStatistics -from sklearn.metrics.pairwise import euclidean_distances +from ..common._policy import _get_policy +from ..utils import _is_arraylike_not_scalar class _BaseKMeans(TransformerMixin, ClusterMixin, BaseEstimator, ABC): def __init__( - self, - n_clusters, - *, - init, - n_init, - max_iter, - tol, - verbose, - random_state, - n_local_trials=None): + self, + n_clusters, + *, + init, + n_init, + max_iter, + tol, + verbose, + random_state, + n_local_trials=None, + ): self.n_clusters = n_clusters self.init = init self.max_iter = max_iter @@ -96,11 +88,8 @@ def _tolerance(self, rtol, X_table, policy, dtype=np.float32): return mean_var * rtol def _check_params_vs_input( - self, - X_table, - policy, - default_n_init=10, - dtype=np.float32): + self, X_table, policy, default_n_init=10, dtype=np.float32 + ): # n_clusters if X_table.shape[0] < self.n_clusters: raise ValueError( @@ -153,11 +142,12 @@ def _get_policy(self, queue, *data): def _get_onedal_params(self, dtype=np.float32): thr = self._tol if hasattr(self, "_tol") else self.tol return { - 'fptype': 'float' if dtype == np.float32 else 'double', - 'method': 'by_default', 'seed': -1, - 'max_iteration_count': self.max_iter, - 'cluster_count': self.n_clusters, - 'accuracy_threshold': thr, + "fptype": "float" if dtype == np.float32 else "double", + "method": "by_default", + "seed": -1, + "max_iteration_count": self.max_iter, + "cluster_count": self.n_clusters, + "accuracy_threshold": thr, } def _get_params_and_input(self, X, policy): @@ -178,26 +168,19 @@ def _get_params_and_input(self, X, policy): return (params, X_table, dtype) def _init_centroids_custom( - self, - X_table, - init, - random_seed, - policy, - dtype=np.float32, - n_centroids=None): + self, X_table, init, random_seed, policy, dtype=np.float32, n_centroids=None + ): n_clusters = self.n_clusters if n_centroids is None else n_centroids if isinstance(init, str) and init == "k-means++": alg = KMeansInit( - cluster_count=n_clusters, - seed=random_seed, - algorithm="plus_plus_dense") + cluster_count=n_clusters, seed=random_seed, algorithm="plus_plus_dense" + ) centers_table = alg.compute_raw(X_table, policy, dtype) elif isinstance(init, str) and init == "random": alg = KMeansInit( - cluster_count=n_clusters, - seed=random_seed, - algorithm="random_dense") + cluster_count=n_clusters, seed=random_seed, algorithm="random_dense" + ) centers_table = alg.compute_raw(X_table, policy, dtype) elif _is_arraylike_not_scalar(init): centers = np.asarray(init) @@ -220,11 +203,7 @@ def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float3 random_state=random_state, ) elif isinstance(init, str) and init == "random": - seeds = random_state.choice( - n_samples, - size=self.n_clusters, - replace=False - ) + seeds = random_state.choice(n_samples, size=self.n_clusters, replace=False) centers = X[seeds] elif callable(init): cc_arr = init(X, self.n_clusters, random_state) @@ -236,7 +215,8 @@ def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float3 else: raise ValueError( f"init should be either 'k-means++', 'random', a ndarray or a " - f"callable, got '{ init }' instead.") + f"callable, got '{ init }' instead." + ) centers = _convert_to_supported(policy, centers) return to_table(centers) @@ -250,8 +230,12 @@ def _fit_backend(self, X_table, centroids_table, module, policy, dtype=np.float3 result = module.train(policy, params, X_table, centroids_table) - return (result.responses, result.objective_function_value, - result.model, result.iteration_count) + return ( + result.responses, + result.objective_function_value, + result.model, + result.iteration_count, + ) def _fit(self, X, module, queue=None): policy = self._get_policy(queue, X) @@ -269,7 +253,8 @@ def is_better_iteration(inertia, labels): mod = _backend.kmeans_common better_inertia = inertia < best_inertia same_clusters = mod._is_same_clustering( - labels, best_labels, self.n_clusters) + labels, best_labels, self.n_clusters + ) return better_inertia and not same_clusters random_state = check_random_state(self.random_state) @@ -280,12 +265,12 @@ def is_better_iteration(inertia, labels): init = check_array(init, dtype=dtype, copy=True, order="C") self._validate_center_shape(X, init) - use_custom_init = daal_check_version((2023, 'P', 200)) and not callable(self.init) + use_custom_init = daal_check_version((2023, "P", 200)) and not callable(self.init) for _ in range(self._n_init): if use_custom_init: - #random_seed = random_state.tomaxint() - random_seed = random_state.randint(np.iinfo('i').max) + # random_seed = random_state.tomaxint() + random_seed = random_state.randint(np.iinfo("i").max) centroids_table = self._init_centroids_custom( X_table, init, random_seed, policy, dtype=dtype ) @@ -302,9 +287,7 @@ def is_better_iteration(inertia, labels): ) if self.verbose: - print("KMeans iteration completed with " - "inertia {}.".format(inertia) - ) + print("KMeans iteration completed with " "inertia {}.".format(inertia)) if is_better_iteration(inertia, labels): best_model, best_n_iter = model, n_iter @@ -496,7 +479,7 @@ def k_means( copy_x=True, algorithm="lloyd", return_n_iter=False, - queue=None + queue=None, ): est = KMeans( n_clusters=n_clusters, diff --git a/onedal/cluster/kmeans_init.py b/onedal/cluster/kmeans_init.py index 9cf31b5b02..3f5a2b65b5 100755 --- a/onedal/cluster/kmeans_init.py +++ b/onedal/cluster/kmeans_init.py @@ -15,30 +15,28 @@ # =============================================================================== import numpy as np +from sklearn.utils import check_random_state +from daal4py.sklearn._utils import daal_check_version, get_dtype from onedal import _backend -from daal4py.sklearn._utils import get_dtype -from ..datatypes import _convert_to_supported - from ..common._policy import _get_policy -from ..datatypes._data_conversion import from_table, to_table - -from sklearn.utils import check_random_state +from ..datatypes import _convert_to_supported, from_table, to_table -from daal4py.sklearn._utils import daal_check_version +if daal_check_version((2023, "P", 200)): -if daal_check_version((2023, 'P', 200)): class KMeansInit: """ KMeansInit oneDAL implementation. """ - def __init__(self, - cluster_count, - seed=777, - local_trials_count=None, - algorithm='plus_plus_dense'): + def __init__( + self, + cluster_count, + seed=777, + local_trials_count=None, + algorithm="plus_plus_dense", + ): self.cluster_count = cluster_count self.seed = seed self.local_trials_count = local_trials_count @@ -54,10 +52,11 @@ def _get_policy(self, queue, *data): def _get_onedal_params(self, dtype=np.float32): return { - 'fptype': 'float' if dtype == np.float32 else 'double', - 'local_trials_count': self.local_trials_count, - 'method': self.algorithm, 'seed': self.seed, - 'cluster_count': self.cluster_count, + "fptype": "float" if dtype == np.float32 else "double", + "local_trials_count": self.local_trials_count, + "method": self.algorithm, + "seed": self.seed, + "cluster_count": self.cluster_count, } def _get_params_and_input(self, X, policy): @@ -94,16 +93,18 @@ def compute(self, X, queue=None): return self._compute(X, _backend.kmeans_init.init, queue) def kmeans_plusplus( - X, - n_clusters, - *, - x_squared_norms=None, - random_state=None, - n_local_trials=None, - queue=None): + X, + n_clusters, + *, + x_squared_norms=None, + random_state=None, + n_local_trials=None, + queue=None, + ): random_seed = check_random_state(random_state).tomaxint() return ( KMeansInit( - n_clusters, seed=random_seed, local_trials_count=n_local_trials).compute( - X, queue), np.full( - n_clusters, -1)) + n_clusters, seed=random_seed, local_trials_count=n_local_trials + ).compute(X, queue), + np.full(n_clusters, -1), + ) diff --git a/onedal/cluster/tests/test_kmeans.py b/onedal/cluster/tests/test_kmeans.py index 61a40962a0..ac3f305353 100644 --- a/onedal/cluster/tests/test_kmeans.py +++ b/onedal/cluster/tests/test_kmeans.py @@ -14,21 +14,20 @@ # limitations under the License. # =============================================================================== -import pytest import numpy as np - +import pytest from numpy.testing import assert_array_equal + from daal4py.sklearn._utils import daal_check_version -if daal_check_version((2023, 'P', 200)): +if daal_check_version((2023, "P", 200)): + from sklearn.cluster import kmeans_plusplus as init_external + from sklearn.neighbors import NearestNeighbors + from onedal.cluster import KMeans from onedal.cluster import kmeans_plusplus as init_internal from onedal.tests.utils._device_selection import get_queues - from sklearn.cluster import kmeans_plusplus as init_external - - from sklearn.neighbors import NearestNeighbors - def generate_dataset(n_dim, n_cluster, n_points=None, seed=777, dtype=np.float32): # We need some reference value of points for each cluster n_points = (n_dim * n_cluster) if n_points is None else n_points @@ -46,7 +45,7 @@ def generate_dataset(n_dim, n_cluster, n_points=None, seed=777, dtype=np.float32 # Generating dataset def gen_one(c): - params = {'loc': cs[c, :], 'scale': vs[c], 'size': (n_points, n_dim)} + params = {"loc": cs[c, :], "scale": vs[c], "size": (n_points, n_dim)} return gen.normal(**params) data = [gen_one(c) for c in range(n_cluster)] @@ -57,23 +56,23 @@ def gen_one(c): return (cs, vs, data) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) - @pytest.mark.parametrize('n_dim', [3, 4, 17, 24]) - @pytest.mark.parametrize('n_cluster', [9, 11, 32]) - @pytest.mark.parametrize('pipeline', ['implicit', 'external', 'internal']) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.parametrize("n_dim", [3, 4, 17, 24]) + @pytest.mark.parametrize("n_cluster", [9, 11, 32]) + @pytest.mark.parametrize("pipeline", ["implicit", "external", "internal"]) def test_generated_dataset(queue, dtype, n_dim, n_cluster, pipeline): seed = 777 * n_dim * n_cluster cs, vs, X = generate_dataset(n_dim, n_cluster, seed=seed, dtype=dtype) - if pipeline == 'external': + if pipeline == "external": init_data, _ = init_external(X, n_cluster) m = KMeans(n_cluster, init=init_data, max_iter=5) - elif pipeline == 'internal': + elif pipeline == "internal": init_data, _ = init_internal(X, n_cluster, queue=queue) m = KMeans(n_cluster, init=init_data, max_iter=5) else: - m = KMeans(n_cluster, init='k-means++', max_iter=5) + m = KMeans(n_cluster, init="k-means++", max_iter=5) m.fit(X, queue=queue) diff --git a/onedal/cluster/tests/test_kmeans_init.py b/onedal/cluster/tests/test_kmeans_init.py index 6d92ab9c44..932918aa53 100755 --- a/onedal/cluster/tests/test_kmeans_init.py +++ b/onedal/cluster/tests/test_kmeans_init.py @@ -14,22 +14,22 @@ # limitations under the License. # =============================================================================== -import pytest import numpy as np - +import pytest from numpy.testing import assert_array_equal -from daal4py.sklearn._utils import daal_check_version -if daal_check_version((2023, 'P', 200)): - from onedal.cluster import kmeans_plusplus, KMeans - from onedal.tests.utils._device_selection import get_queues +from daal4py.sklearn._utils import daal_check_version +if daal_check_version((2023, "P", 200)): from sklearn.datasets import load_breast_cancer from sklearn.metrics import davies_bouldin_score - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) - @pytest.mark.parametrize('n_cluster', [2, 5, 11, 128]) + from onedal.cluster import KMeans, kmeans_plusplus + from onedal.tests.utils._device_selection import get_queues + + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.parametrize("n_cluster", [2, 5, 11, 128]) def test_breast_cancer(queue, dtype, n_cluster): X, _ = load_breast_cancer(return_X_y=True) X = np.asarray(X).astype(dtype=dtype) @@ -58,7 +58,7 @@ def generate_dataset(n_dim, n_cluster, n_points=None, seed=777, dtype=np.float32 # Generating dataset def gen_one(c): - params = {'loc': cs[c, :], 'scale': vs[c], 'size': (n_points, n_dim)} + params = {"loc": cs[c, :], "scale": vs[c], "size": (n_points, n_dim)} return gen.normal(**params) data = [gen_one(c) for c in range(n_cluster)] @@ -69,10 +69,10 @@ def gen_one(c): return (cs, vs, data) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) - @pytest.mark.parametrize('n_dim', [3, 12, 17]) - @pytest.mark.parametrize('n_cluster', [2, 15, 61]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.parametrize("n_dim", [3, 12, 17]) + @pytest.mark.parametrize("n_cluster", [2, 15, 61]) def test_generated_dataset(queue, dtype, n_dim, n_cluster): seed = 777 * n_dim * n_cluster cs, vs, X = generate_dataset(n_dim, n_cluster, seed=seed, dtype=dtype) diff --git a/onedal/common/_estimator_checks.py b/onedal/common/_estimator_checks.py index 034b724b65..e42efea8e4 100755 --- a/onedal/common/_estimator_checks.py +++ b/onedal/common/_estimator_checks.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== def _check_is_fitted(estimator, attributes=None, *, msg=None): if msg is None: - msg = ("This %(name)s instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this estimator.") + msg = ( + "This %(name)s instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator." + ) - if not hasattr(estimator, 'fit'): + if not hasattr(estimator, "fit"): raise TypeError("%s is not an estimator instance." % (estimator)) if attributes is not None: @@ -28,11 +30,10 @@ def _check_is_fitted(estimator, attributes=None, *, msg=None): attributes = [attributes] attrs = all([hasattr(estimator, attr) for attr in attributes]) else: - attrs = [v for v in vars(estimator) - if v.endswith("_") and not v.startswith("__")] + attrs = [v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")] if not attrs: - raise AttributeError(msg % {'name': type(estimator).__name__}) + raise AttributeError(msg % {"name": type(estimator).__name__}) def _is_classifier(estimator): diff --git a/onedal/common/_mixin.py b/onedal/common/_mixin.py index 9b1adeb819..94efb1daf6 100644 --- a/onedal/common/_mixin.py +++ b/onedal/common/_mixin.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== + class ClassifierMixin: _estimator_type = "classifier" def score(self, X, y, sample_weight=None, queue=None): from sklearn.metrics import accuracy_score - return accuracy_score(y, self.predict(X, queue=queue), - sample_weight=sample_weight) + + return accuracy_score( + y, self.predict(X, queue=queue), sample_weight=sample_weight + ) def _more_tags(self): return {"requires_y": True} @@ -31,6 +34,7 @@ class RegressorMixin: def score(self, X, y, sample_weight=None, queue=None): from sklearn.metrics import r2_score + return r2_score(y, self.predict(X, queue=queue), sample_weight=sample_weight) def _more_tags(self): diff --git a/onedal/common/_policy.py b/onedal/common/_policy.py index 49dc4863a5..3de7f769d7 100644 --- a/onedal/common/_policy.py +++ b/onedal/common/_policy.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from onedal import _backend, _is_dpc_backend import sys -oneapi_is_available = 'daal4py.oneapi' in sys.modules +from onedal import _backend, _is_dpc_backend + +oneapi_is_available = "daal4py.oneapi" in sys.modules if oneapi_is_available: from daal4py.oneapi import _get_sycl_ctxt, sycl_execution_context @@ -35,9 +36,9 @@ def _get_policy(queue, *data): def _get_queue(*data): - if len(data) > 0 and hasattr(data[0], '__sycl_usm_array_interface__'): + if len(data) > 0 and hasattr(data[0], "__sycl_usm_array_interface__"): # Assume that all data reside on the same device - return data[0].__sycl_usm_array_interface__['syclobj'] + return data[0].__sycl_usm_array_interface__["syclobj"] return None @@ -47,7 +48,7 @@ def __init__(self): self._host_context = None if oneapi_is_available: self._d4p_context = _get_sycl_ctxt() - self._host_context = sycl_execution_context('cpu') + self._host_context = sycl_execution_context("cpu") self._host_context.apply() def __del__(self): @@ -62,12 +63,14 @@ def __init__(self): if _is_dpc_backend: + class _DataParallelInteropPolicy(_backend.data_parallel_policy): def __init__(self, queue): self._queue = queue self._d4p_interop = _Daal4PyContextReset() - if 'sklearnex' in sys.modules: + if "sklearnex" in sys.modules: from sklearnex._device_offload import DummySyclQueue + if isinstance(queue, DummySyclQueue): super().__init__(self._queue.sycl_device.get_filter_string()) return diff --git a/onedal/common/_spmd_policy.py b/onedal/common/_spmd_policy.py index f829b04363..daea02fc63 100644 --- a/onedal/common/_spmd_policy.py +++ b/onedal/common/_spmd_policy.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from onedal import _backend, _is_dpc_backend import sys +from onedal import _backend, _is_dpc_backend + if _is_dpc_backend: + class _SPMDDataParallelInteropPolicy(_backend.spmd_data_parallel_policy): def __init__(self, queue): self._queue = queue diff --git a/onedal/common/tests/test_policy.py b/onedal/common/tests/test_policy.py index 63783c550d..05350051ca 100644 --- a/onedal/common/tests/test_policy.py +++ b/onedal/common/tests/test_policy.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,23 +12,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np +import pytest from onedal.common._policy import _get_policy from onedal.tests.utils._device_selection import ( - get_queues, get_memory_usm, is_dpctl_available, device_type_to_str) + device_type_to_str, + get_memory_usm, + get_queues, + is_dpctl_available, +) -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_queue_passed_directly(queue): device_name = device_type_to_str(queue) assert _get_policy(queue).get_device_name() == device_name -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_with_numpy_data(queue): X = np.zeros((5, 3)) y = np.zeros(3) @@ -37,27 +41,28 @@ def test_with_numpy_data(queue): assert _get_policy(queue, X, y).get_device_name() == device_name -@pytest.mark.skipif(not is_dpctl_available(), reason='depends on dpctl') -@pytest.mark.parametrize('queue', get_queues('cpu,gpu')) -@pytest.mark.parametrize('memtype', get_memory_usm()) +@pytest.mark.skipif(not is_dpctl_available(), reason="depends on dpctl") +@pytest.mark.parametrize("queue", get_queues("cpu,gpu")) +@pytest.mark.parametrize("memtype", get_memory_usm()) def test_with_usm_ndarray_data(queue, memtype): from dpctl.tensor import usm_ndarray device_name = device_type_to_str(queue) X = usm_ndarray((5, 3), buffer=memtype(5 * 3 * 8, queue=queue)) - y = usm_ndarray((3, ), buffer=memtype(3 * 8, queue=queue)) + y = usm_ndarray((3,), buffer=memtype(3 * 8, queue=queue)) assert _get_policy(None, X, y).get_device_name() == device_name -@pytest.mark.skipif(not is_dpctl_available(['cpu', 'gpu']), - reason='test uses multiple devices') -@pytest.mark.parametrize('memtype', get_memory_usm()) +@pytest.mark.skipif( + not is_dpctl_available(["cpu", "gpu"]), reason="test uses multiple devices" +) +@pytest.mark.parametrize("memtype", get_memory_usm()) def test_queue_parameter_with_usm_ndarray(memtype): from dpctl import SyclQueue from dpctl.tensor import usm_ndarray - q1 = SyclQueue('cpu') - q2 = SyclQueue('gpu') + q1 = SyclQueue("cpu") + q2 = SyclQueue("gpu") X = usm_ndarray((5, 3), buffer=memtype(5 * 3 * 8, queue=q1)) assert _get_policy(q2, X).get_device_name() == device_type_to_str(q2) diff --git a/onedal/datatypes/__init__.py b/onedal/datatypes/__init__.py index 3f98655726..470fda902e 100644 --- a/onedal/datatypes/__init__.py +++ b/onedal/datatypes/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,29 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from .validation import ( - _column_or_1d, - _validate_targets, - _check_X_y, - _check_array, - _check_classification_targets, - _type_of_target, - _is_integral_float, - _is_multilabel, - _check_n_features, - _num_features, - _num_samples, - _is_arraylike, - _is_arraylike_not_scalar -) +from ._data_conversion import _convert_to_supported, from_table, to_table -from ._data_conversion import _convert_to_supported - -__all__ = ['_column_or_1d', '_validate_targets', '_check_X_y', - '_check_array', '_check_classification_targets', - '_type_of_target', '_is_integral_float', - '_is_multilabel', '_check_n_features', '_num_features', - '_num_samples', '_convert_to_supported', - '_is_arraylike', '_is_arraylike_not_scalar'] +__all__ = ["from_table", "to_table", "_convert_to_supported"] diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 4e3b0f9cc4..ec5ffc5c08 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -14,17 +14,18 @@ # limitations under the License. # =============================================================================== -import numpy as np import warnings -from onedal import _is_dpc_backend -from onedal import _backend +import numpy as np + from daal4py.sklearn._utils import make2d +from onedal import _backend, _is_dpc_backend try: import dpctl import dpctl.tensor as dpt - dpctl_available = dpctl.__version__ >= '0.14' + + dpctl_available = dpctl.__version__ >= "0.14" except ImportError: dpctl_available = False @@ -67,9 +68,11 @@ def func(x): def convert_or_pass(x): if (x is not None) and (x.dtype == np.float64): - warnings.warn("Data will be converted into float32 from " - "float64 because device does not support it", - RuntimeWarning, ) + warnings.warn( + "Data will be converted into float32 from " + "float64 because device does not support it", + RuntimeWarning, + ) return x.astype(np.float32) else: return x @@ -80,6 +83,7 @@ def convert_or_pass(x): return _apply_and_pass(func, *data) else: + def _convert_to_supported(policy, *data): def func(x): return x diff --git a/onedal/datatypes/tests/test_data.py b/onedal/datatypes/tests/test_data.py index 09ef20b77b..9a91e017e3 100644 --- a/onedal/datatypes/tests/test_data.py +++ b/onedal/datatypes/tests/test_data.py @@ -14,19 +14,19 @@ # limitations under the License. # =============================================================================== -import pytest import numpy as np +import pytest from numpy.testing import assert_allclose -from onedal.primitives import linear_kernel from onedal import _backend - +from onedal.primitives import linear_kernel from onedal.tests.utils._device_selection import get_queues try: import dpctl import dpctl.tensor as dpt - dpctl_available = dpctl.__version__ >= '0.14' + + dpctl_available = dpctl.__version__ >= "0.14" except ImportError: dpctl_available = False @@ -35,7 +35,7 @@ def _test_input_format_c_contiguous_numpy(queue, dtype): rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 4)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='C') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="C") assert x_numpy.flags.c_contiguous assert not x_numpy.flags.f_contiguous assert not x_numpy.flags.fnc @@ -46,8 +46,8 @@ def _test_input_format_c_contiguous_numpy(queue, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_c_contiguous_numpy(queue, dtype): _test_input_format_c_contiguous_numpy(queue, dtype) @@ -56,7 +56,7 @@ def _test_input_format_f_contiguous_numpy(queue, dtype): rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 4)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='F') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="F") assert not x_numpy.flags.c_contiguous assert x_numpy.flags.f_contiguous assert x_numpy.flags.fnc @@ -67,8 +67,8 @@ def _test_input_format_f_contiguous_numpy(queue, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_f_contiguous_numpy(queue, dtype): _test_input_format_f_contiguous_numpy(queue, dtype) @@ -92,18 +92,18 @@ def _test_input_format_c_not_contiguous_numpy(queue, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_c_not_contiguous_numpy(queue, dtype): _test_input_format_c_not_contiguous_numpy(queue, dtype) def _test_input_format_c_contiguous_pandas(queue, dtype): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 4)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='C') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="C") assert x_numpy.flags.c_contiguous assert not x_numpy.flags.f_contiguous assert not x_numpy.flags.fnc @@ -115,18 +115,18 @@ def _test_input_format_c_contiguous_pandas(queue, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_c_contiguous_pandas(queue, dtype): _test_input_format_c_contiguous_pandas(queue, dtype) def _test_input_format_f_contiguous_pandas(queue, dtype): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 4)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='F') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="F") assert not x_numpy.flags.c_contiguous assert x_numpy.flags.f_contiguous assert x_numpy.flags.fnc @@ -138,31 +138,32 @@ def _test_input_format_f_contiguous_pandas(queue, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_f_contiguous_pandas(queue, dtype): _test_input_format_f_contiguous_pandas(queue, dtype) -@pytest.mark.skipif(not dpctl_available, - reason="requires dpctl>=0.14") -@pytest.mark.parametrize('queue', get_queues('cpu,gpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64, np.int32, np.int64]) +@pytest.mark.skipif(not dpctl_available, reason="requires dpctl>=0.14") +@pytest.mark.parametrize("queue", get_queues("cpu,gpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) def test_input_format_c_contiguous_dpctl(queue, dtype): rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 59)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='C') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="C") x_dpt = dpt.asarray(x_numpy, usm_type="device", sycl_queue=queue) # assert not x_dpt.flags.fnc assert isinstance(x_dpt, dpt.usm_ndarray) x_table = _backend.dpctl_to_table(x_dpt) - assert hasattr(x_table, '__sycl_usm_array_interface__') + assert hasattr(x_table, "__sycl_usm_array_interface__") x_dpt_from_table = dpt.asarray(x_table) - assert x_dpt.__sycl_usm_array_interface__[ - 'data'][0] == x_dpt_from_table.__sycl_usm_array_interface__['data'][0] + assert ( + x_dpt.__sycl_usm_array_interface__["data"][0] + == x_dpt_from_table.__sycl_usm_array_interface__["data"][0] + ) assert x_dpt.shape == x_dpt_from_table.shape assert x_dpt.strides == x_dpt_from_table.strides assert x_dpt.dtype == x_dpt_from_table.dtype @@ -170,25 +171,26 @@ def test_input_format_c_contiguous_dpctl(queue, dtype): assert x_dpt_from_table.flags.c_contiguous -@pytest.mark.skipif(not dpctl_available, - reason="requires dpctl>=0.14") -@pytest.mark.parametrize('queue', get_queues('cpu,gpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64, np.int32, np.int64]) +@pytest.mark.skipif(not dpctl_available, reason="requires dpctl>=0.14") +@pytest.mark.parametrize("queue", get_queues("cpu,gpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) def test_input_format_f_contiguous_dpctl(queue, dtype): rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 59)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='F') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="F") x_dpt = dpt.asarray(x_numpy, usm_type="device", sycl_queue=queue) # assert not x_dpt.flags.fnc assert isinstance(x_dpt, dpt.usm_ndarray) x_table = _backend.dpctl_to_table(x_dpt) - assert hasattr(x_table, '__sycl_usm_array_interface__') + assert hasattr(x_table, "__sycl_usm_array_interface__") x_dpt_from_table = dpt.asarray(x_table) - assert x_dpt.__sycl_usm_array_interface__[ - 'data'][0] == x_dpt_from_table.__sycl_usm_array_interface__['data'][0] + assert ( + x_dpt.__sycl_usm_array_interface__["data"][0] + == x_dpt_from_table.__sycl_usm_array_interface__["data"][0] + ) assert x_dpt.shape == x_dpt_from_table.shape assert x_dpt.strides == x_dpt_from_table.strides assert x_dpt.dtype == x_dpt_from_table.dtype diff --git a/onedal/decomposition/__init__.py b/onedal/decomposition/__init__.py index eda7b9fc14..618e0b9082 100644 --- a/onedal/decomposition/__init__.py +++ b/onedal/decomposition/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/onedal/decomposition/pca.py b/onedal/decomposition/pca.py index eda23a1ced..b6834c731b 100644 --- a/onedal/decomposition/pca.py +++ b/onedal/decomposition/pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,24 +12,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np +from daal4py.sklearn._utils import sklearn_check_version from onedal import _backend + from ..common._policy import _get_policy -from ..datatypes._data_conversion import from_table, to_table -from ..datatypes import _convert_to_supported -from daal4py.sklearn._utils import sklearn_check_version +from ..datatypes import _convert_to_supported, from_table, to_table -class PCA(): +class PCA: def __init__( - self, - n_components=None, - is_deterministic=True, - method='precomputed', - copy=True + self, n_components=None, is_deterministic=True, method="precomputed", copy=True ): self.n_components = n_components self.method = method @@ -37,11 +33,10 @@ def __init__( def get_onedal_params(self, data): return { - 'fptype': - 'float' if data.dtype == np.float32 else 'double', - 'method': self.method, - 'n_components': self.n_components, - 'is_deterministic': self.is_deterministic + "fptype": "float" if data.dtype == np.float32 else "double", + "method": self.method, + "n_components": self.n_components, + "is_deterministic": self.is_deterministic, } def _get_policy(self, queue, *data): @@ -54,34 +49,27 @@ def fit(self, X, queue): policy = self._get_policy(queue, X) # TODO: investigate why np.ndarray with OWNDATA=FALSE flag # fails to be converted to oneDAL table - if isinstance(X, np.ndarray) and not X.flags['OWNDATA']: + if isinstance(X, np.ndarray) and not X.flags["OWNDATA"]: X = X.copy() X = _convert_to_supported(policy, X) params = self.get_onedal_params(X) cov_result = _backend.covariance.compute( - policy, - {'fptype': params['fptype'], 'method': 'dense'}, - to_table(X) + policy, {"fptype": params["fptype"], "method": "dense"}, to_table(X) ) covariance_matrix = from_table(cov_result.cov_matrix) self.mean_ = from_table(cov_result.means) result = _backend.decomposition.dim_reduction.train( - policy, - params, - to_table(covariance_matrix) + policy, params, to_table(covariance_matrix) ) self.n_components_ = self.n_components self.variances_ = from_table(result.variances) self.components_ = from_table(result.eigenvectors) - self.explained_variance_ = \ - np.maximum(from_table(result.eigenvalues).ravel(), 0) + self.explained_variance_ = np.maximum(from_table(result.eigenvalues).ravel(), 0) tot_var = covariance_matrix.trace() self.explained_variance_ratio_ = self.explained_variance_ / tot_var - self.singular_values_ = np.sqrt( - (n_samples - 1) * self.explained_variance_ - ) + self.singular_values_ = np.sqrt((n_samples - 1) * self.explained_variance_) if sklearn_check_version("1.2"): self.n_features_in_ = n_features @@ -94,10 +82,8 @@ def fit(self, X, queue): self.n_samples_ = n_samples if self.n_components < n_sf_min: if self.explained_variance_.shape[0] < n_sf_min: - resid_var_ = tot_var - \ - self.explained_variance_[:self.n_components].sum() - self.noise_variance_ = \ - resid_var_ / (n_sf_min - self.n_components) + resid_var_ = tot_var - self.explained_variance_[: self.n_components].sum() + self.noise_variance_ = resid_var_ / (n_sf_min - self.n_components) return self def _create_model(self): @@ -112,8 +98,7 @@ def predict(self, X, queue): X = _convert_to_supported(policy, X) params = self.get_onedal_params(X) - result = _backend.decomposition.dim_reduction.infer(policy, - params, - model, - to_table(X)) + result = _backend.decomposition.dim_reduction.infer( + policy, params, model, to_table(X) + ) return from_table(result.transformed_data) diff --git a/onedal/ensemble/__init__.py b/onedal/ensemble/__init__.py index 30d18d71b4..86e60b233a 100644 --- a/onedal/ensemble/__init__.py +++ b/onedal/ensemble/__init__.py @@ -14,10 +14,16 @@ # limitations under the License. # =============================================================================== -from .forest import RandomForestClassifier, RandomForestRegressor -from .forest import ExtraTreesClassifier, ExtraTreesRegressor +from .forest import ( + ExtraTreesClassifier, + ExtraTreesRegressor, + RandomForestClassifier, + RandomForestRegressor, +) -__all__ = ['RandomForestClassifier', - 'RandomForestRegressor', - 'ExtraTreesClassifier', - 'ExtraTreesRegressor'] +__all__ = [ + "RandomForestClassifier", + "RandomForestRegressor", + "ExtraTreesClassifier", + "ExtraTreesRegressor", +] diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index f300d1785f..546bd979d5 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -14,75 +14,75 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version) -from sklearn.ensemble import BaseEnsemble -from abc import ABCMeta, abstractmethod import numbers -from numbers import Number import warnings +from abc import ABCMeta, abstractmethod +from math import ceil +from numbers import Number + +import numpy as np +from scipy import sparse as sp +from sklearn.ensemble import BaseEnsemble from sklearn.exceptions import DataConversionWarning from sklearn.utils import ( + check_array, check_random_state, compute_sample_weight, - check_array, - deprecated) + deprecated, +) from sklearn.utils.validation import ( - check_is_fitted, + _num_samples, check_consistent_length, - _num_samples) -from math import ceil - -import numpy as np -from scipy import sparse as sp -from ..datatypes import ( - _validate_targets, - _check_X_y, - _check_array, - _column_or_1d, - _check_n_features, - _convert_to_supported + check_is_fitted, ) -from ..common._mixin import ClassifierMixin, RegressorMixin -from ..common._policy import _get_policy -from ..common._estimator_checks import _check_is_fitted -from ..datatypes._data_conversion import from_table, to_table +from daal4py.sklearn._utils import daal_check_version, sklearn_check_version from onedal import _backend -from sklearn.tree import DecisionTreeClassifier +from ..common._estimator_checks import _check_is_fitted +from ..common._mixin import ClassifierMixin, RegressorMixin +from ..common._policy import _get_policy +from ..datatypes import _convert_to_supported, from_table, to_table +from ..utils import ( + _check_array, + _check_n_features, + _check_X_y, + _column_or_1d, + _validate_targets, +) class BaseForest(BaseEnsemble, metaclass=ABCMeta): @abstractmethod def __init__( - self, - n_estimators, - criterion, - max_depth, - min_samples_split, - min_samples_leaf, - min_weight_fraction_leaf, - max_features, - max_leaf_nodes, - min_impurity_decrease, - min_impurity_split, - bootstrap, - oob_score, - random_state, - warm_start, - class_weight, - ccp_alpha, - max_samples, - max_bins, - min_bin_size, - infer_mode, - splitter_mode, - voting_mode, - error_metric_mode, - variable_importance_mode, - algorithm, - **kwargs): + self, + n_estimators, + criterion, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_features, + max_leaf_nodes, + min_impurity_decrease, + min_impurity_split, + bootstrap, + oob_score, + random_state, + warm_start, + class_weight, + ccp_alpha, + max_samples, + max_bins, + min_bin_size, + infer_mode, + splitter_mode, + voting_mode, + error_metric_mode, + variable_importance_mode, + algorithm, + **kwargs, + ): self.n_estimators = n_estimators self.bootstrap = bootstrap self.oob_score = oob_score @@ -109,32 +109,41 @@ def __init__( self.variable_importance_mode = variable_importance_mode self.algorithm = algorithm - def _to_absolute_max_features(self, max_features, n_features, - is_classification=False): + def _to_absolute_max_features( + self, max_features, n_features, is_classification=False + ): if max_features is None: return n_features if isinstance(max_features, str): if max_features == "auto": - if not sklearn_check_version('1.3'): - if sklearn_check_version('1.1'): + if not sklearn_check_version("1.3"): + if sklearn_check_version("1.1"): warnings.warn( "`max_features='auto'` has been deprecated in 1.1 " "and will be removed in 1.3. To keep the past behaviour, " "explicitly set `max_features=1.0` or remove this " "parameter as it is also the default value for " "RandomForestRegressors and ExtraTreesRegressors.", - FutureWarning, ) - return max(1, int(np.sqrt(n_features)) - ) if is_classification else n_features - if max_features == 'sqrt': + FutureWarning, + ) + return ( + max(1, int(np.sqrt(n_features))) + if is_classification + else n_features + ) + if max_features == "sqrt": return max(1, int(np.sqrt(n_features))) if max_features == "log2": return max(1, int(np.log2(n_features))) - allowed_string_values = '"sqrt" or "log2"' if sklearn_check_version( - '1.3') else '"auto", "sqrt" or "log2"' + allowed_string_values = ( + '"sqrt" or "log2"' + if sklearn_check_version("1.3") + else '"auto", "sqrt" or "log2"' + ) raise ValueError( - 'Invalid value for max_features. Allowed string ' - f'values are {allowed_string_values}.') + "Invalid value for max_features. Allowed string " + f"values are {allowed_string_values}." + ) if isinstance(max_features, (numbers.Integral, np.integer)): return max_features if max_features > 0.0: @@ -143,10 +152,10 @@ def _to_absolute_max_features(self, max_features, n_features, def _get_observations_per_tree_fraction(self, n_samples, max_samples): if max_samples is None: - return 1. + return 1.0 if isinstance(max_samples, numbers.Integral): - if not sklearn_check_version('1.2'): + if not sklearn_check_version("1.2"): if not (1 <= max_samples <= n_samples): msg = "`max_samples` must be in range 1 to {} but got value {}" raise ValueError(msg.format(n_samples, max_samples)) @@ -157,9 +166,9 @@ def _get_observations_per_tree_fraction(self, n_samples, max_samples): return max(float(max_samples / n_samples), 1 / n_samples) if isinstance(max_samples, numbers.Real): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): pass - elif sklearn_check_version('1.0'): + elif sklearn_check_version("1.0"): if not (0 < float(max_samples) <= 1): msg = "`max_samples` must be in range (0.0, 1.0] but got value {}" raise ValueError(msg.format(max_samples)) @@ -175,12 +184,15 @@ def _get_observations_per_tree_fraction(self, n_samples, max_samples): def _get_onedal_params(self, data): n_samples, n_features = data.shape features_per_node = self._to_absolute_max_features( - self.max_features, n_features, self.is_classification) + self.max_features, n_features, self.is_classification + ) observations_per_tree_fraction = self._get_observations_per_tree_fraction( - n_samples=n_samples, max_samples=self.max_samples) - observations_per_tree_fraction = observations_per_tree_fraction if bool( - self.bootstrap) else 1. + n_samples=n_samples, max_samples=self.max_samples + ) + observations_per_tree_fraction = ( + observations_per_tree_fraction if bool(self.bootstrap) else 1.0 + ) if not self.bootstrap and self.max_samples is not None: raise ValueError( @@ -189,116 +201,126 @@ def _get_onedal_params(self, data): "`max_sample=None`." ) if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError("Out of bag estimation only available" " if bootstrap=True") min_observations_in_leaf_node = ( - self.min_samples_leaf if isinstance( - self.min_samples_leaf, - numbers.Integral) else int( - ceil( - self.min_samples_leaf * n_samples))) + self.min_samples_leaf + if isinstance(self.min_samples_leaf, numbers.Integral) + else int(ceil(self.min_samples_leaf * n_samples)) + ) min_observations_in_split_node = ( - self.min_samples_split if isinstance( - self.min_samples_split, - numbers.Integral) else int( - ceil( - self.min_samples_split * n_samples))) + self.min_samples_split + if isinstance(self.min_samples_split, numbers.Integral) + else int(ceil(self.min_samples_split * n_samples)) + ) onedal_params = { - 'fptype': 'float' if data.dtype == np.float32 else 'double', - 'method': self.algorithm, - 'infer_mode': self.infer_mode, - 'voting_mode': self.voting_mode, - 'observations_per_tree_fraction': observations_per_tree_fraction, - 'impurity_threshold': float( - 0.0 if self.min_impurity_split is None else self.min_impurity_split), - 'min_weight_fraction_in_leaf_node': self.min_weight_fraction_leaf, - 'min_impurity_decrease_in_split_node': self.min_impurity_decrease, - 'tree_count': int(self.n_estimators), - 'features_per_node': features_per_node, - 'max_tree_depth': int(0 if self.max_depth is None else self.max_depth), - 'min_observations_in_leaf_node': min_observations_in_leaf_node, - 'min_observations_in_split_node': min_observations_in_split_node, - 'max_leaf_nodes': (0 if self.max_leaf_nodes is None else self.max_leaf_nodes), - 'max_bins': self.max_bins, - 'min_bin_size': self.min_bin_size, - 'memory_saving_mode': False, - 'bootstrap': bool(self.bootstrap), - 'error_metric_mode': self.error_metric_mode, - 'variable_importance_mode': self.variable_importance_mode, + "fptype": "float" if data.dtype == np.float32 else "double", + "method": self.algorithm, + "infer_mode": self.infer_mode, + "voting_mode": self.voting_mode, + "observations_per_tree_fraction": observations_per_tree_fraction, + "impurity_threshold": float( + 0.0 if self.min_impurity_split is None else self.min_impurity_split + ), + "min_weight_fraction_in_leaf_node": self.min_weight_fraction_leaf, + "min_impurity_decrease_in_split_node": self.min_impurity_decrease, + "tree_count": int(self.n_estimators), + "features_per_node": features_per_node, + "max_tree_depth": int(0 if self.max_depth is None else self.max_depth), + "min_observations_in_leaf_node": min_observations_in_leaf_node, + "min_observations_in_split_node": min_observations_in_split_node, + "max_leaf_nodes": (0 if self.max_leaf_nodes is None else self.max_leaf_nodes), + "max_bins": self.max_bins, + "min_bin_size": self.min_bin_size, + "memory_saving_mode": False, + "bootstrap": bool(self.bootstrap), + "error_metric_mode": self.error_metric_mode, + "variable_importance_mode": self.variable_importance_mode, } if self.is_classification: - onedal_params['class_count'] = 0 if self.classes_ is None else len( - self.classes_) - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = self.splitter_mode + onedal_params["class_count"] = ( + 0 if self.classes_ is None else len(self.classes_) + ) + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = self.splitter_mode return onedal_params def _check_parameters(self): if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) else: # float - if not 0. < self.min_samples_leaf <= 0.5: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + if not 0.0 < self.min_samples_leaf <= 0.5: + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split) + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the integer %s" % self.min_samples_split + ) else: # float - if not 0. < self.min_samples_split <= 1.: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split) + if not 0.0 < self.min_samples_split <= 1.0: + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the float %s" % self.min_samples_split + ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if self.min_impurity_split is not None: - warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value has changed from 1e-7 to 0 in " - "version 0.23, and it will be removed in 0.25. " - "Use the min_impurity_decrease parameter instead.", - FutureWarning) - - if self.min_impurity_split < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") - if self.min_impurity_decrease < 0.: - raise ValueError("min_impurity_decrease must be greater than " - "or equal to 0") + warnings.warn( + "The min_impurity_split parameter is deprecated. " + "Its default value has changed from 1e-7 to 0 in " + "version 0.23, and it will be removed in 0.25. " + "Use the min_impurity_decrease parameter instead.", + FutureWarning, + ) + + if self.min_impurity_split < 0.0: + raise ValueError( + "min_impurity_split must be greater than " "or equal to 0" + ) + if self.min_impurity_decrease < 0.0: + raise ValueError( + "min_impurity_decrease must be greater than " "or equal to 0" + ) if self.max_leaf_nodes is not None: if not isinstance(self.max_leaf_nodes, numbers.Integral): raise ValueError( "max_leaf_nodes must be integral number but was " - "%r" % - self.max_leaf_nodes) + "%r" % self.max_leaf_nodes + ) if self.max_leaf_nodes < 2: raise ValueError( - ("max_leaf_nodes {0} must be either None " - "or larger than 1").format( - self.max_leaf_nodes)) + ("max_leaf_nodes {0} must be either None " "or larger than 1").format( + self.max_leaf_nodes + ) + ) if isinstance(self.max_bins, numbers.Integral): if not 2 <= self.max_bins: - raise ValueError("max_bins must be at least 2, got %s" - % self.max_bins) + raise ValueError("max_bins must be at least 2, got %s" % self.max_bins) else: - raise ValueError("max_bins must be integral number but was " - "%r" % self.max_bins) + raise ValueError( + "max_bins must be integral number but was " "%r" % self.max_bins + ) if isinstance(self.min_bin_size, numbers.Integral): if not 1 <= self.min_bin_size: - raise ValueError("min_bin_size must be at least 1, got %s" - % self.min_bin_size) + raise ValueError( + "min_bin_size must be at least 1, got %s" % self.min_bin_size + ) else: - raise ValueError("min_bin_size must be integral number but was " - "%r" % self.min_bin_size) + raise ValueError( + "min_bin_size must be integral number but was " "%r" % self.min_bin_size + ) def _validate_targets(self, y, dtype): self.class_weight_ = None @@ -311,18 +333,20 @@ def _get_sample_weight(self, X, y, sample_weight): if n_samples == 1: raise ValueError("n_samples=1") - sample_weight = np.asarray([] - if sample_weight is None - else sample_weight, dtype=dtype) + sample_weight = np.asarray( + [] if sample_weight is None else sample_weight, dtype=dtype + ) sample_weight = sample_weight.ravel() sample_weight_count = sample_weight.shape[0] if sample_weight_count != 0 and sample_weight_count != n_samples: - raise ValueError("sample_weight and X have incompatible shapes: " - "%r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (len(sample_weight), X.shape)) + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." + % (len(sample_weight), X.shape) + ) if sample_weight_count == 0: sample_weight = np.ones(n_samples, dtype=dtype) @@ -330,15 +354,21 @@ def _get_sample_weight(self, X, y, sample_weight): sample_weight = np.full(n_samples, sample_weight, dtype=dtype) else: sample_weight = _check_array( - sample_weight, accept_sparse=False, ensure_2d=False, - dtype=dtype, order="C" + sample_weight, + accept_sparse=False, + ensure_2d=False, + dtype=dtype, + order="C", ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) return sample_weight def _get_policy(self, queue, *data): @@ -346,27 +376,31 @@ def _get_policy(self, queue, *data): def _fit(self, X, y, sample_weight, module, queue): X, y = _check_X_y( - X, y, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse='csr') + X, + y, + dtype=[np.float64, np.float32], + force_all_finite=True, + accept_sparse="csr", + ) y = self._validate_targets(y, X.dtype) sample_weight = self._get_sample_weight(X, y, sample_weight) self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ policy = self._get_policy(queue, X, y, sample_weight) X, y, sample_weight = _convert_to_supported(policy, X, y, sample_weight) params = self._get_onedal_params(X) - train_result = module.train( - policy, params, *to_table(X, y, sample_weight)) + train_result = module.train(policy, params, *to_table(X, y, sample_weight)) self._onedal_model = train_result.model if self.oob_score: if self.is_classification: self.oob_score_ = from_table(train_result.oob_err_accuracy)[0, 0] self.oob_decision_function_ = from_table( - train_result.oob_err_decision_function) + train_result.oob_err_decision_function + ) if np.any(self.oob_decision_function_ == 0): warnings.warn( "Some inputs do not have OOB scores. This probably means " @@ -377,7 +411,8 @@ def _fit(self, X, y, sample_weight, module, queue): else: self.oob_score_ = from_table(train_result.oob_err_r2)[0, 0] self.oob_prediction_ = from_table( - train_result.oob_err_prediction).reshape(-1) + train_result.oob_err_prediction + ).reshape(-1) if np.any(self.oob_prediction_ == 0): warnings.warn( "Some inputs do not have OOB scores. This probably means " @@ -391,12 +426,13 @@ def _fit(self, X, y, sample_weight, module, queue): def _create_model(self, module): # TODO: # upate error msg. - raise NotImplementedError('Creating model is not supported.') + raise NotImplementedError("Creating model is not supported.") def _predict(self, X, module, queue): _check_is_fitted(self) - X = _check_array(X, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse=False) + X = _check_array( + X, dtype=[np.float64, np.float32], force_all_finite=True, accept_sparse=False + ) _check_n_features(self, X, False) policy = self._get_policy(queue, X) @@ -409,13 +445,14 @@ def _predict(self, X, module, queue): def _predict_proba(self, X, module, queue): _check_is_fitted(self) - X = _check_array(X, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse=False) + X = _check_array( + X, dtype=[np.float64, np.float32], force_all_finite=True, accept_sparse=False + ) _check_n_features(self, X, False) policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) params = self._get_onedal_params(X) - params['infer_mode'] = 'class_probabilities' + params["infer_mode"] = "class_probabilities" model = self._onedal_model result = module.infer(policy, params, model, to_table(X)) @@ -424,33 +461,35 @@ def _predict_proba(self, X, module, queue): class RandomForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features='sqrt' if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='best', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt" if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="best", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -476,12 +515,14 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = True def _validate_targets(self, y, dtype): y, self.class_weight_, self.classes_ = _validate_targets( - y, self.class_weight, dtype) + y, self.class_weight, dtype + ) # Decapsulate classes_ attributes # TODO: @@ -491,50 +532,49 @@ def _validate_targets(self, y, dtype): return y def fit(self, X, y, sample_weight=None, queue=None): - return self._fit(X, y, sample_weight, - _backend.decision_forest.classification, queue) + return self._fit( + X, y, sample_weight, _backend.decision_forest.classification, queue + ) def predict(self, X, queue=None): pred = super()._predict(X, _backend.decision_forest.classification, queue) - return np.take( - self.classes_, - pred.ravel().astype( - np.int64, - casting='unsafe')) + return np.take(self.classes_, pred.ravel().astype(np.int64, casting="unsafe")) def predict_proba(self, X, queue=None): return super()._predict_proba(X, _backend.decision_forest.classification, queue) class RandomForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features=1.0 if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='best', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='dense', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0 if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="best", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -560,49 +600,53 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = False def fit(self, X, y, sample_weight=None, queue=None): if sample_weight is not None: - if hasattr(sample_weight, '__array__'): + if hasattr(sample_weight, "__array__"): sample_weight[sample_weight == 0.0] = 1.0 sample_weight = [sample_weight] - return super()._fit(X, y, sample_weight, - _backend.decision_forest.regression, queue) + return super()._fit( + X, y, sample_weight, _backend.decision_forest.regression, queue + ) def predict(self, X, queue=None): return super()._predict(X, _backend.decision_forest.regression, queue).ravel() class ExtraTreesClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='random', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="random", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -628,12 +672,14 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = True def _validate_targets(self, y, dtype): y, self.class_weight_, self.classes_ = _validate_targets( - y, self.class_weight, dtype) + y, self.class_weight, dtype + ) # Decapsulate classes_ attributes # TODO: @@ -643,50 +689,49 @@ def _validate_targets(self, y, dtype): return y def fit(self, X, y, sample_weight=None, queue=None): - return self._fit(X, y, sample_weight, - _backend.decision_forest.classification, queue) + return self._fit( + X, y, sample_weight, _backend.decision_forest.classification, queue + ) def predict(self, X, queue=None): pred = super()._predict(X, _backend.decision_forest.classification, queue) - return np.take( - self.classes_, - pred.ravel().astype( - np.int64, - casting='unsafe')) + return np.take(self.classes_, pred.ravel().astype(np.int64, casting="unsafe")) def predict_proba(self, X, queue=None): return super()._predict_proba(X, _backend.decision_forest.classification, queue) class ExtraTreesRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='random', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="random", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -712,16 +757,18 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = False def fit(self, X, y, sample_weight=None, queue=None): if sample_weight is not None: - if hasattr(sample_weight, '__array__'): + if hasattr(sample_weight, "__array__"): sample_weight[sample_weight == 0.0] = 1.0 sample_weight = [sample_weight] - return super()._fit(X, y, sample_weight, - _backend.decision_forest.regression, queue) + return super()._fit( + X, y, sample_weight, _backend.decision_forest.regression, queue + ) def predict(self, X, queue=None): return super()._predict(X, _backend.decision_forest.regression, queue).ravel() diff --git a/onedal/ensemble/tests/test_random_forest.py b/onedal/ensemble/tests/test_random_forest.py index 84fab6ea44..317c63556a 100644 --- a/onedal/ensemble/tests/test_random_forest.py +++ b/onedal/ensemble/tests/test_random_forest.py @@ -14,56 +14,64 @@ # limitations under the License. # =============================================================================== -import pytest import numpy as np +import pytest from numpy.testing import assert_allclose +from sklearn.datasets import make_classification, make_regression from daal4py.sklearn._utils import daal_check_version from onedal.ensemble import RandomForestClassifier, RandomForestRegressor from onedal.tests.utils._device_selection import get_queues -from sklearn.datasets import make_classification, make_regression - -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_rf_classifier(queue): - X, y = make_classification(n_samples=100, n_features=4, - n_informative=2, n_redundant=0, - random_state=0, shuffle=False) - rf = RandomForestClassifier( - max_depth=2, random_state=0).fit(X, y, queue=queue) + X, y = make_classification( + n_samples=100, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, + ) + rf = RandomForestClassifier(max_depth=2, random_state=0).fit(X, y, queue=queue) assert_allclose([1], rf.predict([[0, 0, 0, 0]], queue=queue)) -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_rf_regression(queue): - X, y = make_regression(n_samples=100, n_features=4, n_informative=2, - random_state=0, shuffle=False) - rf = RandomForestRegressor( - max_depth=2, random_state=0).fit(X, y, queue=queue) - assert_allclose( - [-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2) + X, y = make_regression( + n_samples=100, n_features=4, n_informative=2, random_state=0, shuffle=False + ) + rf = RandomForestRegressor(max_depth=2, random_state=0).fit(X, y, queue=queue) + assert_allclose([-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2) -@pytest.mark.skipif(not daal_check_version((2023, 'P', 101)), - reason='requires OneDAL 2023.1.1') -@pytest.mark.parametrize('queue', get_queues('gpu')) +@pytest.mark.skipif( + not daal_check_version((2023, "P", 101)), reason="requires OneDAL 2023.1.1" +) +@pytest.mark.parametrize("queue", get_queues("gpu")) def test_rf_classifier_random_splitter(queue): - X, y = make_classification(n_samples=100, n_features=4, - n_informative=2, n_redundant=0, - random_state=0, shuffle=False) - rf = RandomForestClassifier( - max_depth=2, random_state=0, - splitter_mode='random').fit(X, y, queue=queue) + X, y = make_classification( + n_samples=100, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, + ) + rf = RandomForestClassifier(max_depth=2, random_state=0, splitter_mode="random").fit( + X, y, queue=queue + ) assert_allclose([1], rf.predict([[0, 0, 0, 0]], queue=queue)) -@pytest.mark.parametrize('queue', get_queues('gpu')) +@pytest.mark.parametrize("queue", get_queues("gpu")) def test_rf_regression_random_splitter(queue): - X, y = make_regression(n_samples=100, n_features=4, n_informative=2, - random_state=0, shuffle=False) - rf = RandomForestRegressor( - max_depth=2, random_state=0, - splitter_mode='random').fit(X, y, queue=queue) - assert_allclose( - [-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2) + X, y = make_regression( + n_samples=100, n_features=4, n_informative=2, random_state=0, shuffle=False + ) + rf = RandomForestRegressor(max_depth=2, random_state=0, splitter_mode="random").fit( + X, y, queue=queue + ) + assert_allclose([-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2) diff --git a/onedal/linear_model/__init__.py b/onedal/linear_model/__init__.py index b7be0fbcf4..ee4de6210c 100755 --- a/onedal/linear_model/__init__.py +++ b/onedal/linear_model/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .linear_model import LinearRegression -__all__ = ['LinearRegression'] +__all__ = ["LinearRegression"] diff --git a/onedal/linear_model/linear_model.py b/onedal/linear_model/linear_model.py index 8861effe89..3c9a310bff 100755 --- a/onedal/linear_model/linear_model.py +++ b/onedal/linear_model/linear_model.py @@ -14,25 +14,20 @@ # limitations under the License. # =============================================================================== -from sklearn.base import BaseEstimator from abc import ABCMeta, abstractmethod +from numbers import Number import numpy as np -from numbers import Number +from sklearn.base import BaseEstimator -from daal4py.sklearn._utils import (get_dtype, make2d) -from ..datatypes import ( - _check_X_y, - _num_features, - _check_array, - _check_n_features, - _convert_to_supported) +from daal4py.sklearn._utils import get_dtype, make2d +from onedal import _backend +from ..common._estimator_checks import _check_is_fitted from ..common._mixin import RegressorMixin from ..common._policy import _get_policy -from ..common._estimator_checks import _check_is_fitted -from ..datatypes._data_conversion import from_table, to_table -from onedal import _backend +from ..datatypes import _convert_to_supported, from_table, to_table +from ..utils import _check_array, _check_n_features, _check_X_y, _num_features class BaseLinearRegression(BaseEstimator, metaclass=ABCMeta): @@ -46,11 +41,12 @@ def _get_policy(self, queue, *data): return _get_policy(queue, *data) def _get_onedal_params(self, dtype=np.float32): - intercept = 'intercept|' if self.fit_intercept else '' + intercept = "intercept|" if self.fit_intercept else "" return { - 'fptype': 'float' if dtype == np.float32 else 'double', - 'method': self.algorithm, 'intercept': self.fit_intercept, - 'result_option': (intercept + 'coefficients'), + "fptype": "float" if dtype == np.float32 else "double", + "method": self.algorithm, + "intercept": self.fit_intercept, + "result_option": (intercept + "coefficients"), } def _fit(self, X, y, module, queue): @@ -68,8 +64,7 @@ def _fit(self, X, y, module, queue): y_loc = np.asarray(y_loc).astype(dtype=dtype) # Finiteness is checked in the sklearnex wrapper - X_loc, y_loc = _check_X_y( - X_loc, y_loc, force_all_finite=False, accept_2d_y=True) + X_loc, y_loc = _check_X_y(X_loc, y_loc, force_all_finite=False, accept_2d_y=True) self.n_features_in_ = _num_features(X_loc, fallback_1d=True) @@ -113,14 +108,18 @@ def _create_model(self, module, policy): intercept = np.asarray(intercept, dtype=dtype) assert n_targets_in == intercept.size - intercept = _check_array(intercept, dtype=[np.float64, np.float32], - force_all_finite=True, ensure_2d=False) + intercept = _check_array( + intercept, + dtype=[np.float64, np.float32], + force_all_finite=True, + ensure_2d=False, + ) coefficients = _check_array( coefficients, - dtype=[ - np.float64, - np.float32], - force_all_finite=True, ensure_2d=False) + dtype=[np.float64, np.float32], + force_all_finite=True, + ensure_2d=False, + ) coefficients, intercept = make2d(coefficients), make2d(intercept) coefficients = coefficients.T if n_targets_in == 1 else coefficients @@ -154,11 +153,12 @@ def _predict(self, X, module, queue): X_loc = X # Finiteness is checked in the sklearnex wrapper - X_loc = _check_array(X_loc, dtype=[np.float64, np.float32], - force_all_finite=False, ensure_2d=False) + X_loc = _check_array( + X_loc, dtype=[np.float64, np.float32], force_all_finite=False, ensure_2d=False + ) _check_n_features(self, X_loc, False) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(module, policy) @@ -188,12 +188,8 @@ class LinearRegression(RegressorMixin, BaseLinearRegression): """ def __init__( - self, - fit_intercept=True, - copy_X=False, - *, - algorithm='norm_eq', - **kwargs): + self, fit_intercept=True, copy_X=False, *, algorithm="norm_eq", **kwargs + ): super().__init__(fit_intercept=fit_intercept, copy_X=copy_X, algorithm=algorithm) def fit(self, X, y, queue=None): diff --git a/onedal/linear_model/tests/test_linear_regression.py b/onedal/linear_model/tests/test_linear_regression.py index 6809ef30b3..5962c530b1 100755 --- a/onedal/linear_model/tests/test_linear_regression.py +++ b/onedal/linear_model/tests/test_linear_regression.py @@ -16,33 +16,32 @@ from daal4py.sklearn._utils import daal_check_version, sklearn_check_version -if daal_check_version((2023, 'P', 100)): - import pytest +if daal_check_version((2023, "P", 100)): import numpy as np + import pytest from numpy.testing import assert_allclose, assert_array_equal - - from onedal.linear_model import LinearRegression - from onedal.tests.utils._device_selection import get_queues - from sklearn.datasets import load_diabetes from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + from onedal.linear_model import LinearRegression + from onedal.tests.utils._device_selection import get_queues + + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_diabetes(queue, dtype): X, y = load_diabetes(return_X_y=True) X, y = X.astype(dtype), y.astype(dtype) - X_train, X_test, y_train, y_test = \ - train_test_split(X, y, - train_size=0.8, random_state=777) + X_train, X_test, y_train, y_test = train_test_split( + X, y, train_size=0.8, random_state=777 + ) model = LinearRegression(fit_intercept=True) model.fit(X_train, y_train, queue=queue) y_pred = model.predict(X_test, queue=queue) assert mean_squared_error(y_test, y_pred) < 2396 - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_pickle(queue, dtype): X, y = load_diabetes(return_X_y=True) X, y = X.astype(dtype), y.astype(dtype) @@ -51,6 +50,7 @@ def test_pickle(queue, dtype): expected = model.predict(X, queue=queue) import pickle + dump = pickle.dumps(model) model2 = pickle.loads(dump) @@ -59,8 +59,8 @@ def test_pickle(queue, dtype): assert_array_equal(expected, result) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_full_results(queue, dtype): seed = 42 f_count, r_count = 19, 7 @@ -90,8 +90,8 @@ def test_full_results(queue, dtype): tol = 2e-4 if res.dtype == np.float32 else 1e-7 assert_allclose(gtr, res, rtol=tol) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_no_intercept_results(queue, dtype): seed = 42 f_count, r_count = 19, 7 @@ -117,8 +117,8 @@ def test_no_intercept_results(queue, dtype): tol = 5e-5 if res.dtype == np.float32 else 1e-7 assert_allclose(gtr, res, rtol=tol) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_reconstruct_model(queue, dtype): seed = 42 s_count = 3500 diff --git a/onedal/neighbors/__init__.py b/onedal/neighbors/__init__.py index c535172bb0..a8aede3c4d 100755 --- a/onedal/neighbors/__init__.py +++ b/onedal/neighbors/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors -__all__ = ['KNeighborsClassifier', 'KNeighborsRegressor', 'NearestNeighbors'] +__all__ = ["KNeighborsClassifier", "KNeighborsRegressor", "NearestNeighbors"] diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index a3f84d21c3..52f73f8fa9 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,35 +12,33 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABCMeta - from numbers import Integral import numpy as np -from ..datatypes import ( - _check_X_y, - _check_array, - _column_or_1d, - _check_n_features, - _check_classification_targets, - _convert_to_supported, - _num_samples -) from daal4py import ( - bf_knn_classification_training, bf_knn_classification_prediction, + bf_knn_classification_training, + kdtree_knn_classification_prediction, kdtree_knn_classification_training, - kdtree_knn_classification_prediction ) from onedal import _backend +from ..common._estimator_checks import _check_is_fitted, _is_classifier, _is_regressor from ..common._mixin import ClassifierMixin, RegressorMixin from ..common._policy import _get_policy -from ..common._estimator_checks import _check_is_fitted, _is_classifier, _is_regressor -from ..datatypes._data_conversion import from_table, to_table +from ..datatypes import _convert_to_supported, from_table, to_table +from ..utils import ( + _check_array, + _check_classification_targets, + _check_n_features, + _check_X_y, + _column_or_1d, + _num_samples, +) class NeighborsCommonBase(metaclass=ABCMeta): @@ -50,21 +48,23 @@ def _get_policy(self, queue, *data): def _parse_auto_method(self, method, n_samples, n_features): result_method = method - if (method in ['auto', 'ball_tree']): - condition = self.n_neighbors is not None and \ - self.n_neighbors >= n_samples // 2 - if self.metric == 'precomputed' or n_features > 15 or condition: - result_method = 'brute' + if method in ["auto", "ball_tree"]: + condition = ( + self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 + ) + if self.metric == "precomputed" or n_features > 15 or condition: + result_method = "brute" else: - if self.metric == 'euclidean': - result_method = 'kd_tree' + if self.metric == "euclidean": + result_method = "kd_tree" else: - result_method = 'brute' + result_method = "brute" return result_method - def _validate_data(self, X, y=None, reset=True, - validate_separately=False, **check_params): + def _validate_data( + self, X, y=None, reset=True, validate_separately=False, **check_params + ): if y is None: if self.requires_y: raise ValueError( @@ -86,7 +86,7 @@ def _validate_data(self, X, y=None, reset=True, X, y = _check_X_y(X, y, **check_params) out = X, y - if check_params.get('ensure_2d', True): + if check_params.get("ensure_2d", True): _check_n_features(self, X, reset=reset) return out @@ -124,42 +124,48 @@ def _get_weights(self, dist, weights): def _get_onedal_params(self, X, y=None): class_count = 0 if self.classes_ is None else len(self.classes_) - weights = getattr(self, 'weights', 'uniform') + weights = getattr(self, "weights", "uniform") return { - 'fptype': 'float' if X.dtype == np.float32 else 'double', - 'vote_weights': 'uniform' if weights == 'uniform' else 'distance', - 'method': self._fit_method, - 'radius': self.radius, - 'class_count': class_count, - 'neighbor_count': self.n_neighbors, - 'metric': self.effective_metric_, - 'p': self.p, - 'metric_params': self.effective_metric_params_, - 'result_option': 'indices|distances' if y is None else 'responses', + "fptype": "float" if X.dtype == np.float32 else "double", + "vote_weights": "uniform" if weights == "uniform" else "distance", + "method": self._fit_method, + "radius": self.radius, + "class_count": class_count, + "neighbor_count": self.n_neighbors, + "metric": self.effective_metric_, + "p": self.p, + "metric_params": self.effective_metric_params_, + "result_option": "indices|distances" if y is None else "responses", } def _get_daal_params(self, data): class_count = 0 if self.classes_ is None else len(self.classes_) - weights = getattr(self, 'weights', 'uniform') + weights = getattr(self, "weights", "uniform") params = { - 'fptype': 'float' if data.dtype == np.float32 else 'double', - 'method': 'defaultDense', - 'k': self.n_neighbors, - 'voteWeights': 'voteUniform' if weights == 'uniform' else 'voteDistance', - 'resultsToCompute': 'computeIndicesOfNeighbors|computeDistances', - 'resultsToEvaluate': 'none' - if getattr(self, '_y', None) is None or _is_regressor(self) - else 'computeClassLabels' + "fptype": "float" if data.dtype == np.float32 else "double", + "method": "defaultDense", + "k": self.n_neighbors, + "voteWeights": "voteUniform" if weights == "uniform" else "voteDistance", + "resultsToCompute": "computeIndicesOfNeighbors|computeDistances", + "resultsToEvaluate": "none" + if getattr(self, "_y", None) is None or _is_regressor(self) + else "computeClassLabels", } if class_count != 0: - params['nClasses'] = class_count + params["nClasses"] = class_count return params class NeighborsBase(NeighborsCommonBase, metaclass=ABCMeta): - def __init__(self, n_neighbors=None, radius=None, - algorithm='auto', metric='minkowski', - p=2, metric_params=None): + def __init__( + self, + n_neighbors=None, + radius=None, + algorithm="auto", + metric="minkowski", + p=2, + metric_params=None, + ): self.n_neighbors = n_neighbors self.radius = radius self.algorithm = algorithm @@ -179,19 +185,21 @@ def _validate_n_classes(self): if len(self.classes_) < 2: raise ValueError( "The number of classes has to be greater than one; got %d" - " class" % len(self.classes_)) + " class" % len(self.classes_) + ) def _fit(self, X, y, queue): self._onedal_model = None self._tree = None self._shape = None self.classes_ = None - self.effective_metric_ = getattr(self, 'effective_metric_', self.metric) + self.effective_metric_ = getattr(self, "effective_metric_", self.metric) self.effective_metric_params_ = getattr( - self, 'effective_metric_params_', self.metric_params) + self, "effective_metric_params_", self.metric_params + ) if y is not None or self.requires_y: - shape = getattr(y, 'shape', None) + shape = getattr(y, "shape", None) X, y = super()._validate_data(X, y, dtype=[np.float64, np.float32]) self._shape = shape if shape is not None else y.shape @@ -206,8 +214,7 @@ def _fit(self, X, y, queue): self.classes_ = [] self._y = np.empty(y.shape, dtype=int) for k in range(self._y.shape[1]): - classes, self._y[:, k] = np.unique( - y[:, k], return_inverse=True) + classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes) if not self.outputs_2d_: @@ -226,19 +233,16 @@ def _fit(self, X, y, queue): if self.n_neighbors is not None: if self.n_neighbors <= 0: - raise ValueError( - "Expected n_neighbors > 0. Got %d" % - self.n_neighbors - ) + raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) if not isinstance(self.n_neighbors, Integral): raise TypeError( "n_neighbors does not take %s value, " - "enter integer value" % - type(self.n_neighbors)) + "enter integer value" % type(self.n_neighbors) + ) self._fit_method = super()._parse_auto_method( - self.algorithm, - self.n_samples_fit_, self.n_features_in_) + self.algorithm, self.n_samples_fit_, self.n_features_in_ + ) _fit_y = None gpu_device = queue is not None and queue.sycl_device.is_gpu @@ -255,34 +259,34 @@ def _fit(self, X, y, queue): return result - def _kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): - n_features = getattr(self, 'n_features_in_', None) - shape = getattr(X, 'shape', None) + def _kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): + n_features = getattr(self, "n_features_in_", None) + shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError((f'X has {X.shape[1]} features, ' - f'but kneighbors is expecting ' - f'{n_features} features as input')) + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but kneighbors is expecting " + f"{n_features} features as input" + ) + ) _check_is_fitted(self) if n_neighbors is None: n_neighbors = self.n_neighbors elif n_neighbors <= 0: - raise ValueError( - "Expected n_neighbors > 0. Got %d" % - n_neighbors - ) + raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) else: if not isinstance(n_neighbors, Integral): raise TypeError( "n_neighbors does not take %s value, " - "enter integer value" % - type(n_neighbors)) + "enter integer value" % type(n_neighbors) + ) if X is not None: query_is_train = False - X = _check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) else: query_is_train = True X = self._fit_X @@ -295,31 +299,32 @@ def _kneighbors(self, X=None, n_neighbors=None, if n_neighbors > n_samples_fit: raise ValueError( "Expected n_neighbors <= n_samples, " - " but n_samples = %d, n_neighbors = %d" % - (n_samples_fit, n_neighbors) + " but n_samples = %d, n_neighbors = %d" % (n_samples_fit, n_neighbors) ) chunked_results = None method = super()._parse_auto_method( - self._fit_method, self.n_samples_fit_, n_features) + self._fit_method, self.n_samples_fit_, n_features + ) gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: params = super()._get_daal_params(X) else: params = super()._get_onedal_params(X) prediction_results = self._onedal_predict( - self._onedal_model, X, params, queue=queue) + self._onedal_model, X, params, queue=queue + ) - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: distances = prediction_results.distances indices = prediction_results.indices else: distances = from_table(prediction_results.distances) indices = from_table(prediction_results.indices) - if method == 'kd_tree': + if method == "kd_tree": for i in range(distances.shape[0]): seq = distances[i].argsort() indices[i] = indices[i][seq] @@ -361,26 +366,34 @@ def _kneighbors(self, X=None, n_neighbors=None, dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False - neigh_ind = np.reshape( - neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) if return_distance: - neigh_dist = np.reshape( - neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) return neigh_dist, neigh_ind return neigh_ind class KNeighborsClassifier(NeighborsBase, ClassifierMixin): - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', - p=2, metric='minkowski', metric_params=None, **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + p=2, + metric="minkowski", + metric_params=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - metric=metric, p=p, + metric=metric, + p=p, metric_params=metric_params, - **kwargs) + **kwargs, + ) self.weights = weights def _get_onedal_params(self, X, y=None): @@ -389,15 +402,15 @@ def _get_onedal_params(self, X, y=None): def _get_daal_params(self, data): params = super()._get_daal_params(data) - params['resultsToEvaluate'] = 'computeClassLabels' - params['resultsToCompute'] = '' + params["resultsToEvaluate"] = "computeClassLabels" + params["resultsToCompute"] = "" return params def _onedal_fit(self, X, y, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: params = self._get_daal_params(X) - if self._fit_method == 'brute': + if self._fit_method == "brute": train_alg = bf_knn_classification_training else: @@ -408,15 +421,16 @@ def _onedal_fit(self, X, y, queue): policy = self._get_policy(queue, X, y) X, y = _convert_to_supported(policy, X, y) params = self._get_onedal_params(X, y) - train_alg = _backend.neighbors.classification.train(policy, params, - *to_table(X, y)) + train_alg = _backend.neighbors.classification.train( + policy, params, *to_table(X, y) + ) return train_alg.model def _onedal_predict(self, model, X, params, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: - if self._fit_method == 'brute': + if self.effective_metric_ == "euclidean" and not gpu_device: + if self._fit_method == "brute": predict_alg = bf_knn_classification_prediction else: @@ -426,15 +440,16 @@ def _onedal_predict(self, model, X, params, queue): policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(_backend.neighbors.classification) - if 'responses' not in params['result_option']: - params['result_option'] += '|responses' - params['fptype'] = 'float' if X.dtype == np.float32 else 'double' + if "responses" not in params["result_option"]: + params["result_option"] += "|responses" + params["fptype"] = "float" if X.dtype == np.float32 else "double" result = _backend.neighbors.classification.infer( - policy, params, model, to_table(X)) + policy, params, model, to_table(X) + ) return result @@ -442,37 +457,40 @@ def fit(self, X, y, queue=None): return super()._fit(X, y, queue=queue) def predict(self, X, queue=None): - X = _check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) - onedal_model = getattr(self, '_onedal_model', None) - n_features = getattr(self, 'n_features_in_', None) - n_samples_fit_ = getattr(self, 'n_samples_fit_', None) - shape = getattr(X, 'shape', None) + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + onedal_model = getattr(self, "_onedal_model", None) + n_features = getattr(self, "n_features_in_", None) + n_samples_fit_ = getattr(self, "n_samples_fit_", None) + shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError((f'X has {X.shape[1]} features, ' - f'but KNNClassifier is expecting ' - f'{n_features} features as input')) + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but KNNClassifier is expecting " + f"{n_features} features as input" + ) + ) _check_is_fitted(self) self._fit_method = super()._parse_auto_method( - self.algorithm, - n_samples_fit_, n_features) + self.algorithm, n_samples_fit_, n_features + ) self._validate_n_classes() gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: params = self._get_daal_params(X) else: params = self._get_onedal_params(X) prediction_result = self._onedal_predict(onedal_model, X, params, queue=queue) - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: responses = prediction_result.prediction else: responses = from_table(prediction_result.responses) - result = self.classes_.take( - np.asarray(responses.ravel(), dtype=np.intp)) + result = self.classes_.take(np.asarray(responses.ravel(), dtype=np.intp)) return result @@ -513,21 +531,30 @@ def predict_proba(self, X, queue=None): return probabilities - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super()._kneighbors(X, n_neighbors, return_distance, queue=queue) class KNeighborsRegressor(NeighborsBase, RegressorMixin): - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', - p=2, metric='minkowski', metric_params=None, **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + p=2, + metric="minkowski", + metric_params=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - metric=metric, p=p, + metric=metric, + p=p, metric_params=metric_params, - **kwargs) + **kwargs, + ) self.weights = weights def _get_onedal_params(self, X, y=None): @@ -536,15 +563,15 @@ def _get_onedal_params(self, X, y=None): def _get_daal_params(self, data): params = super()._get_daal_params(data) - params['resultsToCompute'] = 'computeIndicesOfNeighbors|computeDistances' - params['resultsToEvaluate'] = 'none' + params["resultsToCompute"] = "computeIndicesOfNeighbors|computeDistances" + params["resultsToEvaluate"] = "none" return params def _onedal_fit(self, X, y, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: params = self._get_daal_params(X) - if self._fit_method == 'brute': + if self._fit_method == "brute": train_alg = bf_knn_classification_training else: @@ -564,8 +591,8 @@ def _onedal_fit(self, X, y, queue): def _onedal_predict(self, model, X, params, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: - if self._fit_method == 'brute': + if self.effective_metric_ == "euclidean" and not gpu_device: + if self._fit_method == "brute": predict_alg = bf_knn_classification_prediction else: @@ -575,16 +602,17 @@ def _onedal_predict(self, model, X, params, queue): policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) - backend = _backend.neighbors.regression if gpu_device \ - else _backend.neighbors.search + backend = ( + _backend.neighbors.regression if gpu_device else _backend.neighbors.search + ) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(backend) - if 'responses' not in params['result_option'] and gpu_device: - params['result_option'] += '|responses' - params['fptype'] = 'float' if X.dtype == np.float32 else 'double' + if "responses" not in params["result_option"] and gpu_device: + params["result_option"] += "|responses" + params["fptype"] = "float" if X.dtype == np.float32 else "double" result = backend.infer(policy, params, model, to_table(X)) return result @@ -592,26 +620,29 @@ def _onedal_predict(self, model, X, params, queue): def fit(self, X, y, queue=None): return super()._fit(X, y, queue=queue) - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super()._kneighbors(X, n_neighbors, return_distance, queue=queue) def _predict_gpu(self, X, queue=None): - X = _check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) - onedal_model = getattr(self, '_onedal_model', None) - n_features = getattr(self, 'n_features_in_', None) - n_samples_fit_ = getattr(self, 'n_samples_fit_', None) - shape = getattr(X, 'shape', None) + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + onedal_model = getattr(self, "_onedal_model", None) + n_features = getattr(self, "n_features_in_", None) + n_samples_fit_ = getattr(self, "n_samples_fit_", None) + shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError((f'X has {X.shape[1]} features, ' - f'but KNNClassifier is expecting ' - f'{n_features} features as input')) + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but KNNClassifier is expecting " + f"{n_features} features as input" + ) + ) _check_is_fitted(self) self._fit_method = super()._parse_auto_method( - self.algorithm, - n_samples_fit_, n_features) + self.algorithm, n_samples_fit_, n_features + ) params = self._get_onedal_params(X) @@ -647,21 +678,34 @@ def _predict_skl(self, X, queue=None): def predict(self, X, queue=None): gpu_device = queue is not None and queue.sycl_device.is_gpu - is_uniform_weights = getattr(self, 'weights', 'uniform') == 'uniform' - return self._predict_gpu(X, queue=queue) \ - if gpu_device and is_uniform_weights else self._predict_skl(X, queue=queue) + is_uniform_weights = getattr(self, "weights", "uniform") == "uniform" + return ( + self._predict_gpu(X, queue=queue) + if gpu_device and is_uniform_weights + else self._predict_skl(X, queue=queue) + ) class NearestNeighbors(NeighborsBase): - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', - p=2, metric='minkowski', metric_params=None, **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + p=2, + metric="minkowski", + metric_params=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - metric=metric, p=p, + metric=metric, + p=p, metric_params=metric_params, - **kwargs) + **kwargs, + ) self.weights = weights def _get_onedal_params(self, X, y=None): @@ -670,16 +714,17 @@ def _get_onedal_params(self, X, y=None): def _get_daal_params(self, data): params = super()._get_daal_params(data) - params['resultsToCompute'] = 'computeIndicesOfNeighbors|computeDistances' - params['resultsToEvaluate'] = 'none' if getattr(self, '_y', None) is None \ - else 'computeClassLabels' + params["resultsToCompute"] = "computeIndicesOfNeighbors|computeDistances" + params["resultsToEvaluate"] = ( + "none" if getattr(self, "_y", None) is None else "computeClassLabels" + ) return params def _onedal_fit(self, X, y, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: params = self._get_daal_params(X) - if self._fit_method == 'brute': + if self._fit_method == "brute": train_alg = bf_knn_classification_training else: @@ -690,15 +735,14 @@ def _onedal_fit(self, X, y, queue): policy = self._get_policy(queue, X, y) X, y = _convert_to_supported(policy, X, y) params = self._get_onedal_params(X, y) - train_alg = _backend.neighbors.search.train(policy, params, - to_table(X)) + train_alg = _backend.neighbors.search.train(policy, params, to_table(X)) return train_alg.model def _onedal_predict(self, model, X, params, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: - if self._fit_method == 'brute': + if self.effective_metric_ == "euclidean" and not gpu_device: + if self._fit_method == "brute": predict_alg = bf_knn_classification_prediction else: @@ -708,12 +752,12 @@ def _onedal_predict(self, model, X, params, queue): policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(_backend.neighbors.search) - params['fptype'] = 'float' if X.dtype == np.float32 else 'double' + params["fptype"] = "float" if X.dtype == np.float32 else "double" result = _backend.neighbors.search.infer(policy, params, model, to_table(X)) return result @@ -721,6 +765,5 @@ def _onedal_predict(self, model, X, params, queue): def fit(self, X, y, queue=None): return super()._fit(X, y, queue=queue) - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super()._kneighbors(X, n_neighbors, return_distance, queue=queue) diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index c44e658e9d..8941f49965 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np +import pytest from numpy.testing import assert_array_equal +from sklearn import datasets from onedal.neighbors import KNeighborsClassifier from onedal.tests.utils._device_selection import get_queues -from sklearn import datasets - -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_iris(queue): iris = datasets.load_iris() clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) @@ -33,13 +32,14 @@ def test_iris(queue): # TODO: investigate failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) +@pytest.mark.parametrize("queue", get_queues("cpu")) def test_pickle(queue): iris = datasets.load_iris() clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) expected = clf.predict(iris.data, queue=queue) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 3b1a72bffc..a409999030 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -14,13 +14,14 @@ # limitations under the License. # =============================================================================== -from .kernel_functions import linear_kernel, rbf_kernel, poly_kernel, sigmoid_kernel from .get_tree import get_tree_state_cls, get_tree_state_reg +from .kernel_functions import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel __all__ = [ - 'get_tree_state_cls', - 'get_tree_state_reg', - 'linear_kernel', - 'rbf_kernel', - 'poly_kernel', - 'sigmoid_kernel'] + "get_tree_state_cls", + "get_tree_state_reg", + "linear_kernel", + "rbf_kernel", + "poly_kernel", + "sigmoid_kernel", +] diff --git a/onedal/primitives/get_tree.py b/onedal/primitives/get_tree.py index 2ba33e1e61..9afd86624b 100644 --- a/onedal/primitives/get_tree.py +++ b/onedal/primitives/get_tree.py @@ -18,8 +18,7 @@ def get_tree_state_cls(model, iTree, n_classes): - return _backend.get_tree.classification.get_tree_state( - model, iTree, n_classes) + return _backend.get_tree.classification.get_tree_state(model, iTree, n_classes) def get_tree_state_reg(model, iTree): diff --git a/onedal/primitives/kernel_functions.py b/onedal/primitives/kernel_functions.py index 6a3800614a..aaa2eb3380 100644 --- a/onedal/primitives/kernel_functions.py +++ b/onedal/primitives/kernel_functions.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np -from onedal.datatypes import _check_array + from onedal import _backend from ..common._policy import _get_policy -from ..datatypes._data_conversion import from_table, to_table, _convert_to_supported +from ..datatypes import _convert_to_supported, from_table, to_table +from ..utils import _check_array def _check_inputs(X, Y): def check_input(data): return _check_array(data, dtype=[np.float64, np.float32], force_all_finite=False) + X = check_input(X) Y = X if Y is None else check_input(Y) return X, Y @@ -33,7 +35,7 @@ def check_input(data): def _compute_kernel(params, submodule, X, Y, queue): policy = _get_policy(queue, X, Y) X, Y = _convert_to_supported(policy, X, Y) - params['fptype'] = 'float' if X.dtype == np.float32 else 'double' + params["fptype"] = "float" if X.dtype == np.float32 else "double" X, Y = to_table(X, Y) result = submodule.compute(policy, params, X, Y) return from_table(result.values) @@ -57,9 +59,13 @@ def linear_kernel(X, Y=None, scale=1.0, shift=0.0, queue=None): kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y) """ X, Y = _check_inputs(X, Y) - return _compute_kernel({'method': 'dense', - 'scale': scale, 'shift': shift}, - _backend.linear_kernel, X, Y, queue) + return _compute_kernel( + {"method": "dense", "scale": scale, "shift": shift}, + _backend.linear_kernel, + X, + Y, + queue, + ) def rbf_kernel(X, Y=None, gamma=None, queue=None): @@ -85,8 +91,9 @@ def rbf_kernel(X, Y=None, gamma=None, queue=None): gamma = 1.0 / X.shape[1] if gamma is None else gamma sigma = np.sqrt(0.5 / gamma) - return _compute_kernel({'method': 'dense', 'sigma': sigma}, - _backend.rbf_kernel, X, Y, queue) + return _compute_kernel( + {"method": "dense", "sigma": sigma}, _backend.rbf_kernel, X, Y, queue + ) def poly_kernel(X, Y=None, gamma=1.0, coef0=0.0, degree=3, queue=None): @@ -109,9 +116,13 @@ def poly_kernel(X, Y=None, gamma=1.0, coef0=0.0, degree=3, queue=None): """ X, Y = _check_inputs(X, Y) - return _compute_kernel({'method': 'dense', - 'scale': gamma, 'shift': coef0, 'degree': degree}, - _backend.polynomial_kernel, X, Y, queue) + return _compute_kernel( + {"method": "dense", "scale": gamma, "shift": coef0, "degree": degree}, + _backend.polynomial_kernel, + X, + Y, + queue, + ) def sigmoid_kernel(X, Y=None, gamma=1.0, coef0=0.0, queue=None): @@ -133,6 +144,10 @@ def sigmoid_kernel(X, Y=None, gamma=1.0, coef0=0.0, queue=None): """ X, Y = _check_inputs(X, Y) - return _compute_kernel({'method': 'dense', - 'scale': gamma, 'shift': coef0}, - _backend.sigmoid_kernel, X, Y, queue) + return _compute_kernel( + {"method": "dense", "scale": gamma, "shift": coef0}, + _backend.sigmoid_kernel, + X, + Y, + queue, + ) diff --git a/onedal/primitives/tests/test_kernel_functions.py b/onedal/primitives/tests/test_kernel_functions.py index e5682605c8..d8589a8e07 100644 --- a/onedal/primitives/tests/test_kernel_functions.py +++ b/onedal/primitives/tests/test_kernel_functions.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,21 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np +import pytest from numpy.testing import assert_allclose -from onedal.primitives import (linear_kernel, rbf_kernel, - poly_kernel, sigmoid_kernel) from sklearn.metrics.pairwise import rbf_kernel as sklearn_rbf_kernel -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) +from onedal.primitives import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) +@pytest.mark.parametrize("queue", get_queues("cpu")) def test_dense_self_linear_kernel(queue): rng = np.random.RandomState(0) X = np.array(5 * rng.random_sample((10, 4))) @@ -49,15 +50,15 @@ def _test_dense_small_linear_kernel(queue, scale, shift, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('scale', [1.0, 2.0]) -@pytest.mark.parametrize('shift', [0.0, 1.0]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("scale", [1.0, 2.0]) +@pytest.mark.parametrize("shift", [0.0, 1.0]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_dense_small_linear_kernel(queue, scale, shift, dtype): _test_dense_small_linear_kernel(queue, scale, shift, dtype) -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_dense_self_rbf_kernel(queue): rng = np.random.RandomState(0) X = np.array(5 * rng.random_sample((10, 4))) @@ -80,15 +81,15 @@ def _test_dense_small_rbf_kernel(queue, gamma, dtype): assert_allclose(result, expected, rtol=tol) -@pytest.mark.parametrize('gamma', [0.1, None]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("gamma", [0.1, None]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues()) def test_dense_small_rbf_kernel(queue, gamma, dtype): _test_dense_small_rbf_kernel(queue, gamma, dtype) @pass_if_not_implemented_for_gpu(reason="poly kernel is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_dense_self_poly_kernel(queue): rng = np.random.RandomState(0) X = np.array(2 * rng.random_sample((10, 4))) @@ -113,17 +114,17 @@ def _test_dense_small_poly_kernel(queue, gamma, coef0, degree, dtype): @pass_if_not_implemented_for_gpu(reason="poly kernel is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('gamma', [0.1, 1.0]) -@pytest.mark.parametrize('coef0', [0.0, 1.0]) -@pytest.mark.parametrize('degree', [2, 3]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("gamma", [0.1, 1.0]) +@pytest.mark.parametrize("coef0", [0.0, 1.0]) +@pytest.mark.parametrize("degree", [2, 3]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_dense_small_poly_kernel(queue, gamma, coef0, degree, dtype): _test_dense_small_poly_kernel(queue, gamma, coef0, degree, dtype) @pass_if_not_implemented_for_gpu(reason="sigmoid kernel is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_dense_self_sigmoid_kernel(queue): rng = np.random.RandomState(0) X = np.array(2 * rng.random_sample((15, 4))) @@ -147,9 +148,9 @@ def _test_dense_small_sigmoid_kernel(queue, gamma, coef0, dtype): @pass_if_not_implemented_for_gpu(reason="sigmoid kernel is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('gamma', [0.1, 1.0, 2.4]) -@pytest.mark.parametrize('coef0', [0.0, 1.0, 5.5]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("gamma", [0.1, 1.0, 2.4]) +@pytest.mark.parametrize("coef0", [0.0, 1.0, 5.5]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_dense_small_sigmoid_kernel(queue, gamma, coef0, dtype): _test_dense_small_sigmoid_kernel(queue, gamma, coef0, dtype) diff --git a/onedal/spmd/__init__.py b/onedal/spmd/__init__.py index 9099df571a..3c698d694b 100644 --- a/onedal/spmd/__init__.py +++ b/onedal/spmd/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== __all__ = [ - 'basic_statistics', - 'cluster', - 'decomposition', - 'ensemble', - 'linear_model', - 'neighbors'] + "basic_statistics", + "cluster", + "decomposition", + "ensemble", + "linear_model", + "neighbors", +] diff --git a/onedal/spmd/basic_statistics/__init__.py b/onedal/spmd/basic_statistics/__init__.py index 6f45ecfe5c..2b99fdbdb7 100644 --- a/onedal/spmd/basic_statistics/__init__.py +++ b/onedal/spmd/basic_statistics/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .basic_statistics import BasicStatistics -__all__ = ['BasicStatistics'] +__all__ = ["BasicStatistics"] diff --git a/onedal/spmd/basic_statistics/basic_statistics.py b/onedal/spmd/basic_statistics/basic_statistics.py index af4a5e2429..86269277d9 100644 --- a/onedal/spmd/basic_statistics/basic_statistics.py +++ b/onedal/spmd/basic_statistics/basic_statistics.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -from ...common._spmd_policy import _get_spmd_policy -from ..._device_offload import support_usm_ndarray + from onedal.basic_statistics import BasicStatistics as BasicStatistics_Batch +from ..._device_offload import support_usm_ndarray +from ...common._spmd_policy import _get_spmd_policy + class BaseBasicStatisticsSPMD(ABC): def _get_policy(self, queue, *data): @@ -26,7 +28,6 @@ def _get_policy(self, queue, *data): class BasicStatistics(BaseBasicStatisticsSPMD, BasicStatistics_Batch): - @support_usm_ndarray() def compute(self, data, weights=None, queue=None): return super().compute(data, weights, queue) diff --git a/onedal/spmd/cluster/__init__.py b/onedal/spmd/cluster/__init__.py index b94f1d3918..0c39935dc2 100644 --- a/onedal/spmd/cluster/__init__.py +++ b/onedal/spmd/cluster/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import daal_check_version -if daal_check_version((2023, 'P', 200)): +if daal_check_version((2023, "P", 200)): from .kmeans import KMeans - __all__ = ['KMeans'] + __all__ = ["KMeans"] else: __all__ = [] diff --git a/onedal/spmd/cluster/kmeans.py b/onedal/spmd/cluster/kmeans.py index 000d265af1..abab681554 100644 --- a/onedal/spmd/cluster/kmeans.py +++ b/onedal/spmd/cluster/kmeans.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -from ...common._spmd_policy import _get_spmd_policy -from ..._device_offload import support_usm_ndarray + from onedal.cluster import KMeans as KMeans_Batch +from ..._device_offload import support_usm_ndarray +from ...common._spmd_policy import _get_spmd_policy + class BaseKMeansSPMD(ABC): def _get_policy(self, queue, *data): diff --git a/onedal/spmd/decomposition/__init__.py b/onedal/spmd/decomposition/__init__.py index eda7b9fc14..618e0b9082 100644 --- a/onedal/spmd/decomposition/__init__.py +++ b/onedal/spmd/decomposition/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/onedal/spmd/decomposition/pca.py b/onedal/spmd/decomposition/pca.py index a511170ec2..e150cf8e63 100644 --- a/onedal/spmd/decomposition/pca.py +++ b/onedal/spmd/decomposition/pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from ...common._spmd_policy import _get_spmd_policy -from ..._device_offload import support_usm_ndarray from onedal.decomposition.pca import PCA as PCABatch +from ..._device_offload import support_usm_ndarray +from ...common._spmd_policy import _get_spmd_policy + class BasePCASPMD: def _get_policy(self, queue, *data): @@ -26,7 +27,6 @@ def _get_policy(self, queue, *data): class PCA(BasePCASPMD, PCABatch): - @support_usm_ndarray() def fit(self, X, queue): return super().fit(X, queue) diff --git a/onedal/spmd/ensemble/__init__.py b/onedal/spmd/ensemble/__init__.py index 5dcc919355..b53fb8f910 100644 --- a/onedal/spmd/ensemble/__init__.py +++ b/onedal/spmd/ensemble/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .forest import RandomForestClassifier, RandomForestRegressor -__all__ = ['RandomForestClassifier', 'RandomForestRegressor'] +__all__ = ["RandomForestClassifier", "RandomForestRegressor"] diff --git a/onedal/spmd/ensemble/forest.py b/onedal/spmd/ensemble/forest.py index d2e32be87c..56d18a2a0f 100644 --- a/onedal/spmd/ensemble/forest.py +++ b/onedal/spmd/ensemble/forest.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -from ...common._spmd_policy import _get_spmd_policy - from onedal.ensemble import RandomForestClassifier as RandomForestClassifier_Batch from onedal.ensemble import RandomForestRegressor as RandomForestRegressor_Batch +from ...common._spmd_policy import _get_spmd_policy + class BaseForestSPMD(ABC): def _get_policy(self, queue, *data): diff --git a/onedal/spmd/linear_model/__init__.py b/onedal/spmd/linear_model/__init__.py index 33e882bdcb..893243cd56 100644 --- a/onedal/spmd/linear_model/__init__.py +++ b/onedal/spmd/linear_model/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .linear_model import LinearRegression -__all__ = ['LinearRegression'] +__all__ = ["LinearRegression"] diff --git a/onedal/spmd/linear_model/linear_model.py b/onedal/spmd/linear_model/linear_model.py index d07eb7df28..8990a3b1c9 100644 --- a/onedal/spmd/linear_model/linear_model.py +++ b/onedal/spmd/linear_model/linear_model.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -from ...common._spmd_policy import _get_spmd_policy -from ..._device_offload import support_usm_ndarray + from onedal.linear_model import LinearRegression as LinearRegression_Batch +from ..._device_offload import support_usm_ndarray +from ...common._spmd_policy import _get_spmd_policy + class BaseLinearRegressionSPMD(ABC): def _get_policy(self, queue, *data): @@ -26,7 +28,6 @@ def _get_policy(self, queue, *data): class LinearRegression(BaseLinearRegressionSPMD, LinearRegression_Batch): - @support_usm_ndarray() def fit(self, X, y, queue=None): return super().fit(X, y, queue) diff --git a/onedal/spmd/neighbors/__init__.py b/onedal/spmd/neighbors/__init__.py index 99099fa51c..11f104287a 100644 --- a/onedal/spmd/neighbors/__init__.py +++ b/onedal/spmd/neighbors/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors -__all__ = ['KNeighborsClassifier', 'KNeighborsRegressor', 'NearestNeighbors'] +__all__ = ["KNeighborsClassifier", "KNeighborsRegressor", "NearestNeighbors"] diff --git a/onedal/spmd/neighbors/neighbors.py b/onedal/spmd/neighbors/neighbors.py index 02981599b9..d3b7cb61c8 100644 --- a/onedal/spmd/neighbors/neighbors.py +++ b/onedal/spmd/neighbors/neighbors.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -from ...common._spmd_policy import _get_spmd_policy -from ..._device_offload import support_usm_ndarray + from onedal.neighbors import KNeighborsClassifier as KNeighborsClassifier_Batch from onedal.neighbors import KNeighborsRegressor as KNeighborsRegressor_Batch +from ..._device_offload import support_usm_ndarray +from ...common._spmd_policy import _get_spmd_policy + class NeighborsCommonBaseSPMD(ABC): def _get_policy(self, queue, *data): @@ -27,7 +29,6 @@ def _get_policy(self, queue, *data): class KNeighborsClassifier(NeighborsCommonBaseSPMD, KNeighborsClassifier_Batch): - @support_usm_ndarray() def fit(self, X, y, queue=None): return super().fit(X, y, queue) @@ -41,8 +42,7 @@ def predict_proba(self, X, queue=None): raise NotImplementedError("predict_proba not supported in distributed mode.") @support_usm_ndarray() - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super().kneighbors(X, n_neighbors, return_distance, queue) @@ -52,12 +52,13 @@ def fit(self, X, y, queue=None): if queue is not None and queue.sycl_device.is_gpu: return super()._fit(X, y, queue=queue) else: - raise ValueError('SPMD version of kNN is not implemented for ' - 'CPU. Consider running on it on GPU.') + raise ValueError( + "SPMD version of kNN is not implemented for " + "CPU. Consider running on it on GPU." + ) @support_usm_ndarray() - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super().kneighbors(X, n_neighbors, return_distance, queue) @support_usm_ndarray() @@ -66,18 +67,16 @@ def predict(self, X, queue=None): def _get_onedal_params(self, X, y=None): params = super()._get_onedal_params(X, y) - if 'responses' not in params['result_option']: - params['result_option'] += '|responses' + if "responses" not in params["result_option"]: + params["result_option"] += "|responses" return params class NearestNeighbors(NeighborsCommonBaseSPMD): - @support_usm_ndarray() def fit(self, X, y, queue=None): return super().fit(X, y, queue) @support_usm_ndarray() - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super().kneighbors(X, n_neighbors, return_distance, queue) diff --git a/onedal/svm/__init__.py b/onedal/svm/__init__.py index c8647cba2a..941048029b 100644 --- a/onedal/svm/__init__.py +++ b/onedal/svm/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .svm import SVC, SVR, NuSVC, NuSVR, SVMtype -__all__ = ['SVC', 'SVR', 'NuSVC', 'NuSVR', 'SVMtype'] +__all__ = ["SVC", "SVR", "NuSVC", "NuSVR", "SVMtype"] diff --git a/onedal/svm/svm.py b/onedal/svm/svm.py index ebbcbd628c..b851d0178c 100644 --- a/onedal/svm/svm.py +++ b/onedal/svm/svm.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,29 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import sklearn_check_version -from sklearn.base import BaseEstimator from abc import ABCMeta, abstractmethod from enum import Enum from numbers import Number, Real import numpy as np from scipy import sparse as sp -from ..datatypes import ( - _validate_targets, - _check_X_y, - _check_array, - _column_or_1d, - _check_n_features -) +from sklearn.base import BaseEstimator +from daal4py.sklearn._utils import sklearn_check_version +from onedal import _backend + +from ..common._estimator_checks import _check_is_fitted from ..common._mixin import ClassifierMixin, RegressorMixin from ..common._policy import _get_policy -from ..common._estimator_checks import _check_is_fitted -from ..datatypes._data_conversion import from_table, to_table -from onedal import _backend +from ..datatypes import from_table, to_table +from ..utils import ( + _check_array, + _check_n_features, + _check_X_y, + _column_or_1d, + _validate_targets, +) class SVMtype(Enum): @@ -46,11 +47,28 @@ class SVMtype(Enum): class BaseSVM(BaseEstimator, metaclass=ABCMeta): @abstractmethod - def __init__(self, C, nu, epsilon, kernel='rbf', *, degree, gamma, - coef0, tol, shrinking, cache_size, max_iter, tau, - class_weight, decision_function_shape, - break_ties, algorithm, svm_type=None, **kwargs): - + def __init__( + self, + C, + nu, + epsilon, + kernel="rbf", + *, + degree, + gamma, + coef0, + tol, + shrinking, + cache_size, + max_iter, + tau, + class_weight, + decision_function_shape, + break_ties, + algorithm, + svm_type=None, + **kwargs, + ): self.C = C self.nu = nu self.epsilon = epsilon @@ -71,14 +89,14 @@ def __init__(self, C, nu, epsilon, kernel='rbf', *, degree, gamma, def _compute_gamma_sigma(self, gamma, X): if isinstance(gamma, str): - if gamma == 'scale': + if gamma == "scale": if sp.isspmatrix(X): # var = E[X^2] - E[X]^2 - X_sc = (X.multiply(X)).mean() - (X.mean())**2 + X_sc = (X.multiply(X)).mean() - (X.mean()) ** 2 else: X_sc = X.var() _gamma = 1.0 / (X.shape[1] * X_sc) if X_sc != 0 else 1.0 - elif gamma == 'auto': + elif gamma == "auto": _gamma = 1.0 / X.shape[1] else: raise ValueError( @@ -86,7 +104,7 @@ def _compute_gamma_sigma(self, gamma, X): "'auto'. Got '{}' instead.".format(gamma) ) else: - if sklearn_check_version('1.1') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.1") and not sklearn_check_version("1.2"): if isinstance(gamma, Real): if gamma <= 0: msg = ( @@ -117,17 +135,19 @@ def _get_sample_weight(self, X, y, sample_weight): if n_samples == 1: raise ValueError("n_samples=1") - sample_weight = np.asarray([] - if sample_weight is None - else sample_weight, dtype=np.float64) + sample_weight = np.asarray( + [] if sample_weight is None else sample_weight, dtype=np.float64 + ) sample_weight_count = sample_weight.shape[0] if sample_weight_count != 0 and sample_weight_count != n_samples: - raise ValueError("sample_weight and X have incompatible shapes: " - "%r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (len(sample_weight), X.shape)) + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." + % (len(sample_weight), X.shape) + ) ww = None if sample_weight_count == 0 and self.class_weight_ is None: @@ -139,40 +159,51 @@ def _get_sample_weight(self, X, y, sample_weight): sample_weight = np.full(n_samples, sample_weight, dtype=dtype) else: sample_weight = _check_array( - sample_weight, accept_sparse=False, ensure_2d=False, - dtype=dtype, order="C" + sample_weight, + accept_sparse=False, + ensure_2d=False, + dtype=dtype, + order="C", ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) if self.svm_type == SVMtype.nu_svc: - weight_per_class = [np.sum(sample_weight[y == class_label]) - for class_label in np.unique(y)] + weight_per_class = [ + np.sum(sample_weight[y == class_label]) for class_label in np.unique(y) + ] for i in range(len(weight_per_class)): for j in range(i + 1, len(weight_per_class)): - if self.nu * (weight_per_class[i] + weight_per_class[j]) / 2 > \ - min(weight_per_class[i], weight_per_class[j]): - raise ValueError('specified nu is infeasible') + if self.nu * (weight_per_class[i] + weight_per_class[j]) / 2 > min( + weight_per_class[i], weight_per_class[j] + ): + raise ValueError("specified nu is infeasible") if np.all(sample_weight <= 0): if self.svm_type == SVMtype.nu_svc: - err_msg = 'negative dimensions are not allowed' + err_msg = "negative dimensions are not allowed" else: - err_msg = 'Invalid input - all samples have zero or negative weights.' + err_msg = "Invalid input - all samples have zero or negative weights." raise ValueError(err_msg) if np.any(sample_weight <= 0): - if self.svm_type == SVMtype.c_svc and \ - len(np.unique(y[sample_weight > 0])) != len(self.classes_): + if self.svm_type == SVMtype.c_svc and len( + np.unique(y[sample_weight > 0]) + ) != len(self.classes_): raise ValueError( - 'Invalid input - all samples with positive weights ' - 'belong to the same class' if sklearn_check_version('1.2') else - 'Invalid input - all samples with positive weights ' - 'have the same label.') + "Invalid input - all samples with positive weights " + "belong to the same class" + if sklearn_check_version("1.2") + else "Invalid input - all samples with positive weights " + "have the same label." + ) ww = sample_weight if self.class_weight_ is not None: for i, v in enumerate(self.class_weight_): @@ -190,39 +221,51 @@ def _get_onedal_params(self, data): self.n_iter_ = 1 if max_iter < 1 else max_iter class_count = 0 if self.classes_ is None else len(self.classes_) return { - 'fptype': 'float' if data.dtype == np.float32 else 'double', - 'method': self.algorithm, - 'kernel': self.kernel, - 'c': self.C, 'nu': self.nu, 'epsilon': self.epsilon, - 'class_count': class_count, 'accuracy_threshold': self.tol, - 'max_iteration_count': int(max_iter), 'scale': self._scale_, - 'sigma': self._sigma_, 'shift': self.coef0, 'degree': self.degree, - 'tau': self.tau, 'shrinking': self.shrinking, 'cache_size': self.cache_size + "fptype": "float" if data.dtype == np.float32 else "double", + "method": self.algorithm, + "kernel": self.kernel, + "c": self.C, + "nu": self.nu, + "epsilon": self.epsilon, + "class_count": class_count, + "accuracy_threshold": self.tol, + "max_iteration_count": int(max_iter), + "scale": self._scale_, + "sigma": self._sigma_, + "shift": self.coef0, + "degree": self.degree, + "tau": self.tau, + "shrinking": self.shrinking, + "cache_size": self.cache_size, } def _fit(self, X, y, sample_weight, module, queue): - if hasattr(self, 'decision_function_shape'): - if self.decision_function_shape not in ('ovr', 'ovo', None): + if hasattr(self, "decision_function_shape"): + if self.decision_function_shape not in ("ovr", "ovo", None): raise ValueError( f"decision_function_shape must be either 'ovr' or 'ovo', " f"got {self.decision_function_shape}." ) if y is None: - if self._get_tags()['requires_y']: + if self._get_tags()["requires_y"]: raise ValueError( f"This {self.__class__.__name__} estimator " f"requires y to be passed, but the target y is None." ) X, y = _check_X_y( - X, y, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse='csr') + X, + y, + dtype=[np.float64, np.float32], + force_all_finite=True, + accept_sparse="csr", + ) y = self._validate_targets(y, X.dtype) sample_weight = self._get_sample_weight(X, y, sample_weight) self._sparse = sp.isspmatrix(X) - if self.kernel == 'linear': + if self.kernel == "linear": self._scale_, self._sigma_ = 1.0, 1.0 self.coef0 = 0.0 else: @@ -240,14 +283,15 @@ def _fit(self, X, y, sample_weight, module, queue): self.support_vectors_ = from_table(result.support_vectors) self.intercept_ = from_table(result.biases).ravel() - self.support_ = from_table(result.support_indices).ravel().astype('int') + self.support_ = from_table(result.support_indices).ravel().astype("int") self.n_features_in_ = X.shape[1] self.shape_fit_ = X.shape - if getattr(self, 'classes_', None) is not None: + if getattr(self, "classes_", None) is not None: indices = y.take(self.support_, axis=0) - self._n_support = np.array([ - np.sum(indices == i) for i, _ in enumerate(self.classes_)]) + self._n_support = np.array( + [np.sum(indices == i) for i, _ in enumerate(self.classes_)] + ) self._gamma = self._scale_ self._onedal_model = result.model @@ -266,22 +310,32 @@ def _create_model(self, module): def _predict(self, X, module, queue): _check_is_fitted(self) - if self.break_ties and self.decision_function_shape == 'ovo': - raise ValueError("break_ties must be False when " - "decision_function_shape is 'ovo'") + if self.break_ties and self.decision_function_shape == "ovo": + raise ValueError( + "break_ties must be False when " "decision_function_shape is 'ovo'" + ) - if (module in [_backend.svm.classification, _backend.svm.nu_classification]): + if module in [_backend.svm.classification, _backend.svm.nu_classification]: sv = self.support_vectors_ if not self._sparse and sv.size > 0 and self._n_support.sum() != sv.shape[0]: - raise ValueError("The internal representation " - f"of {self.__class__.__name__} was altered") + raise ValueError( + "The internal representation " + f"of {self.__class__.__name__} was altered" + ) - if self.break_ties and self.decision_function_shape == 'ovr' and \ - len(self.classes_) > 2: + if ( + self.break_ties + and self.decision_function_shape == "ovr" + and len(self.classes_) > 2 + ): y = np.argmax(self.decision_function(X), axis=1) else: - X = _check_array(X, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse='csr') + X = _check_array( + X, + dtype=[np.float64, np.float32], + force_all_finite=True, + accept_sparse="csr", + ) _check_n_features(self, X, False) if self._sparse and not sp.isspmatrix(X): @@ -292,12 +346,13 @@ def _predict(self, X, module, queue): if sp.issparse(X) and not self._sparse and not callable(self.kernel): raise ValueError( "cannot use sparse input in %r trained on dense data" - % type(self).__name__) + % type(self).__name__ + ) policy = _get_policy(queue, X) params = self._get_onedal_params(X) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(module) @@ -319,14 +374,16 @@ def _ovr_decision_function(self, predictions, confidences, n_classes): votes[predictions[:, k] == 1, j] += 1 k += 1 - transformed_confidences = \ - sum_of_confidences / (3 * (np.abs(sum_of_confidences) + 1)) + transformed_confidences = sum_of_confidences / ( + 3 * (np.abs(sum_of_confidences) + 1) + ) return votes + transformed_confidences def _decision_function(self, X, module, queue): _check_is_fitted(self) - X = _check_array(X, dtype=[np.float64, np.float32], - force_all_finite=False, accept_sparse='csr') + X = _check_array( + X, dtype=[np.float64, np.float32], force_all_finite=False, accept_sparse="csr" + ) _check_n_features(self, X, False) if self._sparse and not sp.isspmatrix(X): @@ -337,18 +394,21 @@ def _decision_function(self, X, module, queue): if sp.issparse(X) and not self._sparse and not callable(self.kernel): raise ValueError( "cannot use sparse input in %r trained on dense data" - % type(self).__name__) + % type(self).__name__ + ) - if (module in [_backend.svm.classification, _backend.svm.nu_classification]): + if module in [_backend.svm.classification, _backend.svm.nu_classification]: sv = self.support_vectors_ if not self._sparse and sv.size > 0 and self._n_support.sum() != sv.shape[0]: - raise ValueError("The internal representation " - f"of {self.__class__.__name__} was altered") + raise ValueError( + "The internal representation " + f"of {self.__class__.__name__} was altered" + ) policy = _get_policy(queue, X) params = self._get_onedal_params(X) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(module) @@ -358,9 +418,10 @@ def _decision_function(self, X, module, queue): if len(self.classes_) == 2: decision_function = decision_function.ravel() - if self.decision_function_shape == 'ovr' and len(self.classes_) > 2: + if self.decision_function_shape == "ovr" and len(self.classes_) > 2: decision_function = self._ovr_decision_function( - decision_function < 0, -decision_function, len(self.classes_)) + decision_function < 0, -decision_function, len(self.classes_) + ) return decision_function @@ -369,17 +430,41 @@ class SVR(RegressorMixin, BaseSVM): Epsilon--Support Vector Regression. """ - def __init__(self, C=1.0, epsilon=0.1, kernel='rbf', *, degree=3, - gamma='scale', coef0=0.0, tol=1e-3, shrinking=True, - cache_size=200.0, max_iter=-1, tau=1e-12, - algorithm='thunder', **kwargs): - super().__init__(C=C, nu=0.5, epsilon=epsilon, kernel=kernel, - degree=degree, gamma=gamma, - coef0=coef0, tol=tol, - shrinking=shrinking, cache_size=cache_size, - max_iter=max_iter, tau=tau, class_weight=None, - decision_function_shape=None, - break_ties=False, algorithm=algorithm) + def __init__( + self, + C=1.0, + epsilon=0.1, + kernel="rbf", + *, + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + shrinking=True, + cache_size=200.0, + max_iter=-1, + tau=1e-12, + algorithm="thunder", + **kwargs, + ): + super().__init__( + C=C, + nu=0.5, + epsilon=epsilon, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + shrinking=shrinking, + cache_size=cache_size, + max_iter=max_iter, + tau=tau, + class_weight=None, + decision_function_shape=None, + break_ties=False, + algorithm=algorithm, + ) self.svm_type = SVMtype.epsilon_svr def fit(self, X, y, sample_weight=None, queue=None): @@ -395,22 +480,49 @@ class SVC(ClassifierMixin, BaseSVM): C-Support Vector Classification. """ - def __init__(self, C=1.0, kernel='rbf', *, degree=3, gamma='scale', - coef0=0.0, tol=1e-3, shrinking=True, cache_size=200.0, - max_iter=-1, tau=1e-12, class_weight=None, - decision_function_shape='ovr', break_ties=False, - algorithm='thunder', **kwargs): - super().__init__(C=C, nu=0.5, epsilon=0.0, kernel=kernel, degree=degree, - gamma=gamma, coef0=coef0, tol=tol, - shrinking=shrinking, cache_size=cache_size, - max_iter=max_iter, tau=tau, class_weight=class_weight, - decision_function_shape=decision_function_shape, - break_ties=break_ties, algorithm=algorithm) + def __init__( + self, + C=1.0, + kernel="rbf", + *, + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + shrinking=True, + cache_size=200.0, + max_iter=-1, + tau=1e-12, + class_weight=None, + decision_function_shape="ovr", + break_ties=False, + algorithm="thunder", + **kwargs, + ): + super().__init__( + C=C, + nu=0.5, + epsilon=0.0, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + shrinking=shrinking, + cache_size=cache_size, + max_iter=max_iter, + tau=tau, + class_weight=class_weight, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + algorithm=algorithm, + ) self.svm_type = SVMtype.c_svc def _validate_targets(self, y, dtype): y, self.class_weight_, self.classes_ = _validate_targets( - y, self.class_weight, dtype) + y, self.class_weight, dtype + ) return y def fit(self, X, y, sample_weight=None, queue=None): @@ -431,17 +543,41 @@ class NuSVR(RegressorMixin, BaseSVM): Nu-Support Vector Regression. """ - def __init__(self, nu=0.5, C=1.0, kernel='rbf', *, degree=3, - gamma='scale', coef0=0.0, tol=1e-3, shrinking=True, - cache_size=200.0, max_iter=-1, tau=1e-12, - algorithm='thunder', **kwargs): - super().__init__(C=C, nu=nu, epsilon=0.0, kernel=kernel, - degree=degree, gamma=gamma, - coef0=coef0, tol=tol, - shrinking=shrinking, cache_size=cache_size, - max_iter=max_iter, tau=tau, class_weight=None, - decision_function_shape=None, - break_ties=False, algorithm=algorithm) + def __init__( + self, + nu=0.5, + C=1.0, + kernel="rbf", + *, + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + shrinking=True, + cache_size=200.0, + max_iter=-1, + tau=1e-12, + algorithm="thunder", + **kwargs, + ): + super().__init__( + C=C, + nu=nu, + epsilon=0.0, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + shrinking=shrinking, + cache_size=cache_size, + max_iter=max_iter, + tau=tau, + class_weight=None, + decision_function_shape=None, + break_ties=False, + algorithm=algorithm, + ) self.svm_type = SVMtype.nu_svr def fit(self, X, y, sample_weight=None, queue=None): @@ -457,22 +593,49 @@ class NuSVC(ClassifierMixin, BaseSVM): Nu-Support Vector Classification. """ - def __init__(self, nu=0.5, kernel='rbf', *, degree=3, gamma='scale', - coef0=0.0, tol=1e-3, shrinking=True, cache_size=200.0, - max_iter=-1, tau=1e-12, class_weight=None, - decision_function_shape='ovr', break_ties=False, - algorithm='thunder', **kwargs): - super().__init__(C=1.0, nu=nu, epsilon=0.0, kernel=kernel, degree=degree, - gamma=gamma, coef0=coef0, tol=tol, - shrinking=shrinking, cache_size=cache_size, - max_iter=max_iter, tau=tau, class_weight=class_weight, - decision_function_shape=decision_function_shape, - break_ties=break_ties, algorithm=algorithm) + def __init__( + self, + nu=0.5, + kernel="rbf", + *, + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + shrinking=True, + cache_size=200.0, + max_iter=-1, + tau=1e-12, + class_weight=None, + decision_function_shape="ovr", + break_ties=False, + algorithm="thunder", + **kwargs, + ): + super().__init__( + C=1.0, + nu=nu, + epsilon=0.0, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + shrinking=shrinking, + cache_size=cache_size, + max_iter=max_iter, + tau=tau, + class_weight=class_weight, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + algorithm=algorithm, + ) self.svm_type = SVMtype.nu_svc def _validate_targets(self, y, dtype): y, self.class_weight_, self.classes_ = _validate_targets( - y, self.class_weight, dtype) + y, self.class_weight, dtype + ) return y def fit(self, X, y, sample_weight=None, queue=None): diff --git a/onedal/svm/tests/test_csr_svm.py b/onedal/svm/tests/test_csr_svm.py index a623e9c2ea..a1f445868e 100644 --- a/onedal/svm/tests/test_csr_svm.py +++ b/onedal/svm/tests/test_csr_svm.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,26 +12,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from scipy import sparse as sp - -from numpy.testing import assert_array_equal, assert_array_almost_equal - -from onedal.svm import SVC, SVR - -from sklearn.utils.estimator_checks import check_estimator +import pytest import sklearn.utils.estimator_checks +from numpy.testing import assert_array_almost_equal, assert_array_equal +from scipy import sparse as sp from sklearn import datasets, metrics +from sklearn.base import clone as clone_estimator +from sklearn.datasets import make_blobs, make_classification from sklearn.metrics.pairwise import rbf_kernel -from sklearn.datasets import make_classification, make_blobs from sklearn.model_selection import train_test_split -from sklearn.base import clone as clone_estimator +from sklearn.utils.estimator_checks import check_estimator -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) +from onedal.svm import SVC, SVR +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) def is_classifier(estimator): @@ -49,18 +48,24 @@ def check_svm_model_equal(queue, svm, X_train, y_train, X_test, decimal=6): sparse_svm.fit(X_train, y_train, queue=queue) assert sp.issparse(sparse_svm.support_vectors_) assert sp.issparse(sparse_svm.dual_coef_) - assert_array_almost_equal(dense_svm.support_vectors_, - sparse_svm.support_vectors_.toarray(), decimal) - assert_array_almost_equal(dense_svm.dual_coef_, - sparse_svm.dual_coef_.toarray(), decimal) + assert_array_almost_equal( + dense_svm.support_vectors_, sparse_svm.support_vectors_.toarray(), decimal + ) + assert_array_almost_equal( + dense_svm.dual_coef_, sparse_svm.dual_coef_.toarray(), decimal + ) assert_array_almost_equal(dense_svm.support_, sparse_svm.support_) - assert_array_almost_equal(dense_svm.predict(X_test_dense, queue=queue), - sparse_svm.predict(X_test, queue=queue)) + assert_array_almost_equal( + dense_svm.predict(X_test_dense, queue=queue), + sparse_svm.predict(X_test, queue=queue), + ) if is_classifier(svm): - assert_array_almost_equal(dense_svm.decision_function(X_test_dense, queue=queue), - sparse_svm.decision_function(X_test, queue=queue), - decimal) + assert_array_almost_equal( + dense_svm.decision_function(X_test_dense, queue=queue), + sparse_svm.decision_function(X_test, queue=queue), + decimal, + ) def _test_simple_dataset(queue, kernel): @@ -77,12 +82,20 @@ def _test_simple_dataset(queue, kernel): @pass_if_not_implemented_for_gpu(reason="csr svm is not implemented") -@pytest.mark.parametrize('queue', get_queues('cpu') + [ - pytest.param(get_queues('gpu'), - marks=pytest.mark.xfail( - reason="raises UnknownError instead of RuntimeError " - "with unimplemented message"))]) -@pytest.mark.parametrize('kernel', ['linear', 'rbf']) +@pytest.mark.parametrize( + "queue", + get_queues("cpu") + + [ + pytest.param( + get_queues("gpu"), + marks=pytest.mark.xfail( + reason="raises UnknownError instead of RuntimeError " + "with unimplemented message" + ), + ) + ], +) +@pytest.mark.parametrize("kernel", ["linear", "rbf"]) def test_simple_dataset(queue, kernel): _test_simple_dataset(queue, kernel) @@ -97,13 +110,21 @@ def _test_binary_dataset(queue, kernel): @pass_if_not_implemented_for_gpu(reason="csr svm is not implemented") -@pytest.mark.parametrize('queue', get_queues('cpu') + [ - pytest.param(get_queues('gpu'), - marks=pytest.mark.xfail( - reason="raises UnknownError for linear and rbf, " - "Unimplemented error with inconsistent error message " - "for poly and sigmoid"))]) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'poly', 'sigmoid']) +@pytest.mark.parametrize( + "queue", + get_queues("cpu") + + [ + pytest.param( + get_queues("gpu"), + marks=pytest.mark.xfail( + reason="raises UnknownError for linear and rbf, " + "Unimplemented error with inconsistent error message " + "for poly and sigmoid" + ), + ) + ], +) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"]) def test_binary_dataset(queue, kernel): _test_binary_dataset(queue, kernel) @@ -123,8 +144,8 @@ def _test_iris(queue, kernel): @pass_if_not_implemented_for_gpu(reason="csr svm is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'poly', 'sigmoid']) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"]) def test_iris(queue, kernel): _test_iris(queue, kernel) @@ -140,35 +161,191 @@ def _test_diabetes(queue, kernel): @pass_if_not_implemented_for_gpu(reason="csr svm is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'poly', 'sigmoid']) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"]) def test_diabetes(queue, kernel): _test_diabetes(queue, kernel) @pass_if_not_implemented_for_gpu(reason="csr svm is not implemented") @pytest.mark.xfail(reason="Failed test. Need investigate") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_sparse_realdata(queue): data = np.array([0.03771744, 0.1003567, 0.01174647, 0.027069]) indices = np.array([6, 5, 35, 31]) indptr = np.array( - [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4]) + [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + ] + ) X = sp.csr_matrix((data, indices, indptr)) y = np.array( - [1., 0., 2., 2., 1., 1., 1., 2., 2., 0., 1., 2., 2., - 0., 2., 0., 3., 0., 3., 0., 1., 1., 3., 2., 3., 2., - 0., 3., 1., 0., 2., 1., 2., 0., 1., 0., 2., 3., 1., - 3., 0., 1., 0., 0., 2., 0., 1., 2., 2., 2., 3., 2., - 0., 3., 2., 1., 2., 3., 2., 2., 0., 1., 0., 1., 2., - 3., 0., 0., 2., 2., 1., 3., 1., 1., 0., 1., 2., 1., - 1., 3.]) - - clf = SVC(kernel='linear').fit(X.toarray(), y, queue=queue) - sp_clf = SVC(kernel='linear').fit(X, y, queue=queue) + [ + 1.0, + 0.0, + 2.0, + 2.0, + 1.0, + 1.0, + 1.0, + 2.0, + 2.0, + 0.0, + 1.0, + 2.0, + 2.0, + 0.0, + 2.0, + 0.0, + 3.0, + 0.0, + 3.0, + 0.0, + 1.0, + 1.0, + 3.0, + 2.0, + 3.0, + 2.0, + 0.0, + 3.0, + 1.0, + 0.0, + 2.0, + 1.0, + 2.0, + 0.0, + 1.0, + 0.0, + 2.0, + 3.0, + 1.0, + 3.0, + 0.0, + 1.0, + 0.0, + 0.0, + 2.0, + 0.0, + 1.0, + 2.0, + 2.0, + 2.0, + 3.0, + 2.0, + 0.0, + 3.0, + 2.0, + 1.0, + 2.0, + 3.0, + 2.0, + 2.0, + 0.0, + 1.0, + 0.0, + 1.0, + 2.0, + 3.0, + 0.0, + 0.0, + 2.0, + 2.0, + 1.0, + 3.0, + 1.0, + 1.0, + 0.0, + 1.0, + 2.0, + 1.0, + 1.0, + 3.0, + ] + ) + + clf = SVC(kernel="linear").fit(X.toarray(), y, queue=queue) + sp_clf = SVC(kernel="linear").fit(X, y, queue=queue) assert_array_equal(clf.support_vectors_, sp_clf.support_vectors_.toarray()) assert_array_equal(clf.dual_coef_, sp_clf.dual_coef_.toarray()) diff --git a/onedal/svm/tests/test_nusvc.py b/onedal/svm/tests/test_nusvc.py index 68fc0c0390..4fa1d83ddf 100644 --- a/onedal/svm/tests/test_nusvc.py +++ b/onedal/svm/tests/test_nusvc.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,48 +12,48 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from numpy.testing import assert_array_equal, assert_array_almost_equal - -from onedal.svm import NuSVC -from sklearn.svm import NuSVC as SklearnNuSVC - +import pytest +from numpy.testing import assert_array_almost_equal, assert_array_equal from sklearn import datasets from sklearn.datasets import make_blobs from sklearn.metrics.pairwise import rbf_kernel from sklearn.model_selection import train_test_split +from sklearn.svm import NuSVC as SklearnNuSVC -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) +from onedal.svm import NuSVC +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) def _test_libsvm_parameters(queue, array_constr, dtype): - X = array_constr([[-2, -1], [-1, -1], [-1, -2], - [1, 1], [1, 2], [2, 1]], dtype=dtype) + X = array_constr([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype=dtype) y = array_constr([1, 1, 1, 2, 2, 2], dtype=dtype) - clf = NuSVC(kernel='linear').fit(X, y, queue=queue) + clf = NuSVC(kernel="linear").fit(X, y, queue=queue) assert_array_almost_equal( - clf.dual_coef_, [[-0.04761905, -0.0952381, 0.0952381, 0.04761905]]) + clf.dual_coef_, [[-0.04761905, -0.0952381, 0.0952381, 0.04761905]] + ) assert_array_equal(clf.support_, [0, 1, 3, 4]) assert_array_equal(clf.support_vectors_, X[clf.support_]) - assert_array_equal(clf.intercept_, [0.]) + assert_array_equal(clf.intercept_, [0.0]) assert_array_equal(clf.predict(X, queue=queue), y) @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('array_constr', [np.array]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("array_constr", [np.array]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_libsvm_parameters(queue, array_constr, dtype): _test_libsvm_parameters(queue, array_constr, dtype) @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_class_weight(queue): X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) y = np.array([1, 1, 1, 2, 2, 2]) @@ -64,23 +64,23 @@ def test_class_weight(queue): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_sample_weight(queue): X = np.array([[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) - clf = NuSVC(kernel='linear') + clf = NuSVC(kernel="linear") clf.fit(X, y, sample_weight=[1] * 6, queue=queue) assert_array_almost_equal(clf.intercept_, [0.0]) @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_decision_function(queue): X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] Y = [1, 1, 1, 2, 2, 2] - clf = NuSVC(kernel='rbf', gamma=1, decision_function_shape='ovo') + clf = NuSVC(kernel="rbf", gamma=1, decision_function_shape="ovo") clf.fit(X, Y, queue=queue) rbfs = rbf_kernel(X, clf.support_vectors_, gamma=clf.gamma) @@ -89,23 +89,24 @@ def test_decision_function(queue): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_iris(queue): iris = datasets.load_iris() - clf = NuSVC(kernel='linear').fit(iris.data, iris.target, queue=queue) + clf = NuSVC(kernel="linear").fit(iris.data, iris.target, queue=queue) assert clf.score(iris.data, iris.target, queue=queue) > 0.9 assert_array_equal(clf.classes_, np.sort(clf.classes_)) @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_decision_function_shape(queue): X, y = make_blobs(n_samples=80, centers=5, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # check shape of ovo_decition_function=True - clf = NuSVC(kernel='linear', - decision_function_shape='ovo').fit(X_train, y_train, queue=queue) + clf = NuSVC(kernel="linear", decision_function_shape="ovo").fit( + X_train, y_train, queue=queue + ) dec = clf.decision_function(X_train, queue=queue) assert dec.shape == (len(X_train), 10) @@ -114,13 +115,14 @@ def test_decision_function_shape(queue): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): iris = datasets.load_iris() - clf = NuSVC(kernel='linear').fit(iris.data, iris.target, queue=queue) + clf = NuSVC(kernel="linear").fit(iris.data, iris.target, queue=queue) expected = clf.decision_function(iris.data, queue=queue) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) @@ -132,11 +134,11 @@ def test_pickle(queue): def _test_cancer_rbf_compare_with_sklearn(queue, nu, gamma): cancer = datasets.load_breast_cancer() - clf = NuSVC(kernel='rbf', gamma=gamma, nu=nu) + clf = NuSVC(kernel="rbf", gamma=gamma, nu=nu) clf.fit(cancer.data, cancer.target, queue=queue) result = clf.score(cancer.data, cancer.target, queue=queue) - clf = SklearnNuSVC(kernel='rbf', gamma=gamma, nu=nu) + clf = SklearnNuSVC(kernel="rbf", gamma=gamma, nu=nu) clf.fit(cancer.data, cancer.target) expected = clf.score(cancer.data, cancer.target) @@ -145,9 +147,9 @@ def _test_cancer_rbf_compare_with_sklearn(queue, nu, gamma): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('gamma', ['scale', 'auto']) -@pytest.mark.parametrize('nu', [0.25, 0.5]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("gamma", ["scale", "auto"]) +@pytest.mark.parametrize("nu", [0.25, 0.5]) def test_cancer_rbf_compare_with_sklearn(queue, nu, gamma): _test_cancer_rbf_compare_with_sklearn(queue, nu, gamma) @@ -155,11 +157,11 @@ def test_cancer_rbf_compare_with_sklearn(queue, nu, gamma): def _test_cancer_linear_compare_with_sklearn(queue, nu): cancer = datasets.load_breast_cancer() - clf = NuSVC(kernel='linear', nu=nu) + clf = NuSVC(kernel="linear", nu=nu) clf.fit(cancer.data, cancer.target, queue=queue) result = clf.score(cancer.data, cancer.target, queue=queue) - clf = SklearnNuSVC(kernel='linear', nu=nu) + clf = SklearnNuSVC(kernel="linear", nu=nu) clf.fit(cancer.data, cancer.target) expected = clf.score(cancer.data, cancer.target) @@ -168,8 +170,8 @@ def _test_cancer_linear_compare_with_sklearn(queue, nu): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('nu', [0.25, 0.5]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("nu", [0.25, 0.5]) def test_cancer_linear_compare_with_sklearn(queue, nu): _test_cancer_linear_compare_with_sklearn(queue, nu) @@ -177,11 +179,11 @@ def test_cancer_linear_compare_with_sklearn(queue, nu): def _test_cancer_poly_compare_with_sklearn(queue, params): cancer = datasets.load_breast_cancer() - clf = NuSVC(kernel='poly', **params) + clf = NuSVC(kernel="poly", **params) clf.fit(cancer.data, cancer.target, queue=queue) result = clf.score(cancer.data, cancer.target, queue=queue) - clf = SklearnNuSVC(kernel='poly', **params) + clf = SklearnNuSVC(kernel="poly", **params) clf.fit(cancer.data, cancer.target) expected = clf.score(cancer.data, cancer.target) @@ -190,10 +192,13 @@ def _test_cancer_poly_compare_with_sklearn(queue, params): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('params', [ - {'degree': 2, 'coef0': 0.1, 'gamma': 'scale', 'nu': .25}, - {'degree': 3, 'coef0': 0.0, 'gamma': 'scale', 'nu': .5} -]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize( + "params", + [ + {"degree": 2, "coef0": 0.1, "gamma": "scale", "nu": 0.25}, + {"degree": 3, "coef0": 0.0, "gamma": "scale", "nu": 0.5}, + ], +) def test_cancer_poly_compare_with_sklearn(queue, params): _test_cancer_poly_compare_with_sklearn(queue, params) diff --git a/onedal/svm/tests/test_nusvr.py b/onedal/svm/tests/test_nusvr.py index 8dbe608934..fd85317687 100644 --- a/onedal/svm/tests/test_nusvr.py +++ b/onedal/svm/tests/test_nusvr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,58 +12,54 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from numpy.testing import assert_array_equal, assert_allclose, assert_array_almost_equal +import pytest +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal from sklearn import datasets from sklearn.metrics.pairwise import rbf_kernel - -from onedal.svm import NuSVR from sklearn.svm import NuSVR as SklearnNuSVR -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) - +from onedal.svm import NuSVR +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) -synth_params = { - 'n_samples': 500, - 'n_features': 100, - 'random_state': 42 -} +synth_params = {"n_samples": 500, "n_features": 100, "random_state": 42} @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_diabetes_simple(queue): diabetes = datasets.load_diabetes() - clf = NuSVR(kernel='linear', C=10.) + clf = NuSVR(kernel="linear", C=10.0) clf.fit(diabetes.data, diabetes.target, queue=queue) assert clf.score(diabetes.data, diabetes.target, queue=queue) > 0.02 @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_input_format_for_diabetes(queue): diabetes = datasets.load_diabetes() - c_contiguous_numpy = np.asanyarray(diabetes.data, dtype='float', order='C') + c_contiguous_numpy = np.asanyarray(diabetes.data, dtype="float", order="C") assert c_contiguous_numpy.flags.c_contiguous assert not c_contiguous_numpy.flags.f_contiguous assert not c_contiguous_numpy.flags.fnc - clf = NuSVR(kernel='linear', C=10.) + clf = NuSVR(kernel="linear", C=10.0) clf.fit(c_contiguous_numpy, diabetes.target, queue=queue) dual_c_contiguous_numpy = clf.dual_coef_ res_c_contiguous_numpy = clf.predict(c_contiguous_numpy, queue=queue) - f_contiguous_numpy = np.asanyarray(diabetes.data, dtype='float', order='F') + f_contiguous_numpy = np.asanyarray(diabetes.data, dtype="float", order="F") assert not f_contiguous_numpy.flags.c_contiguous assert f_contiguous_numpy.flags.f_contiguous assert f_contiguous_numpy.flags.fnc - clf = NuSVR(kernel='linear', C=10.) + clf = NuSVR(kernel="linear", C=10.0) clf.fit(f_contiguous_numpy, diabetes.target, queue=queue) dual_f_contiguous_numpy = clf.dual_coef_ res_f_contiguous_numpy = clf.predict(f_contiguous_numpy, queue=queue) @@ -72,19 +68,19 @@ def test_input_format_for_diabetes(queue): @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_predict(queue): iris = datasets.load_iris() X = iris.data y = iris.target - reg = NuSVR(kernel='linear', C=0.1).fit(X, y, queue=queue) + reg = NuSVR(kernel="linear", C=0.1).fit(X, y, queue=queue) linear = np.dot(X, reg.support_vectors_.T) dec = np.dot(linear, reg.dual_coef_.T) + reg.intercept_ assert_array_almost_equal(dec.ravel(), reg.predict(X, queue=queue).ravel()) - reg = NuSVR(kernel='rbf', gamma=1).fit(X, y, queue=queue) + reg = NuSVR(kernel="rbf", gamma=1).fit(X, y, queue=queue) rbfs = rbf_kernel(X, reg.support_vectors_, gamma=reg.gamma) dec = np.dot(rbfs, reg.dual_coef_.T) + reg.intercept_ @@ -93,24 +89,25 @@ def test_predict(queue): def _test_diabetes_compare_with_sklearn(queue, kernel): diabetes = datasets.load_diabetes() - clf_onedal = NuSVR(kernel=kernel, nu=.25, C=10.) + clf_onedal = NuSVR(kernel=kernel, nu=0.25, C=10.0) clf_onedal.fit(diabetes.data, diabetes.target, queue=queue) result = clf_onedal.score(diabetes.data, diabetes.target, queue=queue) - clf_sklearn = SklearnNuSVR(kernel=kernel, nu=.25, C=10.) + clf_sklearn = SklearnNuSVR(kernel=kernel, nu=0.25, C=10.0) clf_sklearn.fit(diabetes.data, diabetes.target) expected = clf_sklearn.score(diabetes.data, diabetes.target) assert result > expected - 1e-5 assert_allclose(clf_sklearn.intercept_, clf_onedal.intercept_, atol=1e-3) - assert_allclose(clf_sklearn.support_vectors_.shape, - clf_sklearn.support_vectors_.shape) + assert_allclose( + clf_sklearn.support_vectors_.shape, clf_sklearn.support_vectors_.shape + ) assert_allclose(clf_sklearn.dual_coef_, clf_onedal.dual_coef_, atol=1e-2) @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'poly', 'sigmoid']) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"]) def test_diabetes_compare_with_sklearn(queue, kernel): _test_diabetes_compare_with_sklearn(queue, kernel) @@ -118,11 +115,11 @@ def test_diabetes_compare_with_sklearn(queue, kernel): def _test_synth_rbf_compare_with_sklearn(queue, C, nu, gamma): x, y = datasets.make_regression(**synth_params) - clf = NuSVR(kernel='rbf', gamma=gamma, C=C, nu=nu) + clf = NuSVR(kernel="rbf", gamma=gamma, C=C, nu=nu) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnNuSVR(kernel='rbf', gamma=gamma, C=C, nu=nu) + clf = SklearnNuSVR(kernel="rbf", gamma=gamma, C=C, nu=nu) clf.fit(x, y) expected = clf.score(x, y) @@ -131,10 +128,10 @@ def _test_synth_rbf_compare_with_sklearn(queue, C, nu, gamma): @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('gamma', ['scale', 'auto']) -@pytest.mark.parametrize('C', [100.0, 1000.0]) -@pytest.mark.parametrize('nu', [0.25, 0.75]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("gamma", ["scale", "auto"]) +@pytest.mark.parametrize("C", [100.0, 1000.0]) +@pytest.mark.parametrize("nu", [0.25, 0.75]) def test_synth_rbf_compare_with_sklearn(queue, C, nu, gamma): _test_synth_rbf_compare_with_sklearn(queue, C, nu, gamma) @@ -142,11 +139,11 @@ def test_synth_rbf_compare_with_sklearn(queue, C, nu, gamma): def _test_synth_linear_compare_with_sklearn(queue, C, nu): x, y = datasets.make_regression(**synth_params) - clf = NuSVR(kernel='linear', C=C, nu=nu) + clf = NuSVR(kernel="linear", C=C, nu=nu) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnNuSVR(kernel='linear', C=C, nu=nu) + clf = SklearnNuSVR(kernel="linear", C=C, nu=nu) clf.fit(x, y) expected = clf.score(x, y) @@ -157,9 +154,9 @@ def _test_synth_linear_compare_with_sklearn(queue, C, nu): @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('C', [0.001, 0.1]) -@pytest.mark.parametrize('nu', [0.25, 0.75]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("C", [0.001, 0.1]) +@pytest.mark.parametrize("nu", [0.25, 0.75]) def test_synth_linear_compare_with_sklearn(queue, C, nu): _test_synth_linear_compare_with_sklearn(queue, C, nu) @@ -167,11 +164,11 @@ def test_synth_linear_compare_with_sklearn(queue, C, nu): def _test_synth_poly_compare_with_sklearn(queue, params): x, y = datasets.make_regression(**synth_params) - clf = NuSVR(kernel='poly', **params) + clf = NuSVR(kernel="poly", **params) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnNuSVR(kernel='poly', **params) + clf = SklearnNuSVR(kernel="poly", **params) clf.fit(x, y) expected = clf.score(x, y) @@ -180,25 +177,29 @@ def _test_synth_poly_compare_with_sklearn(queue, params): @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('params', [ - {'degree': 2, 'coef0': 0.1, 'gamma': 'scale', 'C': 100, 'nu': .25}, - {'degree': 3, 'coef0': 0.0, 'gamma': 'scale', 'C': 1000, 'nu': .75} -]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize( + "params", + [ + {"degree": 2, "coef0": 0.1, "gamma": "scale", "C": 100, "nu": 0.25}, + {"degree": 3, "coef0": 0.0, "gamma": "scale", "C": 1000, "nu": 0.75}, + ], +) def test_synth_poly_compare_with_sklearn(queue, params): _test_synth_poly_compare_with_sklearn(queue, params) @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): diabetes = datasets.load_diabetes() - clf = NuSVR(kernel='rbf', C=10.) + clf = NuSVR(kernel="rbf", C=10.0) clf.fit(diabetes.data, diabetes.target, queue=queue) expected = clf.predict(diabetes.data, queue=queue) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) diff --git a/onedal/svm/tests/test_svc.py b/onedal/svm/tests/test_svc.py index a6599a97f0..284a6b20f3 100644 --- a/onedal/svm/tests/test_svc.py +++ b/onedal/svm/tests/test_svc.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,23 +12,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from numpy.testing import assert_array_equal, assert_array_almost_equal - -from onedal.svm import SVC - -from sklearn.utils.estimator_checks import check_estimator +import pytest import sklearn.utils.estimator_checks +from numpy.testing import assert_array_almost_equal, assert_array_equal from sklearn import datasets -from sklearn.metrics.pairwise import rbf_kernel from sklearn.datasets import make_blobs +from sklearn.metrics.pairwise import rbf_kernel from sklearn.model_selection import train_test_split +from sklearn.utils.estimator_checks import check_estimator -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) +from onedal.svm import SVC +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) def _replace_and_save(md, fns, replacing_fn): @@ -53,42 +53,52 @@ def dummy(*args, **kwargs): pass md = sklearn.utils.estimator_checks - saved = _replace_and_save(md, [ - 'check_sample_weights_invariance', # Max absolute difference: 0.0008 - 'check_estimators_fit_returns_self', # ValueError: empty metadata - 'check_classifiers_train', # assert y_pred.shape == (n_samples,) - 'check_estimators_unfitted', # Call 'fit' with appropriate arguments - ], dummy) + saved = _replace_and_save( + md, + [ + "check_sample_weights_invariance", # Max absolute difference: 0.0008 + "check_estimators_fit_returns_self", # ValueError: empty metadata + "check_classifiers_train", # assert y_pred.shape == (n_samples,) + "check_estimators_unfitted", # Call 'fit' with appropriate arguments + ], + dummy, + ) check_estimator(SVC()) _restore_from_saved(md, saved) def _test_libsvm_parameters(queue, array_constr, dtype): - X = array_constr([[-2, -1], [-1, -1], [-1, -2], - [1, 1], [1, 2], [2, 1]], dtype=dtype) + X = array_constr([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype=dtype) y = array_constr([1, 1, 1, 2, 2, 2], dtype=dtype) - clf = SVC(kernel='linear').fit(X, y, queue=queue) - assert_array_equal(clf.dual_coef_, [[-0.25, .25]]) + clf = SVC(kernel="linear").fit(X, y, queue=queue) + assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]]) assert_array_equal(clf.support_, [1, 3]) assert_array_equal(clf.support_vectors_, (X[1], X[3])) - assert_array_equal(clf.intercept_, [0.]) + assert_array_equal(clf.intercept_, [0.0]) assert_array_equal(clf.predict(X), y) # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('array_constr', [np.array]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("array_constr", [np.array]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_libsvm_parameters(queue, array_constr, dtype): _test_libsvm_parameters(queue, array_constr, dtype) -@pytest.mark.parametrize('queue', get_queues('cpu') + [ - pytest.param(get_queues('gpu'), - marks=pytest.mark.xfail( - reason="class weights are not implemented " - "but the error is not raised"))]) +@pytest.mark.parametrize( + "queue", + get_queues("cpu") + + [ + pytest.param( + get_queues("gpu"), + marks=pytest.mark.xfail( + reason="class weights are not implemented " "but the error is not raised" + ), + ) + ], +) def test_class_weight(queue): X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) y = np.array([1, 1, 1, 2, 2, 2]) @@ -99,22 +109,22 @@ def test_class_weight(queue): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) +@pytest.mark.parametrize("queue", get_queues("cpu")) def test_sample_weight(queue): X = np.array([[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) - clf = SVC(kernel='linear') + clf = SVC(kernel="linear") clf.fit(X, y, sample_weight=[1] * 6, queue=queue) assert_array_almost_equal(clf.intercept_, [0.0]) -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_decision_function(queue): X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype=np.float32) Y = np.array([1, 1, 1, 2, 2, 2], dtype=np.float32) - clf = SVC(kernel='rbf', gamma=1, decision_function_shape='ovo') + clf = SVC(kernel="rbf", gamma=1, decision_function_shape="ovo") clf.fit(X, Y, queue=queue) rbfs = rbf_kernel(X, clf.support_vectors_, gamma=clf.gamma) @@ -123,38 +133,40 @@ def test_decision_function(queue): @pass_if_not_implemented_for_gpu(reason="multiclass svm is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_iris(queue): iris = datasets.load_iris() - clf = SVC(kernel='linear').fit(iris.data, iris.target, queue=queue) + clf = SVC(kernel="linear").fit(iris.data, iris.target, queue=queue) assert clf.score(iris.data, iris.target, queue=queue) > 0.9 assert_array_equal(clf.classes_, np.sort(clf.classes_)) @pass_if_not_implemented_for_gpu(reason="multiclass svm is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_decision_function_shape(queue): X, y = make_blobs(n_samples=80, centers=5, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # check shape of ovo_decition_function=True - clf = SVC(kernel='linear', - decision_function_shape='ovo').fit(X_train, y_train, queue=queue) + clf = SVC(kernel="linear", decision_function_shape="ovo").fit( + X_train, y_train, queue=queue + ) dec = clf.decision_function(X_train, queue=queue) assert dec.shape == (len(X_train), 10) with pytest.raises(ValueError, match="must be either 'ovr' or 'ovo'"): - SVC(decision_function_shape='bad').fit(X_train, y_train, queue=queue) + SVC(decision_function_shape="bad").fit(X_train, y_train, queue=queue) @pass_if_not_implemented_for_gpu(reason="multiclass svm is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): iris = datasets.load_iris() - clf = SVC(kernel='linear').fit(iris.data, iris.target, queue=queue) + clf = SVC(kernel="linear").fit(iris.data, iris.target, queue=queue) expected = clf.decision_function(iris.data, queue=queue) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) @@ -164,18 +176,26 @@ def test_pickle(queue): @pass_if_not_implemented_for_gpu(reason="sigmoid kernel is not implemented") -@pytest.mark.parametrize('queue', get_queues('cpu') + [ - pytest.param(get_queues('gpu'), - marks=pytest.mark.xfail(reason="raises Unimplemented error " - "with inconsistent error message"))]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize( + "queue", + get_queues("cpu") + + [ + pytest.param( + get_queues("gpu"), + marks=pytest.mark.xfail( + reason="raises Unimplemented error " "with inconsistent error message" + ), + ) + ], +) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_svc_sigmoid(queue, dtype): - X_train = np.array([[-1, 2], [0, 0], [2, -1], - [+1, +1], [+1, +2], [+2, +1]], dtype=dtype) - X_test = np.array([[0, 2], [0.5, 0.5], - [0.3, 0.1], [2, 0], [-1, -1]], dtype=dtype) + X_train = np.array( + [[-1, 2], [0, 0], [2, -1], [+1, +1], [+1, +2], [+2, +1]], dtype=dtype + ) + X_test = np.array([[0, 2], [0.5, 0.5], [0.3, 0.1], [2, 0], [-1, -1]], dtype=dtype) y_train = np.array([1, 1, 1, 2, 2, 2], dtype=dtype) - svc = SVC(kernel='sigmoid').fit(X_train, y_train, queue=queue) + svc = SVC(kernel="sigmoid").fit(X_train, y_train, queue=queue) assert_array_equal(svc.dual_coef_, [[-1, -1, -1, 1, 1, 1]]) assert_array_equal(svc.support_, [0, 1, 2, 3, 4, 5]) diff --git a/onedal/svm/tests/test_svr.py b/onedal/svm/tests/test_svr.py index f03d6813b7..6ad7822d4a 100644 --- a/onedal/svm/tests/test_svr.py +++ b/onedal/svm/tests/test_svr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,29 +12,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from numpy.testing import assert_array_almost_equal, assert_allclose, assert_array_equal +import pytest +import sklearn.utils.estimator_checks +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal from sklearn import datasets from sklearn.metrics.pairwise import rbf_kernel - -from onedal.svm import SVR from sklearn.svm import SVR as SklearnSVR - from sklearn.utils.estimator_checks import check_estimator -import sklearn.utils.estimator_checks - -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) +from onedal.svm import SVR +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) -synth_params = { - 'n_samples': 500, - 'n_features': 100, - 'random_state': 42 -} +synth_params = {"n_samples": 500, "n_features": 100, "random_state": 42} def _replace_and_save(md, fns, replacing_fn): @@ -59,25 +54,29 @@ def dummy(*args, **kwargs): pass md = sklearn.utils.estimator_checks - saved = _replace_and_save(md, [ - 'check_sample_weights_invariance', # Max absolute difference: 0.0002 - 'check_estimators_fit_returns_self', # ??? - 'check_regressors_train', # Cannot get data type from empty metadata - 'check_estimators_unfitted', # expected NotFittedError from sklearn - ], dummy) + saved = _replace_and_save( + md, + [ + "check_sample_weights_invariance", # Max absolute difference: 0.0002 + "check_estimators_fit_returns_self", # ??? + "check_regressors_train", # Cannot get data type from empty metadata + "check_estimators_unfitted", # expected NotFittedError from sklearn + ], + dummy, + ) check_estimator(SVR()) _restore_from_saved(md, saved) @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_run_to_run_fit(queue): diabetes = datasets.load_diabetes() - clf_first = SVR(kernel='linear', C=10.) + clf_first = SVR(kernel="linear", C=10.0) clf_first.fit(diabetes.data, diabetes.target, queue=queue) for _ in range(10): - clf = SVR(kernel='linear', C=10.) + clf = SVR(kernel="linear", C=10.0) clf.fit(diabetes.data, diabetes.target, queue=queue) assert_allclose(clf_first.intercept_, clf.intercept_) assert_allclose(clf_first.support_vectors_, clf.support_vectors_) @@ -85,35 +84,35 @@ def test_run_to_run_fit(queue): @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_diabetes_simple(queue): diabetes = datasets.load_diabetes() - clf = SVR(kernel='linear', C=10.) + clf = SVR(kernel="linear", C=10.0) clf.fit(diabetes.data, diabetes.target, queue=queue) assert clf.score(diabetes.data, diabetes.target, queue=queue) > 0.02 @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_input_format_for_diabetes(queue): diabetes = datasets.load_diabetes() - c_contiguous_numpy = np.asanyarray(diabetes.data, dtype='float', order='C') + c_contiguous_numpy = np.asanyarray(diabetes.data, dtype="float", order="C") assert c_contiguous_numpy.flags.c_contiguous assert not c_contiguous_numpy.flags.f_contiguous assert not c_contiguous_numpy.flags.fnc - clf = SVR(kernel='linear', C=10.) + clf = SVR(kernel="linear", C=10.0) clf.fit(c_contiguous_numpy, diabetes.target, queue=queue) dual_c_contiguous_numpy = clf.dual_coef_ res_c_contiguous_numpy = clf.predict(c_contiguous_numpy, queue=queue) - f_contiguous_numpy = np.asanyarray(diabetes.data, dtype='float', order='F') + f_contiguous_numpy = np.asanyarray(diabetes.data, dtype="float", order="F") assert not f_contiguous_numpy.flags.c_contiguous assert f_contiguous_numpy.flags.f_contiguous assert f_contiguous_numpy.flags.fnc - clf = SVR(kernel='linear', C=10.) + clf = SVR(kernel="linear", C=10.0) clf.fit(f_contiguous_numpy, diabetes.target, queue=queue) dual_f_contiguous_numpy = clf.dual_coef_ res_f_contiguous_numpy = clf.predict(f_contiguous_numpy, queue=queue) @@ -122,19 +121,19 @@ def test_input_format_for_diabetes(queue): @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_predict(queue): iris = datasets.load_iris() X = iris.data y = iris.target - reg = SVR(kernel='linear', C=0.1).fit(X, y, queue=queue) + reg = SVR(kernel="linear", C=0.1).fit(X, y, queue=queue) linear = np.dot(X, reg.support_vectors_.T) dec = np.dot(linear, reg.dual_coef_.T) + reg.intercept_ assert_array_almost_equal(dec.ravel(), reg.predict(X, queue=queue).ravel()) - reg = SVR(kernel='rbf', gamma=1).fit(X, y, queue=queue) + reg = SVR(kernel="rbf", gamma=1).fit(X, y, queue=queue) rbfs = rbf_kernel(X, reg.support_vectors_, gamma=reg.gamma) dec = np.dot(rbfs, reg.dual_coef_.T) + reg.intercept_ @@ -143,35 +142,36 @@ def test_predict(queue): def _test_diabetes_compare_with_sklearn(queue, kernel): diabetes = datasets.load_diabetes() - clf_onedal = SVR(kernel=kernel, C=10., gamma=2) + clf_onedal = SVR(kernel=kernel, C=10.0, gamma=2) clf_onedal.fit(diabetes.data, diabetes.target, queue=queue) result = clf_onedal.score(diabetes.data, diabetes.target, queue=queue) - clf_sklearn = SklearnSVR(kernel=kernel, C=10., gamma=2) + clf_sklearn = SklearnSVR(kernel=kernel, C=10.0, gamma=2) clf_sklearn.fit(diabetes.data, diabetes.target) expected = clf_sklearn.score(diabetes.data, diabetes.target) assert result > expected - 1e-5 assert_allclose(clf_sklearn.intercept_, clf_onedal.intercept_, atol=1e-3) - assert_allclose(clf_sklearn.support_vectors_.shape, - clf_sklearn.support_vectors_.shape) + assert_allclose( + clf_sklearn.support_vectors_.shape, clf_sklearn.support_vectors_.shape + ) assert_allclose(clf_sklearn.dual_coef_, clf_onedal.dual_coef_, atol=1e-1) @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'poly', 'sigmoid']) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"]) def test_diabetes_compare_with_sklearn(queue, kernel): _test_diabetes_compare_with_sklearn(queue, kernel) def _test_synth_rbf_compare_with_sklearn(queue, C, gamma): x, y = datasets.make_regression(**synth_params) - clf = SVR(kernel='rbf', gamma=gamma, C=C) + clf = SVR(kernel="rbf", gamma=gamma, C=C) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnSVR(kernel='rbf', gamma=gamma, C=C) + clf = SklearnSVR(kernel="rbf", gamma=gamma, C=C) clf.fit(x, y) expected = clf.score(x, y) @@ -180,20 +180,20 @@ def _test_synth_rbf_compare_with_sklearn(queue, C, gamma): @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('gamma', ['scale', 'auto']) -@pytest.mark.parametrize('C', [100.0, 1000.0]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("gamma", ["scale", "auto"]) +@pytest.mark.parametrize("C", [100.0, 1000.0]) def test_synth_rbf_compare_with_sklearn(queue, C, gamma): _test_synth_rbf_compare_with_sklearn(queue, C, gamma) def _test_synth_linear_compare_with_sklearn(queue, C): x, y = datasets.make_regression(**synth_params) - clf = SVR(kernel='linear', C=C) + clf = SVR(kernel="linear", C=C) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnSVR(kernel='linear', C=C) + clf = SklearnSVR(kernel="linear", C=C) clf.fit(x, y) expected = clf.score(x, y) @@ -204,19 +204,19 @@ def _test_synth_linear_compare_with_sklearn(queue, C): @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('C', [0.001, 0.1]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("C", [0.001, 0.1]) def test_synth_linear_compare_with_sklearn(queue, C): _test_synth_linear_compare_with_sklearn(queue, C) def _test_synth_poly_compare_with_sklearn(queue, params): x, y = datasets.make_regression(**synth_params) - clf = SVR(kernel='poly', **params) + clf = SVR(kernel="poly", **params) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnSVR(kernel='poly', **params) + clf = SklearnSVR(kernel="poly", **params) clf.fit(x, y) expected = clf.score(x, y) @@ -225,48 +225,52 @@ def _test_synth_poly_compare_with_sklearn(queue, params): @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('params', [ - {'degree': 2, 'coef0': 0.1, 'gamma': 'scale', 'C': 100}, - {'degree': 3, 'coef0': 0.0, 'gamma': 'scale', 'C': 1000} -]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize( + "params", + [ + {"degree": 2, "coef0": 0.1, "gamma": "scale", "C": 100}, + {"degree": 3, "coef0": 0.0, "gamma": "scale", "C": 1000}, + ], +) def test_synth_poly_compare_with_sklearn(queue, params): _test_synth_poly_compare_with_sklearn(queue, params) @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_sided_sample_weight(queue): - clf = SVR(C=1e-2, kernel='linear') + clf = SVR(C=1e-2, kernel="linear") X = [[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 0]] Y = [1, 1, 1, 2, 2, 2] - sample_weight = [10., .1, .1, .1, .1, 10] + sample_weight = [10.0, 0.1, 0.1, 0.1, 0.1, 10] clf.fit(X, Y, sample_weight=sample_weight, queue=queue) - y_pred = clf.predict([[-1., 1.]], queue=queue) + y_pred = clf.predict([[-1.0, 1.0]], queue=queue) assert y_pred < 1.5 - sample_weight = [1., .1, 10., 10., .1, .1] + sample_weight = [1.0, 0.1, 10.0, 10.0, 0.1, 0.1] clf.fit(X, Y, sample_weight=sample_weight, queue=queue) - y_pred = clf.predict([[-1., 1.]], queue=queue) + y_pred = clf.predict([[-1.0, 1.0]], queue=queue) assert y_pred > 1.5 sample_weight = [1] * 6 clf.fit(X, Y, sample_weight=sample_weight, queue=queue) - y_pred = clf.predict([[-1., 1.]], queue=queue) + y_pred = clf.predict([[-1.0, 1.0]], queue=queue) assert y_pred == pytest.approx(1.5) @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): diabetes = datasets.load_diabetes() - clf = SVR(kernel='rbf', C=10.) + clf = SVR(kernel="rbf", C=10.0) clf.fit(diabetes.data, diabetes.target, queue=queue) expected = clf.predict(diabetes.data, queue=queue) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) diff --git a/onedal/tests/utils/_device_selection.py b/onedal/tests/utils/_device_selection.py index fcac7d12cd..73fc060ead 100644 --- a/onedal/tests/utils/_device_selection.py +++ b/onedal/tests/utils/_device_selection.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,22 +12,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import functools +import pytest + -def get_queues(filter_='cpu,gpu'): +def get_queues(filter_="cpu,gpu"): queues = [] try: import dpctl - if dpctl.has_cpu_devices and 'cpu' in filter_: - queues.append(dpctl.SyclQueue('cpu')) - if dpctl.has_gpu_devices and 'gpu' in filter_: - queues.append(dpctl.SyclQueue('gpu')) + if dpctl.has_cpu_devices and "cpu" in filter_: + queues.append(dpctl.SyclQueue("cpu")) + if dpctl.has_gpu_devices and "gpu" in filter_: + queues.append(dpctl.SyclQueue("gpu")) finally: return queues @@ -35,6 +36,7 @@ def get_queues(filter_='cpu,gpu'): def get_memory_usm(): try: from dpctl.memory import MemoryUSMDevice, MemoryUSMShared + return [MemoryUSMDevice, MemoryUSMShared] except ImportError: return [] @@ -47,9 +49,9 @@ def is_dpctl_available(targets=None): if targets is None: return True for device in targets: - if device == 'cpu' and not dpctl.has_cpu_devices(): + if device == "cpu" and not dpctl.has_cpu_devices(): return False - if device == 'gpu' and not dpctl.has_gpu_devices(): + if device == "gpu" and not dpctl.has_gpu_devices(): return False return True except ImportError: @@ -58,14 +60,15 @@ def is_dpctl_available(targets=None): def device_type_to_str(queue): if queue is None: - return 'cpu' + return "cpu" from dpctl import device_type + if queue.sycl_device.device_type == device_type.cpu: - return 'cpu' + return "cpu" if queue.sycl_device.device_type == device_type.gpu: - return 'gpu' - return 'unknown' + return "gpu" + return "unknown" def pass_if_not_implemented_for_gpu(reason=""): @@ -75,9 +78,11 @@ def decorator(test): @functools.wraps(test) def wrapper(queue, *args, **kwargs): if queue is not None and queue.sycl_device.is_gpu: - with pytest.raises(RuntimeError, match='is not implemented for GPU'): + with pytest.raises(RuntimeError, match="is not implemented for GPU"): test(queue, *args, **kwargs) else: test(queue, *args, **kwargs) + return wrapper + return decorator diff --git a/onedal/utils/__init__.py b/onedal/utils/__init__.py new file mode 100644 index 0000000000..ed23be4782 --- /dev/null +++ b/onedal/utils/__init__.py @@ -0,0 +1,47 @@ +# =============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from .validation import ( + _check_array, + _check_classification_targets, + _check_n_features, + _check_X_y, + _column_or_1d, + _is_arraylike, + _is_arraylike_not_scalar, + _is_integral_float, + _is_multilabel, + _num_features, + _num_samples, + _type_of_target, + _validate_targets, +) + +__all__ = [ + "_column_or_1d", + "_validate_targets", + "_check_X_y", + "_check_array", + "_check_classification_targets", + "_type_of_target", + "_is_integral_float", + "_is_multilabel", + "_check_n_features", + "_num_features", + "_num_samples", + "_is_arraylike", + "_is_arraylike_not_scalar", +] diff --git a/onedal/datatypes/validation.py b/onedal/utils/validation.py similarity index 69% rename from onedal/datatypes/validation.py rename to onedal/utils/validation.py index ce3fa8a9b7..b163873a13 100644 --- a/onedal/datatypes/validation.py +++ b/onedal/utils/validation.py @@ -14,20 +14,21 @@ # limitations under the License. # =============================================================================== -import numpy as np import warnings +from collections.abc import Sequence +from numbers import Integral + +import numpy as np from scipy import sparse as sp -from scipy.sparse import issparse, dok_matrix, lil_matrix +from scipy.sparse import dok_matrix, issparse, lil_matrix from sklearn.preprocessing import LabelEncoder from sklearn.utils.validation import check_array -from collections.abc import Sequence -from numbers import Integral + from daal4py.sklearn.utils.validation import _assert_all_finite class DataConversionWarning(UserWarning): - """Warning used to notify implicit data conversions happening in the code. - """ + """Warning used to notify implicit data conversions happening in the code.""" def _is_arraylike(x): @@ -52,24 +53,26 @@ def _column_or_1d(y, warn=False): return np.ravel(y) if len(shape) == 2 and shape[1] == 1: if warn: - warnings.warn("A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples, ), for example using ravel().", - DataConversionWarning, stacklevel=2) + warnings.warn( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples, ), for example using ravel().", + DataConversionWarning, + stacklevel=2, + ) return np.ravel(y) raise ValueError( - "y should be a 1d array, " - "got an array of shape {} instead.".format(shape)) + "y should be a 1d array, " "got an array of shape {} instead.".format(shape) + ) def _compute_class_weight(class_weight, classes, y): if set(y) - set(classes): - raise ValueError("classes should include all valid labels that can " - "be in y") + raise ValueError("classes should include all valid labels that can " "be in y") if class_weight is None or len(class_weight) == 0: - weight = np.ones(classes.shape[0], dtype=np.float64, order='C') - elif class_weight == 'balanced': + weight = np.ones(classes.shape[0], dtype=np.float64, order="C") + elif class_weight == "balanced": y_ = _column_or_1d(y) classes, _ = np.unique(y_, return_inverse=True) @@ -82,10 +85,12 @@ def _compute_class_weight(class_weight, classes, y): weight = len(y_) / (len(le.classes_) * y_bin) else: # user-defined dictionary - weight = np.ones(classes.shape[0], dtype=np.float64, order='C') + weight = np.ones(classes.shape[0], dtype=np.float64, order="C") if not isinstance(class_weight, dict): - raise ValueError("class_weight must be dict, 'balanced', or None," - " got: %r" % class_weight) + raise ValueError( + "class_weight must be dict, 'balanced', or None," + " got: %r" % class_weight + ) for c in class_weight: i = np.searchsorted(classes, c) if i >= len(classes) or classes[i] != c: @@ -99,23 +104,30 @@ def _validate_targets(y, class_weight, dtype): y_ = _column_or_1d(y, warn=True) _check_classification_targets(y) classes, y = np.unique(y_, return_inverse=True) - class_weight_res = _compute_class_weight(class_weight, - classes=classes, y=y_) + class_weight_res = _compute_class_weight(class_weight, classes=classes, y=y_) if len(classes) < 2: raise ValueError( "The number of classes has to be greater than one; got %d" - " class" % len(classes)) - - return np.asarray(y, dtype=dtype, order='C'), class_weight_res, classes - - -def _check_array(array, dtype="numeric", accept_sparse=False, order=None, - copy=False, force_all_finite=True, - ensure_2d=True, accept_large_sparse=True): + " class" % len(classes) + ) + + return np.asarray(y, dtype=dtype, order="C"), class_weight_res, classes + + +def _check_array( + array, + dtype="numeric", + accept_sparse=False, + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + accept_large_sparse=True, +): if force_all_finite: if sp.issparse(array): - if hasattr(array, 'data'): + if hasattr(array, "data"): _assert_all_finite(array.data) force_all_finite = False else: @@ -129,7 +141,8 @@ def _check_array(array, dtype="numeric", accept_sparse=False, order=None, copy=copy, force_all_finite=force_all_finite, ensure_2d=ensure_2d, - accept_large_sparse=accept_large_sparse) + accept_large_sparse=accept_large_sparse, + ) if sp.isspmatrix(array): return array @@ -146,68 +159,82 @@ def _check_array(array, dtype="numeric", accept_sparse=False, order=None, def _check_X_y( - X, - y, - dtype="numeric", - accept_sparse=False, - order=None, - copy=False, - force_all_finite=True, - ensure_2d=True, - accept_large_sparse=True, - y_numeric=False, - accept_2d_y=False): + X, + y, + dtype="numeric", + accept_sparse=False, + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + accept_large_sparse=True, + y_numeric=False, + accept_2d_y=False, +): if y is None: raise ValueError("y cannot be None") - X = _check_array(X, accept_sparse=accept_sparse, - dtype=dtype, order=order, copy=copy, - force_all_finite=force_all_finite, - ensure_2d=ensure_2d, - accept_large_sparse=accept_large_sparse) + X = _check_array( + X, + accept_sparse=accept_sparse, + dtype=dtype, + order=order, + copy=copy, + force_all_finite=force_all_finite, + ensure_2d=ensure_2d, + accept_large_sparse=accept_large_sparse, + ) if not accept_2d_y: y = _column_or_1d(y, warn=True) - if y_numeric and y.dtype.kind == 'O': + if y_numeric and y.dtype.kind == "O": y = y.astype(np.float64) _assert_all_finite(y) lengths = [X.shape[0], y.shape[0]] uniques = np.unique(lengths) if len(uniques) > 1: - raise ValueError("Found input variables with inconsistent numbers of" - " samples: %r" % [int(length) for length in lengths]) + raise ValueError( + "Found input variables with inconsistent numbers of" + " samples: %r" % [int(length) for length in lengths] + ) return X, y def _check_classification_targets(y): y_type = _type_of_target(y) - if y_type not in ['binary', 'multiclass', 'multiclass-multioutput', - 'multilabel-indicator', 'multilabel-sequences']: + if y_type not in [ + "binary", + "multiclass", + "multiclass-multioutput", + "multilabel-indicator", + "multilabel-sequences", + ]: raise ValueError("Unknown label type: %r" % y_type) def _type_of_target(y): - is_sequence, is_array = isinstance(y, Sequence), hasattr(y, '__array__') + is_sequence, is_array = isinstance(y, Sequence), hasattr(y, "__array__") is_not_string, is_spmatrix = not isinstance(y, str), sp.isspmatrix(y) valid = (is_sequence or is_array or is_spmatrix) and is_not_string if not valid: - raise ValueError('Expected array-like (array or non-string sequence), ' - 'got %r' % y) + raise ValueError( + "Expected array-like (array or non-string sequence), " "got %r" % y + ) - sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray']) + sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"] if sparse_pandas: raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") if _is_multilabel(y): - return 'multilabel-indicator' + return "multilabel-indicator" # DeprecationWarning will be replaced by ValueError, see NEP 34 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html with warnings.catch_warnings(): - warnings.simplefilter('error', np.VisibleDeprecationWarning) + warnings.simplefilter("error", np.VisibleDeprecationWarning) try: y = np.asarray(y) except np.VisibleDeprecationWarning: @@ -217,23 +244,27 @@ def _type_of_target(y): # The old sequence of sequences format try: - if not hasattr(y[0], '__array__') and isinstance(y[0], Sequence) \ - and not isinstance(y[0], str): - raise ValueError('You appear to be using a legacy multi-label data' - ' representation. Sequence of sequences are no' - ' longer supported; use a binary array or sparse' - ' matrix instead - the MultiLabelBinarizer' - ' transformer can convert to this format.') + if ( + not hasattr(y[0], "__array__") + and isinstance(y[0], Sequence) + and not isinstance(y[0], str) + ): + raise ValueError( + "You appear to be using a legacy multi-label data" + " representation. Sequence of sequences are no" + " longer supported; use a binary array or sparse" + " matrix instead - the MultiLabelBinarizer" + " transformer can convert to this format." + ) except IndexError: pass # Invalid inputs - if y.ndim > 2 or (y.dtype == object and len( - y) and not isinstance(y.flat[0], str)): - return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] + if y.ndim > 2 or (y.dtype == object and len(y) and not isinstance(y.flat[0], str)): + return "unknown" # [[[1, 2]]] or [obj_1] and not ["label_1"] if y.ndim == 2 and y.shape[1] == 0: - return 'unknown' # [[]] + return "unknown" # [[]] if y.ndim == 2 and y.shape[1] > 1: suffix = "-multioutput" # [[1, 2], [1, 2]] @@ -241,26 +272,26 @@ def _type_of_target(y): suffix = "" # [1, 2, 3] or [[1], [2], [3]] # check float and contains non-integer float values - if y.dtype.kind == 'f' and np.any(y != y.astype(int)): + if y.dtype.kind == "f" and np.any(y != y.astype(int)): # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] _assert_all_finite(y) - return 'continuous' + suffix + return "continuous" + suffix if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): - return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] - return 'binary' # [1, 2] or [["a"], ["b"]] + return "multiclass" + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] + return "binary" # [1, 2] or [["a"], ["b"]] def _is_integral_float(y): - return y.dtype.kind == 'f' and np.all(y.astype(int) == y) + return y.dtype.kind == "f" and np.all(y.astype(int) == y) def _is_multilabel(y): - if hasattr(y, '__array__') or isinstance(y, Sequence): + if hasattr(y, "__array__") or isinstance(y, Sequence): # DeprecationWarning will be replaced by ValueError, see NEP 34 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html with warnings.catch_warnings(): - warnings.simplefilter('error', np.VisibleDeprecationWarning) + warnings.simplefilter("error", np.VisibleDeprecationWarning) try: y = np.asarray(y) except np.VisibleDeprecationWarning: @@ -274,12 +305,14 @@ def _is_multilabel(y): if issparse(y): if isinstance(y, (dok_matrix, lil_matrix)): y = y.tocsr() - return len(y.data) == 0 or np.unique(y.data).size == 1 and \ - (y.dtype.kind in 'biu' or _is_integral_float(np.unique(y.data))) + return ( + len(y.data) == 0 + or np.unique(y.data).size == 1 + and (y.dtype.kind in "biu" or _is_integral_float(np.unique(y.data))) + ) labels = np.unique(y) - return len(labels) < 3 and ( - y.dtype.kind in 'biu' or _is_integral_float(labels)) + return len(labels) < 3 and (y.dtype.kind in "biu" or _is_integral_float(labels)) def _check_n_features(self, X, reset): @@ -309,7 +342,8 @@ def _check_n_features(self, X, reset): if n_features != self.n_features_in_: raise ValueError( f"X has {n_features} features, but {self.__class__.__name__} " - f"is expecting {self.n_features_in_} features as input.") + f"is expecting {self.n_features_in_} features as input." + ) def _num_features(X, fallback_1d=False): @@ -318,20 +352,17 @@ def _num_features(X, fallback_1d=False): type_name = type_.__qualname__ else: type_name = f"{type_.__module__}.{type_.__qualname__}" - message = ( - "Unable to find the number of features from X of type " - f"{type_name}" - ) - if not hasattr(X, '__len__') and not hasattr(X, 'shape'): - if not hasattr(X, '__array__'): + message = "Unable to find the number of features from X of type " f"{type_name}" + if not hasattr(X, "__len__") and not hasattr(X, "shape"): + if not hasattr(X, "__array__"): raise TypeError(message) # Only convert X to a numpy array if there is no cheaper, heuristic # option. X = np.asarray(X) - if hasattr(X, 'shape'): + if hasattr(X, "shape"): ndim_thr = 1 if fallback_1d else 2 - if not hasattr(X.shape, '__len__') or len(X.shape) < ndim_thr: + if not hasattr(X.shape, "__len__") or len(X.shape) < ndim_thr: message += f" with shape {X.shape}" raise TypeError(message) return X.shape[-1] @@ -340,15 +371,14 @@ def _num_features(X, fallback_1d=False): # Do not consider an array-like of strings or dicts to be a 2D array if isinstance(first_sample, (str, bytes, dict)): - message += (f" where the samples are of type " - f"{type(first_sample).__qualname__}") + message += f" where the samples are of type " f"{type(first_sample).__qualname__}" raise TypeError(message) try: # If X is a list of lists, for instance, we assume that all nested # lists have the same length without checking or converting to # a numpy array to keep this function call as cheap as possible. - if (not fallback_1d) or hasattr(first_sample, '__len__'): + if (not fallback_1d) or hasattr(first_sample, "__len__"): return len(first_sample) else: return 1 @@ -371,8 +401,8 @@ def _num_samples(x): if hasattr(x, "shape") and x.shape is not None: if len(x.shape) == 0: raise TypeError( - "Singleton array %r cannot be considered a valid collection." % - x) + "Singleton array %r cannot be considered a valid collection." % x + ) # Check that shape is returning an integer or default to len # Dask dataframes may not return numeric shape[0] value if hasattr(x, "shape") and isinstance(x.shape[0], Integral): diff --git a/pyproject.toml b/pyproject.toml index 3723d2ede3..290e5f2056 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,3 +19,7 @@ line-length = 90 target-version = ['py37', 'py38', 'py39', 'py310', 'py311'] extend-ignore = 'E203' + +[tool.isort] +profile = "black" +line_length = 90 diff --git a/requirements-doc.txt b/requirements-doc.txt index 47a00a9ebf..4ebf238164 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -5,7 +5,7 @@ Babel==2.12.1 backcall==0.2.0 beautifulsoup4==4.12.2 bleach==6.0.0 -certifi==2023.5.7 +certifi==2023.7.22 charset-normalizer==3.1.0 click==8.1.3 decorator==5.1.1 diff --git a/setup.py b/setup.py index aaac0ab582..8ee45bb2a9 100644 --- a/setup.py +++ b/setup.py @@ -434,7 +434,8 @@ def run(self): 'onedal.ensemble', 'onedal.neighbors', 'onedal.primitives', - 'onedal.svm'] + 'onedal.svm', + 'onedal.utils'] if ONEDAL_VERSION >= 20230100: packages_with_tests += [ diff --git a/sklearnex/__main__.py b/sklearnex/__main__.py index f64f90402f..8fc1bbdaeb 100755 --- a/sklearnex/__main__.py +++ b/sklearnex/__main__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,9 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import sys + from sklearnex import patch_sklearn @@ -29,27 +30,30 @@ def _main(): scikit-learn, optimizing solvers of scikit-learn with Intel(R) oneAPI Data Analytics Library. """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('-m', action='store_true', dest='module', - help="Executes following as a module") - parser.add_argument('name', help="Script or module name") - parser.add_argument('args', nargs=argparse.REMAINDER, - help="Command line arguments") + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "-m", action="store_true", dest="module", help="Executes following as a module" + ) + parser.add_argument("name", help="Script or module name") + parser.add_argument("args", nargs=argparse.REMAINDER, help="Command line arguments") args = parser.parse_args() try: import sklearn + patch_sklearn() except ImportError: print("Scikit-learn could not be imported. Nothing to patch") sys.argv = [args.name] + args.args - if '_' + args.name in globals(): - return globals()['_' + args.name](*args.args) + if "_" + args.name in globals(): + return globals()["_" + args.name](*args.args) import runpy + runf = runpy.run_module if args.module else runpy.run_path - runf(args.name, run_name='__main__') + runf(args.name, run_name="__main__") sys.exit(_main()) diff --git a/sklearnex/_config.py b/sklearnex/_config.py index 6bba89145a..fa85762589 100644 --- a/sklearnex/_config.py +++ b/sklearnex/_config.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import threading from contextlib import contextmanager diff --git a/sklearnex/_device_offload.py b/sklearnex/_device_offload.py index b15d683a0d..e2d3693361 100644 --- a/sklearnex/_device_offload.py +++ b/sklearnex/_device_offload.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,44 +12,56 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from ._config import get_config -from ._utils import get_patch_message +import logging +import sys from functools import wraps + import numpy as np -import sys -import logging try: from dpctl import SyclQueue from dpctl.memory import MemoryUSMDevice, as_usm_memory from dpctl.tensor import usm_ndarray + dpctl_available = True except ImportError: dpctl_available = False -oneapi_is_available = 'daal4py.oneapi' in sys.modules +try: + import dpnp + + dpnp_available = True +except ImportError: + dpnp_available = False + +from ._config import get_config +from ._utils import get_patch_message + +oneapi_is_available = "daal4py.oneapi" in sys.modules if oneapi_is_available: from daal4py.oneapi import _get_device_name_sycl_ctxt, _get_sycl_ctxt_params class DummySyclQueue: - '''This class is designed to act like dpctl.SyclQueue - to allow device dispatching in scenarios when dpctl is not available''' + """This class is designed to act like dpctl.SyclQueue + to allow device dispatching in scenarios when dpctl is not available""" class DummySyclDevice: def __init__(self, filter_string): self._filter_string = filter_string - self.is_cpu = 'cpu' in filter_string - self.is_gpu = 'gpu' in filter_string + self.is_cpu = "cpu" in filter_string + self.is_gpu = "gpu" in filter_string # TODO: check for possibility of fp64 support # on other devices in this dummy class self.has_aspect_fp64 = self.is_cpu if not (self.is_cpu): - logging.warning("Device support is limited. " - "Please install dpctl for full experience") + logging.warning( + "Device support is limited. " + "Please install dpctl for full experience" + ) def get_filter_string(self): return self._filter_string @@ -65,23 +77,26 @@ def _get_device_info_from_daal4py(): def _get_global_queue(): - target = get_config()['target_offload'] + target = get_config()["target_offload"] d4p_target, _ = _get_device_info_from_daal4py() - if d4p_target == 'host': - d4p_target = 'cpu' + if d4p_target == "host": + d4p_target = "cpu" QueueClass = DummySyclQueue if not dpctl_available else SyclQueue - if target != 'auto': - if d4p_target is not None and \ - d4p_target != target: + if target != "auto": + if d4p_target is not None and d4p_target != target: if not isinstance(target, str): if d4p_target not in target.sycl_device.get_filter_string(): - raise RuntimeError("Cannot use target offload option " - "inside daal4py.oneapi.sycl_context") + raise RuntimeError( + "Cannot use target offload option " + "inside daal4py.oneapi.sycl_context" + ) else: - raise RuntimeError("Cannot use target offload option " - "inside daal4py.oneapi.sycl_context") + raise RuntimeError( + "Cannot use target offload option " + "inside daal4py.oneapi.sycl_context" + ) if isinstance(target, QueueClass): return target return QueueClass(target) @@ -95,22 +110,25 @@ def _transfer_to_host(queue, *data): host_data = [] for item in data: - usm_iface = getattr(item, '__sycl_usm_array_interface__', None) + usm_iface = getattr(item, "__sycl_usm_array_interface__", None) if usm_iface is not None: if not dpctl_available: - raise RuntimeError("dpctl need to be installed to work " - "with __sycl_usm_array_interface__") + raise RuntimeError( + "dpctl need to be installed to work " + "with __sycl_usm_array_interface__" + ) if queue is not None: - if queue.sycl_device != usm_iface['syclobj'].sycl_device: - raise RuntimeError('Input data shall be located ' - 'on single target device') + if queue.sycl_device != usm_iface["syclobj"].sycl_device: + raise RuntimeError( + "Input data shall be located " "on single target device" + ) else: - queue = usm_iface['syclobj'] + queue = usm_iface["syclobj"] buffer = as_usm_memory(item).copy_to_host() - item = np.ndarray(shape=usm_iface['shape'], - dtype=usm_iface['typestr'], - buffer=buffer) + item = np.ndarray( + shape=usm_iface["shape"], dtype=usm_iface["typestr"], buffer=buffer + ) has_usm_data = True else: has_host_data = True @@ -119,7 +137,7 @@ def _transfer_to_host(queue, *data): mismatch_usm_item = usm_iface is not None and has_host_data if mismatch_host_item or mismatch_usm_item: - raise RuntimeError('Input data shall be located on single target device') + raise RuntimeError("Input data shall be located on single target device") host_data.append(item) return queue, host_data @@ -128,23 +146,23 @@ def _transfer_to_host(queue, *data): def _get_backend(obj, queue, method_name, *data): cpu_device = queue is None or queue.sycl_device.is_cpu gpu_device = queue is not None and queue.sycl_device.is_gpu - cpu_fallback = False - if (cpu_device and obj._onedal_cpu_supported(method_name, *data)) or \ - (gpu_device and obj._onedal_gpu_supported(method_name, *data)): - return 'onedal', queue, cpu_fallback + if (cpu_device and obj._onedal_cpu_supported(method_name, *data)) or ( + gpu_device and obj._onedal_gpu_supported(method_name, *data) + ): + return "onedal", queue if cpu_device: - return 'sklearn', None, cpu_fallback + return "sklearn", None _, d4p_options = _get_device_info_from_daal4py() - allow_fallback = get_config()['allow_fallback_to_host'] or \ - d4p_options.get('host_offload_on_fail', False) + allow_fallback_to_host = get_config()["allow_fallback_to_host"] or d4p_options.get( + "host_offload_on_fail", False + ) - if gpu_device and allow_fallback: + if gpu_device and allow_fallback_to_host: if obj._onedal_cpu_supported(method_name, *data): - cpu_fallback = True - return 'onedal', None, cpu_fallback - return 'sklearn', None, cpu_fallback + return "onedal", None + return "sklearn", None raise RuntimeError("Device support is not implemented") @@ -155,20 +173,22 @@ def dispatch(obj, method_name, branches, *args, **kwargs): q, hostvalues = _transfer_to_host(q, *kwargs.values()) hostkwargs = dict(zip(kwargs.keys(), hostvalues)) - backend, q, cpu_fallback = _get_backend(obj, q, method_name, *hostargs) + backend, q = _get_backend(obj, q, method_name, *hostargs) - if backend == 'onedal': + if backend == "onedal": return branches[backend](obj, *hostargs, **hostkwargs, queue=q) - if backend == 'sklearn': + if backend == "sklearn": return branches[backend](obj, *hostargs, **hostkwargs) - raise RuntimeError(f'Undefined backend {backend} in ' - f'{obj.__class__.__name__}.{method_name}') + raise RuntimeError( + f"Undefined backend {backend} in " f"{obj.__class__.__name__}.{method_name}" + ) def _copy_to_usm(queue, array): if not dpctl_available: - raise RuntimeError("dpctl need to be installed to work " - "with __sycl_usm_array_interface__") + raise RuntimeError( + "dpctl need to be installed to work " "with __sycl_usm_array_interface__" + ) mem = MemoryUSMDevice(array.nbytes, queue=queue) mem.copy_from_host(array.tobytes()) return usm_ndarray(array.shape, array.dtype, buffer=mem) @@ -181,9 +201,12 @@ def wrapper(self, *args, **kwargs): if len(data) == 0: usm_iface = None else: - usm_iface = getattr(data[0], '__sycl_usm_array_interface__', None) + usm_iface = getattr(data[0], "__sycl_usm_array_interface__", None) result = func(self, *args, **kwargs) if usm_iface is not None: - return _copy_to_usm(usm_iface['syclobj'], result) + result = _copy_to_usm(usm_iface["syclobj"], result) + if dpnp_available and isinstance(data[0], dpnp.ndarray): + result = dpnp.array(result, copy=False) return result + return wrapper diff --git a/sklearnex/_utils.py b/sklearnex/_utils.py index 7f0d32a41b..dc7dae5365 100755 --- a/sklearnex/_utils.py +++ b/sklearnex/_utils.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,21 +13,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import logging -import warnings import os import sys +import warnings + from daal4py.sklearn._utils import daal_check_version def set_sklearn_ex_verbose(): log_level = os.environ.get("SKLEARNEX_VERBOSE") - logger = logging.getLogger('sklearnex') + logger = logging.getLogger("sklearnex") logging_channel = logging.StreamHandler() - logging_formatter = logging.Formatter('%(levelname)s:%(name)s: %(message)s') + logging_formatter = logging.Formatter("%(levelname)s:%(name)s: %(message)s") logging_channel.setFormatter(logging_formatter) logger.addHandler(logging_channel) @@ -35,9 +36,11 @@ def set_sklearn_ex_verbose(): if log_level is not None: logger.setLevel(log_level) except Exception: - warnings.warn('Unknown level "{}" for logging.\n' - 'Please, use one of "CRITICAL", "ERROR", ' - '"WARNING", "INFO", "DEBUG".'.format(log_level)) + warnings.warn( + 'Unknown level "{}" for logging.\n' + 'Please, use one of "CRITICAL", "ERROR", ' + '"WARNING", "INFO", "DEBUG".'.format(log_level) + ) def get_patch_message(s, queue=None, cpu_fallback=False): @@ -45,27 +48,29 @@ def get_patch_message(s, queue=None, cpu_fallback=False): message = "running accelerated version on " if queue is not None: if queue.sycl_device.is_gpu: - message += 'GPU' + message += "GPU" elif queue.sycl_device.is_cpu: - message += 'CPU' + message += "CPU" else: - raise RuntimeError('Unsupported device') + raise RuntimeError("Unsupported device") - elif 'daal4py.oneapi' in sys.modules: + elif "daal4py.oneapi" in sys.modules: from daal4py.oneapi import _get_device_name_sycl_ctxt + dev = _get_device_name_sycl_ctxt() - if dev == 'cpu' or dev is None: - message += 'CPU' - elif dev == 'gpu': + if dev == "cpu" or dev is None: + message += "CPU" + elif dev == "gpu": if cpu_fallback: - message += 'CPU' + message += "CPU" else: - message += 'GPU' + message += "GPU" else: - raise ValueError(f"Unexpected device name {dev}." - " Supported types are cpu and gpu") + raise ValueError( + f"Unexpected device name {dev}." " Supported types are cpu and gpu" + ) else: - message += 'CPU' + message += "CPU" elif s == "sklearn": message = "fallback to original Scikit-learn" @@ -74,7 +79,8 @@ def get_patch_message(s, queue=None, cpu_fallback=False): else: raise ValueError( f"Invalid input - expected one of 'onedal','sklearn'," - f" 'sklearn_after_onedal', got {s}") + f" 'sklearn_after_onedal', got {s}" + ) return message diff --git a/sklearnex/basic_statistics/__init__.py b/sklearnex/basic_statistics/__init__.py index 623c7cd83f..43c391c96f 100644 --- a/sklearnex/basic_statistics/__init__.py +++ b/sklearnex/basic_statistics/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .basic_statistics import BasicStatistics -__all__ = ['BasicStatistics'] +__all__ = ["BasicStatistics"] diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index 09e298c81b..f2b5b41694 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.basic_statistics import BasicStatistics diff --git a/sklearnex/cluster/__init__.py b/sklearnex/cluster/__init__.py index 3376349de3..81a8d7046d 100755 --- a/sklearnex/cluster/__init__.py +++ b/sklearnex/cluster/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,9 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from .k_means import KMeans from .dbscan import DBSCAN +from .k_means import KMeans -__all__ = ['KMeans', 'DBSCAN'] +__all__ = ["KMeans", "DBSCAN"] diff --git a/sklearnex/cluster/dbscan.py b/sklearnex/cluster/dbscan.py index 31fea742f3..7e2dc8d1a7 100755 --- a/sklearnex/cluster/dbscan.py +++ b/sklearnex/cluster/dbscan.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.cluster import DBSCAN diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 1f5b4556fc..50f70418d5 100755 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.cluster import KMeans diff --git a/sklearnex/cluster/tests/test_dbscan.py b/sklearnex/cluster/tests/test_dbscan.py index 5690c8427c..c1e5e7830f 100755 --- a/sklearnex/cluster/tests/test_dbscan.py +++ b/sklearnex/cluster/tests/test_dbscan.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,10 +21,10 @@ def test_sklearnex_import(): from sklearnex.cluster import DBSCAN - X = np.array([[1, 2], [2, 2], [2, 3], - [8, 7], [8, 8], [25, 80]]) + + X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]) dbscan = DBSCAN(eps=3, min_samples=2).fit(X) - assert 'daal4py' in dbscan.__module__ + assert "daal4py" in dbscan.__module__ result = dbscan.labels_ expected = np.array([0, 0, 0, 1, 1, -1], dtype=np.int32) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 78570349cf..69a8787bf4 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,10 +21,10 @@ def test_sklearnex_import(): from sklearnex.cluster import KMeans - X = np.array([[1, 2], [1, 4], [1, 0], - [10, 2], [10, 4], [10, 0]]) + + X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) kmeans = KMeans(n_clusters=2, random_state=0).fit(X) - assert 'daal4py' in kmeans.__module__ + assert "daal4py" in kmeans.__module__ result = kmeans.predict([[0, 0], [12, 3]]) expected = np.array([1, 0], dtype=np.int32) diff --git a/sklearnex/decomposition/__init__.py b/sklearnex/decomposition/__init__.py index ba84d03dc8..b9dadc237b 100755 --- a/sklearnex/decomposition/__init__.py +++ b/sklearnex/decomposition/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/sklearnex/decomposition/pca.py b/sklearnex/decomposition/pca.py index 317ac5bc7d..b0f374787d 100755 --- a/sklearnex/decomposition/pca.py +++ b/sklearnex/decomposition/pca.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.decomposition import PCA diff --git a/sklearnex/decomposition/tests/test_pca.py b/sklearnex/decomposition/tests/test_pca.py index 35c0e686d4..da9d3bc283 100755 --- a/sklearnex/decomposition/tests/test_pca.py +++ b/sklearnex/decomposition/tests/test_pca.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,7 +21,8 @@ def test_sklearnex_import(): from sklearnex.decomposition import PCA + X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) - pca = PCA(n_components=2, svd_solver='full').fit(X) - assert 'daal4py' in pca.__module__ + pca = PCA(n_components=2, svd_solver="full").fit(X) + assert "daal4py" in pca.__module__ assert_allclose(pca.singular_values_, [6.30061232, 0.54980396]) diff --git a/sklearnex/dispatcher.py b/sklearnex/dispatcher.py index 8ef12576ea..d19a4d4786 100644 --- a/sklearnex/dispatcher.py +++ b/sklearnex/dispatcher.py @@ -56,33 +56,49 @@ def get_patch_map(): from .neighbors import KNeighborsRegressor as KNeighborsRegressor_sklearnex from .neighbors import LocalOutlierFactor as LocalOutlierFactor_sklearnex from .neighbors import NearestNeighbors as NearestNeighbors_sklearnex - from .svm import SVC as SVC_sklearnex - from .svm import SVR as SVR_sklearnex - from .svm import NuSVC as NuSVC_sklearnex - from .svm import NuSVR as NuSVR_sklearnex # Preview classes for patching from .preview.cluster import KMeans as KMeans_sklearnex from .preview.decomposition import PCA as PCA_sklearnex - from .preview.linear_model import LinearRegression as LinearRegression_sklearnex from .preview.ensemble import ( ExtraTreesClassifier as ExtraTreesClassifier_sklearnex, - ExtraTreesRegressor as ExtraTreesRegressor_sklearnex, + ) + from .preview.ensemble import ExtraTreesRegressor as ExtraTreesRegressor_sklearnex + from .preview.ensemble import ( RandomForestClassifier as RandomForestClassifier_sklearnex, + ) + from .preview.ensemble import ( RandomForestRegressor as RandomForestRegressor_sklearnex, ) + from .preview.linear_model import LinearRegression as LinearRegression_sklearnex + from .svm import SVC as SVC_sklearnex + from .svm import SVR as SVR_sklearnex + from .svm import NuSVC as NuSVC_sklearnex + from .svm import NuSVR as NuSVR_sklearnex # Patch for mapping if _is_preview_enabled(): # Ensemble - mapping["extra_trees_classifier"] = [[(ensemble_module, - "ExtraTreesClassifier", - ExtraTreesClassifier_sklearnex), - None]] - mapping["extra_trees_regressor"] = [[(ensemble_module, - "ExtraTreesRegressor", - ExtraTreesRegressor_sklearnex), - None]] + mapping["extra_trees_classifier"] = [ + [ + ( + ensemble_module, + "ExtraTreesClassifier", + ExtraTreesClassifier_sklearnex, + ), + None, + ] + ] + mapping["extra_trees_regressor"] = [ + [ + ( + ensemble_module, + "ExtraTreesRegressor", + ExtraTreesRegressor_sklearnex, + ), + None, + ] + ] mapping["extratreesclassifier"] = mapping["extra_trees_classifier"] mapping["extratreesregressor"] = mapping["extra_trees_regressor"] mapping.pop("random_forest_classifier") @@ -239,9 +255,7 @@ def patch_sklearn(name=None, verbose=True, global_patch=False, preview=False): algorithm, verbose=False, deprecation=False, get_map=get_patch_map ) else: - patch_sklearn_orig( - name, verbose=False, deprecation=False, get_map=get_patch_map - ) + patch_sklearn_orig(name, verbose=False, deprecation=False, get_map=get_patch_map) if verbose and sys.stderr is not None: sys.stderr.write( @@ -288,9 +302,7 @@ def sklearn_is_patched(name=None, return_map=False): ) return is_patched else: - return sklearn_is_patched_orig( - name, get_map=get_patch_map, return_map=return_map - ) + return sklearn_is_patched_orig(name, get_map=get_patch_map, return_map=return_map) def is_patched_instance(instance: object) -> bool: diff --git a/sklearnex/ensemble/__init__.py b/sklearnex/ensemble/__init__.py index e1102d00d9..5c40aa4974 100644 --- a/sklearnex/ensemble/__init__.py +++ b/sklearnex/ensemble/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .forest import RandomForestClassifier, RandomForestRegressor -__all__ = ['RandomForestClassifier', 'RandomForestRegressor'] +__all__ = ["RandomForestClassifier", "RandomForestRegressor"] diff --git a/sklearnex/ensemble/forest.py b/sklearnex/ensemble/forest.py index 5ad02b46cd..d15e32ce6d 100644 --- a/sklearnex/ensemble/forest.py +++ b/sklearnex/ensemble/forest.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.ensemble import RandomForestClassifier, RandomForestRegressor diff --git a/sklearnex/ensemble/tests/test_forest.py b/sklearnex/ensemble/tests/test_forest.py index 6437496aae..309b3658a1 100644 --- a/sklearnex/ensemble/tests/test_forest.py +++ b/sklearnex/ensemble/tests/test_forest.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,34 +13,42 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose from sklearn.datasets import make_classification, make_regression + from daal4py.sklearn._utils import daal_check_version def test_sklearnex_import_rf_classifier(): from sklearnex.ensemble import RandomForestClassifier - X, y = make_classification(n_samples=1000, n_features=4, - n_informative=2, n_redundant=0, - random_state=0, shuffle=False) + + X, y = make_classification( + n_samples=1000, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, + ) rf = RandomForestClassifier(max_depth=2, random_state=0).fit(X, y) - assert 'daal4py' in rf.__module__ + assert "daal4py" in rf.__module__ assert_allclose([1], rf.predict([[0, 0, 0, 0]])) def test_sklearnex_import_rf_regression(): from sklearnex.ensemble import RandomForestRegressor - X, y = make_regression(n_features=4, n_informative=2, - random_state=0, shuffle=False) + + X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) rf = RandomForestRegressor(max_depth=2, random_state=0).fit(X, y) - assert 'daal4py' in rf.__module__ + assert "daal4py" in rf.__module__ pred = rf.predict([[0, 0, 0, 0]]) - if daal_check_version((2021, 'P', 400)): + if daal_check_version((2021, "P", 400)): # random engine work was changed in sklearnex 2023.1 - assert np.allclose([-6.97], pred, atol=1e-2) \ - or np.allclose([-8.36], pred, atol=1e-2) + assert np.allclose([-6.97], pred, atol=1e-2) or np.allclose( + [-8.36], pred, atol=1e-2 + ) else: assert_allclose([-6.66], pred, atol=1e-2) diff --git a/sklearnex/glob/__main__.py b/sklearnex/glob/__main__.py index 7712a587d6..de51b784e3 100755 --- a/sklearnex/glob/__main__.py +++ b/sklearnex/glob/__main__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,10 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from sklearnex import patch_sklearn -from sklearnex import unpatch_sklearn +from sklearnex import patch_sklearn, unpatch_sklearn def _main(): @@ -34,15 +33,29 @@ def __call__(self, parser, namespace, values, option_string=None): description=""" Patch all your Scikit-learn applications using Intel(R) Extension for scikit-learn.""", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.register('action', 'extend', ExtendAction) - parser.add_argument('action', choices=["patch_sklearn", "unpatch_sklearn"], - help="Enable or Disable patching") - parser.add_argument('--no-verbose', '-nv', action='store_false', - help="Disable additional information about enabling patching") - parser.add_argument('--algorithm', '-a', action='extend', type=str, nargs="+", - help="The name of an algorithm to be patched globally") + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.register("action", "extend", ExtendAction) + parser.add_argument( + "action", + choices=["patch_sklearn", "unpatch_sklearn"], + help="Enable or Disable patching", + ) + parser.add_argument( + "--no-verbose", + "-nv", + action="store_false", + help="Disable additional information about enabling patching", + ) + parser.add_argument( + "--algorithm", + "-a", + action="extend", + type=str, + nargs="+", + help="The name of an algorithm to be patched globally", + ) args = parser.parse_args() if args.action == "patch_sklearn": @@ -50,9 +63,11 @@ def __call__(self, parser, namespace, values, option_string=None): elif args.action == "unpatch_sklearn": unpatch_sklearn(global_unpatch=True) else: - raise RuntimeError("Invalid choice for the action attribute." - " Expected: patch_sklearn or unpatch_sklearn." - f" Got {args.action}") + raise RuntimeError( + "Invalid choice for the action attribute." + " Expected: patch_sklearn or unpatch_sklearn." + f" Got {args.action}" + ) _main() diff --git a/sklearnex/glob/dispatcher.py b/sklearnex/glob/dispatcher.py index 7633832921..631e51c907 100755 --- a/sklearnex/glob/dispatcher.py +++ b/sklearnex/glob/dispatcher.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== + def get_patch_str(name=None, verbose=True): return f"""try: @@ -36,46 +37,52 @@ def get_patch_str_re(): def patch_sklearn_global(name=None, verbose=True): import os import re + try: import sklearn except ImportError: raise ImportError("Scikit-learn could not be imported. Nothing to patch\n") init_file_path = sklearn.__file__ - distributor_file_path = os.path.join(os.path.dirname(init_file_path), - "_distributor_init.py") + distributor_file_path = os.path.join( + os.path.dirname(init_file_path), "_distributor_init.py" + ) - with open(distributor_file_path, 'r', encoding='utf-8') as distributor_file: + with open(distributor_file_path, "r", encoding="utf-8") as distributor_file: lines = distributor_file.read() if re.search(get_patch_str_re(), lines): - lines = re.sub(get_patch_str_re(), '', lines) + lines = re.sub(get_patch_str_re(), "", lines) - with open(distributor_file_path, 'w', encoding='utf-8') as distributor_file: + with open(distributor_file_path, "w", encoding="utf-8") as distributor_file: distributor_file.write(lines + "\n" + get_patch_str(name, verbose) + "\n") - print("Scikit-learn was successfully globally patched" - " by Intel(R) Extension for Scikit-learn") + print( + "Scikit-learn was successfully globally patched" + " by Intel(R) Extension for Scikit-learn" + ) return def unpatch_sklearn_global(): import os import re + try: import sklearn except ImportError: raise ImportError("Scikit-learn could not be imported. Nothing to unpatch\n") init_file_path = sklearn.__file__ - distributor_file_path = os.path.join(os.path.dirname(init_file_path), - "_distributor_init.py") + distributor_file_path = os.path.join( + os.path.dirname(init_file_path), "_distributor_init.py" + ) - with open(distributor_file_path, 'r', encoding='utf-8') as distributor_file: + with open(distributor_file_path, "r", encoding="utf-8") as distributor_file: lines = distributor_file.read() if not re.search(get_patch_str_re(), lines): print("Nothing to unpatch: Scikit-learn is not patched\n") return - lines = re.sub(get_patch_str_re(), '', lines) + lines = re.sub(get_patch_str_re(), "", lines) - with open(distributor_file_path, 'w', encoding='utf-8') as distributor_file: + with open(distributor_file_path, "w", encoding="utf-8") as distributor_file: distributor_file.write(lines) print("Scikit-learn was successfully globally unpatched") diff --git a/sklearnex/linear_model/__init__.py b/sklearnex/linear_model/__init__.py index d04e9cb3cf..012522ca82 100755 --- a/sklearnex/linear_model/__init__.py +++ b/sklearnex/linear_model/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,18 +13,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +from .coordinate_descent import ElasticNet, Lasso from .linear import LinearRegression -from .logistic_path import logistic_regression_path, LogisticRegression +from .logistic_path import LogisticRegression, logistic_regression_path from .ridge import Ridge -from .coordinate_descent import ElasticNet, Lasso __all__ = [ - 'Ridge', - 'LinearRegression', - 'LogisticRegression', - 'logistic_regression_path', - 'ElasticNet', - 'Lasso' + "Ridge", + "LinearRegression", + "LogisticRegression", + "logistic_regression_path", + "ElasticNet", + "Lasso", ] diff --git a/sklearnex/linear_model/coordinate_descent.py b/sklearnex/linear_model/coordinate_descent.py index efc8e72e20..731de3dc09 100644 --- a/sklearnex/linear_model/coordinate_descent.py +++ b/sklearnex/linear_model/coordinate_descent.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.linear_model import ElasticNet, Lasso diff --git a/sklearnex/linear_model/logistic_path.py b/sklearnex/linear_model/logistic_path.py index ee852748f6..b9274f76f5 100644 --- a/sklearnex/linear_model/logistic_path.py +++ b/sklearnex/linear_model/logistic_path.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn.linear_model import logistic_regression_path, LogisticRegression +from daal4py.sklearn.linear_model import LogisticRegression, logistic_regression_path diff --git a/sklearnex/linear_model/ridge.py b/sklearnex/linear_model/ridge.py index 81a83c5f98..6c00cee3a3 100644 --- a/sklearnex/linear_model/ridge.py +++ b/sklearnex/linear_model/ridge.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.linear_model import Ridge diff --git a/sklearnex/linear_model/tests/test_linear.py b/sklearnex/linear_model/tests/test_linear.py index 7a1e3f52d8..3b8dd9d3ab 100755 --- a/sklearnex/linear_model/tests/test_linear.py +++ b/sklearnex/linear_model/tests/test_linear.py @@ -18,44 +18,49 @@ import numpy as np from numpy.testing import assert_allclose from sklearn.datasets import make_regression + from daal4py.sklearn._utils import daal_check_version def test_sklearnex_import_linear(): from sklearnex.linear_model import LinearRegression + X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) y = np.dot(X, np.array([1, 2])) + 3 linreg = LinearRegression().fit(X, y) - assert 'daal4py' in linreg.__module__ + assert "daal4py" in linreg.__module__ assert linreg.n_features_in_ == 2 - assert_allclose(linreg.intercept_, 3.) - assert_allclose(linreg.coef_, [1., 2.]) + assert_allclose(linreg.intercept_, 3.0) + assert_allclose(linreg.coef_, [1.0, 2.0]) def test_sklearnex_import_ridge(): from sklearnex.linear_model import Ridge + X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) y = np.dot(X, np.array([1, 2])) + 3 ridgereg = Ridge().fit(X, y) - assert 'daal4py' in ridgereg.__module__ + assert "daal4py" in ridgereg.__module__ assert_allclose(ridgereg.intercept_, 4.5) assert_allclose(ridgereg.coef_, [0.8, 1.4]) def test_sklearnex_import_lasso(): from sklearnex.linear_model import Lasso + X = [[0, 0], [1, 1], [2, 2]] y = [0, 1, 2] lasso = Lasso(alpha=0.1).fit(X, y) - assert 'daal4py' in lasso.__module__ + assert "daal4py" in lasso.__module__ assert_allclose(lasso.intercept_, 0.15) assert_allclose(lasso.coef_, [0.85, 0.0]) def test_sklearnex_import_elastic(): from sklearnex.linear_model import ElasticNet + X, y = make_regression(n_features=2, random_state=0) elasticnet = ElasticNet(random_state=0).fit(X, y) - assert 'daal4py' in elasticnet.__module__ + assert "daal4py" in elasticnet.__module__ assert_allclose(elasticnet.intercept_, 1.451, atol=1e-3) assert_allclose(elasticnet.coef_, [18.838, 64.559], atol=1e-3) diff --git a/sklearnex/linear_model/tests/test_logreg.py b/sklearnex/linear_model/tests/test_logreg.py index 35489b0eff..c361a09a48 100755 --- a/sklearnex/linear_model/tests/test_logreg.py +++ b/sklearnex/linear_model/tests/test_logreg.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -22,7 +22,8 @@ def test_sklearnex_import(): from sklearnex.linear_model import LogisticRegression + X, y = load_iris(return_X_y=True) logreg = LogisticRegression(random_state=0, max_iter=200).fit(X, y) - assert 'daal4py' in logreg.__module__ + assert "daal4py" in logreg.__module__ assert_allclose(logreg.score(X, y), 0.9733, atol=1e-3) diff --git a/sklearnex/manifold/__init__.py b/sklearnex/manifold/__init__.py index 6310727ed0..9c9fda72f6 100755 --- a/sklearnex/manifold/__init__.py +++ b/sklearnex/manifold/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .t_sne import TSNE -__all__ = ['TSNE'] +__all__ = ["TSNE"] diff --git a/sklearnex/manifold/t_sne.py b/sklearnex/manifold/t_sne.py index 000e1406d6..bb1b72f48c 100755 --- a/sklearnex/manifold/t_sne.py +++ b/sklearnex/manifold/t_sne.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.manifold import TSNE diff --git a/sklearnex/manifold/tests/test_tsne.py b/sklearnex/manifold/tests/test_tsne.py index 159cebeba0..a5e5027d97 100755 --- a/sklearnex/manifold/tests/test_tsne.py +++ b/sklearnex/manifold/tests/test_tsne.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,6 +21,7 @@ def test_sklearnex_import(): from sklearnex.manifold import TSNE + X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) tsne = TSNE(n_components=2, perplexity=2.0).fit(X) - assert 'daal4py' in tsne.__module__ + assert "daal4py" in tsne.__module__ diff --git a/sklearnex/metrics/__init__.py b/sklearnex/metrics/__init__.py index 1d3a5b8021..37724a8557 100755 --- a/sklearnex/metrics/__init__.py +++ b/sklearnex/metrics/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,12 +13,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from .ranking import roc_auc_score from .pairwise import pairwise_distances +from .ranking import roc_auc_score __all__ = [ - 'roc_auc_score', - 'pairwise_distances', + "roc_auc_score", + "pairwise_distances", ] diff --git a/sklearnex/metrics/pairwise.py b/sklearnex/metrics/pairwise.py index 25f32b5d83..938bad4dd4 100755 --- a/sklearnex/metrics/pairwise.py +++ b/sklearnex/metrics/pairwise.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.metrics import pairwise_distances diff --git a/sklearnex/metrics/ranking.py b/sklearnex/metrics/ranking.py index 8982be4e69..14762dd3ef 100755 --- a/sklearnex/metrics/ranking.py +++ b/sklearnex/metrics/ranking.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.metrics import roc_auc_score diff --git a/sklearnex/metrics/tests/test_metrics.py b/sklearnex/metrics/tests/test_metrics.py index cf1d6bda93..85ac15cb3a 100755 --- a/sklearnex/metrics/tests/test_metrics.py +++ b/sklearnex/metrics/tests/test_metrics.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,18 +21,20 @@ def test_sklearnex_import_roc_auc(): - from sklearnex.metrics import roc_auc_score from sklearnex.linear_model import LogisticRegression + from sklearnex.metrics import roc_auc_score + X, y = load_breast_cancer(return_X_y=True) - clf = LogisticRegression(solver='liblinear', random_state=0).fit(X, y) + clf = LogisticRegression(solver="liblinear", random_state=0).fit(X, y) res = roc_auc_score(y, clf.decision_function(X)) assert_allclose(res, 0.99, atol=1e-2) def test_sklearnex_import_pairwise_distances(): from sklearnex.metrics import pairwise_distances + rng = np.random.RandomState(0) x = np.abs(rng.rand(4), dtype=np.float64) x = np.vstack([x, x]) - res = pairwise_distances(x, metric='cosine') - assert_allclose(res, [[0., 0.], [0., 0.]], atol=1e-2) + res = pairwise_distances(x, metric="cosine") + assert_allclose(res, [[0.0, 0.0], [0.0, 0.0]], atol=1e-2) diff --git a/sklearnex/model_selection/__init__.py b/sklearnex/model_selection/__init__.py index b96bd0f4ab..99222cd7f1 100755 --- a/sklearnex/model_selection/__init__.py +++ b/sklearnex/model_selection/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,10 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .split import train_test_split __all__ = [ - 'train_test_split', + "train_test_split", ] diff --git a/sklearnex/model_selection/split.py b/sklearnex/model_selection/split.py index d2278382f2..cd00f112ab 100755 --- a/sklearnex/model_selection/split.py +++ b/sklearnex/model_selection/split.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.model_selection import train_test_split diff --git a/sklearnex/model_selection/tests/test_model_selection.py b/sklearnex/model_selection/tests/test_model_selection.py index 1e12c53461..78af6b12e0 100755 --- a/sklearnex/model_selection/tests/test_model_selection.py +++ b/sklearnex/model_selection/tests/test_model_selection.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,10 +21,11 @@ def test_sklearnex_import_train_test_split(): from sklearnex.model_selection import train_test_split + X = np.arange(100).reshape((10, 10)) y = np.arange(10) - split = train_test_split(X, y, test_size=None, train_size=.5) + split = train_test_split(X, y, test_size=None, train_size=0.5) X_train, X_test, y_train, y_test = split assert len(y_test) == len(y_train) diff --git a/sklearnex/neighbors/__init__.py b/sklearnex/neighbors/__init__.py index c84dcd238e..1f9d31c88e 100755 --- a/sklearnex/neighbors/__init__.py +++ b/sklearnex/neighbors/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,12 +13,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .knn_classification import KNeighborsClassifier -from .knn_unsupervised import NearestNeighbors from .knn_regression import KNeighborsRegressor +from .knn_unsupervised import NearestNeighbors from .lof import LocalOutlierFactor -__all__ = ['KNeighborsClassifier', 'KNeighborsRegressor', 'LocalOutlierFactor', - 'NearestNeighbors'] +__all__ = [ + "KNeighborsClassifier", + "KNeighborsRegressor", + "LocalOutlierFactor", + "NearestNeighbors", +] diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 81178d555d..e12056d56f 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,20 +13,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import PatchingConditionsChain, sklearn_check_version -from onedal.datatypes import _check_array, _num_features, _num_samples +import warnings import numpy as np from scipy import sparse as sp -import warnings - +from sklearn.neighbors._ball_tree import BallTree from sklearn.neighbors._base import VALID_METRICS from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase -from sklearn.neighbors._ball_tree import BallTree from sklearn.neighbors._kd_tree import KDTree +from daal4py.sklearn._utils import PatchingConditionsChain, sklearn_check_version +from onedal.utils import _check_array, _num_features, _num_samples + class KNeighborsDispatchingBase: def _fit_validation(self, X, y=None): @@ -34,11 +34,15 @@ def _fit_validation(self, X, y=None): self._validate_params() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) - if self.metric_params is not None and 'p' in self.metric_params: + if self.metric_params is not None and "p" in self.metric_params: if self.p is not None: - warnings.warn("Parameter p is found in metric_params. " - "The corresponding parameter from __init__ " - "is ignored.", SyntaxWarning, stacklevel=2) + warnings.warn( + "Parameter p is found in metric_params. " + "The corresponding parameter from __init__ " + "is ignored.", + SyntaxWarning, + stacklevel=2, + ) self.effective_metric_params_ = self.metric_params.copy() effective_p = self.metric_params["p"] else: @@ -59,31 +63,35 @@ def _fit_validation(self, X, y=None): if not isinstance(X, (KDTree, BallTree, sklearn_NeighborsBase)): self._fit_X = _check_array( - X, dtype=[np.float64, np.float32], accept_sparse=True) + X, dtype=[np.float64, np.float32], accept_sparse=True + ) self.n_samples_fit_ = _num_samples(self._fit_X) self.n_features_in_ = _num_features(self._fit_X) if self.algorithm == "auto": # A tree approach is better for small number of neighbors or small # number of features, with KDTree generally faster when available - is_n_neighbors_valid_for_brute = self.n_neighbors is not None and \ - self.n_neighbors >= self._fit_X.shape[0] // 2 + is_n_neighbors_valid_for_brute = ( + self.n_neighbors is not None + and self.n_neighbors >= self._fit_X.shape[0] // 2 + ) if self._fit_X.shape[1] > 15 or is_n_neighbors_valid_for_brute: self._fit_method = "brute" else: if self.effective_metric_ in VALID_METRICS["kd_tree"]: self._fit_method = "kd_tree" - elif callable(self.effective_metric_) or \ - self.effective_metric_ in \ - VALID_METRICS["ball_tree"]: + elif ( + callable(self.effective_metric_) + or self.effective_metric_ in VALID_METRICS["ball_tree"] + ): self._fit_method = "ball_tree" else: self._fit_method = "brute" else: self._fit_method = self.algorithm - if hasattr(self, '_onedal_estimator'): - delattr(self, '_onedal_estimator') + if hasattr(self, "_onedal_estimator"): + delattr(self, "_onedal_estimator") # To cover test case when we pass patched # estimator as an input for other estimator if isinstance(X, sklearn_NeighborsBase): @@ -92,8 +100,8 @@ def _fit_validation(self, X, y=None): self._fit_method = X._fit_method self.n_samples_fit_ = X.n_samples_fit_ self.n_features_in_ = X.n_features_in_ - if hasattr(X, '_onedal_estimator'): - self.effective_metric_params_.pop('p') + if hasattr(X, "_onedal_estimator"): + self.effective_metric_params_.pop("p") if self._fit_method == "ball_tree": X._tree = BallTree( X._fit_X, @@ -116,58 +124,63 @@ def _fit_validation(self, X, y=None): elif isinstance(X, BallTree): self._fit_X = X.data self._tree = X - self._fit_method = 'ball_tree' + self._fit_method = "ball_tree" self.n_samples_fit_ = X.data.shape[0] self.n_features_in_ = X.data.shape[1] elif isinstance(X, KDTree): self._fit_X = X.data self._tree = X - self._fit_method = 'kd_tree' + self._fit_method = "kd_tree" self.n_samples_fit_ = X.data.shape[0] self.n_features_in_ = X.data.shape[1] def _onedal_supported(self, device, method_name, *data): class_name = self.__class__.__name__ - is_classifier = 'Classifier' in class_name - is_regressor = 'Regressor' in class_name + is_classifier = "Classifier" in class_name + is_regressor = "Regressor" in class_name is_unsupervised = not (is_classifier or is_regressor) patching_status = PatchingConditionsChain( - f'sklearn.neighbors.{class_name}.{method_name}') + f"sklearn.neighbors.{class_name}.{method_name}" + ) if not patching_status.and_condition( not isinstance(data[0], (KDTree, BallTree, sklearn_NeighborsBase)), - f'Input type {type(data[0])} is not supported.' + f"Input type {type(data[0])} is not supported.", ): return patching_status.get_status(logs=True) - if self._fit_method in ['auto', 'ball_tree']: - condition = self.n_neighbors is not None and \ - self.n_neighbors >= self.n_samples_fit_ // 2 + if self._fit_method in ["auto", "ball_tree"]: + condition = ( + self.n_neighbors is not None + and self.n_neighbors >= self.n_samples_fit_ // 2 + ) if self.n_features_in_ > 15 or condition: - result_method = 'brute' + result_method = "brute" else: - if self.effective_metric_ in ['euclidean']: - result_method = 'kd_tree' + if self.effective_metric_ in ["euclidean"]: + result_method = "kd_tree" else: - result_method = 'brute' + result_method = "brute" else: result_method = self._fit_method - p_less_than_one = "p" in self.effective_metric_params_.keys() and \ - self.effective_metric_params_["p"] < 1 + p_less_than_one = ( + "p" in self.effective_metric_params_.keys() + and self.effective_metric_params_["p"] < 1 + ) if not patching_status.and_condition( not p_less_than_one, '"p" metric parameter is less than 1' ): return patching_status.get_status(logs=True) if not patching_status.and_condition( - not sp.isspmatrix(data[0]), 'Sparse input is not supported.' + not sp.isspmatrix(data[0]), "Sparse input is not supported." ): return patching_status.get_status(logs=True) if not is_unsupervised: - is_valid_weights = self.weights in ['uniform', "distance"] + is_valid_weights = self.weights in ["uniform", "distance"] if is_classifier: class_count = 1 is_single_output = False @@ -177,65 +190,73 @@ def _onedal_supported(self, device, method_name, *data): y = np.asarray(data[1]) if is_classifier: class_count = len(np.unique(y)) - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): y = self._onedal_estimator._y - if y is not None and hasattr(y, 'ndim') and hasattr(y, 'shape'): + if y is not None and hasattr(y, "ndim") and hasattr(y, "shape"): is_single_output = y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1 # TODO: add native support for these metric names - metrics_map = { - 'manhattan': ['l1', 'cityblock'], - 'euclidean': ['l2'] - } + metrics_map = {"manhattan": ["l1", "cityblock"], "euclidean": ["l2"]} for origin, aliases in metrics_map.items(): if self.effective_metric_ in aliases: self.effective_metric_ = origin break - if self.effective_metric_ == 'manhattan': - self.effective_metric_params_['p'] = 1 - elif self.effective_metric_ == 'euclidean': - self.effective_metric_params_['p'] = 2 + if self.effective_metric_ == "manhattan": + self.effective_metric_params_["p"] = 1 + elif self.effective_metric_ == "euclidean": + self.effective_metric_params_["p"] = 2 onedal_brute_metrics = [ - 'manhattan', 'minkowski', 'euclidean', 'chebyshev', 'cosine'] - onedal_kdtree_metrics = ['euclidean'] - is_valid_for_brute = result_method == 'brute' and \ - self.effective_metric_ in onedal_brute_metrics - is_valid_for_kd_tree = result_method == 'kd_tree' and \ - self.effective_metric_ in onedal_kdtree_metrics - if result_method == 'kd_tree': + "manhattan", + "minkowski", + "euclidean", + "chebyshev", + "cosine", + ] + onedal_kdtree_metrics = ["euclidean"] + is_valid_for_brute = ( + result_method == "brute" and self.effective_metric_ in onedal_brute_metrics + ) + is_valid_for_kd_tree = ( + result_method == "kd_tree" and self.effective_metric_ in onedal_kdtree_metrics + ) + if result_method == "kd_tree": if not patching_status.and_condition( - device != 'gpu', '"kd_tree" method is not supported on GPU.' + device != "gpu", '"kd_tree" method is not supported on GPU.' ): return patching_status.get_status(logs=True) if not patching_status.and_condition( is_valid_for_kd_tree or is_valid_for_brute, - f'{result_method} with {self.effective_metric_} metric is not supported.' + f"{result_method} with {self.effective_metric_} metric is not supported.", ): return patching_status.get_status(logs=True) if not is_unsupervised: - if not patching_status.and_conditions([ - (is_single_output, 'Only single output is supported.'), - (is_valid_weights, - f'"{type(self.weights)}" weights type is not supported.') - ]): + if not patching_status.and_conditions( + [ + (is_single_output, "Only single output is supported."), + ( + is_valid_weights, + f'"{type(self.weights)}" weights type is not supported.', + ), + ] + ): return patching_status.get_status(logs=True) - if method_name == 'fit': + if method_name == "fit": if is_classifier: patching_status.and_condition( - class_count >= 2, 'One-class case is not supported.' + class_count >= 2, "One-class case is not supported." ) return patching_status.get_status(logs=True) - if method_name in ['predict', 'predict_proba', 'kneighbors']: + if method_name in ["predict", "predict_proba", "kneighbors"]: patching_status.and_condition( - hasattr(self, '_onedal_estimator'), 'oneDAL model was not trained.' + hasattr(self, "_onedal_estimator"), "oneDAL model was not trained." ) return patching_status.get_status(logs=True) - raise RuntimeError(f'Unknown method {method_name} in {class_name}') + raise RuntimeError(f"Unknown method {method_name} in {class_name}") def _onedal_gpu_supported(self, method_name, *data): - return self._onedal_supported('gpu', method_name, *data) + return self._onedal_supported("gpu", method_name, *data) def _onedal_cpu_supported(self, method_name, *data): - return self._onedal_supported('cpu', method_name, *data) + return self._onedal_supported("cpu", method_name, *data) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index de4d81a09a..423345ed1e 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,125 +13,203 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import sklearn_check_version import warnings -from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase from sklearn.neighbors._ball_tree import BallTree +from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase from sklearn.neighbors._kd_tree import KDTree -if not sklearn_check_version('1.2'): + +from daal4py.sklearn._utils import sklearn_check_version + +if not sklearn_check_version("1.2"): from sklearn.neighbors._base import _check_weights + +import numpy as np from sklearn.neighbors._base import VALID_METRICS -from sklearn.neighbors._classification import KNeighborsClassifier as \ - sklearn_KNeighborsClassifier -from sklearn.neighbors._unsupervised import NearestNeighbors as \ - sklearn_NearestNeighbors +from sklearn.neighbors._classification import ( + KNeighborsClassifier as sklearn_KNeighborsClassifier, +) +from sklearn.neighbors._unsupervised import NearestNeighbors as sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted -from onedal.datatypes import _check_array, _num_features, _num_samples from onedal.neighbors import KNeighborsClassifier as onedal_KNeighborsClassifier +from onedal.utils import _check_array, _num_features, _num_samples -from .common import KNeighborsDispatchingBase from .._device_offload import dispatch, wrap_output_data -import numpy as np - +from .common import KNeighborsDispatchingBase if sklearn_check_version("0.24"): + class KNeighborsClassifier_(sklearn_KNeighborsClassifier): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { - **sklearn_KNeighborsClassifier._parameter_constraints} + **sklearn_KNeighborsClassifier._parameter_constraints + } @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) - self.weights = \ - weights if sklearn_check_version("1.0") \ - else _check_weights(weights) + n_jobs=n_jobs, + **kwargs, + ) + self.weights = ( + weights if sklearn_check_version("1.0") else _check_weights(weights) + ) + elif sklearn_check_version("0.22"): - from sklearn.neighbors._base import SupervisedIntegerMixin as \ - BaseSupervisedIntegerMixin + from sklearn.neighbors._base import ( + SupervisedIntegerMixin as BaseSupervisedIntegerMixin, + ) - class KNeighborsClassifier_(sklearn_KNeighborsClassifier, - BaseSupervisedIntegerMixin): + class KNeighborsClassifier_(sklearn_KNeighborsClassifier, BaseSupervisedIntegerMixin): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) + else: - from sklearn.neighbors.base import SupervisedIntegerMixin as \ - BaseSupervisedIntegerMixin + from sklearn.neighbors.base import ( + SupervisedIntegerMixin as BaseSupervisedIntegerMixin, + ) - class KNeighborsClassifier_(sklearn_KNeighborsClassifier, - BaseSupervisedIntegerMixin): + class KNeighborsClassifier_(sklearn_KNeighborsClassifier, BaseSupervisedIntegerMixin): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) class KNeighborsClassifier(KNeighborsClassifier_, KNeighborsDispatchingBase): - if sklearn_check_version('1.2'): - _parameter_constraints: dict = { - **KNeighborsClassifier_._parameter_constraints} - - if sklearn_check_version('1.0'): - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None): + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**KNeighborsClassifier_._parameter_constraints} + + if sklearn_check_version("1.0"): + + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs) + n_jobs=n_jobs, + ) + else: + @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) def fit(self, X, y): self._fit_validation(X, y) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_KNeighborsClassifier.fit, - }, X, y) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_KNeighborsClassifier.fit, + }, + X, + y, + ) return self @wrap_output_data @@ -139,58 +217,81 @@ def predict(self, X): check_is_fitted(self) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_KNeighborsClassifier.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_KNeighborsClassifier.predict, + }, + X, + ) @wrap_output_data def predict_proba(self, X): check_is_fitted(self) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_KNeighborsClassifier.predict_proba, - }, X) + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_KNeighborsClassifier.predict_proba, + }, + X, + ) @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'kneighbors', { - 'onedal': self.__class__._onedal_kneighbors, - 'sklearn': sklearn_KNeighborsClassifier.kneighbors, - }, X, n_neighbors, return_distance) + return dispatch( + self, + "kneighbors", + { + "onedal": self.__class__._onedal_kneighbors, + "sklearn": sklearn_KNeighborsClassifier.kneighbors, + }, + X, + n_neighbors, + return_distance, + ) @wrap_output_data - def radius_neighbors(self, X=None, radius=None, return_distance=True, - sort_results=False): - _onedal_estimator = getattr(self, '_onedal_estimator', None) + def radius_neighbors( + self, X=None, radius=None, return_distance=True, sort_results=False + ): + _onedal_estimator = getattr(self, "_onedal_estimator", None) - if _onedal_estimator is not None or getattr(self, '_tree', 0) is None and \ - self._fit_method == 'kd_tree': + if ( + _onedal_estimator is not None + or getattr(self, "_tree", 0) is None + and self._fit_method == "kd_tree" + ): if sklearn_check_version("0.24"): - sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, '_y', None)) + sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) else: sklearn_NearestNeighbors.fit(self, self._fit_X) if sklearn_check_version("0.22"): result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance, sort_results) + self, X, radius, return_distance, sort_results + ) else: result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance) + self, X, radius, return_distance + ) return result def _onedal_fit(self, X, y, queue=None): onedal_params = { - 'n_neighbors': self.n_neighbors, - 'weights': self.weights, - 'algorithm': self.algorithm, - 'metric': self.effective_metric_, - 'p': self.effective_metric_params_['p'], + "n_neighbors": self.n_neighbors, + "weights": self.weights, + "algorithm": self.algorithm, + "metric": self.effective_metric_, + "p": self.effective_metric_params_["p"], } try: @@ -212,10 +313,12 @@ def _onedal_predict(self, X, queue=None): def _onedal_predict_proba(self, X, queue=None): return self._onedal_estimator.predict_proba(X, queue=queue) - def _onedal_kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def _onedal_kneighbors( + self, X=None, n_neighbors=None, return_distance=True, queue=None + ): return self._onedal_estimator.kneighbors( - X, n_neighbors, return_distance, queue=queue) + X, n_neighbors, return_distance, queue=queue + ) def _save_attributes(self): self.classes_ = self._onedal_estimator.classes_ diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index de4fc7070d..efd789f937 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,125 +13,199 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import sklearn_check_version import warnings -from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase from sklearn.neighbors._ball_tree import BallTree +from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase from sklearn.neighbors._kd_tree import KDTree -if not sklearn_check_version('1.2'): + +from daal4py.sklearn._utils import sklearn_check_version + +if not sklearn_check_version("1.2"): from sklearn.neighbors._base import _check_weights + +import numpy as np from sklearn.neighbors._base import VALID_METRICS -from sklearn.neighbors._regression import KNeighborsRegressor as \ - sklearn_KNeighborsRegressor -from sklearn.neighbors._unsupervised import NearestNeighbors as \ - sklearn_NearestNeighbors +from sklearn.neighbors._regression import ( + KNeighborsRegressor as sklearn_KNeighborsRegressor, +) +from sklearn.neighbors._unsupervised import NearestNeighbors as sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted -from onedal.datatypes import _check_array, _num_features, _num_samples from onedal.neighbors import KNeighborsRegressor as onedal_KNeighborsRegressor +from onedal.utils import _check_array, _num_features, _num_samples -from .common import KNeighborsDispatchingBase from .._device_offload import dispatch, wrap_output_data -import numpy as np - +from .common import KNeighborsDispatchingBase if sklearn_check_version("0.24"): + class KNeighborsRegressor_(sklearn_KNeighborsRegressor): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { - **sklearn_KNeighborsRegressor._parameter_constraints} + **sklearn_KNeighborsRegressor._parameter_constraints + } @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) - self.weights = \ - weights if sklearn_check_version("1.0") \ - else _check_weights(weights) + n_jobs=n_jobs, + **kwargs, + ) + self.weights = ( + weights if sklearn_check_version("1.0") else _check_weights(weights) + ) + elif sklearn_check_version("0.22"): - from sklearn.neighbors._base import SupervisedFloatMixin as \ - BaseSupervisedFloatMixin + from sklearn.neighbors._base import SupervisedFloatMixin as BaseSupervisedFloatMixin - class KNeighborsRegressor_(sklearn_KNeighborsRegressor, - BaseSupervisedFloatMixin): + class KNeighborsRegressor_(sklearn_KNeighborsRegressor, BaseSupervisedFloatMixin): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) + else: - from sklearn.neighbors.base import SupervisedFloatMixin as \ - BaseSupervisedFloatMixin + from sklearn.neighbors.base import SupervisedFloatMixin as BaseSupervisedFloatMixin - class KNeighborsRegressor_(sklearn_KNeighborsRegressor, - BaseSupervisedFloatMixin): + class KNeighborsRegressor_(sklearn_KNeighborsRegressor, BaseSupervisedFloatMixin): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) class KNeighborsRegressor(KNeighborsRegressor_, KNeighborsDispatchingBase): - if sklearn_check_version('1.2'): - _parameter_constraints: dict = { - **KNeighborsRegressor_._parameter_constraints} - - if sklearn_check_version('1.0'): - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None): + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**KNeighborsRegressor_._parameter_constraints} + + if sklearn_check_version("1.0"): + + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs) + n_jobs=n_jobs, + ) + else: + @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) def fit(self, X, y): self._fit_validation(X, y) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_KNeighborsRegressor.fit, - }, X, y) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_KNeighborsRegressor.fit, + }, + X, + y, + ) return self @wrap_output_data @@ -139,48 +213,66 @@ def predict(self, X): check_is_fitted(self) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_KNeighborsRegressor.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_KNeighborsRegressor.predict, + }, + X, + ) @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'kneighbors', { - 'onedal': self.__class__._onedal_kneighbors, - 'sklearn': sklearn_KNeighborsRegressor.kneighbors, - }, X, n_neighbors, return_distance) + return dispatch( + self, + "kneighbors", + { + "onedal": self.__class__._onedal_kneighbors, + "sklearn": sklearn_KNeighborsRegressor.kneighbors, + }, + X, + n_neighbors, + return_distance, + ) @wrap_output_data - def radius_neighbors(self, X=None, radius=None, return_distance=True, - sort_results=False): - _onedal_estimator = getattr(self, '_onedal_estimator', None) + def radius_neighbors( + self, X=None, radius=None, return_distance=True, sort_results=False + ): + _onedal_estimator = getattr(self, "_onedal_estimator", None) - if _onedal_estimator is not None or getattr(self, '_tree', 0) is None and \ - self._fit_method == 'kd_tree': + if ( + _onedal_estimator is not None + or getattr(self, "_tree", 0) is None + and self._fit_method == "kd_tree" + ): if sklearn_check_version("0.24"): - sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, '_y', None)) + sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) else: sklearn_NearestNeighbors.fit(self, self._fit_X) if sklearn_check_version("0.22"): result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance, sort_results) + self, X, radius, return_distance, sort_results + ) else: result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance) + self, X, radius, return_distance + ) return result def _onedal_fit(self, X, y, queue=None): onedal_params = { - 'n_neighbors': self.n_neighbors, - 'weights': self.weights, - 'algorithm': self.algorithm, - 'metric': self.effective_metric_, - 'p': self.effective_metric_params_['p'], + "n_neighbors": self.n_neighbors, + "weights": self.weights, + "algorithm": self.algorithm, + "metric": self.effective_metric_, + "p": self.effective_metric_params_["p"], } try: @@ -199,10 +291,12 @@ def _onedal_fit(self, X, y, queue=None): def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue=queue) - def _onedal_kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def _onedal_kneighbors( + self, X=None, n_neighbors=None, return_distance=True, queue=None + ): return self._onedal_estimator.kneighbors( - X, n_neighbors, return_distance, queue=queue) + X, n_neighbors, return_distance, queue=queue + ) def _save_attributes(self): self.n_features_in_ = self._onedal_estimator.n_features_in_ diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index d9d8b8ed63..f6c2cf503e 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,85 +13,128 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== try: from packaging.version import Version except ImportError: from distutils.version import LooseVersion as Version -from sklearn import __version__ as sklearn_version -from daal4py.sklearn._utils import sklearn_check_version + import warnings -from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase +import numpy as np +from sklearn import __version__ as sklearn_version from sklearn.neighbors._ball_tree import BallTree -from sklearn.neighbors._kd_tree import KDTree from sklearn.neighbors._base import VALID_METRICS -from sklearn.neighbors._unsupervised import NearestNeighbors as \ - sklearn_NearestNeighbors - +from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase +from sklearn.neighbors._kd_tree import KDTree +from sklearn.neighbors._unsupervised import NearestNeighbors as sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted -from onedal.datatypes import _check_array, _num_features, _num_samples +from daal4py.sklearn._utils import sklearn_check_version from onedal.neighbors import NearestNeighbors as onedal_NearestNeighbors +from onedal.utils import _check_array, _num_features, _num_samples -from .common import KNeighborsDispatchingBase from .._device_offload import dispatch, wrap_output_data -import numpy as np +from .common import KNeighborsDispatchingBase +if sklearn_check_version("0.22") and Version(sklearn_version) < Version("0.23"): -if sklearn_check_version("0.22") and \ - Version(sklearn_version) < Version("0.23"): class NearestNeighbors_(sklearn_NearestNeighbors): - def __init__(self, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + else: + class NearestNeighbors_(sklearn_NearestNeighbors): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { - **sklearn_NearestNeighbors._parameter_constraints} + **sklearn_NearestNeighbors._parameter_constraints + } @_deprecate_positional_args - def __init__(self, *, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + *, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) class NearestNeighbors(NearestNeighbors_, KNeighborsDispatchingBase): - if sklearn_check_version('1.2'): - _parameter_constraints: dict = { - **NearestNeighbors_._parameter_constraints} + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**NearestNeighbors_._parameter_constraints} @_deprecate_positional_args - def __init__(self, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) def fit(self, X, y=None): self._fit_validation(X, y) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_NearestNeighbors.fit, - }, X, None) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_NearestNeighbors.fit, + }, + X, + None, + ) return self @wrap_output_data @@ -99,37 +142,50 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if sklearn_check_version("1.0") and X is not None: self._check_feature_names(X, reset=False) - return dispatch(self, 'kneighbors', { - 'onedal': self.__class__._onedal_kneighbors, - 'sklearn': sklearn_NearestNeighbors.kneighbors, - }, X, n_neighbors, return_distance) + return dispatch( + self, + "kneighbors", + { + "onedal": self.__class__._onedal_kneighbors, + "sklearn": sklearn_NearestNeighbors.kneighbors, + }, + X, + n_neighbors, + return_distance, + ) @wrap_output_data - def radius_neighbors(self, X=None, radius=None, return_distance=True, - sort_results=False): - _onedal_estimator = getattr(self, '_onedal_estimator', None) - - if _onedal_estimator is not None or getattr(self, '_tree', 0) is None and \ - self._fit_method == 'kd_tree': + def radius_neighbors( + self, X=None, radius=None, return_distance=True, sort_results=False + ): + _onedal_estimator = getattr(self, "_onedal_estimator", None) + + if ( + _onedal_estimator is not None + or getattr(self, "_tree", 0) is None + and self._fit_method == "kd_tree" + ): if sklearn_check_version("0.24"): - sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, '_y', None)) + sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) else: sklearn_NearestNeighbors.fit(self, self._fit_X) if sklearn_check_version("0.22"): result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance, sort_results) + self, X, radius, return_distance, sort_results + ) else: result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance) + self, X, radius, return_distance + ) return result def _onedal_fit(self, X, y=None, queue=None): onedal_params = { - 'n_neighbors': self.n_neighbors, - 'algorithm': self.algorithm, - 'metric': self.effective_metric_, - 'p': self.effective_metric_params_['p'], + "n_neighbors": self.n_neighbors, + "algorithm": self.algorithm, + "metric": self.effective_metric_, + "p": self.effective_metric_params_["p"], } try: @@ -148,10 +204,12 @@ def _onedal_fit(self, X, y=None, queue=None): def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue=queue) - def _onedal_kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def _onedal_kneighbors( + self, X=None, n_neighbors=None, return_distance=True, queue=None + ): return self._onedal_estimator.kneighbors( - X, n_neighbors, return_distance, queue=queue) + X, n_neighbors, return_distance, queue=queue + ) def _save_attributes(self): self.classes_ = self._onedal_estimator.classes_ diff --git a/sklearnex/neighbors/lof.py b/sklearnex/neighbors/lof.py index b02f98c64d..720be45ab8 100644 --- a/sklearnex/neighbors/lof.py +++ b/sklearnex/neighbors/lof.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,13 +13,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import warnings -from sklearn.neighbors._lof import LocalOutlierFactor as \ - sklearn_LocalOutlierFactor +import numpy as np +from sklearn.neighbors._lof import LocalOutlierFactor as sklearn_LocalOutlierFactor + from .knn_unsupervised import NearestNeighbors try: @@ -27,18 +27,21 @@ except ImportError: pass -from sklearn.utils.validation import check_is_fitted from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted from daal4py.sklearn._utils import sklearn_check_version -from .._device_offload import dispatch, wrap_output_data + from .._config import config_context +from .._device_offload import dispatch, wrap_output_data if sklearn_check_version("1.0"): + class LocalOutlierFactor(sklearn_LocalOutlierFactor): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { - **sklearn_LocalOutlierFactor._parameter_constraints} + **sklearn_LocalOutlierFactor._parameter_constraints + } def __init__( self, @@ -62,7 +65,7 @@ def __init__( metric_params=metric_params, n_jobs=n_jobs, contamination=contamination, - novelty=novelty + novelty=novelty, ) def _fit(self, X, y, queue=None): @@ -76,7 +79,7 @@ def _fit(self, X, y, queue=None): metric=self.metric, p=self.p, metric_params=self.metric_params, - n_jobs=self.n_jobs + n_jobs=self.n_jobs, ) self._knn.fit(X) @@ -98,8 +101,9 @@ def _fit(self, X, y, queue=None): ) self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) - self._distances_fit_X_, _neighbors_indices_fit_X_ =\ - self._knn.kneighbors(n_neighbors=self.n_neighbors_) + self._distances_fit_X_, _neighbors_indices_fit_X_ = self._knn.kneighbors( + n_neighbors=self.n_neighbors_ + ) self._lrd = self._local_reachability_density( self._distances_fit_X_, _neighbors_indices_fit_X_ @@ -127,10 +131,16 @@ def _fit(self, X, y, queue=None): return self def fit(self, X, y=None): - return dispatch(self, 'neighbors.LocalOutlierFactor.fit', { - 'onedal': self.__class__._fit, - 'sklearn': None, - }, X, y) + return dispatch( + self, + "neighbors.LocalOutlierFactor.fit", + { + "onedal": self.__class__._fit, + "sklearn": None, + }, + X, + y, + ) def _onedal_predict(self, X, queue=None): with config_context(target_offload=queue): @@ -148,10 +158,15 @@ def _onedal_predict(self, X, queue=None): @wrap_output_data def _predict(self, X=None): - return dispatch(self, 'neighbors.LocalOutlierFactor.predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': None, - }, X) + return dispatch( + self, + "neighbors.LocalOutlierFactor.predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": None, + }, + X, + ) def _score_samples(self, X, queue=None): with config_context(target_offload=queue): @@ -183,10 +198,15 @@ def _check_novelty_score_samples(self): @available_if(_check_novelty_score_samples) @wrap_output_data def score_samples(self, X): - return dispatch(self, 'neighbors.LocalOutlierFactor.score_samples', { - 'onedal': self.__class__._score_samples, - 'sklearn': None, - }, X) + return dispatch( + self, + "neighbors.LocalOutlierFactor.score_samples", + { + "onedal": self.__class__._score_samples, + "sklearn": None, + }, + X, + ) def _check_novelty_fit_predict(self): if self.novelty: @@ -204,17 +224,25 @@ def _fit_predict(self, X, y, queue=None): @available_if(_check_novelty_fit_predict) @wrap_output_data def fit_predict(self, X, y=None): - return dispatch(self, 'neighbors.LocalOutlierFactor.fit_predict', { - 'onedal': self.__class__._fit_predict, - 'sklearn': None, - }, X, y) + return dispatch( + self, + "neighbors.LocalOutlierFactor.fit_predict", + { + "onedal": self.__class__._fit_predict, + "sklearn": None, + }, + X, + y, + ) def _onedal_gpu_supported(self, method_name, *data): return True def _onedal_cpu_supported(self, method_name, *data): return True + else: + class LocalOutlierFactor(sklearn_LocalOutlierFactor): def __init__( self, @@ -238,7 +266,7 @@ def __init__( metric_params=metric_params, n_jobs=n_jobs, contamination=contamination, - novelty=novelty + novelty=novelty, ) def _fit(self, X, y=None, queue=None): @@ -250,7 +278,7 @@ def _fit(self, X, y=None, queue=None): metric=self.metric, p=self.p, metric_params=self.metric_params, - n_jobs=self.n_jobs + n_jobs=self.n_jobs, ) self._knn.fit(X) @@ -272,8 +300,9 @@ def _fit(self, X, y=None, queue=None): ) self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) - self._distances_fit_X_, _neighbors_indices_fit_X_ =\ - self._knn.kneighbors(n_neighbors=self.n_neighbors_) + self._distances_fit_X_, _neighbors_indices_fit_X_ = self._knn.kneighbors( + n_neighbors=self.n_neighbors_ + ) self._lrd = self._local_reachability_density( self._distances_fit_X_, _neighbors_indices_fit_X_ @@ -301,10 +330,16 @@ def _fit(self, X, y=None, queue=None): return self def fit(self, X, y=None): - return dispatch(self, 'neighbors.LocalOutlierFactor.fit', { - 'onedal': self.__class__._fit, - 'sklearn': None, - }, X, y) + return dispatch( + self, + "neighbors.LocalOutlierFactor.fit", + { + "onedal": self.__class__._fit, + "sklearn": None, + }, + X, + y, + ) def _onedal_predict(self, X, queue=None): with config_context(target_offload=queue): @@ -322,10 +357,15 @@ def _onedal_predict(self, X, queue=None): @wrap_output_data def _predict(self, X=None): - return dispatch(self, 'neighbors.LocalOutlierFactor.predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': None, - }, X) + return dispatch( + self, + "neighbors.LocalOutlierFactor.predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": None, + }, + X, + ) def _onedal_score_samples(self, X, queue=None): with config_context(target_offload=queue): @@ -345,17 +385,24 @@ def _onedal_score_samples(self, X, queue=None): @wrap_output_data def _score_samples(self, X): if not self.novelty: - msg = ('score_samples is not available when novelty=False. The ' - 'scores of the training samples are always available ' - 'through the negative_outlier_factor_ attribute. Use ' - 'novelty=True if you want to use LOF for novelty detection ' - 'and compute score_samples for new unseen data.') + msg = ( + "score_samples is not available when novelty=False. The " + "scores of the training samples are always available " + "through the negative_outlier_factor_ attribute. Use " + "novelty=True if you want to use LOF for novelty detection " + "and compute score_samples for new unseen data." + ) raise AttributeError(msg) - return dispatch(self, 'neighbors.LocalOutlierFactor.score_samples', { - 'onedal': self.__class__._onedal_score_samples, - 'sklearn': None, - }, X) + return dispatch( + self, + "neighbors.LocalOutlierFactor.score_samples", + { + "onedal": self.__class__._onedal_score_samples, + "sklearn": None, + }, + X, + ) def _onedal_fit_predict(self, X, y, queue=None): with config_context(target_offload=queue): @@ -363,10 +410,16 @@ def _onedal_fit_predict(self, X, y, queue=None): @wrap_output_data def _fit_predict(self, X, y=None): - return dispatch(self, 'neighbors.LocalOutlierFactor._onedal_fit_predict', { - 'onedal': self.__class__._onedal_fit_predict, - 'sklearn': None, - }, X, y) + return dispatch( + self, + "neighbors.LocalOutlierFactor._onedal_fit_predict", + { + "onedal": self.__class__._onedal_fit_predict, + "sklearn": None, + }, + X, + y, + ) def _onedal_gpu_supported(self, method_name, *data): return True diff --git a/sklearnex/neighbors/tests/test_neighbors.py b/sklearnex/neighbors/tests/test_neighbors.py index e871dc9a3b..735f40a4b8 100755 --- a/sklearnex/neighbors/tests/test_neighbors.py +++ b/sklearnex/neighbors/tests/test_neighbors.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,37 +21,41 @@ def test_sklearnex_import_knn_classifier(): from sklearnex.neighbors import KNeighborsClassifier + X = [[0], [1], [2], [3]] y = [0, 0, 1, 1] neigh = KNeighborsClassifier(n_neighbors=3).fit(X, y) - assert 'sklearnex' in neigh.__module__ + assert "sklearnex" in neigh.__module__ assert_allclose(neigh.predict([[1.1]]), [0]) def test_sklearnex_import_knn_regression(): from sklearnex.neighbors import KNeighborsRegressor + X = [[0], [1], [2], [3]] y = [0, 0, 1, 1] neigh = KNeighborsRegressor(n_neighbors=2).fit(X, y) - assert 'sklearnex' in neigh.__module__ + assert "sklearnex" in neigh.__module__ assert_allclose(neigh.predict([[1.5]]), [0.5]) def test_sklearnex_import_nn(): from sklearnex.neighbors import NearestNeighbors + X = [[0, 0, 2], [1, 0, 0], [0, 0, 1]] neigh = NearestNeighbors(n_neighbors=2).fit(X) - assert 'sklearnex' in neigh.__module__ + assert "sklearnex" in neigh.__module__ result = neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False) assert_allclose(result, [[2, 0]]) def test_sklearnex_import_lof(): from sklearnex.neighbors import LocalOutlierFactor + X = [[7, 7, 7], [1, 0, 0], [0, 0, 1], [0, 0, 1]] lof = LocalOutlierFactor(n_neighbors=2) result = lof.fit_predict(X) - assert hasattr(lof, '_knn') - assert 'sklearnex' in lof.__module__ - assert 'sklearnex' in lof._knn.__module__ + assert hasattr(lof, "_knn") + assert "sklearnex" in lof.__module__ + assert "sklearnex" in lof._knn.__module__ assert_allclose(result, [-1, 1, 1, 1]) diff --git a/sklearnex/preview/__init__.py b/sklearnex/preview/__init__.py index 63508ba7f4..d6431ee3c7 100644 --- a/sklearnex/preview/__init__.py +++ b/sklearnex/preview/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -__all__ = ['cluster', 'decomposition', 'linear_model', 'ensemble'] +__all__ = ["cluster", "decomposition", "linear_model", "ensemble"] diff --git a/sklearnex/preview/cluster/__init__.py b/sklearnex/preview/cluster/__init__.py index 5a3f8d1447..fe90485107 100644 --- a/sklearnex/preview/cluster/__init__.py +++ b/sklearnex/preview/cluster/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .k_means import KMeans -__all__ = ['KMeans'] +__all__ = ["KMeans"] diff --git a/sklearnex/preview/cluster/_common.py b/sklearnex/preview/cluster/_common.py index ab2be9efe3..ddcbe87d9e 100644 --- a/sklearnex/preview/cluster/_common.py +++ b/sklearnex/preview/cluster/_common.py @@ -23,7 +23,7 @@ def get_cluster_centers(self): def set_cluster_centers(self, value): self._cluster_centers_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.cluster_centers_ = value @@ -33,7 +33,7 @@ def get_labels(self): def set_labels(self, value): self._labels_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.labels_ = value @@ -43,7 +43,7 @@ def get_inertia(self): def set_inertia(self, value): self._inertia_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.inertia_ = value @@ -53,7 +53,7 @@ def get_n_iter(self): def set_n_iter(self, value): self._n_iter_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.n_iter_ = value diff --git a/sklearnex/preview/cluster/k_means.py b/sklearnex/preview/cluster/k_means.py index c5a9dd4a09..80abad8c8d 100644 --- a/sklearnex/preview/cluster/k_means.py +++ b/sklearnex/preview/cluster/k_means.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,54 +13,50 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import daal_check_version import logging -if daal_check_version((2023, 'P', 200)): +from daal4py.sklearn._utils import daal_check_version + +if daal_check_version((2023, "P", 200)): import numpy as np from scipy.sparse import issparse - - from ._common import BaseKMeans - from ..._device_offload import dispatch, wrap_output_data - - from onedal.cluster import KMeans as onedal_KMeans from sklearn.cluster import KMeans as sklearn_KMeans - - from daal4py.sklearn._utils import ( - sklearn_check_version, - PatchingConditionsChain) - + from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.validation import ( + _deprecate_positional_args, _num_samples, check_is_fitted, - _deprecate_positional_args) + ) - from sklearn.utils._openmp_helpers import _openmp_effective_n_threads + from daal4py.sklearn._utils import PatchingConditionsChain, sklearn_check_version + from onedal.cluster import KMeans as onedal_KMeans + + from ..._device_offload import dispatch, wrap_output_data + from ._common import BaseKMeans class KMeans(sklearn_KMeans, BaseKMeans): __doc__ = sklearn_KMeans.__doc__ n_iter_, inertia_ = None, None labels_, cluster_centers_ = None, None - if sklearn_check_version('1.2'): - _parameter_constraints: dict = { - **sklearn_KMeans._parameter_constraints} + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**sklearn_KMeans._parameter_constraints} @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', - n_init='auto' if sklearn_check_version('1.4') else 'warn', + init="k-means++", + n_init="auto" if sklearn_check_version("1.4") else "warn", max_iter=300, tol=1e-4, verbose=0, random_state=None, copy_x=True, - algorithm='lloyd', + algorithm="lloyd", ): super().__init__( n_clusters=n_clusters, @@ -73,20 +69,22 @@ def __init__( copy_x=copy_x, algorithm=algorithm, ) - elif sklearn_check_version('1.0'): + + elif sklearn_check_version("1.0"): + @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', + init="k-means++", n_init=10, max_iter=300, tol=1e-4, verbose=0, random_state=None, copy_x=True, - algorithm='auto', + algorithm="auto", ): super().__init__( n_clusters=n_clusters, @@ -99,22 +97,24 @@ def __init__( copy_x=copy_x, algorithm=algorithm, ) + else: + @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', + init="k-means++", n_init=10, max_iter=300, tol=1e-4, - precompute_distances='deprecated', + precompute_distances="deprecated", verbose=0, random_state=None, copy_x=True, - n_jobs='deprecated', - algorithm='auto', + n_jobs="deprecated", + algorithm="auto", ): super().__init__( n_clusters=n_clusters, @@ -132,36 +132,40 @@ def __init__( def _initialize_onedal_estimator(self): onedal_params = { - 'n_clusters': self.n_clusters, - 'init': self.init, - 'max_iter': self.max_iter, - 'tol': self.tol, - 'n_init': self.n_init, - 'verbose': self.verbose, - 'random_state': self.random_state, + "n_clusters": self.n_clusters, + "init": self.init, + "max_iter": self.max_iter, + "tol": self.tol, + "n_init": self.n_init, + "verbose": self.verbose, + "random_state": self.random_state, } self._onedal_estimator = onedal_KMeans(**onedal_params) def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): - assert method_name == 'fit' + assert method_name == "fit" class_name = self.__class__.__name__ - patching_status = PatchingConditionsChain( - f'sklearn.cluster.{class_name}.fit') + patching_status = PatchingConditionsChain(f"sklearn.cluster.{class_name}.fit") sample_count = _num_samples(X) self._algorithm = self.algorithm supported_algs = ["auto", "full", "lloyd"] correct_count = self.n_clusters < sample_count - patching_status.and_conditions([ - (self.algorithm in supported_algs, 'Only lloyd algorithm is supported.'), - (not issparse(self.init), 'Sparse init values are not supported'), - (correct_count, 'n_clusters is smaller than number of samples'), - (sample_weight is None, 'Sample weight is not None.'), - (not issparse(X), 'Sparse input is not supported.'), - ]) + patching_status.and_conditions( + [ + ( + self.algorithm in supported_algs, + "Only lloyd algorithm is supported.", + ), + (not issparse(self.init), "Sparse init values are not supported"), + (correct_count, "n_clusters is smaller than number of samples"), + (sample_weight is None, "Sample weight is not None."), + (not issparse(X), "Sparse input is not supported."), + ] + ) return patching_status.get_status(logs=True) @@ -184,15 +188,22 @@ def fit(self, X, y=None, sample_weight=None): """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_KMeans.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_KMeans.fit, + }, + X, + y, + sample_weight, + ) return self @@ -216,20 +227,26 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): self._save_attributes() def _onedal_predict_supported(self, method_name, X): - assert method_name == 'predict' + assert method_name == "predict" class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( - f'sklearn.cluster.{class_name}.predict') + f"sklearn.cluster.{class_name}.predict" + ) supported_algs = ["auto", "full", "lloyd"] dense_centers = not issparse(self.cluster_centers_) - patching_status.and_conditions([ - (self.algorithm in supported_algs, 'Only lloyd algorithm is supported.'), - (dense_centers, 'Sparse clusters is not supported.'), - (not issparse(X), 'Sparse input is not supported.') - ]) + patching_status.and_conditions( + [ + ( + self.algorithm in supported_algs, + "Only lloyd algorithm is supported.", + ), + (dense_centers, "Sparse clusters is not supported."), + (not issparse(X), "Sparse input is not supported."), + ] + ) return patching_status.get_status(logs=True) @@ -253,31 +270,37 @@ def predict(self, X): """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_KMeans.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_KMeans.predict, + }, + X, + ) def _onedal_predict(self, X, queue=None): X = self._validate_data(X, accept_sparse=False, reset=False) - if not hasattr(self, '_onedal_estimator'): + if not hasattr(self, "_onedal_estimator"): self._initialize_onedal_estimator() self._onedal_estimator.cluster_centers_ = self.cluster_centers_ return self._onedal_estimator.predict(X, queue=queue) def _onedal_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": return self._onedal_fit_supported(method_name, *data) - if method_name == 'predict': + if method_name == "predict": return self._onedal_predict_supported(method_name, *data) raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) def _onedal_gpu_supported(self, method_name, *data): return self._onedal_supported(method_name, *data) @@ -335,5 +358,7 @@ def transform(self, X): else: from daal4py.sklearn.cluster import KMeans - logging.warning('Preview KMeans requires oneDAL version >= 2023.2 ' - 'but it was not found') + + logging.warning( + "Preview KMeans requires oneDAL version >= 2023.2 " "but it was not found" + ) diff --git a/sklearnex/preview/decomposition/__init__.py b/sklearnex/preview/decomposition/__init__.py index 4b78bc0172..02fd05199e 100644 --- a/sklearnex/preview/decomposition/__init__.py +++ b/sklearnex/preview/decomposition/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/sklearnex/preview/decomposition/pca.py b/sklearnex/preview/decomposition/pca.py index 460e3b4b41..ae779cfc95 100755 --- a/sklearnex/preview/decomposition/pca.py +++ b/sklearnex/preview/decomposition/pca.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,34 +13,36 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import numbers from math import sqrt + +import numpy as np from scipy.sparse import issparse +from sklearn.base import BaseEstimator +from sklearn.utils.extmath import stable_cumsum +from sklearn.utils.validation import check_array, check_is_fitted -from ..._device_offload import dispatch from daal4py.sklearn._utils import sklearn_check_version +from onedal.utils import _check_array -from sklearn.utils.extmath import stable_cumsum -from onedal.datatypes import _check_array -from sklearn.utils.validation import check_array -from sklearn.base import BaseEstimator -from sklearn.utils.validation import check_is_fitted -if sklearn_check_version('1.1') and not sklearn_check_version('1.2'): +from ..._device_offload import dispatch + +if sklearn_check_version("1.1") and not sklearn_check_version("1.2"): from sklearn.utils import check_scalar -if sklearn_check_version('0.23'): +if sklearn_check_version("0.23"): from sklearn.decomposition._pca import _infer_dimension else: from sklearn.decomposition._pca import _infer_dimension_ -from onedal.decomposition import PCA as onedal_PCA from sklearn.decomposition import PCA as sklearn_PCA +from onedal.decomposition import PCA as onedal_PCA + class PCA(sklearn_PCA): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_PCA._parameter_constraints} def __init__( @@ -66,13 +68,11 @@ def __init__( self.power_iteration_normalizer = power_iteration_normalizer self.random_state = random_state - def _validate_n_components(self, n_components, n_samples, - n_features, n_sf_min): + def _validate_n_components(self, n_components, n_samples, n_features, n_sf_min): if n_components == "mle": if n_samples < n_features: raise ValueError( - "n_components='mle' is only supported if" - " n_samples >= n_features" + "n_components='mle' is only supported if" " n_samples >= n_features" ) elif not 0 <= n_components <= n_sf_min: raise ValueError( @@ -82,15 +82,16 @@ def _validate_n_components(self, n_components, n_samples, ) elif n_components >= 1: if not isinstance(n_components, numbers.Integral): - raise ValueError("n_components=%r must be of type int " - "when greater than or equal to 1, " - "was of type=%r" - % (n_components, type(n_components))) + raise ValueError( + "n_components=%r must be of type int " + "when greater than or equal to 1, " + "was of type=%r" % (n_components, type(n_components)) + ) def fit(self, X, y=None): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self._validate_params() - elif sklearn_check_version('1.1'): + elif sklearn_check_version("1.1"): check_scalar( self.n_oversamples, "n_oversamples", @@ -107,12 +108,14 @@ def _fit(self, X): "TruncatedSVD for a possible alternative." ) - if sklearn_check_version('0.23'): - X = self._validate_data(X, dtype=[np.float64, np.float32], - ensure_2d=True, copy=False) + if sklearn_check_version("0.23"): + X = self._validate_data( + X, dtype=[np.float64, np.float32], ensure_2d=True, copy=False + ) else: - X = _check_array(X, dtype=[np.float64, np.float32], - ensure_2d=True, copy=False) + X = _check_array( + X, dtype=[np.float64, np.float32], ensure_2d=True, copy=False + ) n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) @@ -125,13 +128,12 @@ def _fit(self, X): else: n_components = self.n_components - self._validate_n_components(n_components, n_samples, n_features, - n_sf_min) + self._validate_n_components(n_components, n_samples, n_features, n_sf_min) self._fit_svd_solver = self.svd_solver shape_good_for_daal = X.shape[1] / X.shape[0] < 2 if self._fit_svd_solver == "auto": - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): if max(X.shape) <= 500 or n_components == "mle": self._fit_svd_solver = "full" elif 1 <= n_components < 0.8 * n_sf_min: @@ -139,69 +141,73 @@ def _fit(self, X): else: self._fit_svd_solver = "full" else: - if n_components == 'mle': - self._fit_svd_solver = 'full' + if n_components == "mle": + self._fit_svd_solver = "full" else: n, p, k = X.shape[0], X.shape[1], n_components # check if sklearnex is faster than randomized sklearn # Refer to daal4py - regression_coefs = np.array([ - [9.779873e-11, n * p * k], - [-1.122062e-11, n * p * p], - [1.127905e-09, n ** 2], - ]) - - if n_components >= 1 and np.dot( - regression_coefs[:, 0], - regression_coefs[:, 1]) <= 0: - self._fit_svd_solver = 'randomized' + regression_coefs = np.array( + [ + [9.779873e-11, n * p * k], + [-1.122062e-11, n * p * p], + [1.127905e-09, n**2], + ] + ) + + if ( + n_components >= 1 + and np.dot(regression_coefs[:, 0], regression_coefs[:, 1]) <= 0 + ): + self._fit_svd_solver = "randomized" else: - self._fit_svd_solver = 'full' + self._fit_svd_solver = "full" - if not shape_good_for_daal or self._fit_svd_solver != 'full': - if sklearn_check_version('0.23'): + if not shape_good_for_daal or self._fit_svd_solver != "full": + if sklearn_check_version("0.23"): X = self._validate_data(X, copy=self.copy) else: X = check_array(X, copy=self.copy) # Call different fits for either full or truncated SVD if shape_good_for_daal and self._fit_svd_solver == "full": - return dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_PCA._fit_full, - }, X) + return dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_PCA._fit_full, + }, + X, + ) elif not shape_good_for_daal and self._fit_svd_solver == "full": return sklearn_PCA._fit_full(self, X, n_components) elif self._fit_svd_solver in ["arpack", "randomized"]: return sklearn_PCA._fit_truncated( - self, X, n_components, self._fit_svd_solver, + self, + X, + n_components, + self._fit_svd_solver, ) else: - raise ValueError( - "Unrecognized svd_solver='{0}'".format(self._fit_svd_solver) - ) + raise ValueError("Unrecognized svd_solver='{0}'".format(self._fit_svd_solver)) def _onedal_gpu_supported(self, method_name, *data): - if method_name == 'fit': - return self._fit_svd_solver == 'full' - elif method_name == 'transform': - return hasattr(self, '_onedal_estimator') - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}' - ) + if method_name == "fit": + return self._fit_svd_solver == "full" + elif method_name == "transform": + return hasattr(self, "_onedal_estimator") + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_cpu_supported(self, method_name, *data): - if method_name == 'fit': - return self._fit_svd_solver == 'full' - elif method_name == 'transform': - return hasattr(self, '_onedal_estimator') - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}' - ) + if method_name == "fit": + return self._fit_svd_solver == "full" + elif method_name == "transform": + return hasattr(self, "_onedal_estimator") + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_fit(self, X, y=None, queue=None): - - if self.n_components == 'mle' or self.n_components is None: + if self.n_components == "mle" or self.n_components is None: onedal_n_components = min(X.shape) elif 0 < self.n_components < 1: onedal_n_components = min(X.shape) @@ -209,9 +215,9 @@ def _onedal_fit(self, X, y=None, queue=None): onedal_n_components = self.n_components onedal_params = { - 'n_components': onedal_n_components, - 'is_deterministic': True, - 'method': "precomputed", + "n_components": onedal_n_components, + "is_deterministic": True, + "method": "precomputed", } self._onedal_estimator = onedal_PCA(**onedal_params) self._onedal_estimator.fit(X, queue=queue) @@ -227,12 +233,7 @@ def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue) def _onedal_transform(self, X): - X = _check_array( - X, - dtype=[np.float64, np.float32], - ensure_2d=True, - copy=False - ) + X = _check_array(X, dtype=[np.float64, np.float32], ensure_2d=True, copy=False) if hasattr(self, "n_features_in_"): if self.n_features_in_ != X.shape[1]: @@ -251,10 +252,15 @@ def _onedal_transform(self, X): # Mean center X_centered = X - self.mean_ - return dispatch(self, 'transform', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_PCA.transform, - }, X_centered) + return dispatch( + self, + "transform", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_PCA.transform, + }, + X_centered, + ) def transform(self, X): check_is_fitted(self) @@ -310,13 +316,12 @@ def _save_attributes(self): self.mean_ = self._onedal_estimator.mean_ self.singular_values_ = self._onedal_estimator.singular_values_ self.explained_variance_ = self._onedal_estimator.explained_variance_ - self.explained_variance_ratio_ = \ - self._onedal_estimator.explained_variance_ratio_ + self.explained_variance_ratio_ = self._onedal_estimator.explained_variance_ratio_ if self.n_components is None: self.n_components_ = self._onedal_estimator.n_components_ - elif self.n_components == 'mle': - if sklearn_check_version('0.23'): + elif self.n_components == "mle": + if sklearn_check_version("0.23"): self.n_components_ = _infer_dimension( self.explained_variance_, self.n_samples_ ) @@ -326,23 +331,25 @@ def _save_attributes(self): ) elif 0 < self.n_components < 1.0: ratio_cumsum = stable_cumsum(self.explained_variance_ratio_) - self.n_components_ = np.searchsorted( - ratio_cumsum, self.n_components, side='right') + 1 + self.n_components_ = ( + np.searchsorted(ratio_cumsum, self.n_components, side="right") + 1 + ) else: self.n_components_ = self._onedal_estimator.n_components_ if self.n_components_ < n_sf_min: if self.explained_variance_.shape[0] == n_sf_min: - self.noise_variance_ = \ - self.explained_variance_[self.n_components_:].mean() + self.noise_variance_ = self.explained_variance_[ + self.n_components_ : + ].mean() else: self.noise_variance_ = self._onedal_estimator.noise_variance_ else: - self.noise_variance_ = 0. - - self.explained_variance_ = self.explained_variance_[:self.n_components_] - self.explained_variance_ratio_ = \ - self.explained_variance_ratio_[:self.n_components_] - self.components_ = \ - self._onedal_estimator.components_[:self.n_components_] - self.singular_values_ = self.singular_values_[:self.n_components_] + self.noise_variance_ = 0.0 + + self.explained_variance_ = self.explained_variance_[: self.n_components_] + self.explained_variance_ratio_ = self.explained_variance_ratio_[ + : self.n_components_ + ] + self.components_ = self._onedal_estimator.components_[: self.n_components_] + self.singular_values_ = self.singular_values_[: self.n_components_] diff --git a/sklearnex/preview/decomposition/tests/test_preview_pca.py b/sklearnex/preview/decomposition/tests/test_preview_pca.py index e4b4ad5c18..5a3a891bce 100755 --- a/sklearnex/preview/decomposition/tests/test_preview_pca.py +++ b/sklearnex/preview/decomposition/tests/test_preview_pca.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,8 +21,9 @@ def test_sklearnex_import(): from sklearnex.preview.decomposition import PCA + X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) - pca = PCA(n_components=2, svd_solver='full').fit(X) - assert 'sklearnex' in pca.__module__ - assert hasattr(pca, '_onedal_estimator') + pca = PCA(n_components=2, svd_solver="full").fit(X) + assert "sklearnex" in pca.__module__ + assert hasattr(pca, "_onedal_estimator") assert_allclose(pca.singular_values_, [6.30061232, 0.54980396]) diff --git a/sklearnex/preview/ensemble/__init__.py b/sklearnex/preview/ensemble/__init__.py index e7a8fde386..cce939b4cf 100755 --- a/sklearnex/preview/ensemble/__init__.py +++ b/sklearnex/preview/ensemble/__init__.py @@ -15,8 +15,12 @@ # limitations under the License. # =============================================================================== -from .forest import RandomForestClassifier, RandomForestRegressor from .extra_trees import ExtraTreesClassifier, ExtraTreesRegressor +from .forest import RandomForestClassifier, RandomForestRegressor -__all__ = ['ExtraTreesClassifier', 'ExtraTreesRegressor', - 'RandomForestClassifier', 'RandomForestRegressor'] +__all__ = [ + "ExtraTreesClassifier", + "ExtraTreesRegressor", + "RandomForestClassifier", + "RandomForestRegressor", +] diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index 5ac48a3e62..4f6a17621b 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,51 +13,44 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== - -from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version, - make2d, PatchingConditionsChain, check_tree_nodes -) - -import numpy as np +# =============================================================================== import numbers - import warnings - from abc import ABC -from sklearn.exceptions import DataConversionWarning - -from ..._config import get_config -from ..._device_offload import dispatch, wrap_output_data - +import numpy as np +from scipy import sparse as sp +from sklearn.base import clone from sklearn.ensemble import ExtraTreesClassifier as sklearn_ExtraTreesClassifier from sklearn.ensemble import ExtraTreesRegressor as sklearn_ExtraTreesRegressor - -from sklearn.utils.validation import ( - check_is_fitted, - check_consistent_length, - check_array, - check_X_y) - -from onedal.datatypes import _num_features, _num_samples - -from sklearn.utils import check_random_state, deprecated - -from sklearn.base import clone - +from sklearn.exceptions import DataConversionWarning from sklearn.tree import ExtraTreeClassifier, ExtraTreeRegressor from sklearn.tree._tree import Tree +from sklearn.utils import check_random_state, deprecated +from sklearn.utils.validation import ( + check_array, + check_consistent_length, + check_is_fitted, + check_X_y, +) +from daal4py.sklearn._utils import ( + PatchingConditionsChain, + check_tree_nodes, + daal_check_version, + make2d, + sklearn_check_version, +) from onedal.ensemble import ExtraTreesClassifier as onedal_ExtraTreesClassifier from onedal.ensemble import ExtraTreesRegressor as onedal_ExtraTreesRegressor from onedal.primitives import get_tree_state_cls, get_tree_state_reg +from onedal.utils import _num_features, _num_samples -from scipy import sparse as sp +from ..._config import get_config +from ..._device_offload import dispatch, wrap_output_data -if sklearn_check_version('1.2'): +if sklearn_check_version("1.2"): from sklearn.utils._param_validation import Interval @@ -69,7 +62,7 @@ def _fit_proba(self, X, y, sample_weight=None, queue=None): # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue def _save_attributes(self): self._onedal_model = self._onedal_estimator._onedal_model @@ -79,8 +72,9 @@ def _save_attributes(self): if hasattr(self._onedal_estimator, "oob_prediction_"): self.oob_prediction_ = self._onedal_estimator.oob_prediction_ if hasattr(self._onedal_estimator, "oob_decision_function_"): - self.oob_decision_function_ = \ + self.oob_decision_function_ = ( self._onedal_estimator.oob_decision_function_ + ) return self def _onedal_classifier(self, **onedal_params): @@ -92,69 +86,79 @@ def _onedal_regressor(self, **onedal_params): # TODO: # move to onedal modul. def _check_parameters(self): - if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) else: # float - if not 0. < self.min_samples_leaf <= 0.5: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + if not 0.0 < self.min_samples_leaf <= 0.5: + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split) + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the integer %s" % self.min_samples_split + ) else: # float - if not 0. < self.min_samples_split <= 1.: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split) + if not 0.0 < self.min_samples_split <= 1.0: + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the float %s" % self.min_samples_split + ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if getattr(self, "min_impurity_split", None) is not None: - warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value has changed from 1e-7 to 0 in " - "version 0.23, and it will be removed in 0.25. " - "Use the min_impurity_decrease parameter instead.", - FutureWarning) - - if getattr(self, "min_impurity_split") < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") - if self.min_impurity_decrease < 0.: - raise ValueError("min_impurity_decrease must be greater than " - "or equal to 0") + warnings.warn( + "The min_impurity_split parameter is deprecated. " + "Its default value has changed from 1e-7 to 0 in " + "version 0.23, and it will be removed in 0.25. " + "Use the min_impurity_decrease parameter instead.", + FutureWarning, + ) + + if getattr(self, "min_impurity_split") < 0.0: + raise ValueError( + "min_impurity_split must be greater than " "or equal to 0" + ) + if self.min_impurity_decrease < 0.0: + raise ValueError( + "min_impurity_decrease must be greater than " "or equal to 0" + ) if self.max_leaf_nodes is not None: if not isinstance(self.max_leaf_nodes, numbers.Integral): raise ValueError( "max_leaf_nodes must be integral number but was " - "%r" % - self.max_leaf_nodes) + "%r" % self.max_leaf_nodes + ) if self.max_leaf_nodes < 2: raise ValueError( - ("max_leaf_nodes {0} must be either None " - "or larger than 1").format( - self.max_leaf_nodes)) + ("max_leaf_nodes {0} must be either None " "or larger than 1").format( + self.max_leaf_nodes + ) + ) if isinstance(self.max_bins, numbers.Integral): if not 2 <= self.max_bins: - raise ValueError("max_bins must be at least 2, got %s" - % self.max_bins) + raise ValueError("max_bins must be at least 2, got %s" % self.max_bins) else: - raise ValueError("max_bins must be integral number but was " - "%r" % self.max_bins) + raise ValueError( + "max_bins must be integral number but was " "%r" % self.max_bins + ) if isinstance(self.min_bin_size, numbers.Integral): if not 1 <= self.min_bin_size: - raise ValueError("min_bin_size must be at least 1, got %s" - % self.min_bin_size) + raise ValueError( + "min_bin_size must be at least 1, got %s" % self.min_bin_size + ) else: - raise ValueError("min_bin_size must be integral number but was " - "%r" % self.min_bin_size) + raise ValueError( + "min_bin_size must be integral number but was " "%r" % self.min_bin_size + ) def check_sample_weight(self, sample_weight, X, dtype=None): n_samples = _num_samples(X) @@ -174,49 +178,55 @@ def check_sample_weight(self, sample_weight, X, dtype=None): accept_sparse=False, ensure_2d=False, dtype=dtype, - order="C") + order="C", + ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) return sample_weight class ExtraTreesClassifier(sklearn_ExtraTreesClassifier, BaseTree): __doc__ = sklearn_ExtraTreesClassifier.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_ExtraTreesClassifier._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], - "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")] + "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features='sqrt' if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1): + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt" if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -233,7 +243,7 @@ def __init__( random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight + class_weight=class_weight, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -242,28 +252,31 @@ def __init__( self.min_bin_size = min_bin_size else: - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1): + + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -283,7 +296,7 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -317,17 +330,22 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_ExtraTreesClassifier.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_ExtraTreesClassifier.fit, + }, + X, + y, + sample_weight, + ) return self def _onedal_fit_ready(self, patching_status, X, y, sample_weight): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() @@ -335,22 +353,33 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): self._check_parameters() if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") - - ready = patching_status.and_conditions([ - (self.oob_score and daal_check_version((2021, 'P', 500)) or not - self.oob_score, - "OOB score is only supported starting from 2021.5 version of oneDAL."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.ccp_alpha == 0.0, - f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."), - (self.criterion == "gini", - f"'{self.criterion}' criterion is not supported. " - "Only 'gini' criterion is supported."), - (self.warm_start is False, "Warm start is not supported."), - (self.n_estimators <= 6024, "More than 6024 estimators is not supported.") - ]) + raise ValueError("Out of bag estimation only available" " if bootstrap=True") + + ready = patching_status.and_conditions( + [ + ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score, + "OOB score is only supported starting from 2021.5 version of oneDAL.", + ), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.ccp_alpha == 0.0, + f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported.", + ), + ( + self.criterion == "gini", + f"'{self.criterion}' criterion is not supported. " + "Only 'gini' criterion is supported.", + ), + (self.warm_start is False, "Warm start is not supported."), + ( + self.n_estimators <= 6024, + "More than 6024 estimators is not supported.", + ), + ] + ) if ready: if sklearn_check_version("1.0"): @@ -364,22 +393,29 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, - stacklevel=2) + stacklevel=2, + ) check_consistent_length(X, y) y = make2d(y) self.n_outputs_ = y.shape[1] - ready = patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - (y.dtype in [np.float32, np.float64, np.int32, np.int64], - f"Datatype ({y.dtype}) for y is not supported.") - ]) + ready = patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ( + y.dtype in [np.float32, np.float64, np.int32, np.int64], + f"Datatype ({y.dtype}) for y is not supported.", + ), + ] + ) # TODO: Fix to support integers as input n_samples = X.shape[0] if isinstance(self.max_samples, numbers.Integral): - if not sklearn_check_version('1.2'): + if not sklearn_check_version("1.2"): if not (1 <= self.max_samples <= n_samples): msg = "`max_samples` must be in range 1 to {} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) @@ -388,9 +424,9 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): msg = "`max_samples` must be <= n_samples={} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) elif isinstance(self.max_samples, numbers.Real): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): pass - elif sklearn_check_version('1.0'): + elif sklearn_check_version("1.0"): if not (0 < float(self.max_samples) <= 1): msg = "`max_samples` must be in range (0.0, 1.0] but got value {}" raise ValueError(msg.format(self.max_samples)) @@ -433,10 +469,15 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_ExtraTreesClassifier.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_ExtraTreesClassifier.predict, + }, + X, + ) @wrap_output_data def predict_proba(self, X): @@ -467,54 +508,64 @@ def predict_proba(self, X): # self._check_proba() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - if hasattr(self, 'n_features_in_'): + if hasattr(self, "n_features_in_"): try: num_features = _num_features(X) except TypeError: num_features = _num_samples(X) if num_features != self.n_features_in_: raise ValueError( - (f'X has {num_features} features, ' - f'but ExtraTreesClassifier is expecting ' - f'{self.n_features_in_} features as input')) - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_ExtraTreesClassifier.predict_proba, - }, X) - - if sklearn_check_version('1.0'): + ( + f"X has {num_features} features, " + f"but ExtraTreesClassifier is expecting " + f"{self.n_features_in_} features as input" + ) + ) + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_ExtraTreesClassifier.predict_proba, + }, + X, + ) + + if sklearn_check_version("1.0"): + @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") classes_ = self.classes_[0] n_classes_ = self.n_classes_[0] # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = ExtraTreeClassifier(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution @@ -523,29 +574,27 @@ def _estimators_(self): for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - tree_i_state_class = get_tree_state_cls( - self._onedal_model, i, n_classes_) + tree_i_state_class = get_tree_state_cls(self._onedal_model, i, n_classes_) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( self.n_features_in_, - np.array( - [n_classes_], - dtype=np.intp), - self.n_outputs_) + np.array([n_classes_], dtype=np.intp), + self.n_outputs_, + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -555,48 +604,64 @@ def _estimators_(self): def _onedal_cpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 200)), - "ExtraTrees only supported starting from oneDAL version 2023.2"), - (not sp.issparse(sample_weight), "sample_weight is sparse. " - "Sparse input is not supported."), - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 200)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ( + not sp.issparse(sample_weight), + "sample_weight is sparse. " "Sparse input is not supported.", + ), + ] + ) - dal_ready = dal_ready and not hasattr(self, 'estimators_') + dal_ready = dal_ready and not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = _patching_status.and_conditions([ - (hasattr(self, '_onedal_model'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.2") - ]) - if hasattr(self, 'n_outputs_'): - dal_ready = dal_ready and _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "_onedal_model"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ] + ) + if hasattr(self, "n_outputs_"): + dal_ready = dal_ready and _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: dal_ready = False else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready @@ -604,62 +669,85 @@ def _onedal_cpu_supported(self, method_name, *data): def _onedal_gpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1"), - (sample_weight is not None, "sample_weight is not supported.") - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + (sample_weight is not None, "sample_weight is not supported."), + ] + ) - dal_ready &= not hasattr(self, 'estimators_') + dal_ready &= not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = hasattr(self, '_onedal_model') and hasattr(self, 'n_outputs_') + dal_ready = hasattr(self, "_onedal_model") and hasattr(self, "n_outputs_") if dal_ready: - dal_ready = _patching_status.and_conditions([ - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1") - ]) + dal_ready = _patching_status.and_conditions( + [ + ( + not sp.issparse(X), + "X is sparse. Sparse input is not supported.", + ), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + ] + ) - if hasattr(self, 'n_outputs_'): - dal_ready &= _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + if hasattr(self, "n_outputs_"): + dal_ready &= _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready def _onedal_fit(self, X, y, sample_weight=None, queue=None): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): X, y = self._validate_data( - X, y, multi_output=False, accept_sparse=False, - dtype=[np.float64, np.float32] + X, + y, + multi_output=False, + accept_sparse=False, + dtype=[np.float64, np.float32], ) else: X, y = check_X_y( - X, y, accept_sparse=False, dtype=[np.float64, np.float32], - multi_output=False + X, + y, + accept_sparse=False, + dtype=[np.float64, np.float32], + multi_output=False, ) if sample_weight is not None: @@ -683,7 +771,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): n_classes_ = self.n_classes_[0] self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ if expanded_class_weight is not None: @@ -695,43 +783,42 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): sample_weight = [sample_weight] if n_classes_ < 2: - raise ValueError( - "Training data only contain information about one class.") + raise ValueError("Training data only contain information about one class.") if self.oob_score: - err = 'out_of_bag_error_accuracy|out_of_bag_error_decision_function' + err = "out_of_bag_error_accuracy|out_of_bag_error_decision_function" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': self.random_state, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'class_weight': self.class_weight, - 'max_bins': self.max_bins, - 'min_bin_size': self.min_bin_size, - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": self.random_state, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "class_weight": self.class_weight, + "max_bins": self.max_bins, + "min_bin_size": self.min_bin_size, + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = "random" - if not sklearn_check_version('1.0'): - onedal_params['min_impurity_split'] = self.min_impurity_split + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = "random" + if not sklearn_check_version("1.0"): + onedal_params["min_impurity_split"] = self.min_impurity_split else: - onedal_params['min_impurity_split'] = None + onedal_params["min_impurity_split"] = None self._cached_estimators_ = None # Compute @@ -754,13 +841,12 @@ def _onedal_predict(self, X, queue=None): self._check_feature_names(X, reset=False) res = self._onedal_estimator.predict(X, queue=queue) - return np.take(self.classes_, - res.ravel().astype(np.int64, casting='unsafe')) + return np.take(self.classes_, res.ravel().astype(np.int64, casting="unsafe")) def _onedal_predict_proba(self, X, queue=None): X = check_array(X, dtype=[np.float64, np.float32]) check_is_fitted(self) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): self._check_n_features(X, reset=False) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -770,36 +856,38 @@ def _onedal_predict_proba(self, X, queue=None): class ExtraTreesRegressor(sklearn_ExtraTreesRegressor, BaseTree): __doc__ = sklearn_ExtraTreesRegressor.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_ExtraTreesRegressor._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], - "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")] + "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - *, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features=1.0 if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1): + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0 if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -815,36 +903,40 @@ def __init__( n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start + warm_start=warm_start, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha self.max_samples = max_samples self.max_bins = max_bins self.min_bin_size = min_bin_size + else: - def __init__(self, - n_estimators=100, *, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1 - ): + + def __init__( + self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -863,7 +955,7 @@ def __init__(self, verbose=verbose, warm_start=warm_start, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -873,27 +965,27 @@ def __init__(self, @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = ExtraTreeRegressor(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution @@ -902,26 +994,25 @@ def _estimators_(self): for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - tree_i_state_class = get_tree_state_reg( - self._onedal_model, i) + tree_i_state_class = get_tree_state_reg(self._onedal_model, i) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( - self.n_features_in_, np.array( - [1], dtype=np.intp), self.n_outputs_) + self.n_features_in_, np.array([1], dtype=np.intp), self.n_outputs_ + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -929,9 +1020,7 @@ def _estimators_(self): def _onedal_fit_ready(self, patching_status, X, y, sample_weight): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() @@ -939,30 +1028,41 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): self._check_parameters() if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError("Out of bag estimation only available" " if bootstrap=True") - if sklearn_check_version('1.0') and self.criterion == "mse": + if sklearn_check_version("1.0") and self.criterion == "mse": warnings.warn( "Criterion 'mse' was deprecated in v1.0 and will be " "removed in version 1.2. Use `criterion='squared_error'` " "which is equivalent.", - FutureWarning + FutureWarning, ) - ready = patching_status.and_conditions([ - (self.oob_score and daal_check_version((2021, 'P', 500)) or not - self.oob_score, - "OOB score is only supported starting from 2021.5 version of oneDAL."), - (self.warm_start is False, "Warm start is not supported."), - (self.criterion in ["mse", "squared_error"], - f"'{self.criterion}' criterion is not supported. " - "Only 'mse' and 'squared_error' criteria are supported."), - (self.ccp_alpha == 0.0, - f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.n_estimators <= 6024, "More than 6024 estimators is not supported.") - ]) + ready = patching_status.and_conditions( + [ + ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score, + "OOB score is only supported starting from 2021.5 version of oneDAL.", + ), + (self.warm_start is False, "Warm start is not supported."), + ( + self.criterion in ["mse", "squared_error"], + f"'{self.criterion}' criterion is not supported. " + "Only 'mse' and 'squared_error' criteria are supported.", + ), + ( + self.ccp_alpha == 0.0, + f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported.", + ), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.n_estimators <= 6024, + "More than 6024 estimators is not supported.", + ), + ] + ) if ready: if sklearn_check_version("1.0"): @@ -972,10 +1072,13 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: - warnings.warn("A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples,), for example using ravel().", - DataConversionWarning, stacklevel=2) + warnings.warn( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples,), for example using ravel().", + DataConversionWarning, + stacklevel=2, + ) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) @@ -986,14 +1089,18 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] - ready = patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1.") - ]) + ready = patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ) + ] + ) n_samples = X.shape[0] if isinstance(self.max_samples, numbers.Integral): - if not sklearn_check_version('1.2'): + if not sklearn_check_version("1.2"): if not (1 <= self.max_samples <= n_samples): msg = "`max_samples` must be in range 1 to {} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) @@ -1002,9 +1109,9 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): msg = "`max_samples` must be <= n_samples={} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) elif isinstance(self.max_samples, numbers.Real): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): pass - elif sklearn_check_version('1.0'): + elif sklearn_check_version("1.0"): if not (0 < float(self.max_samples) <= 1): msg = "`max_samples` must be in range (0.0, 1.0] but got value {}" raise ValueError(msg.format(self.max_samples)) @@ -1028,48 +1135,64 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): def _onedal_cpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 200)), - "ExtraTrees only supported starting from oneDAL version 2023.2"), - (not sp.issparse(sample_weight), "sample_weight is sparse. " - "Sparse input is not supported."), - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 200)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ( + not sp.issparse(sample_weight), + "sample_weight is sparse. " "Sparse input is not supported.", + ), + ] + ) - dal_ready &= not hasattr(self, 'estimators_') + dal_ready &= not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = _patching_status.and_conditions([ - (hasattr(self, '_onedal_model'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - (daal_check_version((2023, 'P', 200)), - "ExtraTrees only supported starting from oneDAL version 2023.2") - ]) - if hasattr(self, 'n_outputs_'): - dal_ready &= _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "_onedal_model"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 200)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ] + ) + if hasattr(self, "n_outputs_"): + dal_ready &= _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: dal_ready = False else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready @@ -1077,55 +1200,66 @@ def _onedal_cpu_supported(self, method_name, *data): def _onedal_gpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1"), - (sample_weight is not None, "sample_weight is not supported."), - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + (sample_weight is not None, "sample_weight is not supported."), + ] + ) - dal_ready &= not hasattr(self, 'estimators_') + dal_ready &= not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = _patching_status.and_conditions([ - (hasattr(self, '_onedal_model'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1") - ]) - if hasattr(self, 'n_outputs_'): - dal_ready &= _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "_onedal_model"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + ] + ) + if hasattr(self, "n_outputs_"): + dal_ready &= _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready def _onedal_fit(self, X, y, sample_weight=None, queue=None): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() else: @@ -1142,41 +1276,42 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, - stacklevel=2) + stacklevel=2, + ) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ rs_ = check_random_state(self.random_state) if self.oob_score: - err = 'out_of_bag_error_r2|out_of_bag_error_prediction' + err = "out_of_bag_error_r2|out_of_bag_error_prediction" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': rs_, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": rs_, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = "random" + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = "random" self._cached_estimators_ = None self._onedal_estimator = self._onedal_regressor(**onedal_params) self._onedal_estimator.fit(X, y, sample_weight, queue=queue) @@ -1219,10 +1354,17 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_ExtraTreesRegressor.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_ExtraTreesRegressor.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -1247,15 +1389,22 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_ExtraTreesRegressor.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_ExtraTreesRegressor.predict, + }, + X, + ) + + if sklearn_check_version("1.0"): - if sklearn_check_version('1.0'): @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 4ff88fc6f9..99d36a2c2f 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -15,49 +15,41 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version, - make2d, check_tree_nodes -) - -import numpy as np - import numbers - import warnings - from abc import ABC -from sklearn.exceptions import DataConversionWarning - -from ..._config import get_config -from ..._device_offload import dispatch, wrap_output_data - +import numpy as np +from scipy import sparse as sp +from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier as sklearn_RandomForestClassifier from sklearn.ensemble import RandomForestRegressor as sklearn_RandomForestRegressor - -from sklearn.utils.validation import ( - check_is_fitted, - check_consistent_length, - check_array, - check_X_y) - -from onedal.datatypes import _num_features, _num_samples - -from sklearn.utils import check_random_state, deprecated - -from sklearn.base import clone - +from sklearn.exceptions import DataConversionWarning from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.tree._tree import Tree +from sklearn.utils import check_random_state, deprecated +from sklearn.utils.validation import ( + check_array, + check_consistent_length, + check_is_fitted, + check_X_y, +) +from daal4py.sklearn._utils import ( + check_tree_nodes, + daal_check_version, + make2d, + sklearn_check_version, +) from onedal.ensemble import RandomForestClassifier as onedal_RandomForestClassifier from onedal.ensemble import RandomForestRegressor as onedal_RandomForestRegressor from onedal.primitives import get_tree_state_cls, get_tree_state_reg +from onedal.utils import _num_features, _num_samples -from scipy import sparse as sp +from ..._config import get_config +from ..._device_offload import dispatch, wrap_output_data -if sklearn_check_version('1.2'): +if sklearn_check_version("1.2"): from sklearn.utils._param_validation import Interval, StrOptions @@ -69,7 +61,7 @@ def _fit_proba(self, X, y, sample_weight=None, queue=None): # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue def _save_attributes(self): self._onedal_model = self._onedal_estimator._onedal_model @@ -97,66 +89,77 @@ def _check_parameters(self): ) if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) else: # float - if not 0. < self.min_samples_leaf <= 0.5: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + if not 0.0 < self.min_samples_leaf <= 0.5: + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split) + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the integer %s" % self.min_samples_split + ) else: # float - if not 0. < self.min_samples_split <= 1.: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split) + if not 0.0 < self.min_samples_split <= 1.0: + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the float %s" % self.min_samples_split + ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if self.min_impurity_split is not None: - warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value has changed from 1e-7 to 0 in " - "version 0.23, and it will be removed in 0.25. " - "Use the min_impurity_decrease parameter instead.", - FutureWarning) - - if self.min_impurity_split < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") - if self.min_impurity_decrease < 0.: - raise ValueError("min_impurity_decrease must be greater than " - "or equal to 0") + warnings.warn( + "The min_impurity_split parameter is deprecated. " + "Its default value has changed from 1e-7 to 0 in " + "version 0.23, and it will be removed in 0.25. " + "Use the min_impurity_decrease parameter instead.", + FutureWarning, + ) + + if self.min_impurity_split < 0.0: + raise ValueError( + "min_impurity_split must be greater than " "or equal to 0" + ) + if self.min_impurity_decrease < 0.0: + raise ValueError( + "min_impurity_decrease must be greater than " "or equal to 0" + ) if self.max_leaf_nodes is not None: if not isinstance(self.max_leaf_nodes, numbers.Integral): raise ValueError( "max_leaf_nodes must be integral number but was " - "%r" % - self.max_leaf_nodes) + "%r" % self.max_leaf_nodes + ) if self.max_leaf_nodes < 2: raise ValueError( - ("max_leaf_nodes {0} must be either None " - "or larger than 1").format( - self.max_leaf_nodes)) + ("max_leaf_nodes {0} must be either None " "or larger than 1").format( + self.max_leaf_nodes + ) + ) if isinstance(self.max_bins, numbers.Integral): if not 2 <= self.max_bins: - raise ValueError("max_bins must be at least 2, got %s" - % self.max_bins) + raise ValueError("max_bins must be at least 2, got %s" % self.max_bins) else: - raise ValueError("max_bins must be integral number but was " - "%r" % self.max_bins) + raise ValueError( + "max_bins must be integral number but was " "%r" % self.max_bins + ) if isinstance(self.min_bin_size, numbers.Integral): if not 1 <= self.min_bin_size: - raise ValueError("min_bin_size must be at least 1, got %s" - % self.min_bin_size) + raise ValueError( + "min_bin_size must be at least 1, got %s" % self.min_bin_size + ) else: - raise ValueError("min_bin_size must be integral number but was " - "%r" % self.min_bin_size) + raise ValueError( + "min_bin_size must be integral number but was " "%r" % self.min_bin_size + ) def check_sample_weight(self, sample_weight, X, dtype=None): n_samples = _num_samples(X) @@ -176,51 +179,57 @@ def check_sample_weight(self, sample_weight, X, dtype=None): accept_sparse=False, ensure_2d=False, dtype=dtype, - order="C") + order="C", + ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) return sample_weight class RandomForestClassifier(sklearn_RandomForestClassifier, BaseRandomForest): __doc__ = sklearn_RandomForestClassifier.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_RandomForestClassifier._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], - "splitter_mode": [StrOptions({"best", "random"})] + "splitter_mode": [StrOptions({"best", "random"})], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features='sqrt' if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt" if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -237,7 +246,7 @@ def __init__( random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight + class_weight=class_weight, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -247,30 +256,34 @@ def __init__( self.min_impurity_split = None self.splitter_mode = splitter_mode # self._estimator = DecisionTreeClassifier() + else: - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -290,7 +303,7 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -327,17 +340,22 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_RandomForestClassifier.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_RandomForestClassifier.fit, + }, + X, + y, + sample_weight, + ) return self def _onedal_ready(self, X, y, sample_weight): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if not self.bootstrap and self.max_samples is not None: raise ValueError( "`max_sample` cannot be set if `bootstrap=False`. " @@ -345,8 +363,7 @@ def _onedal_ready(self, X, y, sample_weight): "`max_sample=None`." ) if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError("Out of bag estimation only available" " if bootstrap=True") if sklearn_check_version("1.2"): self._validate_params() else: @@ -357,16 +374,20 @@ def _onedal_ready(self, X, y, sample_weight): correct_criterion = self.criterion == "gini" correct_warm_start = self.warm_start is False - if daal_check_version((2021, 'P', 500)): + if daal_check_version((2021, "P", 500)): correct_oob_score = not self.oob_score else: correct_oob_score = self.oob_score - ready = all([correct_oob_score, - correct_sparsity, - correct_ccp_alpha, - correct_criterion, - correct_warm_start]) + ready = all( + [ + correct_oob_score, + correct_sparsity, + correct_ccp_alpha, + correct_criterion, + correct_warm_start, + ] + ) if ready: if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) @@ -379,7 +400,8 @@ def _onedal_ready(self, X, y, sample_weight): " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, - stacklevel=2) + stacklevel=2, + ) check_consistent_length(X, y) y = make2d(y) @@ -412,10 +434,15 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_RandomForestClassifier.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_RandomForestClassifier.predict, + }, + X, + ) @wrap_output_data def predict_proba(self, X): @@ -446,54 +473,64 @@ def predict_proba(self, X): # self._check_proba() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - if hasattr(self, 'n_features_in_'): + if hasattr(self, "n_features_in_"): try: num_features = _num_features(X) except TypeError: num_features = _num_samples(X) if num_features != self.n_features_in_: raise ValueError( - (f'X has {num_features} features, ' - f'but RandomForestClassifier is expecting ' - f'{self.n_features_in_} features as input')) - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_RandomForestClassifier.predict_proba, - }, X) - - if sklearn_check_version('1.0'): + ( + f"X has {num_features} features, " + f"but RandomForestClassifier is expecting " + f"{self.n_features_in_} features as input" + ) + ) + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_RandomForestClassifier.predict_proba, + }, + X, + ) + + if sklearn_check_version("1.0"): + @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") classes_ = self.classes_[0] n_classes_ = self.n_classes_[0] # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = DecisionTreeClassifier(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution @@ -502,29 +539,27 @@ def _estimators_(self): for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - tree_i_state_class = get_tree_state_cls( - self._onedal_model, i, n_classes_) + tree_i_state_class = get_tree_state_cls(self._onedal_model, i, n_classes_) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( self.n_features_in_, - np.array( - [n_classes_], - dtype=np.intp), - self.n_outputs_) + np.array([n_classes_], dtype=np.intp), + self.n_outputs_, + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -532,13 +567,16 @@ def _estimators_(self): return estimators_ def _onedal_cpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random': - warnings.warn("'random' splitter mode supports GPU devices only " - "and requires oneDAL version >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random": + warnings.warn( + "'random' splitter mode supports GPU devices only " + "and requires oneDAL version >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False elif sp.issparse(X): @@ -551,39 +589,42 @@ def _onedal_cpu_supported(self, method_name, *data): return False elif self.warm_start: return False - elif self.oob_score and not daal_check_version((2023, 'P', 101)): + elif self.oob_score and not daal_check_version((2023, "P", 101)): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name in ['predict', 'predict_proba']: + if method_name in ["predict", "predict_proba"]: X = data[0] - if not hasattr(self, '_onedal_model'): + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(X): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_gpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random' and \ - not daal_check_version((2023, 'P', 101)): - warnings.warn("'random' splitter mode requires OneDAL >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random" and not daal_check_version( + (2023, "P", 101) + ): + warnings.warn( + "'random' splitter mode requires OneDAL >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False elif sp.issparse(X): @@ -602,37 +643,42 @@ def _onedal_gpu_supported(self, method_name, *data): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name in ['predict', 'predict_proba']: + if method_name in ["predict", "predict_proba"]: X = data[0] - if not hasattr(self, '_onedal_model'): + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(X): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_fit(self, X, y, sample_weight=None, queue=None): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): X, y = self._validate_data( - X, y, multi_output=False, accept_sparse=False, - dtype=[np.float64, np.float32] + X, + y, + multi_output=False, + accept_sparse=False, + dtype=[np.float64, np.float32], ) else: X, y = check_X_y( - X, y, accept_sparse=False, dtype=[np.float64, np.float32], - multi_output=False + X, + y, + accept_sparse=False, + dtype=[np.float64, np.float32], + multi_output=False, ) if sample_weight is not None: @@ -656,7 +702,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): n_classes_ = self.n_classes_[0] self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ if expanded_class_weight is not None: @@ -668,40 +714,39 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): sample_weight = [sample_weight] if n_classes_ < 2: - raise ValueError( - "Training data only contain information about one class.") + raise ValueError("Training data only contain information about one class.") if self.oob_score: - err = 'out_of_bag_error_accuracy|out_of_bag_error_decision_function' + err = "out_of_bag_error_accuracy|out_of_bag_error_decision_function" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'min_impurity_split': self.min_impurity_split, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': self.random_state, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'class_weight': self.class_weight, - 'max_bins': self.max_bins, - 'min_bin_size': self.min_bin_size, - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "min_impurity_split": self.min_impurity_split, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": self.random_state, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "class_weight": self.class_weight, + "max_bins": self.max_bins, + "min_bin_size": self.min_bin_size, + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = self.splitter_mode + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = self.splitter_mode self._cached_estimators_ = None # Compute @@ -724,13 +769,12 @@ def _onedal_predict(self, X, queue=None): self._check_feature_names(X, reset=False) res = self._onedal_estimator.predict(X, queue=queue) - return np.take(self.classes_, - res.ravel().astype(np.int64, casting='unsafe')) + return np.take(self.classes_, res.ravel().astype(np.int64, casting="unsafe")) def _onedal_predict_proba(self, X, queue=None): X = check_array(X, dtype=[np.float64, np.float32]) check_is_fitted(self) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): self._check_n_features(X, reset=False) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -740,38 +784,40 @@ def _onedal_predict_proba(self, X, queue=None): class RandomForestRegressor(sklearn_RandomForestRegressor, BaseRandomForest): __doc__ = sklearn_RandomForestRegressor.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_RandomForestRegressor._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], - "splitter_mode": [StrOptions({"best", "random"})] + "splitter_mode": [StrOptions({"best", "random"})], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - *, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features=1.0 if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0 if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -787,7 +833,7 @@ def __init__( n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start + warm_start=warm_start, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -796,29 +842,34 @@ def __init__( self.min_bin_size = min_bin_size self.min_impurity_split = None self.splitter_mode = splitter_mode + else: - def __init__(self, - n_estimators=100, *, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + + def __init__( + self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -837,7 +888,7 @@ def __init__(self, verbose=verbose, warm_start=warm_start, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -849,27 +900,27 @@ def __init__(self, @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = DecisionTreeRegressor(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution @@ -878,26 +929,25 @@ def _estimators_(self): for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - tree_i_state_class = get_tree_state_reg( - self._onedal_model, i) + tree_i_state_class = get_tree_state_reg(self._onedal_model, i) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( - self.n_features_in_, np.array( - [1], dtype=np.intp), self.n_outputs_) + self.n_features_in_, np.array([1], dtype=np.intp), self.n_outputs_ + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -920,17 +970,23 @@ def _onedal_ready(self, X, y, sample_weight): return ready, X, y, sample_weight def _onedal_cpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random': - warnings.warn("'random' splitter mode supports GPU devices only " - "and requires oneDAL version >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random": + warnings.warn( + "'random' splitter mode supports GPU devices only " + "and requires oneDAL version >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False - elif not (self.oob_score and daal_check_version( - (2021, 'P', 500)) or not self.oob_score): + elif not ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score + ): return False elif self.criterion not in ["mse", "squared_error"]: return False @@ -944,42 +1000,48 @@ def _onedal_cpu_supported(self, method_name, *data): return False elif self.warm_start: return False - elif self.oob_score and not daal_check_version((2023, 'P', 101)): + elif self.oob_score and not daal_check_version((2023, "P", 101)): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name == 'predict': - if not hasattr(self, '_onedal_model'): + if method_name == "predict": + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(data[0]): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_gpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random' and \ - not daal_check_version((2023, 'P', 101)): - warnings.warn("'random' splitter mode requires OneDAL >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random" and not daal_check_version( + (2023, "P", 101) + ): + warnings.warn( + "'random' splitter mode requires OneDAL >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False - elif not (self.oob_score and daal_check_version( - (2021, 'P', 500)) or not self.oob_score): + elif not ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score + ): return False elif self.criterion not in ["mse", "squared_error"]: return False @@ -995,32 +1057,29 @@ def _onedal_gpu_supported(self, method_name, *data): return False elif self.oob_score: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name == 'predict': + if method_name == "predict": X = data[0] - if not hasattr(self, '_onedal_model'): + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(X): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_fit(self, X, y, sample_weight=None, queue=None): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() else: @@ -1034,37 +1093,37 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ rs_ = check_random_state(self.random_state) if self.oob_score: - err = 'out_of_bag_error_r2|out_of_bag_error_prediction' + err = "out_of_bag_error_r2|out_of_bag_error_prediction" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': rs_, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": rs_, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = self.splitter_mode + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = self.splitter_mode self._cached_estimators_ = None self._onedal_estimator = self._onedal_regressor(**onedal_params) self._onedal_estimator.fit(X, y, sample_weight, queue=queue) @@ -1113,10 +1172,17 @@ def fit(self, X, y, sample_weight=None): "Either switch to `bootstrap=True` or set " "`max_sample=None`." ) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_RandomForestRegressor.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_RandomForestRegressor.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -1141,15 +1207,22 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_RandomForestRegressor.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_RandomForestRegressor.predict, + }, + X, + ) + + if sklearn_check_version("1.0"): - if sklearn_check_version('1.0'): @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ diff --git a/sklearnex/preview/ensemble/tests/test_preview_ensemble.py b/sklearnex/preview/ensemble/tests/test_preview_ensemble.py index 25bd992e60..0a064e8ed0 100755 --- a/sklearnex/preview/ensemble/tests/test_preview_ensemble.py +++ b/sklearnex/preview/ensemble/tests/test_preview_ensemble.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,55 +13,68 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from numpy.testing import assert_allclose from sklearn.datasets import make_classification, make_regression + from daal4py.sklearn._utils import daal_check_version def test_sklearnex_import_rf_classifier(): from sklearnex.preview.ensemble import RandomForestClassifier - X, y = make_classification(n_samples=1000, n_features=4, - n_informative=2, n_redundant=0, - random_state=0, shuffle=False) + + X, y = make_classification( + n_samples=1000, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, + ) rf = RandomForestClassifier(max_depth=2, random_state=0).fit(X, y) - assert 'sklearnex.preview' in rf.__module__ + assert "sklearnex.preview" in rf.__module__ assert_allclose([1], rf.predict([[0, 0, 0, 0]])) def test_sklearnex_import_rf_regression(): from sklearnex.preview.ensemble import RandomForestRegressor - X, y = make_regression(n_features=4, n_informative=2, - random_state=0, shuffle=False) + + X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) rf = RandomForestRegressor(max_depth=2, random_state=0).fit(X, y) - assert 'sklearnex.preview' in rf.__module__ + assert "sklearnex.preview" in rf.__module__ pred = rf.predict([[0, 0, 0, 0]]) assert_allclose([-6.839], pred, atol=1e-2) def test_sklearnex_import_et_classifier(): from sklearnex.preview.ensemble import ExtraTreesClassifier - X, y = make_classification(n_samples=1000, n_features=4, - n_informative=2, n_redundant=0, - random_state=0, shuffle=False) + + X, y = make_classification( + n_samples=1000, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, + ) # For the 2023.2 release, random_state is not supported # defaults to seed=777, although it is set to 0 rf = ExtraTreesClassifier(max_depth=2, random_state=0).fit(X, y) - assert 'sklearnex' in rf.__module__ + assert "sklearnex" in rf.__module__ assert_allclose([1], rf.predict([[0, 0, 0, 0]])) def test_sklearnex_import_et_regression(): from sklearnex.preview.ensemble import ExtraTreesRegressor - X, y = make_regression(n_features=4, n_informative=2, - random_state=0, shuffle=False) + + X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) # For the 2023.2 release, random_state is not supported # defaults to seed=777, although it is set to 0 rf = ExtraTreesRegressor(max_depth=2, random_state=0).fit(X, y) - assert 'sklearnex' in rf.__module__ + assert "sklearnex" in rf.__module__ pred = rf.predict([[0, 0, 0, 0]]) - if daal_check_version((2023, 'P', 200)): + if daal_check_version((2023, "P", 200)): assert_allclose([27.138], pred, atol=1e-2) else: assert_allclose([-2.826], pred, atol=1e-2) diff --git a/sklearnex/preview/linear_model/__init__.py b/sklearnex/preview/linear_model/__init__.py index 463003bb1d..a244f823a8 100755 --- a/sklearnex/preview/linear_model/__init__.py +++ b/sklearnex/preview/linear_model/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,10 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .linear import LinearRegression -__all__ = [ - 'LinearRegression' -] +__all__ = ["LinearRegression"] diff --git a/sklearnex/preview/linear_model/_common.py b/sklearnex/preview/linear_model/_common.py index 29f6061630..c93241874c 100644 --- a/sklearnex/preview/linear_model/_common.py +++ b/sklearnex/preview/linear_model/_common.py @@ -15,14 +15,14 @@ # =============================================================================== from abc import ABC -import numpy as np -from daal4py.sklearn._utils import sklearn_check_version +import numpy as np +from sklearn.calibration import CalibratedClassifierCV from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import LabelEncoder -from sklearn.calibration import CalibratedClassifierCV -from onedal.datatypes.validation import _column_or_1d +from daal4py.sklearn._utils import sklearn_check_version +from onedal.utils import _column_or_1d def get_coef(self): @@ -31,7 +31,7 @@ def get_coef(self): def set_coef(self, value): self._coef_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.coef_ = value if not self._is_in_fit: del self._onedal_estimator._onedal_model @@ -43,7 +43,7 @@ def get_intercept(self): def set_intercept(self, value): self._intercept_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.intercept_ = value if not self._is_in_fit: del self._onedal_estimator._onedal_model diff --git a/sklearnex/preview/linear_model/linear.py b/sklearnex/preview/linear_model/linear.py index 8c1bd70244..1f0d69ea48 100644 --- a/sklearnex/preview/linear_model/linear.py +++ b/sklearnex/preview/linear_model/linear.py @@ -14,35 +14,40 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import daal_check_version import logging -if daal_check_version((2023, 'P', 100)): +from daal4py.sklearn._utils import daal_check_version + +if daal_check_version((2023, "P", 100)): import numpy as np + from sklearn.linear_model import LinearRegression as sklearn_LinearRegression - from ._common import BaseLinearRegression - from ..._device_offload import dispatch, wrap_output_data + from daal4py.sklearn._utils import ( + PatchingConditionsChain, + get_dtype, + make2d, + sklearn_check_version, + ) + from ..._device_offload import dispatch, wrap_output_data from ...utils.validation import _assert_all_finite - from daal4py.sklearn._utils import ( - get_dtype, make2d, sklearn_check_version, PatchingConditionsChain) - from sklearn.linear_model import LinearRegression as sklearn_LinearRegression + from ._common import BaseLinearRegression - if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): from sklearn.linear_model._base import _deprecate_normalize - from sklearn.utils.validation import _deprecate_positional_args, check_X_y - from sklearn.exceptions import NotFittedError from scipy.sparse import issparse + from sklearn.exceptions import NotFittedError + from sklearn.utils.validation import _deprecate_positional_args, check_X_y from onedal.linear_model import LinearRegression as onedal_LinearRegression - from onedal.datatypes import (_num_features, _num_samples) + from onedal.utils import _num_features, _num_samples class LinearRegression(sklearn_LinearRegression, BaseLinearRegression): __doc__ = sklearn_LinearRegression.__doc__ intercept_, coef_ = None, None - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_LinearRegression._parameter_constraints } @@ -60,11 +65,13 @@ def __init__( n_jobs=n_jobs, positive=positive, ) - elif sklearn_check_version('0.24'): + + elif sklearn_check_version("0.24"): + def __init__( self, fit_intercept=True, - normalize='deprecated' if sklearn_check_version('1.0') else False, + normalize="deprecated" if sklearn_check_version("1.0") else False, copy_X=True, n_jobs=None, positive=False, @@ -76,7 +83,9 @@ def __init__( n_jobs=n_jobs, positive=positive, ) + else: + def __init__( self, fit_intercept=True, @@ -88,7 +97,7 @@ def __init__( fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, - n_jobs=n_jobs + n_jobs=n_jobs, ) def fit(self, X, y, sample_weight=None): @@ -109,15 +118,22 @@ def fit(self, X, y, sample_weight=None): self : object Fitted Estimator. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_LinearRegression.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_LinearRegression.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -135,16 +151,21 @@ def predict(self, X): """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_LinearRegression.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_LinearRegression.predict, + }, + X, + ) def _test_type_and_finiteness(self, X_in): X = X_in if isinstance(X_in, np.ndarray) else np.asarray(X_in) dtype = X.dtype - if 'complex' in str(type(dtype)): + if "complex" in str(type(dtype)): return False try: @@ -154,79 +175,99 @@ def _test_type_and_finiteness(self, X_in): return True def _onedal_fit_supported(self, method_name, *data): - assert method_name == 'fit' + assert method_name == "fit" assert len(data) == 3 X, y, sample_weight = data class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( - f'sklearn.linear_model.{class_name}.fit') + f"sklearn.linear_model.{class_name}.fit" + ) - normalize_is_set = hasattr(self, 'normalize') and self.normalize \ - and self.normalize != 'deprecated' - positive_is_set = hasattr(self, 'positive') and self.positive + normalize_is_set = ( + hasattr(self, "normalize") + and self.normalize + and self.normalize != "deprecated" + ) + positive_is_set = hasattr(self, "positive") and self.positive n_samples = _num_samples(X) n_features = _num_features(X, fallback_1d=True) # Check if equations are well defined - is_good_for_onedal = n_samples > \ - (n_features + int(self.fit_intercept)) - - dal_ready = patching_status.and_conditions([ - (sample_weight is None, 'Sample weight is not supported.'), - (not issparse(X) and not issparse(y), 'Sparse input is not supported.'), - (not normalize_is_set, 'Normalization is not supported.'), - (not positive_is_set, 'Forced positive coefficients are not supported.'), - (is_good_for_onedal, - 'The shape of X (fitting) does not satisfy oneDAL requirements:.' - 'Number of features + 1 >= number of samples.') - ]) + is_good_for_onedal = n_samples > (n_features + int(self.fit_intercept)) + + dal_ready = patching_status.and_conditions( + [ + (sample_weight is None, "Sample weight is not supported."), + ( + not issparse(X) and not issparse(y), + "Sparse input is not supported.", + ), + (not normalize_is_set, "Normalization is not supported."), + ( + not positive_is_set, + "Forced positive coefficients are not supported.", + ), + ( + is_good_for_onedal, + "The shape of X (fitting) does not satisfy oneDAL requirements:." + "Number of features + 1 >= number of samples.", + ), + ] + ) if not dal_ready: return patching_status.get_status(logs=True) if not patching_status.and_condition( - self._test_type_and_finiteness(X), 'Input X is not supported.' + self._test_type_and_finiteness(X), "Input X is not supported." ): return patching_status.get_status(logs=True) patching_status.and_condition( - self._test_type_and_finiteness(y), 'Input y is not supported.') + self._test_type_and_finiteness(y), "Input y is not supported." + ) return patching_status.get_status(logs=True) def _onedal_predict_supported(self, method_name, *data): - assert method_name == 'predict' + assert method_name == "predict" assert len(data) == 1 class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( - f'sklearn.linear_model.{class_name}.predict') + f"sklearn.linear_model.{class_name}.predict" + ) n_samples = _num_samples(*data) - model_is_sparse = issparse(self.coef_) or \ - (self.fit_intercept and issparse(self.intercept_)) - dal_ready = patching_status.and_conditions([ - (n_samples > 0, 'Number of samples is less than 1.'), - (not issparse(*data), 'Sparse input is not supported.'), - (not model_is_sparse, 'Sparse coefficients are not supported.'), - (hasattr(self, '_onedal_estimator'), 'oneDAL model was not trained.') - ]) + model_is_sparse = issparse(self.coef_) or ( + self.fit_intercept and issparse(self.intercept_) + ) + dal_ready = patching_status.and_conditions( + [ + (n_samples > 0, "Number of samples is less than 1."), + (not issparse(*data), "Sparse input is not supported."), + (not model_is_sparse, "Sparse coefficients are not supported."), + (hasattr(self, "_onedal_estimator"), "oneDAL model was not trained."), + ] + ) if not dal_ready: return patching_status.get_status(logs=True) patching_status.and_condition( - self._test_type_and_finiteness(*data), 'Input X is not supported.') + self._test_type_and_finiteness(*data), "Input X is not supported." + ) return patching_status.get_status(logs=True) def _onedal_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": return self._onedal_fit_supported(method_name, *data) - if method_name == 'predict': + if method_name == "predict": return self._onedal_predict_supported(method_name, *data) raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) def _onedal_gpu_supported(self, method_name, *data): return self._onedal_supported(method_name, *data) @@ -235,30 +276,27 @@ def _onedal_cpu_supported(self, method_name, *data): return self._onedal_supported(method_name, *data) def _initialize_onedal_estimator(self): - onedal_params = { - 'fit_intercept': self.fit_intercept, - 'copy_X': self.copy_X} + onedal_params = {"fit_intercept": self.fit_intercept, "copy_X": self.copy_X} self._onedal_estimator = onedal_LinearRegression(**onedal_params) def _onedal_fit(self, X, y, sample_weight, queue=None): assert sample_weight is None check_params = { - 'X': X, - 'y': y, - 'dtype': [np.float64, np.float32], - 'accept_sparse': ['csr', 'csc', 'coo'], - 'y_numeric': True, - 'multi_output': True, - 'force_all_finite': False + "X": X, + "y": y, + "dtype": [np.float64, np.float32], + "accept_sparse": ["csr", "csc", "coo"], + "y_numeric": True, + "multi_output": True, + "force_all_finite": False, } - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): X, y = self._validate_data(**check_params) else: X, y = check_X_y(**check_params) - if sklearn_check_version( - '1.0') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): self._normalize = _deprecate_normalize( self.normalize, default=False, @@ -272,7 +310,7 @@ def _onedal_fit(self, X, y, sample_weight, queue=None): def _onedal_predict(self, X, queue=None): X = self._validate_data(X, accept_sparse=False, reset=False) - if not hasattr(self, '_onedal_estimator'): + if not hasattr(self, "_onedal_estimator"): self._initialize_onedal_estimator() self._onedal_estimator.coef_ = self.coef_ self._onedal_estimator.intercept_ = self.intercept_ @@ -281,5 +319,8 @@ def _onedal_predict(self, X, queue=None): else: from daal4py.sklearn.linear_model import LinearRegression - logging.warning('Preview LinearRegression requires oneDAL version >= 2023.1 ' - 'but it was not found') + + logging.warning( + "Preview LinearRegression requires oneDAL version >= 2023.1 " + "but it was not found" + ) diff --git a/sklearnex/preview/linear_model/tests/test_preview_linear.py b/sklearnex/preview/linear_model/tests/test_preview_linear.py index fe39460136..80d00324e4 100755 --- a/sklearnex/preview/linear_model/tests/test_preview_linear.py +++ b/sklearnex/preview/linear_model/tests/test_preview_linear.py @@ -18,19 +18,21 @@ import numpy as np from numpy.testing import assert_allclose from sklearn.datasets import make_regression + from daal4py.sklearn._utils import daal_check_version def test_sklearnex_import_linear(): from sklearnex.preview.linear_model import LinearRegression + X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) y = np.dot(X, np.array([1, 2])) + 3 linreg = LinearRegression().fit(X, y) - if daal_check_version((2023, 'P', 100)): - assert 'sklearnex' in linreg.__module__ - assert hasattr(linreg, '_onedal_estimator') + if daal_check_version((2023, "P", 100)): + assert "sklearnex" in linreg.__module__ + assert hasattr(linreg, "_onedal_estimator") else: - assert 'daal4py' in linreg.__module__ + assert "daal4py" in linreg.__module__ assert linreg.n_features_in_ == 2 - assert_allclose(linreg.intercept_, 3.) - assert_allclose(linreg.coef_, [1., 2.]) + assert_allclose(linreg.intercept_, 3.0) + assert_allclose(linreg.coef_, [1.0, 2.0]) diff --git a/sklearnex/spmd/__init__.py b/sklearnex/spmd/__init__.py index 9099df571a..3c698d694b 100644 --- a/sklearnex/spmd/__init__.py +++ b/sklearnex/spmd/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== __all__ = [ - 'basic_statistics', - 'cluster', - 'decomposition', - 'ensemble', - 'linear_model', - 'neighbors'] + "basic_statistics", + "cluster", + "decomposition", + "ensemble", + "linear_model", + "neighbors", +] diff --git a/sklearnex/spmd/basic_statistics/__init__.py b/sklearnex/spmd/basic_statistics/__init__.py index 6f45ecfe5c..2b99fdbdb7 100644 --- a/sklearnex/spmd/basic_statistics/__init__.py +++ b/sklearnex/spmd/basic_statistics/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .basic_statistics import BasicStatistics -__all__ = ['BasicStatistics'] +__all__ = ["BasicStatistics"] diff --git a/sklearnex/spmd/basic_statistics/basic_statistics.py b/sklearnex/spmd/basic_statistics/basic_statistics.py index fadc1686d2..9073c3d941 100644 --- a/sklearnex/spmd/basic_statistics/basic_statistics.py +++ b/sklearnex/spmd/basic_statistics/basic_statistics.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.spmd.basic_statistics import BasicStatistics diff --git a/sklearnex/spmd/cluster/__init__.py b/sklearnex/spmd/cluster/__init__.py index b94f1d3918..0c39935dc2 100644 --- a/sklearnex/spmd/cluster/__init__.py +++ b/sklearnex/spmd/cluster/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import daal_check_version -if daal_check_version((2023, 'P', 200)): +if daal_check_version((2023, "P", 200)): from .kmeans import KMeans - __all__ = ['KMeans'] + __all__ = ["KMeans"] else: __all__ = [] diff --git a/sklearnex/spmd/cluster/kmeans.py b/sklearnex/spmd/cluster/kmeans.py index cf614343e2..e8f97c576d 100644 --- a/sklearnex/spmd/cluster/kmeans.py +++ b/sklearnex/spmd/cluster/kmeans.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.spmd.cluster import KMeans diff --git a/sklearnex/spmd/decomposition/__init__.py b/sklearnex/spmd/decomposition/__init__.py index eda7b9fc14..618e0b9082 100644 --- a/sklearnex/spmd/decomposition/__init__.py +++ b/sklearnex/spmd/decomposition/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/sklearnex/spmd/decomposition/pca.py b/sklearnex/spmd/decomposition/pca.py index 5bf6eb63ab..bef34e3bbb 100644 --- a/sklearnex/spmd/decomposition/pca.py +++ b/sklearnex/spmd/decomposition/pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.spmd.decomposition import PCA diff --git a/sklearnex/spmd/ensemble/__init__.py b/sklearnex/spmd/ensemble/__init__.py index 5dcc919355..b53fb8f910 100644 --- a/sklearnex/spmd/ensemble/__init__.py +++ b/sklearnex/spmd/ensemble/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .forest import RandomForestClassifier, RandomForestRegressor -__all__ = ['RandomForestClassifier', 'RandomForestRegressor'] +__all__ = ["RandomForestClassifier", "RandomForestRegressor"] diff --git a/sklearnex/spmd/ensemble/forest.py b/sklearnex/spmd/ensemble/forest.py index cfb711f3d2..8eb77ac75a 100644 --- a/sklearnex/spmd/ensemble/forest.py +++ b/sklearnex/spmd/ensemble/forest.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,17 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC from onedal.spmd.ensemble import RandomForestClassifier as onedal_RandomForestClassifier from onedal.spmd.ensemble import RandomForestRegressor as onedal_RandomForestRegressor -from ...preview.ensemble.forest import RandomForestClassifier as \ - RandomForestClassifier_Batch -from ...preview.ensemble.forest import RandomForestRegressor as \ - RandomForestRegressor_Batch +from ...preview.ensemble.forest import ( + RandomForestClassifier as RandomForestClassifier_Batch, +) +from ...preview.ensemble.forest import ( + RandomForestRegressor as RandomForestRegressor_Batch, +) class BaseForestSPMD(ABC): @@ -42,16 +44,18 @@ def _onedal_cpu_supported(self, method_name, *data): ready = super()._onedal_cpu_supported(method_name, *data) if not ready: raise RuntimeError( - f'Method {method_name} in {self.__class__.__name__} ' - 'is not supported with given inputs.') + f"Method {method_name} in {self.__class__.__name__} " + "is not supported with given inputs." + ) return ready def _onedal_gpu_supported(self, method_name, *data): ready = super()._onedal_gpu_supported(method_name, *data) if not ready: raise RuntimeError( - f'Method {method_name} in {self.__class__.__name__} ' - 'is not supported with given inputs.') + f"Method {method_name} in {self.__class__.__name__} " + "is not supported with given inputs." + ) return ready @@ -64,14 +68,16 @@ def _onedal_cpu_supported(self, method_name, *data): ready = super()._onedal_cpu_supported(method_name, *data) if not ready: raise RuntimeError( - f'Method {method_name} in {self.__class__.__name__} ' - 'is not supported with given inputs.') + f"Method {method_name} in {self.__class__.__name__} " + "is not supported with given inputs." + ) return ready def _onedal_gpu_supported(self, method_name, *data): ready = super()._onedal_gpu_supported(method_name, *data) if not ready: raise RuntimeError( - f'Method {method_name} in {self.__class__.__name__} ' - 'is not supported with given inputs.') + f"Method {method_name} in {self.__class__.__name__} " + "is not supported with given inputs." + ) return ready diff --git a/sklearnex/spmd/linear_model/__init__.py b/sklearnex/spmd/linear_model/__init__.py index 33e882bdcb..893243cd56 100644 --- a/sklearnex/spmd/linear_model/__init__.py +++ b/sklearnex/spmd/linear_model/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .linear_model import LinearRegression -__all__ = ['LinearRegression'] +__all__ = ["LinearRegression"] diff --git a/sklearnex/spmd/linear_model/linear_model.py b/sklearnex/spmd/linear_model/linear_model.py index e179b3fdad..bf0814ca02 100644 --- a/sklearnex/spmd/linear_model/linear_model.py +++ b/sklearnex/spmd/linear_model/linear_model.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.spmd.linear_model import LinearRegression diff --git a/sklearnex/spmd/neighbors/__init__.py b/sklearnex/spmd/neighbors/__init__.py index 99099fa51c..11f104287a 100644 --- a/sklearnex/spmd/neighbors/__init__.py +++ b/sklearnex/spmd/neighbors/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors -__all__ = ['KNeighborsClassifier', 'KNeighborsRegressor', 'NearestNeighbors'] +__all__ = ["KNeighborsClassifier", "KNeighborsRegressor", "NearestNeighbors"] diff --git a/sklearnex/spmd/neighbors/neighbors.py b/sklearnex/spmd/neighbors/neighbors.py index 7eaa5e9f62..1fbe9c1bd1 100644 --- a/sklearnex/spmd/neighbors/neighbors.py +++ b/sklearnex/spmd/neighbors/neighbors.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.spmd.neighbors import ( KNeighborsClassifier, KNeighborsRegressor, - NearestNeighbors + NearestNeighbors, ) # TODO: diff --git a/sklearnex/svm/__init__.py b/sklearnex/svm/__init__.py index 3a9aa066d9..1ec77833b7 100755 --- a/sklearnex/svm/__init__.py +++ b/sklearnex/svm/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,15 +13,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .._utils import get_sklearnex_version -if get_sklearnex_version((2021, 'P', 300)): - from .svr import SVR - from .svc import SVC - from .nusvr import NuSVR + +if get_sklearnex_version((2021, "P", 300)): from .nusvc import NuSVC - __all__ = ['SVR', 'SVC', 'NuSVC', 'NuSVR'] + from .nusvr import NuSVR + from .svc import SVC + from .svr import SVR + + __all__ = ["SVR", "SVC", "NuSVC", "NuSVR"] else: from daal4py.sklearn.svm import SVC - __all__ = ['SVC'] + + __all__ = ["SVC"] diff --git a/sklearnex/svm/_common.py b/sklearnex/svm/_common.py index 9a86e6413a..36c4d6becf 100644 --- a/sklearnex/svm/_common.py +++ b/sklearnex/svm/_common.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,17 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -import numpy as np -from daal4py.sklearn._utils import sklearn_check_version, PatchingConditionsChain +import numpy as np +from sklearn.calibration import CalibratedClassifierCV from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import LabelEncoder -from sklearn.calibration import CalibratedClassifierCV -from onedal.datatypes.validation import _column_or_1d +from daal4py.sklearn._utils import PatchingConditionsChain, sklearn_check_version +from onedal.utils import _column_or_1d def get_dual_coef(self): @@ -31,7 +31,7 @@ def get_dual_coef(self): def set_dual_coef(self, value): self.dual_coef_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.dual_coef_ = value if not self._is_in_fit: del self._onedal_estimator._onedal_model @@ -43,7 +43,7 @@ def get_intercept(self): def set_intercept(self, value): self._intercept_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.intercept_ = value if not self._is_in_fit: del self._onedal_estimator._onedal_model @@ -51,31 +51,37 @@ def set_intercept(self, value): class BaseSVM(ABC): def _onedal_gpu_supported(self, method_name, *data): - patching_status = PatchingConditionsChain(f'sklearn.{method_name}') - patching_status.and_conditions([ - (False, 'GPU offloading is not supported.') - ]) + patching_status = PatchingConditionsChain(f"sklearn.{method_name}") + patching_status.and_conditions([(False, "GPU offloading is not supported.")]) return patching_status.get_status(logs=True) def _onedal_cpu_supported(self, method_name, *data): class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( - f'sklearn.svm.{class_name}.{method_name}') - if method_name == 'fit': - patching_status.and_conditions([ - (self.kernel in ['linear', 'rbf', 'poly', 'sigmoid'], - f'Kernel is "{self.kernel}" while ' - '"linear", "rbf", "poly" and "sigmoid" are only supported.') - ]) + f"sklearn.svm.{class_name}.{method_name}" + ) + if method_name == "fit": + patching_status.and_conditions( + [ + ( + self.kernel in ["linear", "rbf", "poly", "sigmoid"], + f'Kernel is "{self.kernel}" while ' + '"linear", "rbf", "poly" and "sigmoid" are only supported.', + ) + ] + ) return patching_status.get_status(logs=True) - inference_methods = ['predict'] if class_name.endswith('R') \ - else ['predict', 'predict_proba', 'decision_function'] + inference_methods = ( + ["predict"] + if class_name.endswith("R") + else ["predict", "predict_proba", "decision_function"] + ) if method_name in inference_methods: - patching_status.and_conditions([ - (hasattr(self, '_onedal_estimator'), 'oneDAL model was not trained.') - ]) + patching_status.and_conditions( + [(hasattr(self, "_onedal_estimator"), "oneDAL model was not trained.")] + ) return patching_status.get_status(logs=True) - raise RuntimeError(f'Unknown method {method_name} in {class_name}') + raise RuntimeError(f"Unknown method {method_name} in {class_name}") class BaseSVC(BaseSVM): @@ -92,37 +98,38 @@ def _compute_balanced_class_weight(self, y): return recip_freq[le.transform(classes)] def _fit_proba(self, X, y, sample_weight=None, queue=None): - from .._config import get_config, config_context + from .._config import config_context, get_config params = self.get_params() params["probability"] = False - params["decision_function_shape"] = 'ovr' + params["decision_function_shape"] = "ovr" clf_base = self.__class__(**params) # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue with config_context(**cfg): try: n_splits = 5 n_jobs = n_splits if queue is None or queue.sycl_device.is_cpu else 1 cv = StratifiedKFold( - n_splits=n_splits, - shuffle=True, - random_state=self.random_state) + n_splits=n_splits, shuffle=True, random_state=self.random_state + ) if sklearn_check_version("0.24"): self.clf_prob = CalibratedClassifierCV( - clf_base, ensemble=False, cv=cv, method='sigmoid', - n_jobs=n_jobs) + clf_base, ensemble=False, cv=cv, method="sigmoid", n_jobs=n_jobs + ) else: self.clf_prob = CalibratedClassifierCV( - clf_base, cv=cv, method='sigmoid') + clf_base, cv=cv, method="sigmoid" + ) self.clf_prob.fit(X, y, sample_weight) except ValueError: clf_base = clf_base.fit(X, y, sample_weight) self.clf_prob = CalibratedClassifierCV( - clf_base, cv="prefit", method='sigmoid') + clf_base, cv="prefit", method="sigmoid" + ) self.clf_prob.fit(X, y, sample_weight) def _save_attributes(self): @@ -157,7 +164,7 @@ def _save_attributes(self): if sklearn_check_version("1.1"): length = int(len(self.classes_) * (len(self.classes_) - 1) / 2) - self.n_iter_ = np.full((length, ), self._onedal_estimator.n_iter_) + self.n_iter_ = np.full((length,), self._onedal_estimator.n_iter_) class BaseSVR(BaseSVM): diff --git a/sklearnex/svm/nusvc.py b/sklearnex/svm/nusvc.py index cba5aba42e..1eec55763a 100644 --- a/sklearnex/svm/nusvc.py +++ b/sklearnex/svm/nusvc.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== + +from sklearn.exceptions import NotFittedError +from sklearn.svm import NuSVC as sklearn_NuSVC +from sklearn.utils.validation import _deprecate_positional_args from daal4py.sklearn._utils import sklearn_check_version -from ._common import BaseSVC + from .._device_offload import dispatch, wrap_output_data +from ._common import BaseSVC -from sklearn.svm import NuSVC as sklearn_NuSVC -from sklearn.utils.validation import _deprecate_positional_args -from sklearn.exceptions import NotFittedError -if sklearn_check_version('1.0'): +if sklearn_check_version("1.0"): from sklearn.utils.metaestimators import available_if from onedal.svm import NuSVC as onedal_NuSVC @@ -30,21 +32,46 @@ class NuSVC(sklearn_NuSVC, BaseSVC): __doc__ = sklearn_NuSVC.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_NuSVC._parameter_constraints} @_deprecate_positional_args - def __init__(self, *, nu=0.5, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1, decision_function_shape='ovr', - break_ties=False, random_state=None): + def __init__( + self, + *, + nu=0.5, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): super().__init__( - nu=nu, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, - shrinking=shrinking, probability=probability, tol=tol, cache_size=cache_size, - class_weight=class_weight, verbose=verbose, max_iter=max_iter, - decision_function_shape=decision_function_shape, break_ties=break_ties, - random_state=random_state) + nu=nu, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + shrinking=shrinking, + probability=probability, + tol=tol, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) def fit(self, X, y, sample_weight=None): """ @@ -84,10 +111,17 @@ def fit(self, X, y, sample_weight=None): self._validate_params() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_NuSVC.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_NuSVC.fit, + }, + X, + y, + sample_weight, + ) return self @@ -111,12 +145,18 @@ def predict(self, X): """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_NuSVC.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_NuSVC.predict, + }, + X, + ) + + if sklearn_check_version("1.0"): - if sklearn_check_version('1.0'): @available_if(sklearn_NuSVC._check_proba) def predict_proba(self, X): """ @@ -146,7 +186,9 @@ def predict_proba(self, X): datasets. """ return self._predict_proba(X) + else: + @property def predict_proba(self): self._check_proba() @@ -156,38 +198,50 @@ def predict_proba(self): def _predict_proba(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - sklearn_pred_proba = (sklearn_NuSVC.predict_proba - if sklearn_check_version("1.0") - else sklearn_NuSVC._predict_proba) - - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_pred_proba, - }, X) + sklearn_pred_proba = ( + sklearn_NuSVC.predict_proba + if sklearn_check_version("1.0") + else sklearn_NuSVC._predict_proba + ) + + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_pred_proba, + }, + X, + ) @wrap_output_data def decision_function(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'decision_function', { - 'onedal': self.__class__._onedal_decision_function, - 'sklearn': sklearn_NuSVC.decision_function, - }, X) + return dispatch( + self, + "decision_function", + { + "onedal": self.__class__._onedal_decision_function, + "sklearn": sklearn_NuSVC.decision_function, + }, + X, + ) def _onedal_fit(self, X, y, sample_weight=None, queue=None): onedal_params = { - 'nu': self.nu, - 'kernel': self.kernel, - 'degree': self.degree, - 'gamma': self.gamma, - 'coef0': self.coef0, - 'tol': self.tol, - 'shrinking': self.shrinking, - 'cache_size': self.cache_size, - 'max_iter': self.max_iter, - 'class_weight': self.class_weight, - 'break_ties': self.break_ties, - 'decision_function_shape': self.decision_function_shape, + "nu": self.nu, + "kernel": self.kernel, + "degree": self.degree, + "gamma": self.gamma, + "coef0": self.coef0, + "tol": self.tol, + "shrinking": self.shrinking, + "cache_size": self.cache_size, + "max_iter": self.max_iter, + "class_weight": self.class_weight, + "break_ties": self.break_ties, + "decision_function_shape": self.decision_function_shape, } self._onedal_estimator = onedal_NuSVC(**onedal_params) @@ -201,15 +255,16 @@ def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue=queue) def _onedal_predict_proba(self, X, queue=None): - if getattr(self, 'clf_prob', None) is None: + if getattr(self, "clf_prob", None) is None: raise NotFittedError( - "predict_proba is not available when fitted with probability=False") - from .._config import get_config, config_context + "predict_proba is not available when fitted with probability=False" + ) + from .._config import config_context, get_config # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue with config_context(**cfg): return self.clf_prob.predict_proba(X) diff --git a/sklearnex/svm/nusvr.py b/sklearnex/svm/nusvr.py index 75f14f1e69..837da54beb 100644 --- a/sklearnex/svm/nusvr.py +++ b/sklearnex/svm/nusvr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,53 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== - -from daal4py.sklearn._utils import sklearn_check_version -from ._common import BaseSVR -from .._device_offload import dispatch, wrap_output_data +# =============================================================================== from sklearn.svm import NuSVR as sklearn_NuSVR from sklearn.utils.validation import _deprecate_positional_args + +from daal4py.sklearn._utils import sklearn_check_version from onedal.svm import NuSVR as onedal_NuSVR +from .._device_offload import dispatch, wrap_output_data +from ._common import BaseSVR + class NuSVR(sklearn_NuSVR, BaseSVR): __doc__ = sklearn_NuSVR.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_NuSVR._parameter_constraints} @_deprecate_positional_args - def __init__(self, *, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, tol=1e-3, C=1.0, nu=0.5, shrinking=True, - cache_size=200, verbose=False, max_iter=-1): + def __init__( + self, + *, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + C=1.0, + nu=0.5, + shrinking=True, + cache_size=200, + verbose=False, + max_iter=-1, + ): super().__init__( - kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=nu, - shrinking=shrinking, cache_size=cache_size, verbose=verbose, - max_iter=max_iter) + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=nu, + shrinking=shrinking, + cache_size=cache_size, + verbose=verbose, + max_iter=max_iter, + ) def fit(self, X, y, sample_weight=None): """ @@ -76,10 +98,17 @@ def fit(self, X, y, sample_weight=None): self._validate_params() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_NuSVR.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_NuSVR.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -102,23 +131,28 @@ def predict(self, X): """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_NuSVR.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_NuSVR.predict, + }, + X, + ) def _onedal_fit(self, X, y, sample_weight=None, queue=None): onedal_params = { - 'C': self.C, - 'nu': self.nu, - 'kernel': self.kernel, - 'degree': self.degree, - 'gamma': self.gamma, - 'coef0': self.coef0, - 'tol': self.tol, - 'shrinking': self.shrinking, - 'cache_size': self.cache_size, - 'max_iter': self.max_iter, + "C": self.C, + "nu": self.nu, + "kernel": self.kernel, + "degree": self.degree, + "gamma": self.gamma, + "coef0": self.coef0, + "tol": self.tol, + "shrinking": self.shrinking, + "cache_size": self.cache_size, + "max_iter": self.max_iter, } self._onedal_estimator = onedal_NuSVR(**onedal_params) diff --git a/sklearnex/svm/svc.py b/sklearnex/svm/svc.py index ff4f1c3466..816502cc20 100644 --- a/sklearnex/svm/svc.py +++ b/sklearnex/svm/svc.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from scipy import sparse as sp +from sklearn.exceptions import NotFittedError +from sklearn.svm import SVC as sklearn_SVC +from sklearn.utils.validation import _deprecate_positional_args + +from daal4py.sklearn._utils import PatchingConditionsChain, sklearn_check_version -from daal4py.sklearn._utils import sklearn_check_version, PatchingConditionsChain -from ._common import BaseSVC from .._device_offload import dispatch, wrap_output_data +from ._common import BaseSVC -from sklearn.svm import SVC as sklearn_SVC -from sklearn.utils.validation import _deprecate_positional_args -from sklearn.exceptions import NotFittedError -if sklearn_check_version('1.0'): +if sklearn_check_version("1.0"): from sklearn.utils.metaestimators import available_if from onedal.svm import SVC as onedal_SVC @@ -33,21 +34,46 @@ class SVC(sklearn_SVC, BaseSVC): __doc__ = sklearn_SVC.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_SVC._parameter_constraints} @_deprecate_positional_args - def __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1, decision_function_shape='ovr', - break_ties=False, random_state=None): + def __init__( + self, + *, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): super().__init__( - C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, - shrinking=shrinking, probability=probability, tol=tol, cache_size=cache_size, - class_weight=class_weight, verbose=verbose, max_iter=max_iter, - decision_function_shape=decision_function_shape, break_ties=break_ties, - random_state=random_state) + C=C, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + shrinking=shrinking, + probability=probability, + tol=tol, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) def fit(self, X, y, sample_weight=None): """ @@ -87,10 +113,17 @@ def fit(self, X, y, sample_weight=None): self._validate_params() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_SVC.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_SVC.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -113,12 +146,18 @@ def predict(self, X): """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_SVC.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_SVC.predict, + }, + X, + ) + + if sklearn_check_version("1.0"): - if sklearn_check_version('1.0'): @available_if(sklearn_SVC._check_proba) def predict_proba(self, X): """ @@ -148,7 +187,9 @@ def predict_proba(self, X): datasets. """ return self._predict_proba(X) + else: + @property def predict_proba(self): self._check_proba() @@ -156,64 +197,88 @@ def predict_proba(self): @wrap_output_data def _predict_proba(self, X): - sklearn_pred_proba = (sklearn_SVC.predict_proba - if sklearn_check_version("1.0") - else sklearn_SVC._predict_proba) - - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_pred_proba, - }, X) + sklearn_pred_proba = ( + sklearn_SVC.predict_proba + if sklearn_check_version("1.0") + else sklearn_SVC._predict_proba + ) + + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_pred_proba, + }, + X, + ) @wrap_output_data def decision_function(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'decision_function', { - 'onedal': self.__class__._onedal_decision_function, - 'sklearn': sklearn_SVC.decision_function, - }, X) + return dispatch( + self, + "decision_function", + { + "onedal": self.__class__._onedal_decision_function, + "sklearn": sklearn_SVC.decision_function, + }, + X, + ) def _onedal_gpu_supported(self, method_name, *data): class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( - f'sklearn.svm.{class_name}.{method_name}') - if method_name == 'fit': + f"sklearn.svm.{class_name}.{method_name}" + ) + if method_name == "fit": if len(data) > 1: self._class_count = len(np.unique(data[1])) self._is_sparse = sp.isspmatrix(data[0]) - patching_status.and_conditions([ - (self.kernel in ['linear', 'rbf'], - f'Kernel is "{self.kernel}" while ' - '"linear" and "rbf" are only supported on GPU.'), - (self.class_weight is None, 'Class weight is not supported on GPU.'), - (self._class_count == 2, 'Multiclassification is not supported on GPU.'), - (not self._is_sparse, 'Sparse input is not supported on GPU.') - ]) + patching_status.and_conditions( + [ + ( + self.kernel in ["linear", "rbf"], + f'Kernel is "{self.kernel}" while ' + '"linear" and "rbf" are only supported on GPU.', + ), + (self.class_weight is None, "Class weight is not supported on GPU."), + ( + self._class_count == 2, + "Multiclassification is not supported on GPU.", + ), + (not self._is_sparse, "Sparse input is not supported on GPU."), + ] + ) return patching_status.get_status(logs=True) - if method_name in ['predict', 'predict_proba', 'decision_function']: - patching_status.and_conditions([ - (hasattr(self, '_onedal_estimator') and self._onedal_gpu_supported( - 'fit', *data), - 'oneDAL model was not trained on GPU.') - ]) + if method_name in ["predict", "predict_proba", "decision_function"]: + patching_status.and_conditions( + [ + ( + hasattr(self, "_onedal_estimator") + and self._onedal_gpu_supported("fit", *data), + "oneDAL model was not trained on GPU.", + ) + ] + ) return patching_status.get_status(logs=True) - raise RuntimeError(f'Unknown method {method_name} in {class_name}') + raise RuntimeError(f"Unknown method {method_name} in {class_name}") def _onedal_fit(self, X, y, sample_weight=None, queue=None): onedal_params = { - 'C': self.C, - 'kernel': self.kernel, - 'degree': self.degree, - 'gamma': self.gamma, - 'coef0': self.coef0, - 'tol': self.tol, - 'shrinking': self.shrinking, - 'cache_size': self.cache_size, - 'max_iter': self.max_iter, - 'class_weight': self.class_weight, - 'break_ties': self.break_ties, - 'decision_function_shape': self.decision_function_shape, + "C": self.C, + "kernel": self.kernel, + "degree": self.degree, + "gamma": self.gamma, + "coef0": self.coef0, + "tol": self.tol, + "shrinking": self.shrinking, + "cache_size": self.cache_size, + "max_iter": self.max_iter, + "class_weight": self.class_weight, + "break_ties": self.break_ties, + "decision_function_shape": self.decision_function_shape, } self._onedal_estimator = onedal_SVC(**onedal_params) @@ -227,15 +292,16 @@ def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue=queue) def _onedal_predict_proba(self, X, queue=None): - if getattr(self, 'clf_prob', None) is None: + if getattr(self, "clf_prob", None) is None: raise NotFittedError( - "predict_proba is not available when fitted with probability=False") - from .._config import get_config, config_context + "predict_proba is not available when fitted with probability=False" + ) + from .._config import config_context, get_config # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue with config_context(**cfg): return self.clf_prob.predict_proba(X) diff --git a/sklearnex/svm/svr.py b/sklearnex/svm/svr.py index c47bc3334c..b841a1a512 100644 --- a/sklearnex/svm/svr.py +++ b/sklearnex/svm/svr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,53 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== - -from daal4py.sklearn._utils import sklearn_check_version -from ._common import BaseSVR -from .._device_offload import dispatch, wrap_output_data +# =============================================================================== from sklearn.svm import SVR as sklearn_SVR from sklearn.utils.validation import _deprecate_positional_args + +from daal4py.sklearn._utils import sklearn_check_version from onedal.svm import SVR as onedal_SVR +from .._device_offload import dispatch, wrap_output_data +from ._common import BaseSVR + class SVR(sklearn_SVR, BaseSVR): __doc__ = sklearn_SVR.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_SVR._parameter_constraints} @_deprecate_positional_args - def __init__(self, *, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True, - cache_size=200, verbose=False, max_iter=-1): + def __init__( + self, + *, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + C=1.0, + epsilon=0.1, + shrinking=True, + cache_size=200, + verbose=False, + max_iter=-1, + ): super().__init__( - kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, - epsilon=epsilon, shrinking=shrinking, cache_size=cache_size, verbose=verbose, - max_iter=max_iter) + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + epsilon=epsilon, + shrinking=shrinking, + cache_size=cache_size, + verbose=verbose, + max_iter=max_iter, + ) def fit(self, X, y, sample_weight=None): """ @@ -76,10 +98,17 @@ def fit(self, X, y, sample_weight=None): self._validate_params() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_SVR.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_SVR.fit, + }, + X, + y, + sample_weight, + ) return self @@ -103,23 +132,28 @@ def predict(self, X): """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_SVR.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_SVR.predict, + }, + X, + ) def _onedal_fit(self, X, y, sample_weight=None, queue=None): onedal_params = { - 'C': self.C, - 'epsilon': self.epsilon, - 'kernel': self.kernel, - 'degree': self.degree, - 'gamma': self.gamma, - 'coef0': self.coef0, - 'tol': self.tol, - 'shrinking': self.shrinking, - 'cache_size': self.cache_size, - 'max_iter': self.max_iter, + "C": self.C, + "epsilon": self.epsilon, + "kernel": self.kernel, + "degree": self.degree, + "gamma": self.gamma, + "coef0": self.coef0, + "tol": self.tol, + "shrinking": self.shrinking, + "cache_size": self.cache_size, + "max_iter": self.max_iter, } self._onedal_estimator = onedal_SVR(**onedal_params) diff --git a/sklearnex/svm/tests/test_svm.py b/sklearnex/svm/tests/test_svm.py index 9ce49fc67d..6fcfb3718c 100755 --- a/sklearnex/svm/tests/test_svm.py +++ b/sklearnex/svm/tests/test_svm.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,43 +21,43 @@ def test_sklearnex_import_svc(): from sklearnex.svm import SVC - X = np.array([[-2, -1], [-1, -1], [-1, -2], - [+1, +1], [+1, +2], [+2, +1]]) + + X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) y = np.array([1, 1, 1, 2, 2, 2]) - svc = SVC(kernel='linear').fit(X, y) - assert 'daal4py' in svc.__module__ or 'sklearnex' in svc.__module__ - assert_allclose(svc.dual_coef_, [[-0.25, .25]]) + svc = SVC(kernel="linear").fit(X, y) + assert "daal4py" in svc.__module__ or "sklearnex" in svc.__module__ + assert_allclose(svc.dual_coef_, [[-0.25, 0.25]]) assert_allclose(svc.support_, [1, 3]) def test_sklearnex_import_nusvc(): from sklearnex.svm import NuSVC - X = np.array([[-2, -1], [-1, -1], [-1, -2], - [+1, +1], [+1, +2], [+2, +1]]) + + X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) y = np.array([1, 1, 1, 2, 2, 2]) - svc = NuSVC(kernel='linear').fit(X, y) - assert 'daal4py' in svc.__module__ or 'sklearnex' in svc.__module__ + svc = NuSVC(kernel="linear").fit(X, y) + assert "daal4py" in svc.__module__ or "sklearnex" in svc.__module__ assert_allclose(svc.dual_coef_, [[-0.04761905, -0.0952381, 0.0952381, 0.04761905]]) assert_allclose(svc.support_, [0, 1, 3, 4]) def test_sklearnex_import_svr(): from sklearnex.svm import SVR - X = np.array([[-2, -1], [-1, -1], [-1, -2], - [+1, +1], [+1, +2], [+2, +1]]) + + X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) y = np.array([1, 1, 1, 2, 2, 2]) - svc = SVR(kernel='linear').fit(X, y) - assert 'daal4py' in svc.__module__ or 'sklearnex' in svc.__module__ + svc = SVR(kernel="linear").fit(X, y) + assert "daal4py" in svc.__module__ or "sklearnex" in svc.__module__ assert_allclose(svc.dual_coef_, [[-0.1, 0.1]]) assert_allclose(svc.support_, [1, 3]) def test_sklearnex_import_nusvr(): from sklearnex.svm import NuSVR - X = np.array([[-2, -1], [-1, -1], [-1, -2], - [+1, +1], [+1, +2], [+2, +1]]) + + X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) y = np.array([1, 1, 1, 2, 2, 2]) - svc = NuSVR(kernel='linear', nu=0.9).fit(X, y) - assert 'daal4py' in svc.__module__ or 'sklearnex' in svc.__module__ - assert_allclose(svc.dual_coef_, [[-1., 0.611111, 1., -0.611111]], rtol=1e-3) + svc = NuSVR(kernel="linear", nu=0.9).fit(X, y) + assert "daal4py" in svc.__module__ or "sklearnex" in svc.__module__ + assert_allclose(svc.dual_coef_, [[-1.0, 0.611111, 1.0, -0.611111]], rtol=1e-3) assert_allclose(svc.support_, [1, 2, 3, 5]) diff --git a/sklearnex/tests/_models_info.py b/sklearnex/tests/_models_info.py index 309feb78a6..afe213d569 100755 --- a/sklearnex/tests/_models_info.py +++ b/sklearnex/tests/_models_info.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,115 +12,126 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np - -from sklearn.svm import SVC -from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor) -from sklearn.neighbors import ( - KNeighborsClassifier, - KNeighborsRegressor, - NearestNeighbors, - LocalOutlierFactor) +from sklearn.cluster import DBSCAN, KMeans +from sklearn.decomposition import PCA +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import ( + ElasticNet, + Lasso, + LinearRegression, LogisticRegression, LogisticRegressionCV, - LinearRegression, Ridge, - ElasticNet, - Lasso) -from sklearn.cluster import (KMeans, DBSCAN) +) from sklearn.manifold import TSNE -from sklearn.decomposition import PCA +from sklearn.neighbors import ( + KNeighborsClassifier, + KNeighborsRegressor, + LocalOutlierFactor, + NearestNeighbors, +) +from sklearn.svm import SVC MODELS_INFO = [ { - 'model': KNeighborsClassifier(algorithm='brute'), - 'methods': ['kneighbors', 'predict', 'predict_proba', 'score'], - 'dataset': 'classifier', + "model": KNeighborsClassifier(algorithm="brute"), + "methods": ["kneighbors", "predict", "predict_proba", "score"], + "dataset": "classifier", }, { - 'model': KNeighborsRegressor(algorithm='brute'), - 'methods': ['kneighbors', 'predict', 'score'], - 'dataset': 'regression', + "model": KNeighborsRegressor(algorithm="brute"), + "methods": ["kneighbors", "predict", "score"], + "dataset": "regression", }, { - 'model': NearestNeighbors(algorithm='brute'), - 'methods': ['kneighbors'], - 'dataset': 'blobs', + "model": NearestNeighbors(algorithm="brute"), + "methods": ["kneighbors"], + "dataset": "blobs", }, { - 'model': LocalOutlierFactor(novelty=False), - 'methods': ['fit_predict'], - 'dataset': 'blobs', + "model": LocalOutlierFactor(novelty=False), + "methods": ["fit_predict"], + "dataset": "blobs", }, { - 'model': LocalOutlierFactor(novelty=True), - 'methods': ['predict'], - 'dataset': 'blobs', + "model": LocalOutlierFactor(novelty=True), + "methods": ["predict"], + "dataset": "blobs", }, { - 'model': DBSCAN(), - 'methods': ['fit_predict'], - 'dataset': 'blobs', + "model": DBSCAN(), + "methods": ["fit_predict"], + "dataset": "blobs", }, { - 'model': SVC(probability=True), - 'methods': ['decision_function', 'predict', 'predict_proba', 'score'], - 'dataset': 'classifier', + "model": SVC(probability=True), + "methods": ["decision_function", "predict", "predict_proba", "score"], + "dataset": "classifier", }, { - 'model': KMeans(), - 'methods': ['fit_predict', 'fit_transform', 'transform', 'predict', 'score'], - 'dataset': 'blobs', + "model": KMeans(), + "methods": ["fit_predict", "fit_transform", "transform", "predict", "score"], + "dataset": "blobs", }, { - 'model': ElasticNet(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": ElasticNet(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': Lasso(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": Lasso(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': PCA(), - 'methods': ['fit_transform', 'transform', 'score'], - 'dataset': 'classifier', + "model": PCA(), + "methods": ["fit_transform", "transform", "score"], + "dataset": "classifier", }, { - 'model': LogisticRegression(max_iter=100, multi_class='multinomial'), - 'methods': ['decision_function', 'predict', 'predict_proba', - 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": LogisticRegression(max_iter=100, multi_class="multinomial"), + "methods": [ + "decision_function", + "predict", + "predict_proba", + "predict_log_proba", + "score", + ], + "dataset": "classifier", }, { - 'model': LogisticRegressionCV(max_iter=100), - 'methods': ['decision_function', 'predict', 'predict_proba', - 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": LogisticRegressionCV(max_iter=100), + "methods": [ + "decision_function", + "predict", + "predict_proba", + "predict_log_proba", + "score", + ], + "dataset": "classifier", }, { - 'model': RandomForestClassifier(n_estimators=10), - 'methods': ['predict', 'predict_proba', 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": RandomForestClassifier(n_estimators=10), + "methods": ["predict", "predict_proba", "predict_log_proba", "score"], + "dataset": "classifier", }, { - 'model': RandomForestRegressor(n_estimators=10), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": RandomForestRegressor(n_estimators=10), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': LinearRegression(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": LinearRegression(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': Ridge(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": Ridge(), + "methods": ["predict", "score"], + "dataset": "regression", }, ] @@ -140,20 +151,20 @@ TO_SKIP = [ # --------------- NO INFO --------------- - r'KMeans .*transform', - r'KMeans .*score', - r'PCA .*score', - r'LogisticRegression .*decision_function', - r'LogisticRegressionCV .*decision_function', - r'LogisticRegressionCV .*predict', - r'LogisticRegressionCV .*predict_proba', - r'LogisticRegressionCV .*predict_log_proba', - r'LogisticRegressionCV .*score', + r"KMeans .*transform", + r"KMeans .*score", + r"PCA .*score", + r"LogisticRegression .*decision_function", + r"LogisticRegressionCV .*decision_function", + r"LogisticRegressionCV .*predict", + r"LogisticRegressionCV .*predict_proba", + r"LogisticRegressionCV .*predict_log_proba", + r"LogisticRegressionCV .*score", # --------------- Scikit --------------- - r'Ridge float16 predict', - r'Ridge float16 score', - r'RandomForestClassifier .*predict_proba', - r'RandomForestClassifier .*predict_log_proba', - r'pairwise_distances .*pairwise_distances', # except float64 - r'roc_auc_score .*roc_auc_score' + r"Ridge float16 predict", + r"Ridge float16 score", + r"RandomForestClassifier .*predict_proba", + r"RandomForestClassifier .*predict_log_proba", + r"pairwise_distances .*pairwise_distances", # except float64 + r"roc_auc_score .*roc_auc_score", ] diff --git a/sklearnex/tests/test_config.py b/sklearnex/tests/test_config.py index 0a18b00dd3..36f5e82b0a 100644 --- a/sklearnex/tests/test_config.py +++ b/sklearnex/tests/test_config.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,9 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import sklearn + import sklearnex @@ -27,12 +28,12 @@ def test_get_config_contains_sklearn_params(): def test_set_config_works(): default_config = sklearnex.get_config() - sklearnex.set_config(assume_finite=True, - target_offload='cpu:0', - allow_fallback_to_host=True) + sklearnex.set_config( + assume_finite=True, target_offload="cpu:0", allow_fallback_to_host=True + ) config = sklearnex.get_config() - assert config['target_offload'] == 'cpu:0' - assert config['allow_fallback_to_host'] - assert config['assume_finite'] + assert config["target_offload"] == "cpu:0" + assert config["allow_fallback_to_host"] + assert config["assume_finite"] sklearnex.set_config(**default_config) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index fa02df6f5b..bd7d87bd51 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest -import types +import gc +import logging import tracemalloc +import types + +import numpy as np +import pandas as pd +import pytest +from scipy.stats import pearsonr +from sklearn.base import BaseEstimator +from sklearn.datasets import make_classification +from sklearn.model_selection import KFold + from sklearnex import get_patch_map -from sklearnex.model_selection import train_test_split -from sklearnex.utils import _assert_all_finite from sklearnex.metrics import pairwise_distances, roc_auc_score +from sklearnex.model_selection import train_test_split from sklearnex.preview.decomposition import PCA as PreviewPCA -from sklearnex.preview.linear_model import LinearRegression as PreviewLinearRegression +from sklearnex.preview.ensemble import ExtraTreesClassifier as PreviewExtraTreesClassifier +from sklearnex.preview.ensemble import ExtraTreesRegressor as PreviewExtraTreesRegressor from sklearnex.preview.ensemble import ( RandomForestClassifier as PreviewRandomForestClassifier, +) +from sklearnex.preview.ensemble import ( RandomForestRegressor as PreviewRandomForestRegressor, - ExtraTreesClassifier as PreviewExtraTreesClassifier, - ExtraTreesRegressor as PreviewExtraTreesRegressor ) -from sklearn.base import BaseEstimator -from sklearn.model_selection import KFold -from sklearn.datasets import make_classification -from scipy.stats import pearsonr -import pandas as pd -import numpy as np -import gc -import logging +from sklearnex.preview.linear_model import LinearRegression as PreviewLinearRegression +from sklearnex.utils import _assert_all_finite class TrainTestSplitEstimator: @@ -63,12 +67,12 @@ def fit(self, x, y): class CosineDistancesEstimator(PairwiseDistancesEstimator): def __init__(self): - self.metric = 'cosine' + self.metric = "cosine" class CorrelationDistancesEstimator(PairwiseDistancesEstimator): def __init__(self): - self.metric = 'correlation' + self.metric = "correlation" class RocAucEstimator: @@ -87,21 +91,21 @@ def get_patched_estimators(ban_list, output_list): if not isinstance(estimator, types.FunctionType): if name not in ban_list: if isinstance(estimator(), BaseEstimator): - if hasattr(estimator, 'fit'): + if hasattr(estimator, "fit"): output_list.append(estimator) def remove_duplicated_estimators(estimators_list): estimators_map = {} for estimator in estimators_list: - full_name = f'{estimator.__module__}.{estimator.__name__}' + full_name = f"{estimator.__module__}.{estimator.__name__}" estimators_map[full_name] = estimator return estimators_map.values() BANNED_ESTIMATORS = ( - 'LocalOutlierFactor', # fails on ndarray_c for sklearn > 1.0 - 'TSNE', # too slow for using in testing on common data size + "LocalOutlierFactor", # fails on ndarray_c for sklearn > 1.0 + "TSNE", # too slow for using in testing on common data size ) estimators = [ PreviewPCA, @@ -114,7 +118,7 @@ def remove_duplicated_estimators(estimators_list): FiniteCheckEstimator, CosineDistancesEstimator, CorrelationDistancesEstimator, - RocAucEstimator + RocAucEstimator, ] get_patched_estimators(BANNED_ESTIMATORS, estimators) estimators = remove_duplicated_estimators(estimators) @@ -136,17 +140,9 @@ def dataframe_f(x, y): return pd.DataFrame(np.asfortranarray(x)), pd.Series(y) -data_transforms = [ - ndarray_c, - ndarray_f, - dataframe_c, - dataframe_f -] +data_transforms = [ndarray_c, ndarray_f, dataframe_c, dataframe_f] -data_shapes = [ - (1000, 100), - (2000, 50) -] +data_shapes = [(1000, 100), (2000, 50)] EXTRA_MEMORY_THRESHOLD = 0.15 N_SPLITS = 10 @@ -154,9 +150,13 @@ def dataframe_f(x, y): def gen_clsf_data(n_samples, n_features): data, label = make_classification( - n_classes=2, n_samples=n_samples, n_features=n_features, random_state=777) - return data, label, \ - data.size * data.dtype.itemsize + label.size * label.dtype.itemsize + n_classes=2, n_samples=n_samples, n_features=n_features, random_state=777 + ) + return ( + data, + label, + data.size * data.dtype.itemsize + label.size * label.dtype.itemsize, + ) def split_train_inference(kf, x, y, estimator): @@ -172,11 +172,11 @@ def split_train_inference(kf, x, y, estimator): # fallback to stock scikit-learn with default parameters alg = estimator() alg.fit(x_train, y_train) - if hasattr(alg, 'predict'): + if hasattr(alg, "predict"): alg.predict(x_test) - elif hasattr(alg, 'transform'): + elif hasattr(alg, "transform"): alg.transform(x_test) - elif hasattr(alg, 'kneighbors'): + elif hasattr(alg, "kneighbors"): alg.kneighbors(x_test) del alg, x_train, x_test, y_train, y_test mem_tracks.append(tracemalloc.get_traced_memory()[0]) @@ -194,38 +194,46 @@ def _kfold_function_template(estimator, data_transform_function, data_shape): mem_before, _ = tracemalloc.get_traced_memory() mem_tracks = split_train_inference(kf, x, y, estimator) - mem_iter_diffs = (np.array(mem_tracks[1:]) - np.array(mem_tracks[:-1])) + mem_iter_diffs = np.array(mem_tracks[1:]) - np.array(mem_tracks[:-1]) mem_incr_mean, mem_incr_std = mem_iter_diffs.mean(), mem_iter_diffs.std() mem_incr_mean, mem_incr_std = round(mem_incr_mean), round(mem_incr_std) mem_iter_corr, _ = pearsonr(mem_tracks, list(range(len(mem_tracks)))) if mem_iter_corr > 0.95: - logging.warning('Memory usage is steadily increasing with iterations ' - '(Pearson correlation coefficient between ' - f'memory tracks and iterations is {mem_iter_corr})\n' - 'Memory usage increase per iteration: ' - f'{mem_incr_mean}Ā±{mem_incr_std} bytes') + logging.warning( + "Memory usage is steadily increasing with iterations " + "(Pearson correlation coefficient between " + f"memory tracks and iterations is {mem_iter_corr})\n" + "Memory usage increase per iteration: " + f"{mem_incr_mean}Ā±{mem_incr_std} bytes" + ) mem_before_gc, _ = tracemalloc.get_traced_memory() mem_diff = mem_before_gc - mem_before - message = 'Size of extra allocated memory {} using garbage collector ' \ - f'is greater than {EXTRA_MEMORY_THRESHOLD * 100}% of input data' \ - f'\n\tAlgorithm: {estimator.__name__}' \ - f'\n\tInput data size: {data_memory_size} bytes' \ - '\n\tExtra allocated memory size: {} bytes' \ - ' / {} %' + message = ( + "Size of extra allocated memory {} using garbage collector " + f"is greater than {EXTRA_MEMORY_THRESHOLD * 100}% of input data" + f"\n\tAlgorithm: {estimator.__name__}" + f"\n\tInput data size: {data_memory_size} bytes" + "\n\tExtra allocated memory size: {} bytes" + " / {} %" + ) if mem_diff >= EXTRA_MEMORY_THRESHOLD * data_memory_size: - logging.warning(message.format( - 'before', mem_diff, round((mem_diff) / data_memory_size * 100, 2))) + logging.warning( + message.format( + "before", mem_diff, round((mem_diff) / data_memory_size * 100, 2) + ) + ) gc.collect() mem_after, _ = tracemalloc.get_traced_memory() tracemalloc.stop() mem_diff = mem_after - mem_before - assert mem_diff < EXTRA_MEMORY_THRESHOLD * data_memory_size, \ - message.format('after', mem_diff, round((mem_diff) / data_memory_size * 100, 2)) + assert mem_diff < EXTRA_MEMORY_THRESHOLD * data_memory_size, message.format( + "after", mem_diff, round((mem_diff) / data_memory_size * 100, 2) + ) -@pytest.mark.parametrize('data_transform_function', data_transforms) -@pytest.mark.parametrize('estimator', estimators) -@pytest.mark.parametrize('data_shape', data_shapes) +@pytest.mark.parametrize("data_transform_function", data_transforms) +@pytest.mark.parametrize("estimator", estimators) +@pytest.mark.parametrize("data_shape", data_shapes) def test_memory_leaks(estimator, data_transform_function, data_shape): _kfold_function_template(estimator, data_transform_function, data_shape) diff --git a/sklearnex/tests/test_monkeypatch.py b/sklearnex/tests/test_monkeypatch.py index 96de29f698..bcb91f0f99 100755 --- a/sklearnex/tests/test_monkeypatch.py +++ b/sklearnex/tests/test_monkeypatch.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import sklearnex from daal4py.sklearn._utils import daal_check_version @@ -35,9 +35,9 @@ def test_monkey_patching(): n = _classes[i][1] class_module = getattr(p, n).__module__ - assert \ - class_module.startswith('daal4py') or class_module.startswith('sklearnex'), \ - "Patching has completed with error." + assert class_module.startswith("daal4py") or class_module.startswith( + "sklearnex" + ), "Patching has completed with error." for i, _ in enumerate(_tokens): t = _tokens[i] @@ -46,8 +46,7 @@ def test_monkey_patching(): sklearnex.unpatch_sklearn(t) class_module = getattr(p, n).__module__ - assert class_module.startswith('sklearn'), \ - "Unpatching has completed with error." + assert class_module.startswith("sklearn"), "Unpatching has completed with error." sklearnex.unpatch_sklearn() @@ -57,8 +56,7 @@ def test_monkey_patching(): n = _classes[i][1] class_module = getattr(p, n).__module__ - assert class_module.startswith('sklearn'), \ - "Unpatching has completed with error." + assert class_module.startswith("sklearn"), "Unpatching has completed with error." sklearnex.unpatch_sklearn() @@ -70,9 +68,9 @@ def test_monkey_patching(): sklearnex.patch_sklearn(t) class_module = getattr(p, n).__module__ - assert \ - class_module.startswith('daal4py') or class_module.startswith('sklearnex'), \ - "Patching has completed with error." + assert class_module.startswith("daal4py") or class_module.startswith( + "sklearnex" + ), "Patching has completed with error." sklearnex.unpatch_sklearn() @@ -81,14 +79,14 @@ def test_patch_by_list_simple(): sklearnex.patch_sklearn(["LogisticRegression"]) from sklearn.ensemble import RandomForestRegressor - from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression + from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVC - assert RandomForestRegressor.__module__.startswith('sklearn') - assert KNeighborsRegressor.__module__.startswith('sklearn') - assert LogisticRegression.__module__.startswith('daal4py') - assert SVC.__module__.startswith('sklearn') + assert RandomForestRegressor.__module__.startswith("sklearn") + assert KNeighborsRegressor.__module__.startswith("sklearn") + assert LogisticRegression.__module__.startswith("daal4py") + assert SVC.__module__.startswith("sklearn") sklearnex.unpatch_sklearn() @@ -97,14 +95,14 @@ def test_patch_by_list_many_estimators(): sklearnex.patch_sklearn(["LogisticRegression", "SVC"]) from sklearn.ensemble import RandomForestRegressor - from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression + from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVC - assert RandomForestRegressor.__module__.startswith('sklearn') - assert KNeighborsRegressor.__module__.startswith('sklearn') - assert LogisticRegression.__module__.startswith('daal4py') - assert SVC.__module__.startswith('daal4py') or SVC.__module__.startswith('sklearnex') + assert RandomForestRegressor.__module__.startswith("sklearn") + assert KNeighborsRegressor.__module__.startswith("sklearn") + assert LogisticRegression.__module__.startswith("daal4py") + assert SVC.__module__.startswith("daal4py") or SVC.__module__.startswith("sklearnex") sklearnex.unpatch_sklearn() @@ -113,31 +111,32 @@ def test_unpatch_by_list_many_estimators(): sklearnex.patch_sklearn() from sklearn.ensemble import RandomForestRegressor - from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression + from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVC - assert RandomForestRegressor.__module__.startswith('daal4py') - assert KNeighborsRegressor.__module__.startswith('daal4py') or \ - KNeighborsRegressor.__module__.startswith('sklearnex') - assert LogisticRegression.__module__.startswith('daal4py') - assert SVC.__module__.startswith('daal4py') or SVC.__module__.startswith('sklearnex') + assert RandomForestRegressor.__module__.startswith("daal4py") + assert KNeighborsRegressor.__module__.startswith( + "daal4py" + ) or KNeighborsRegressor.__module__.startswith("sklearnex") + assert LogisticRegression.__module__.startswith("daal4py") + assert SVC.__module__.startswith("daal4py") or SVC.__module__.startswith("sklearnex") sklearnex.unpatch_sklearn(["KNeighborsRegressor", "RandomForestRegressor"]) from sklearn.ensemble import RandomForestRegressor - from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression + from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVC - assert RandomForestRegressor.__module__.startswith('sklearn') - assert KNeighborsRegressor.__module__.startswith('sklearn') - assert LogisticRegression.__module__.startswith('daal4py') - assert SVC.__module__.startswith('daal4py') or SVC.__module__.startswith('sklearnex') + assert RandomForestRegressor.__module__.startswith("sklearn") + assert KNeighborsRegressor.__module__.startswith("sklearn") + assert LogisticRegression.__module__.startswith("daal4py") + assert SVC.__module__.startswith("daal4py") or SVC.__module__.startswith("sklearnex") def test_patching_checker(): - for name in [None, 'SVC', 'PCA']: + for name in [None, "SVC", "PCA"]: sklearnex.patch_sklearn(name=name) assert sklearnex.sklearn_is_patched(name=name) @@ -159,10 +158,10 @@ def test_patching_checker(): def test_preview_namespace(): def get_estimators(): + from sklearn.cluster import DBSCAN + from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression - from sklearn.decomposition import PCA - from sklearn.cluster import DBSCAN from sklearn.svm import SVC return LinearRegression(), PCA(), DBSCAN(), SVC(), RandomForestClassifier() @@ -175,34 +174,34 @@ def get_estimators(): assert sklearnex.dispatcher._is_preview_enabled() lr, pca, dbscan, svc, rfc = get_estimators() - assert 'sklearnex.preview' in rfc.__module__ + assert "sklearnex.preview" in rfc.__module__ - if daal_check_version((2023, 'P', 100)): - assert 'sklearnex.preview' in lr.__module__ + if daal_check_version((2023, "P", 100)): + assert "sklearnex.preview" in lr.__module__ else: - assert 'daal4py' in lr.__module__ + assert "daal4py" in lr.__module__ - assert 'sklearnex.preview' in pca.__module__ - assert 'daal4py' in dbscan.__module__ - assert 'sklearnex' in svc.__module__ + assert "sklearnex.preview" in pca.__module__ + assert "daal4py" in dbscan.__module__ + assert "sklearnex" in svc.__module__ sklearnex.unpatch_sklearn() # no patching behavior lr, pca, dbscan, svc, rfc = get_estimators() - assert 'sklearn.' in lr.__module__ - assert 'sklearn.' in pca.__module__ - assert 'sklearn.' in dbscan.__module__ - assert 'sklearn.' in svc.__module__ - assert 'sklearn.' in rfc.__module__ + assert "sklearn." in lr.__module__ + assert "sklearn." in pca.__module__ + assert "sklearn." in dbscan.__module__ + assert "sklearn." in svc.__module__ + assert "sklearn." in rfc.__module__ # default patching behavior sklearnex.patch_sklearn() assert not sklearnex.dispatcher._is_preview_enabled() lr, pca, dbscan, svc, rfc = get_estimators() - assert 'daal4py' in lr.__module__ - assert 'daal4py' in pca.__module__ - assert 'daal4py' in rfc.__module__ - assert 'daal4py' in dbscan.__module__ - assert 'sklearnex' in svc.__module__ + assert "daal4py" in lr.__module__ + assert "daal4py" in pca.__module__ + assert "daal4py" in rfc.__module__ + assert "daal4py" in dbscan.__module__ + assert "sklearnex" in svc.__module__ sklearnex.unpatch_sklearn() diff --git a/sklearnex/tests/test_patching.py b/sklearnex/tests/test_patching.py index c739f3b6d1..cd0c5d2fca 100755 --- a/sklearnex/tests/test_patching.py +++ b/sklearnex/tests/test_patching.py @@ -99,8 +99,11 @@ def _load_all_models(patched): models = [] for patch_infos in get_patch_map().values(): maybe_class = getattr(patch_infos[0][0][0], patch_infos[0][0][1]) - if maybe_class is not None and isclass(maybe_class) and \ - issubclass(maybe_class, BaseEstimator): + if ( + maybe_class is not None + and isclass(maybe_class) + and issubclass(maybe_class, BaseEstimator) + ): models.append(maybe_class()) if patched: @@ -113,9 +116,7 @@ def _load_all_models(patched): UNPATCHED_MODELS = _load_all_models(patched=False) -@pytest.mark.parametrize( - ("patched", "unpatched"), zip(PATCHED_MODELS, UNPATCHED_MODELS) -) +@pytest.mark.parametrize(("patched", "unpatched"), zip(PATCHED_MODELS, UNPATCHED_MODELS)) def test_is_patched_instance(patched, unpatched): assert is_patched_instance(patched), f"{patched} is a patched instance" assert not is_patched_instance(unpatched), f"{unpatched} is an unpatched instance" diff --git a/sklearnex/tests/test_run_to_run_stability_tests.py b/sklearnex/tests/test_run_to_run_stability_tests.py index 91d84ec283..33f39bea79 100755 --- a/sklearnex/tests/test_run_to_run_stability_tests.py +++ b/sklearnex/tests/test_run_to_run_stability_tests.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,48 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== + +import random -import daal4py as d4p import numpy as np import pytest -import random +import daal4py as d4p from sklearnex import patch_sklearn + patch_sklearn() -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LogisticRegression, LogisticRegressionCV -from sklearn.neighbors import (KNeighborsClassifier, KNeighborsRegressor, - NearestNeighbors, LocalOutlierFactor) -from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso -from sklearn.cluster import KMeans, DBSCAN +from scipy import sparse +from sklearn.cluster import DBSCAN, KMeans +from sklearn.datasets import ( + load_breast_cancer, + load_diabetes, + load_iris, + make_classification, + make_regression, +) from sklearn.decomposition import PCA -from sklearn.svm import SVC, NuSVC, SVR, NuSVR +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import ( + ElasticNet, + Lasso, + LinearRegression, + LogisticRegression, + LogisticRegressionCV, + Ridge, +) from sklearn.manifold import TSNE +from sklearn.metrics import pairwise_distances, roc_auc_score from sklearn.model_selection import train_test_split +from sklearn.neighbors import ( + KNeighborsClassifier, + KNeighborsRegressor, + LocalOutlierFactor, + NearestNeighbors, +) +from sklearn.svm import SVC, SVR, NuSVC, NuSVR -from sklearn.datasets import (make_classification, make_regression, - load_breast_cancer, load_diabetes, load_iris) -from sklearn.metrics import pairwise_distances, roc_auc_score -from scipy import sparse from daal4py.sklearn._utils import daal_check_version # to reproduce errors even in CI @@ -51,40 +68,40 @@ def method_processing(X, clf, methods): res = [] name = [] for i in methods: - if i == 'predict': + if i == "predict": res.append(clf.predict(X)) - name.append(get_class_name(clf) + '.predict(X)') - elif i == 'predict_proba': + name.append(get_class_name(clf) + ".predict(X)") + elif i == "predict_proba": res.append(clf.predict_proba(X)) - name.append(get_class_name(clf) + '.predict_proba(X)') - elif i == 'decision_function': + name.append(get_class_name(clf) + ".predict_proba(X)") + elif i == "decision_function": res.append(clf.decision_function(X)) - name.append(get_class_name(clf) + '.decision_function(X)') - elif i == 'kneighbors': + name.append(get_class_name(clf) + ".decision_function(X)") + elif i == "kneighbors": dist, idx = clf.kneighbors(X) res.append(dist) - name.append('dist') + name.append("dist") res.append(idx) - name.append('idx') - elif i == 'fit_predict': + name.append("idx") + elif i == "fit_predict": predict = clf.fit_predict(X) res.append(predict) - name.append(get_class_name(clf) + '.fit_predict') - elif i == 'fit_transform': + name.append(get_class_name(clf) + ".fit_predict") + elif i == "fit_transform": res.append(clf.fit_transform(X)) - name.append(get_class_name(clf) + '.fit_transform') - elif i == 'transform': + name.append(get_class_name(clf) + ".fit_transform") + elif i == "transform": res.append(clf.transform(X)) - name.append(get_class_name(clf) + '.transform(X)') - elif i == 'get_covariance': + name.append(get_class_name(clf) + ".transform(X)") + elif i == "get_covariance": res.append(clf.get_covariance()) - name.append(get_class_name(clf) + '.get_covariance()') - elif i == 'get_precision': + name.append(get_class_name(clf) + ".get_covariance()") + elif i == "get_precision": res.append(clf.get_precision()) - name.append(get_class_name(clf) + '.get_precision()') - elif i == 'score_samples': + name.append(get_class_name(clf) + ".get_precision()") + elif i == "score_samples": res.append(clf.score_samples(X)) - name.append(get_class_name(clf) + '.score_samples(X)') + name.append(get_class_name(clf) + ".score_samples(X)") return res, name @@ -98,29 +115,30 @@ def func(X, Y, clf, methods): if isinstance(ans, np.ndarray) and None in ans: continue res.append(ans) - name.append(get_class_name(clf) + '.' + i) + name.append(get_class_name(clf) + "." + i) return res, name def _run_test(model, methods, dataset): datasets = [] - if dataset in ['blobs', 'classifier', 'sparse']: + if dataset in ["blobs", "classifier", "sparse"]: X1, y1 = load_iris(return_X_y=True) - if dataset == 'sparse': + if dataset == "sparse": X1 = sparse.csr_matrix(X1) datasets.append((X1, y1)) X2, y2 = load_breast_cancer(return_X_y=True) - if dataset == 'sparse': + if dataset == "sparse": X2 = sparse.csr_matrix(X2) datasets.append((X2, y2)) - elif dataset == 'regression': - X1, y1 = make_regression(n_samples=500, n_features=10, - noise=64.0, random_state=42) + elif dataset == "regression": + X1, y1 = make_regression( + n_samples=500, n_features=10, noise=64.0, random_state=42 + ) datasets.append((X1, y1)) X2, y2 = load_diabetes(return_X_y=True) datasets.append((X2, y2)) else: - raise ValueError('Unknown dataset type') + raise ValueError("Unknown dataset type") for X, y in datasets: baseline, name = func(X, y, model, methods) @@ -128,239 +146,264 @@ def _run_test(model, methods, dataset): res, _ = func(X, y, model, methods) for a, b, n in zip(res, baseline, name): - np.testing.assert_allclose(a, b, rtol=0.0, atol=0.0, - err_msg=str(n + " is incorrect")) + np.testing.assert_allclose( + a, b, rtol=0.0, atol=0.0, err_msg=str(n + " is incorrect") + ) MODELS_INFO = [ { - 'model': KNeighborsClassifier(n_neighbors=10, algorithm='brute', - weights="uniform"), - 'methods': ['predict', 'predict_proba', 'kneighbors'], - 'dataset': 'classifier', + "model": KNeighborsClassifier( + n_neighbors=10, algorithm="brute", weights="uniform" + ), + "methods": ["predict", "predict_proba", "kneighbors"], + "dataset": "classifier", }, { - 'model': KNeighborsClassifier(n_neighbors=10, algorithm='brute', - weights="distance"), - 'methods': ['predict', 'predict_proba', 'kneighbors'], - 'dataset': 'classifier', + "model": KNeighborsClassifier( + n_neighbors=10, algorithm="brute", weights="distance" + ), + "methods": ["predict", "predict_proba", "kneighbors"], + "dataset": "classifier", }, { - 'model': KNeighborsClassifier(n_neighbors=10, algorithm='kd_tree', - weights="uniform"), - 'methods': ['predict', 'predict_proba', 'kneighbors'], - 'dataset': 'classifier', + "model": KNeighborsClassifier( + n_neighbors=10, algorithm="kd_tree", weights="uniform" + ), + "methods": ["predict", "predict_proba", "kneighbors"], + "dataset": "classifier", }, { - 'model': KNeighborsClassifier(n_neighbors=10, algorithm='kd_tree', - weights="distance"), - 'methods': ['predict', 'predict_proba', 'kneighbors'], - 'dataset': 'classifier', + "model": KNeighborsClassifier( + n_neighbors=10, algorithm="kd_tree", weights="distance" + ), + "methods": ["predict", "predict_proba", "kneighbors"], + "dataset": "classifier", }, { - 'model': KNeighborsRegressor(n_neighbors=10, algorithm='kd_tree', - weights="distance"), - 'methods': ['predict', 'kneighbors'], - 'dataset': 'regression', + "model": KNeighborsRegressor( + n_neighbors=10, algorithm="kd_tree", weights="distance" + ), + "methods": ["predict", "kneighbors"], + "dataset": "regression", }, { - 'model': KNeighborsRegressor(n_neighbors=10, algorithm='kd_tree', - weights="uniform"), - 'methods': ['predict', 'kneighbors'], - 'dataset': 'regression', + "model": KNeighborsRegressor( + n_neighbors=10, algorithm="kd_tree", weights="uniform" + ), + "methods": ["predict", "kneighbors"], + "dataset": "regression", }, { - 'model': KNeighborsRegressor(n_neighbors=10, algorithm='brute', - weights="distance"), - 'methods': ['predict', 'kneighbors'], - 'dataset': 'regression', + "model": KNeighborsRegressor( + n_neighbors=10, algorithm="brute", weights="distance" + ), + "methods": ["predict", "kneighbors"], + "dataset": "regression", }, { - 'model': KNeighborsRegressor(n_neighbors=10, algorithm='brute', - weights="uniform"), - 'methods': ['predict', 'kneighbors'], - 'dataset': 'regression', + "model": KNeighborsRegressor( + n_neighbors=10, algorithm="brute", weights="uniform" + ), + "methods": ["predict", "kneighbors"], + "dataset": "regression", }, { - 'model': NearestNeighbors(n_neighbors=10, algorithm='brute'), - 'methods': ['kneighbors'], - 'dataset': 'blobs', + "model": NearestNeighbors(n_neighbors=10, algorithm="brute"), + "methods": ["kneighbors"], + "dataset": "blobs", }, { - 'model': NearestNeighbors(n_neighbors=10, algorithm='kd_tree'), - 'methods': ['kneighbors'], - 'dataset': 'blobs', + "model": NearestNeighbors(n_neighbors=10, algorithm="kd_tree"), + "methods": ["kneighbors"], + "dataset": "blobs", }, { - 'model': LocalOutlierFactor(n_neighbors=10, novelty=False), - 'methods': ['fit_predict'], - 'dataset': 'blobs', + "model": LocalOutlierFactor(n_neighbors=10, novelty=False), + "methods": ["fit_predict"], + "dataset": "blobs", }, { - 'model': LocalOutlierFactor(n_neighbors=10, novelty=True), - 'methods': ['predict'], - 'dataset': 'blobs', + "model": LocalOutlierFactor(n_neighbors=10, novelty=True), + "methods": ["predict"], + "dataset": "blobs", }, { - 'model': DBSCAN(algorithm="brute", n_jobs=-1), - 'methods': [], - 'dataset': 'blobs', + "model": DBSCAN(algorithm="brute", n_jobs=-1), + "methods": [], + "dataset": "blobs", }, { - 'model': SVC(kernel='rbf'), - 'methods': ['predict', 'decision_function'], - 'dataset': 'classifier', + "model": SVC(kernel="rbf"), + "methods": ["predict", "decision_function"], + "dataset": "classifier", }, { - 'model': SVC(kernel='rbf'), - 'methods': ['predict', 'decision_function'], - 'dataset': 'sparse', + "model": SVC(kernel="rbf"), + "methods": ["predict", "decision_function"], + "dataset": "sparse", }, { - 'model': NuSVC(kernel='rbf'), - 'methods': ['predict', 'decision_function'], - 'dataset': 'classifier', + "model": NuSVC(kernel="rbf"), + "methods": ["predict", "decision_function"], + "dataset": "classifier", }, { - 'model': SVR(kernel='rbf'), - 'methods': ['predict'], - 'dataset': 'regression', + "model": SVR(kernel="rbf"), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': NuSVR(kernel='rbf'), - 'methods': ['predict'], - 'dataset': 'regression', + "model": NuSVR(kernel="rbf"), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': TSNE(random_state=0), - 'methods': ['fit_transform'], - 'dataset': 'classifier', + "model": TSNE(random_state=0), + "methods": ["fit_transform"], + "dataset": "classifier", }, { - 'model': KMeans(random_state=0, init="k-means++"), - 'methods': ['predict'], - 'dataset': 'blobs', + "model": KMeans(random_state=0, init="k-means++"), + "methods": ["predict"], + "dataset": "blobs", }, { - 'model': KMeans(random_state=0, init="random"), - 'methods': ['predict'], - 'dataset': 'blobs', + "model": KMeans(random_state=0, init="random"), + "methods": ["predict"], + "dataset": "blobs", }, { - 'model': KMeans(random_state=0, init="k-means++"), - 'methods': ['predict'], - 'dataset': 'sparse', + "model": KMeans(random_state=0, init="k-means++"), + "methods": ["predict"], + "dataset": "sparse", }, { - 'model': KMeans(random_state=0, init="random"), - 'methods': ['predict'], - 'dataset': 'sparse', + "model": KMeans(random_state=0, init="random"), + "methods": ["predict"], + "dataset": "sparse", }, { - 'model': ElasticNet(random_state=0), - 'methods': ['predict'], - 'dataset': 'regression', + "model": ElasticNet(random_state=0), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': Lasso(random_state=0), - 'methods': ['predict'], - 'dataset': 'regression', + "model": Lasso(random_state=0), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': PCA(n_components=0.5, svd_solver="full", random_state=0), - 'methods': ['transform', 'get_covariance', 'get_precision', 'score_samples'], - 'dataset': 'classifier', + "model": PCA(n_components=0.5, svd_solver="full", random_state=0), + "methods": ["transform", "get_covariance", "get_precision", "score_samples"], + "dataset": "classifier", }, { - 'model': RandomForestClassifier(random_state=0, oob_score=True, - max_samples=0.5, max_features='sqrt'), - 'methods': ['predict', 'predict_proba'], - 'dataset': 'classifier', + "model": RandomForestClassifier( + random_state=0, oob_score=True, max_samples=0.5, max_features="sqrt" + ), + "methods": ["predict", "predict_proba"], + "dataset": "classifier", }, { - 'model': LogisticRegression(random_state=0, solver="newton-cg", max_iter=1000), - 'methods': ['predict', 'predict_proba'], - 'dataset': 'classifier', + "model": LogisticRegression(random_state=0, solver="newton-cg", max_iter=1000), + "methods": ["predict", "predict_proba"], + "dataset": "classifier", }, { - 'model': LogisticRegression(random_state=0, solver="lbfgs", max_iter=1000), - 'methods': ['predict', 'predict_proba'], - 'dataset': 'classifier', + "model": LogisticRegression(random_state=0, solver="lbfgs", max_iter=1000), + "methods": ["predict", "predict_proba"], + "dataset": "classifier", }, { - 'model': LogisticRegressionCV(random_state=0, solver="newton-cg", - n_jobs=-1, max_iter=1000), - 'methods': ['predict', 'predict_proba'], - 'dataset': 'classifier', + "model": LogisticRegressionCV( + random_state=0, solver="newton-cg", n_jobs=-1, max_iter=1000 + ), + "methods": ["predict", "predict_proba"], + "dataset": "classifier", }, { - 'model': LogisticRegressionCV(random_state=0, solver="lbfgs", - n_jobs=-1, max_iter=1000), - 'methods': ['predict', 'predict_proba'], - 'dataset': 'classifier', + "model": LogisticRegressionCV( + random_state=0, solver="lbfgs", n_jobs=-1, max_iter=1000 + ), + "methods": ["predict", "predict_proba"], + "dataset": "classifier", }, { - 'model': RandomForestRegressor(random_state=0, oob_score=True, - max_samples=0.5, max_features='sqrt'), - 'methods': ['predict'], - 'dataset': 'regression', + "model": RandomForestRegressor( + random_state=0, oob_score=True, max_samples=0.5, max_features="sqrt" + ), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': LinearRegression(), - 'methods': ['predict'], - 'dataset': 'regression', + "model": LinearRegression(), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': Ridge(random_state=0), - 'methods': ['predict'], - 'dataset': 'regression', + "model": Ridge(random_state=0), + "methods": ["predict"], + "dataset": "regression", }, ] TO_SKIP = [ - 'TSNE', # Absolute diff is 1e-10, potential problem in KNN, - # will be fixed for next release. (UPD. KNN is fixed but there is a problem - # with stability of stock sklearn. It is already stable in master, so, we - # need to wait for the next sklearn release) - 'LogisticRegression', # Absolute diff is 1e-8, will be fixed for next release - 'LogisticRegressionCV', # Absolute diff is 1e-10, will be fixed for next release - 'RandomForestRegressor', # Absolute diff is 1e-14 in OOB score, - # will be fixed for next release + "TSNE", # Absolute diff is 1e-10, potential problem in KNN, + # will be fixed for next release. (UPD. KNN is fixed but there is a problem + # with stability of stock sklearn. It is already stable in master, so, we + # need to wait for the next sklearn release) + "LogisticRegression", # Absolute diff is 1e-8, will be fixed for next release + "LogisticRegressionCV", # Absolute diff is 1e-10, will be fixed for next release + "RandomForestRegressor", # Absolute diff is 1e-14 in OOB score, + # will be fixed for next release ] -@pytest.mark.parametrize('model_head', MODELS_INFO) +@pytest.mark.parametrize("model_head", MODELS_INFO) def test_models(model_head): stable_algos = [] - if get_class_name(model_head['model']) in stable_algos \ - and daal_check_version((2021, 'P', 300)): + if get_class_name(model_head["model"]) in stable_algos and daal_check_version( + (2021, "P", 300) + ): try: - TO_SKIP.remove(get_class_name(model_head['model'])) + TO_SKIP.remove(get_class_name(model_head["model"])) except ValueError: pass - if get_class_name(model_head['model']) in TO_SKIP: + if get_class_name(model_head["model"]) in TO_SKIP: pytest.skip("Unstable", allow_module_level=False) - _run_test(model_head['model'], model_head['methods'], model_head['dataset']) + _run_test(model_head["model"], model_head["methods"], model_head["dataset"]) -@pytest.mark.parametrize('features', range(5, 10)) +@pytest.mark.parametrize("features", range(5, 10)) def test_train_test_split(features): - X, y = make_classification(n_samples=4000, n_features=features, - n_informative=features, n_redundant=0, - n_clusters_per_class=8, random_state=0) - baseline_X_train, baseline_X_test, baseline_y_train, baseline_y_test = \ - train_test_split(X, y, test_size=0.33, random_state=0) + X, y = make_classification( + n_samples=4000, + n_features=features, + n_informative=features, + n_redundant=0, + n_clusters_per_class=8, + random_state=0, + ) + ( + baseline_X_train, + baseline_X_test, + baseline_y_train, + baseline_y_test, + ) = train_test_split(X, y, test_size=0.33, random_state=0) baseline = [baseline_X_train, baseline_X_test, baseline_y_train, baseline_y_test] for _ in range(10): - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, - random_state=0) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.33, random_state=0 + ) res = [X_train, X_test, y_train, y_test] for a, b in zip(res, baseline): - np.testing.assert_allclose(a, b, rtol=0.0, atol=0.0, - err_msg=str("train_test_split is incorrect")) + np.testing.assert_allclose( + a, b, rtol=0.0, atol=0.0, err_msg=str("train_test_split is incorrect") + ) -@pytest.mark.parametrize('metric', ['cosine', 'correlation']) +@pytest.mark.parametrize("metric", ["cosine", "correlation"]) def test_pairwise_distances(metric): X = np.random.rand(1000) X = np.array(X, dtype=np.float64) @@ -368,16 +411,18 @@ def test_pairwise_distances(metric): for _ in range(5): res = pairwise_distances(X.reshape(1, -1), metric=metric) for a, b in zip(res, baseline): - np.testing.assert_allclose(a, b, rtol=0.0, atol=0.0, - err_msg=str("pairwise_distances is incorrect")) + np.testing.assert_allclose( + a, b, rtol=0.0, atol=0.0, err_msg=str("pairwise_distances is incorrect") + ) -@pytest.mark.parametrize('array_size', [100, 1000, 10000]) +@pytest.mark.parametrize("array_size", [100, 1000, 10000]) def test_roc_auc(array_size): a = [random.randint(0, 1) for i in range(array_size)] b = [random.randint(0, 1) for i in range(array_size)] baseline = roc_auc_score(a, b) for _ in range(5): res = roc_auc_score(a, b) - np.testing.assert_allclose(baseline, res, rtol=0.0, atol=0.0, - err_msg=str("roc_auc is incorrect")) + np.testing.assert_allclose( + baseline, res, rtol=0.0, atol=0.0, err_msg=str("roc_auc is incorrect") + ) diff --git a/sklearnex/tests/utils/_launch_algorithms.py b/sklearnex/tests/utils/_launch_algorithms.py index cc6038eb9e..dfd2cb49fe 100755 --- a/sklearnex/tests/utils/_launch_algorithms.py +++ b/sklearnex/tests/utils/_launch_algorithms.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,25 +12,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import logging import random +import numpy as np + from sklearnex import patch_sklearn + patch_sklearn() +import pathlib +import sys + +from sklearn.datasets import load_diabetes, load_iris, make_regression from sklearn.metrics import pairwise_distances, roc_auc_score -from sklearn.datasets import ( - make_regression, - load_iris, - load_diabetes) -import sys -import pathlib absolute_path = str(pathlib.Path(__file__).parent.absolute()) -sys.path.append(absolute_path + '/../') +sys.path.append(absolute_path + "/../") from _models_info import MODELS_INFO, TYPES @@ -39,80 +39,80 @@ def get_class_name(x): def generate_dataset(name, dtype, model_name): - if model_name == 'LinearRegression': + if model_name == "LinearRegression": X, y = make_regression(n_samples=1000, n_features=5) - elif name in ['blobs', 'classifier']: + elif name in ["blobs", "classifier"]: X, y = load_iris(return_X_y=True) - elif name == 'regression': + elif name == "regression": X, y = load_diabetes(return_X_y=True) else: - raise ValueError('Unknown dataset type') + raise ValueError("Unknown dataset type") X = np.array(X, dtype=dtype) y = np.array(y, dtype=dtype) return (X, y) def run_patch(model_info, dtype): - print(get_class_name(model_info['model']), dtype.__name__) - X, y = generate_dataset(model_info['dataset'], - dtype, - get_class_name(model_info['model'])) - model = model_info['model'] + print(get_class_name(model_info["model"]), dtype.__name__) + X, y = generate_dataset( + model_info["dataset"], dtype, get_class_name(model_info["model"]) + ) + model = model_info["model"] model.fit(X, y) - logging.info('fit') - for i in model_info['methods']: - if i == 'predict': + logging.info("fit") + for i in model_info["methods"]: + if i == "predict": model.predict(X) - elif i == 'predict_proba': + elif i == "predict_proba": model.predict_proba(X) - elif i == 'predict_log_proba': + elif i == "predict_log_proba": model.predict_log_proba(X) - elif i == 'decision_function': + elif i == "decision_function": model.decision_function(X) - elif i == 'fit_predict': + elif i == "fit_predict": model.fit_predict(X) - elif i == 'transform': + elif i == "transform": model.transform(X) - elif i == 'fit_transform': + elif i == "fit_transform": model.fit_transform(X) - elif i == 'kneighbors': + elif i == "kneighbors": model.kneighbors(X) - elif i == 'score': + elif i == "score": model.score(X, y) else: - raise ValueError(i + ' is wrong method') + raise ValueError(i + " is wrong method") logging.info(i) def run_algotithms(): for info in MODELS_INFO: for t in TYPES: - model_name = get_class_name(info['model']) - if model_name in ['Ridge', 'LinearRegression'] and t.__name__ == 'uint32': + model_name = get_class_name(info["model"]) + if model_name in ["Ridge", "LinearRegression"] and t.__name__ == "uint32": continue run_patch(info, t) def run_utils(): # pairwise_distances - for metric in ['cosine', 'correlation']: + for metric in ["cosine", "correlation"]: for t in TYPES: X = np.random.rand(1000) X = np.array(X, dtype=t) - print('pairwise_distances', t.__name__) + print("pairwise_distances", t.__name__) _ = pairwise_distances(X.reshape(1, -1), metric=metric) - logging.info('pairwise_distances') + logging.info("pairwise_distances") # roc_auc_score for t in [np.float32, np.float64]: a = [random.randint(0, 1) for i in range(1000)] b = [random.randint(0, 1) for i in range(1000)] a = np.array(a, dtype=t) b = np.array(b, dtype=t) - print('roc_auc_score', t.__name__) + print("roc_auc_score", t.__name__) _ = roc_auc_score(a, b) - logging.info('roc_auc_score') + logging.info("roc_auc_score") -if __name__ == '__main__': +if __name__ == "__main__": run_algotithms() run_utils() diff --git a/sklearnex/utils/__init__.py b/sklearnex/utils/__init__.py index eb5355bc4f..4c3fe21154 100755 --- a/sklearnex/utils/__init__.py +++ b/sklearnex/utils/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .validation import _assert_all_finite -__all__ = ['_assert_all_finite'] +__all__ = ["_assert_all_finite"] diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 3e75d0fac5..8457e46314 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.utils.validation import _assert_all_finite diff --git a/tests/run_examples.py b/tests/run_examples.py index bd68c8d0c6..d9a8a092ae 100755 --- a/tests/run_examples.py +++ b/tests/run_examples.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,71 +12,72 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import os import struct import subprocess import sys +from collections import defaultdict +from os.path import join as jp +from time import gmtime, strftime from daal4py import __has_dist__ from daal4py.sklearn._utils import get_daal_version -from os.path import join as jp -from time import gmtime, strftime -from collections import defaultdict -print('Starting examples validation') +print("Starting examples validation") # First item is major version - 2021, # second is minor+patch - 0110, # third item is status - B -print('DAAL version:', get_daal_version()) +print("DAAL version:", get_daal_version()) runner_path = os.path.realpath(__file__) runner_dir = os.path.dirname(runner_path) examples_rootdir = jp( - os.path.dirname(os.path.abspath(os.path.join(runner_path, - os.pardir))), - 'examples') + os.path.dirname(os.path.abspath(os.path.join(runner_path, os.pardir))), "examples" +) IS_WIN = False IS_MAC = False IS_LIN = False system_os = "not_supported" -if 'linux' in sys.platform: +if "linux" in sys.platform: IS_LIN = True system_os = "lnx" -elif sys.platform == 'darwin': +elif sys.platform == "darwin": IS_MAC = True system_os = "mac" -elif sys.platform in ['win32', 'cygwin']: +elif sys.platform in ["win32", "cygwin"]: IS_WIN = True system_os = "win" else: - assert False, sys.platform + ' not supported' + assert False, sys.platform + " not supported" -assert 8 * struct.calcsize('P') in [32, 64] +assert 8 * struct.calcsize("P") in [32, 64] -if 8 * struct.calcsize('P') == 32: - logdir = jp(runner_dir, '_results', 'ia32') +if 8 * struct.calcsize("P") == 32: + logdir = jp(runner_dir, "_results", "ia32") else: - logdir = jp(runner_dir, '_results', 'intel64') + logdir = jp(runner_dir, "_results", "intel64") ex_log_dirs = [ - (jp(examples_rootdir, 'daal4py'), jp(logdir, 'daal4py')), - (jp(examples_rootdir, 'sklearnex'), jp(logdir, 'sklearnex'))] + (jp(examples_rootdir, "daal4py"), jp(logdir, "daal4py")), + (jp(examples_rootdir, "sklearnex"), jp(logdir, "sklearnex")), +] availabe_devices = [] try: from daal4py.oneapi import sycl_context + sycl_extention_available = True except ModuleNotFoundError: sycl_extention_available = False -print('Sycl extensions available: {}'.format(sycl_extention_available)) +print("Sycl extensions available: {}".format(sycl_extention_available)) if sycl_extention_available: try: - with sycl_context('gpu'): + with sycl_context("gpu"): gpu_available = True availabe_devices.append("gpu") except RuntimeError: @@ -84,7 +85,7 @@ availabe_devices.append("cpu") # validate that host and cpu devices avaialbe for logging reasons. Examples and # vaidaton logic assumes that host and cpu devices are always available - print('Sycl gpu device: {}'.format(gpu_available)) + print("Sycl gpu device: {}".format(gpu_available)) def check_version(rule, target): @@ -118,56 +119,72 @@ def check_library(rule): for rule_item in rule: try: import importlib + importlib.import_module(rule_item, package=None) except ImportError: return False return True -req_version = defaultdict(lambda: (2019, 'P', 0)) -req_version['sycl/dbscan.py'] = \ - (2021, 'P', 100) # hangs in beta08, need to be fixed -req_version['sycl/linear_regression.py'] = \ - (2021, 'P', 100) # hangs in beta08, need to be fixed -req_version['sycl/kmeans.py'] = \ - (2021, 'P', 200) # not equal results for host and gpu runs -req_version['sycl/pca_transform.py'] = (2021, 'P', 200) -req_version['sycl/decision_forest_classification_hist.py'] = (2021, 'P', 200) -req_version['sycl/decision_forest_regression_hist.py'] = (2021, 'P', 200) -req_version['decision_forest_classification_hist.py'] = (2023, 'P', 1) -req_version['decision_forest_classification_default_dense.py'] = (2023, 'P', 1) -req_version['decision_forest_classification_traverse.py'] = (2023, 'P', 1) -req_version['decision_forest_regression_hist.py'] = (2021, 'P', 200) -req_version['basic_statistics_spmd.py'] = (2023, 'P', 1) -req_version['kmeans_spmd.py'] = (2023, 'P', 2) -req_version['knn_bf_classification_spmd.py'] = (2023, 'P', 1) -req_version['knn_bf_regression_spmd.py'] = (2023, 'P', 1) -req_version['linear_regression_spmd.py'] = (2023, 'P', 1) +req_version = defaultdict(lambda: (2019, "P", 0)) +req_version["sycl/dbscan.py"] = ( + 2021, + "P", + 100, +) # hangs in beta08, need to be fixed +req_version["sycl/linear_regression.py"] = ( + 2021, + "P", + 100, +) # hangs in beta08, need to be fixed +req_version["sycl/kmeans.py"] = ( + 2021, + "P", + 200, +) # not equal results for host and gpu runs +req_version["sycl/pca_transform.py"] = (2021, "P", 200) +req_version["sycl/decision_forest_classification_hist.py"] = (2021, "P", 200) +req_version["sycl/decision_forest_regression_hist.py"] = (2021, "P", 200) +req_version["decision_forest_classification_hist.py"] = (2023, "P", 1) +req_version["decision_forest_classification_default_dense.py"] = (2023, "P", 1) +req_version["decision_forest_classification_traverse.py"] = (2023, "P", 1) +req_version["decision_forest_regression_hist.py"] = (2021, "P", 200) +req_version["basic_statistics_spmd.py"] = (2023, "P", 1) +req_version["kmeans_spmd.py"] = (2023, "P", 2) +req_version["knn_bf_classification_spmd.py"] = (2023, "P", 1) +req_version["knn_bf_regression_spmd.py"] = (2023, "P", 1) +req_version["linear_regression_spmd.py"] = (2023, "P", 1) req_device = defaultdict(lambda: []) -req_device['basic_statistics_spmd.py'] = ["gpu"] -req_device['kmeans_spmd.py'] = ["gpu"] -req_device['knn_bf_classification_spmd.py'] = ["gpu"] -req_device['knn_bf_regression_spmd.py'] = ["gpu"] -req_device['linear_regression_spmd.py'] = ["gpu"] -req_device['pca_spmd.py'] = ["gpu"] -req_device['random_forest_classifier_spmd.py'] = ["gpu"] -req_device['random_forest_regressor_spmd.py'] = ["gpu"] -req_device['sycl/gradient_boosted_regression.py'] = ["gpu"] +req_device["basic_statistics_spmd.py"] = ["gpu"] +req_device["kmeans_spmd.py"] = ["gpu"] +req_device["knn_bf_classification_dpnp.py"] = ["gpu"] +req_device["knn_bf_classification_spmd.py"] = ["gpu"] +req_device["knn_bf_regression_spmd.py"] = ["gpu"] +req_device["linear_regression_spmd.py"] = ["gpu"] +req_device["pca_spmd.py"] = ["gpu"] +req_device["random_forest_classifier_dpctl.py"] = ["gpu"] +req_device["random_forest_classifier_spmd.py"] = ["gpu"] +req_device["random_forest_regressor_dpnp.py"] = ["gpu"] +req_device["random_forest_regressor_spmd.py"] = ["gpu"] +req_device["sycl/gradient_boosted_regression.py"] = ["gpu"] req_library = defaultdict(lambda: []) -req_library['basic_statistics_spmd.py'] = ['dpctl', 'mpi4py'] -req_library['model_builders_lightgbm.py'] = ['lightgbm'] -req_library['model_builders_xgboost.py'] = ['xgboost'] -req_library['model_builders_catboost.py'] = ['catboost'] -req_library['basic_statistics_spmd.py'] = ['dpctl', 'mpi4py'] -req_library['kmeans_spmd.py'] = ['dpctl', 'mpi4py'] -req_library['knn_bf_classification_spmd.py'] = ['dpctl', 'mpi4py'] -req_library['knn_bf_regression_spmd.py'] = ['dpctl', 'mpi4py'] -req_library['linear_regression_spmd.py'] = ['dpctl', 'mpi4py'] -req_library['pca_spmd.py'] = ['dpctl', 'mpi4py'] -req_library['random_forest_classifier_spmd.py'] = ['dpctl', 'mpi4py'] -req_library['random_forest_regressor_spmd.py'] = ['dpctl', 'mpi4py'] +req_library["basic_statistics_spmd.py"] = ["dpctl", "mpi4py"] +req_library["model_builders_lightgbm.py"] = ["lightgbm"] +req_library["model_builders_xgboost.py"] = ["xgboost"] +req_library["model_builders_catboost.py"] = ["catboost"] +req_library["basic_statistics_spmd.py"] = ["dpctl", "mpi4py"] +req_library["kmeans_spmd.py"] = ["dpctl", "mpi4py"] +req_library["knn_bf_classification_dpnp.py"] = ["dpctl", "dpnp"] +req_library["knn_bf_classification_spmd.py"] = ["dpctl", "mpi4py"] +req_library["knn_bf_regression_spmd.py"] = ["dpctl", "mpi4py"] +req_library["linear_regression_spmd.py"] = ["dpctl", "mpi4py"] +req_library["pca_spmd.py"] = ["dpctl", "mpi4py"] +req_library["random_forest_classifier_dpctl.py"] = ["dpctl"] +req_library["random_forest_classifier_spmd.py"] = ["dpctl", "mpi4py"] +req_library["random_forest_regressor_dpnp.py"] = ["dpnp"] +req_library["random_forest_regressor_spmd.py"] = ["dpctl", "mpi4py"] req_os = defaultdict(lambda: []) @@ -181,11 +198,11 @@ def get_exe_cmd(ex, nodist, nostream): if os.path.dirname(ex).endswith("sycl"): if not sycl_extention_available: return None - if not check_version(req_version["sycl/" + os.path.basename(ex)], - get_daal_version()): + if not check_version( + req_version["sycl/" + os.path.basename(ex)], get_daal_version() + ): return None - if not check_device( - req_device["sycl/" + os.path.basename(ex)], availabe_devices): + if not check_device(req_device["sycl/" + os.path.basename(ex)], availabe_devices): return None if not check_os(req_os["sycl/" + os.path.basename(ex)], system_os): return None @@ -195,17 +212,15 @@ def get_exe_cmd(ex, nodist, nostream): return None if not check_library(req_library[os.path.basename(ex)]): return None - if os.path.dirname(ex).endswith("sklearnex") and not nodist and \ - ex.endswith('spmd.py'): + + if os.path.dirname(ex).endswith("sklearnex"): if not check_device(req_device[os.path.basename(ex)], availabe_devices): return None if not check_version(req_version[os.path.basename(ex)], get_daal_version()): return None if not check_library(req_library[os.path.basename(ex)]): return None - if not nostream and ex.endswith('streaming.py'): - return '"' + sys.executable + '" "' + ex + '"' - elif not nodist and ex.endswith('spmd.py'): + if not nodist and ex.endswith('spmd.py'): if IS_WIN: return 'mpiexec -localonly -n 4 "' + sys.executable + '" "' + ex + '"' return 'mpirun -n 4 "' + sys.executable + '" "' + ex + '"' @@ -218,44 +233,52 @@ def run(exdir, logdir, nodist=False, nostream=False): n = 0 if not os.path.exists(logdir): os.makedirs(logdir) - for (dirpath, dirnames, filenames) in os.walk(exdir): + for dirpath, dirnames, filenames in os.walk(exdir): for script in filenames: if script.endswith('.py') and script not in ['__init__.py']: n += 1 if script in skiped_files: print(strftime("%H:%M:%S", gmtime()) + '\tKNOWN BUG IN EXAMPLES\t' + script) else: - logfn = jp(logdir, script.replace('.py', '.res')) - with open(logfn, 'w') as logfile: - print('\n##### ' + jp(dirpath, script)) - execute_string = get_exe_cmd(jp(dirpath, script), - nodist, nostream) - if execute_string: - os.chdir(dirpath) - proc = subprocess.Popen( - execute_string if IS_WIN else ['/bin/bash', - '-c', - execute_string], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - shell=False - ) - out = proc.communicate()[0] - logfile.write(out.decode('ascii')) - if proc.returncode: - print(out) - print( - strftime("%H:%M:%S", gmtime()) + '\tFAILED' - '\t' + script + '\twith errno' - '\t' + str(proc.returncode) - ) - else: - success += 1 - print(strftime("%H:%M:%S", gmtime()) + '\t' - 'PASSED\t' + script) - else: - success += 1 - print(strftime("%H:%M:%S", gmtime()) + '\tSKIPPED\t' + script) + logfn = jp(logdir, script.replace(".py", ".res")) + with open(logfn, "w") as logfile: + print("\n##### " + jp(dirpath, script)) + execute_string = get_exe_cmd(jp(dirpath, script), nodist, nostream) + if execute_string: + os.chdir(dirpath) + proc = subprocess.Popen( + execute_string + if IS_WIN + else ["/bin/bash", "-c", execute_string], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=False, + ) + out = proc.communicate()[0] + logfile.write(out.decode("ascii")) + if proc.returncode: + print(out) + print( + strftime("%H:%M:%S", gmtime()) + "\tFAILED" + "\t" + script + "\twith errno" + "\t" + str(proc.returncode) + ) + out = proc.communicate()[0] + logfile.write(out.decode('ascii')) + if proc.returncode: + print(out) + print( + strftime("%H:%M:%S", gmtime()) + '\tFAILED' + '\t' + script + '\twith errno' + '\t' + str(proc.returncode) + ) + else: + success += 1 + print(strftime("%H:%M:%S", gmtime()) + '\t' + 'PASSED\t' + script) + else: + success += 1 + print(strftime("%H:%M:%S", gmtime()) + '\tSKIPPED\t' + script) return success, n @@ -267,13 +290,15 @@ def run_all(nodist=False, nostream=False): success += s num += n if success != num: - print('{}/{} examples passed/skipped, ' - '{} failed'.format(success, num, num - success)) - print('Error(s) occured. Logs can be found in ' + logdir) + print( + "{}/{} examples passed/skipped, " + "{} failed".format(success, num, num - success) + ) + print("Error(s) occured. Logs can be found in " + logdir) return 4711 - print('{}/{} examples passed/skipped'.format(success, num)) + print("{}/{} examples passed/skipped".format(success, num)) return 0 -if __name__ == '__main__': - sys.exit(run_all('nodist' in sys.argv or not __has_dist__, 'nostream' in sys.argv)) +if __name__ == "__main__": + sys.exit(run_all("nodist" in sys.argv or not __has_dist__, "nostream" in sys.argv)) diff --git a/tests/test_examples_sklearnex.py b/tests/test_examples_sklearnex.py index 2b7e8e4f54..92e752a29f 100644 --- a/tests/test_examples_sklearnex.py +++ b/tests/test_examples_sklearnex.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,27 +12,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import os import subprocess import unittest + from daal4py.sklearn._utils import get_daal_version + test_path = os.path.abspath(os.path.dirname(__file__)) unittest_data_path = os.path.join(test_path, "unittest_data") -examples_path = os.path.join( - os.path.dirname(test_path), "examples", "sklearnex") +examples_path = os.path.join(os.path.dirname(test_path), "examples", "sklearnex") -print('Testing examples_sklearnex') +print("Testing examples_sklearnex") # First item is major version - 2021, # second is minor+patch - 0110, # third item is status - B sklearnex_version = get_daal_version() -print('oneDAL version:', sklearnex_version) +print("oneDAL version:", sklearnex_version) class TestsklearnexExamples(unittest.TestCase): - '''Class for testing sklernex examples''' + """Class for testing sklernex examples""" + # Get a list of all Python files in the examples directory pass @@ -41,22 +43,28 @@ def test_generator(file): def testit(self): # Run the script and capture its exit code process = subprocess.run( - ['python', os.path.join(examples_path, file)], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - check=True) # nosec + ["python", os.path.join(examples_path, file)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=True, + ) # nosec exit_code = process.returncode # Assert that the exit code is 0 self.assertEqual(exit_code, 0) - setattr(TestsklearnexExamples, 'test_' + os.path.splitext(file)[0], testit) + setattr(TestsklearnexExamples, "test_" + os.path.splitext(file)[0], testit) print("Generating tests for " + os.path.splitext(file)[0]) -files = [f for f in os.listdir(examples_path) if f.endswith(".py") and 'spmd' not in f] +files = [ + f + for f in os.listdir(examples_path) + if f.endswith(".py") and "spmd" not in f and "batch" not in f +] for file in files: test_generator(file) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/test_xgboost_mb.py b/tests/test_xgboost_mb.py new file mode 100644 index 0000000000..cd1c1fb063 --- /dev/null +++ b/tests/test_xgboost_mb.py @@ -0,0 +1,101 @@ +# =============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import importlib.util +import unittest + +import numpy as np +from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split + +import daal4py as d4p +from daal4py import _get__daal_link_version__ as dv +from daal4py.sklearn._utils import daal_check_version + +# First item is major version - 2021, +# second is minor+patch - 0110, +# third item is status - B +daal_version = (int(dv()[0:4]), dv()[10:11], int(dv()[4:8])) +reason = str(((2021, "P", 1))) + " not supported in this library version " +reason += str(daal_version) + + +class XgboostModelBuilder(unittest.TestCase): + @unittest.skipUnless( + all( + [ + hasattr(d4p, "get_gbt_model_from_xgboost"), + hasattr(d4p, "gbt_classification_prediction"), + daal_check_version(((2021, "P", 1))), + ] + ), + reason, + ) + @unittest.skipUnless( + importlib.util.find_spec("xgboost") is not None, + "xgboost library is not installed", + ) + def test_earlystop(self): + import xgboost as xgb + + num_classes = 3 + X, y = make_classification( + n_samples=1000, + n_features=10, + n_informative=3, + n_classes=num_classes, + random_state=42, + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 + ) + + # training parameters setting + params = { + "n_estimators": 100, + "max_bin": 256, + "scale_pos_weight": 2, + "lambda_l2": 1, + "alpha": 0.9, + "max_depth": 8, + "num_leaves": 2**8, + "verbosity": 0, + "objective": "multi:softproba", + "learning_rate": 0.3, + "num_class": num_classes, + "early_stopping_rounds": 5, + } + + xgb_clf = xgb.XGBClassifier(**params) + xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) + booster = xgb_clf.get_booster() + + xgb_prediction = xgb_clf.predict(X_test) + xgb_proba = xgb_clf.predict_proba(X_test) + xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(y_test)) + + daal_model = d4p.mb.convert_model(booster) + + daal_prediction = daal_model.predict(X_test) + daal_proba = daal_model.predict_proba(X_test) + daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(y_test)) + + self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) + self.assertTrue(np.allclose(xgb_proba, daal_proba)) + + +if __name__ == "__main__": + unittest.main()