diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..fa01e179 --- /dev/null +++ b/.flake8 @@ -0,0 +1,40 @@ +[flake8] +max-line-length = 88 +inline-quotes = " +ignore = + C408 + C417 + E121 + E123 + E126 + E203 + E226 + E24 + E704 + W503 + W504 + W605 + I + N + B001 + B002 + B003 + B004 + B005 + B007 + B008 + B009 + B010 + B011 + B012 + B013 + B014 + B015 + B016 + B017 +avoid-escape = no +# Error E731 is ignored because of the migration from YAPF to Black. +# See https://github.com/ray-project/ray/issues/21315 for more information. +per-file-ignores = + rllib/evaluation/worker_set.py:E731 + rllib/evaluation/sampler.py:E731 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 6fef9352..e0210637 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -20,12 +20,13 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install codecov - python -m pip install -U yapf==0.23.0 flake8==3.7.7 flake8-comprehensions flake8-quotes==2.0.0 + if [ -f requirements/lint-requirements.txt ]; then python -m pip install -r requirements/lint-requirements.txt; fi - name: Print environment info run: | ./xgboost_ray/tests/env_info.sh - name: Run format script run: | + ls -alp ./format.sh --all test_linux_ray_master: @@ -52,7 +53,7 @@ jobs: python -m pip install --upgrade pip python -m pip install codecov python -m pip install -U ${{ matrix.ray-wheel }} - if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi + if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi - name: Install package run: | python -m pip install -e . @@ -89,7 +90,7 @@ jobs: python -m pip install --upgrade pip python -m pip install codecov python -m pip install -U ray - if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi + if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi - name: Install package run: | python -m pip install -e . @@ -128,7 +129,7 @@ jobs: python -m pip install --upgrade pip python -m pip install codecov python -m pip install -U ray - if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi + if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi - name: Uninstall unavailable dependencies # Disables modin and Ray Tune (via tabulate) run: | @@ -179,7 +180,7 @@ jobs: python -m pip install --upgrade pip python -m pip install codecov python -m pip install -U ${{ matrix.ray-wheel }} - if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi + if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi - name: Install Ubuntu system dependencies run: | sudo apt-get install -y --no-install-recommends ninja-build @@ -234,7 +235,7 @@ jobs: python -m pip install --upgrade pip python -m pip install codecov python -m pip install -U ray - if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi + if [ -f requirements/test-requirements.txt ]; then python -m pip install -r requirements/test-requirements.txt; fi - name: Install package run: | python -m pip install -e . diff --git a/.style.yapf b/.style.yapf deleted file mode 100644 index a7f6ecef..00000000 --- a/.style.yapf +++ /dev/null @@ -1,6 +0,0 @@ -[style] -based_on_style=pep8 -allow_split_before_dict_value=False -join_multiple_lines=False -allow_multiline_lambdas=True - diff --git a/format.sh b/format.sh index 87dae3bb..29e30821 100755 --- a/format.sh +++ b/format.sh @@ -1,40 +1,55 @@ #!/usr/bin/env bash -# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. +# Black + Clang formatter (if installed). This script formats all changed files from the last mergebase. # You are encouraged to run this locally before pushing changes for review. # Cause the script to exit if a single command fails -set -eo pipefail +set -euo pipefail -FLAKE8_VERSION_REQUIRED="3.7.7" -YAPF_VERSION_REQUIRED="0.23.0" +FLAKE8_VERSION_REQUIRED="3.9.1" +BLACK_VERSION_REQUIRED="22.10.0" +SHELLCHECK_VERSION_REQUIRED="0.7.1" +ISORT_VERSION_REQUIRED="5.10.1" -check_command_exist() { +check_python_command_exist() { VERSION="" case "$1" in - yapf) - VERSION=$YAPF_VERSION_REQUIRED + black) + VERSION=$BLACK_VERSION_REQUIRED ;; flake8) VERSION=$FLAKE8_VERSION_REQUIRED ;; + isort) + VERSION=$ISORT_VERSION_REQUIRED + ;; *) echo "$1 is not a required dependency" exit 1 esac - if ! [ -x "$(command -v $1)" ]; then - echo "$1 not installed. pip install $1==$VERSION" + if ! [ -x "$(command -v "$1")" ]; then + echo "$1 not installed. Install the python package with: pip install $1==$VERSION" exit 1 fi } -check_command_exist yapf -check_command_exist flake8 +check_docstyle() { + echo "Checking docstyle..." + violations=$(git ls-files | grep '.py$' | xargs grep -E '^[ ]+[a-z_]+ ?\([a-zA-Z]+\): ' | grep -v 'str(' | grep -v noqa || true) + if [[ -n "$violations" ]]; then + echo + echo "=== Found Ray docstyle violations ===" + echo "$violations" + echo + echo "Per the Google pydoc style, omit types from pydoc args as they are redundant: https://docs.ray.io/en/latest/ray-contribute/getting-involved.html#code-style " + echo "If this is a false positive, you can add a '# noqa' comment to the line to ignore." + exit 1 + fi + return 0 +} -ver=$(yapf --version) -if ! echo $ver | grep -q 0.23.0; then - echo "Wrong YAPF version installed: 0.23.0 is required, not $ver. $YAPF_DOWNLOAD_COMMAND_MSG" - exit 1 -fi +check_python_command_exist black +check_python_command_exist flake8 +check_python_command_exist isort # this stops git rev-parse from failing if we run this from the .git directory builtin cd "$(dirname "${BASH_SOURCE:-$0}")" @@ -42,58 +57,174 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")" ROOT="$(git rev-parse --show-toplevel)" builtin cd "$ROOT" || exit 1 -# Add the upstream remote if it doesn't exist -if ! git remote -v | grep -q upstream; then - git remote add 'upstream' 'https://github.com/ray-project/xgboost_ray.git' +# NOTE(edoakes): black version differs based on installation method: +# Option 1) 'black, 21.12b0 (compiled: no)' +# Option 2) 'black, version 21.12b0' +# For newer versions (at least 22.10.0), a second line is printed which must be dropped: +# +# black, 22.10.0 (compiled: yes) +# Python (CPython) 3.9.13 +BLACK_VERSION_STR=$(black --version) +if [[ "$BLACK_VERSION_STR" == *"compiled"* ]] +then + BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $2}') +else + BLACK_VERSION=$(echo "$BLACK_VERSION_STR" | head -n 1 | awk '{print $3}') fi - -FLAKE8_VERSION=$(flake8 --version | awk '{print $1}') -YAPF_VERSION=$(yapf --version | awk '{print $2}') +FLAKE8_VERSION=$(flake8 --version | head -n 1 | awk '{print $1}') +ISORT_VERSION=$(isort --version | grep VERSION | awk '{print $2}') # params: tool name, tool version, required version tool_version_check() { - if [[ $2 != $3 ]]; then + if [ "$2" != "$3" ]; then echo "WARNING: Ray uses $1 $3, You currently are using $2. This might generate different results." fi } -tool_version_check "flake8" $FLAKE8_VERSION $FLAKE8_VERSION_REQUIRED -tool_version_check "yapf" $YAPF_VERSION $YAPF_VERSION_REQUIRED +tool_version_check "flake8" "$FLAKE8_VERSION" "$FLAKE8_VERSION_REQUIRED" +tool_version_check "black" "$BLACK_VERSION" "$BLACK_VERSION_REQUIRED" +tool_version_check "isort" "$ISORT_VERSION" "$ISORT_VERSION_REQUIRED" + +if command -v shellcheck >/dev/null; then + SHELLCHECK_VERSION=$(shellcheck --version | awk '/^version:/ {print $2}') + tool_version_check "shellcheck" "$SHELLCHECK_VERSION" "$SHELLCHECK_VERSION_REQUIRED" +else + echo "INFO: Ray uses shellcheck for shell scripts, which is not installed. You may install shellcheck=$SHELLCHECK_VERSION_REQUIRED with your system package manager." +fi -if which clang-format >/dev/null; then +if command -v clang-format >/dev/null; then CLANG_FORMAT_VERSION=$(clang-format --version | awk '{print $3}') - tool_version_check "clang-format" $CLANG_FORMAT_VERSION "7.0.0" + tool_version_check "clang-format" "$CLANG_FORMAT_VERSION" "12.0.0" else echo "WARNING: clang-format is not installed!" fi -# Only fetch master since that's the branch we're diffing against. -git fetch upstream master || true +if [[ $(flake8 --version) != *"flake8_quotes"* ]]; then + echo "WARNING: Ray uses flake8 with flake8_quotes. Might error without it. Install with: pip install flake8-quotes" +fi -YAPF_FLAGS=( - '--style' "$ROOT/.style.yapf" - '--recursive' - '--parallel' +if [[ $(flake8 --version) != *"flake8-bugbear"* ]]; then + echo "WARNING: Ray uses flake8 with flake8-bugbear. Might error without it. Install with: pip install flake8-bugbear" +fi + +SHELLCHECK_FLAGS=( + --exclude=1090 # "Can't follow non-constant source. Use a directive to specify location." + --exclude=1091 # "Not following {file} due to some error" + --exclude=2207 # "Prefer mapfile or read -a to split command output (or quote to avoid splitting)." -- these aren't compatible with macOS's old Bash +) + + +BLACK_EXCLUDES=( + '--force-exclude' + 'python/ray/cloudpickle/*|'` + `'python/build/*|'` + `'python/ray/core/src/ray/gcs/*|'` + `'python/ray/thirdparty_files/*|'` + `'python/ray/_private/thirdparty/*|'` + `'python/ray/serve/tests/test_config_files/syntax_error\.py' ) -YAPF_EXCLUDES=( - # '--exclude' 'python/ray/cloudpickle/*' - # '--exclude' 'python/build/*' - # '--exclude' 'python/ray/core/src/ray/gcs/*' - # '--exclude' 'python/ray/thirdparty_files/*' +GIT_LS_EXCLUDES=( + ':(exclude)python/ray/cloudpickle/' + ':(exclude)python/ray/_private/runtime_env/_clonevirtualenv.py' ) +# TODO(barakmich): This should be cleaned up. I've at least excised the copies +# of these arguments to this location, but the long-term answer is to actually +# make a flake8 config file +FLAKE8_PYX_IGNORES="--ignore=C408,E121,E123,E126,E211,E225,E226,E227,E24,E704,E999,W503,W504,W605" + +shellcheck_scripts() { + shellcheck "${SHELLCHECK_FLAGS[@]}" "$@" +} + # Format specified files -format() { - yapf --in-place "${YAPF_FLAGS[@]}" -- "$@" +format_files() { + local shell_files=() python_files=() bazel_files=() + + local name + for name in "$@"; do + local base="${name%.*}" + local suffix="${name#"${base}"}" + + local shebang="" + read -r shebang < "${name}" || true + case "${shebang}" in + '#!'*) + shebang="${shebang#/usr/bin/env }" + shebang="${shebang%% *}" + shebang="${shebang##*/}" + ;; + esac + + if [ "${base}" = "WORKSPACE" ] || [ "${base}" = "BUILD" ] || [ "${suffix}" = ".BUILD" ] || [ "${suffix}" = ".bazel" ] || [ "${suffix}" = ".bzl" ]; then + bazel_files+=("${name}") + elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang#python}" ] || [ "${suffix}" != "${suffix#.py}" ]; then + python_files+=("${name}") + elif [ -z "${suffix}" ] && [ "${shebang}" != "${shebang%sh}" ] || [ "${suffix}" != "${suffix#.sh}" ]; then + shell_files+=("${name}") + else + echo "error: failed to determine file type: ${name}" 1>&2 + return 1 + fi + done + + if [ 0 -lt "${#python_files[@]}" ]; then + isort "${python_files[@]}" + black "${python_files[@]}" + fi + + if command -v shellcheck >/dev/null; then + if shellcheck --shell=sh --format=diff - < /dev/null; then + if [ 0 -lt "${#shell_files[@]}" ]; then + local difference + difference="$(shellcheck_scripts --format=diff "${shell_files[@]}" || true && printf "-")" + difference="${difference%-}" + printf "%s" "${difference}" | patch -p1 + fi + else + echo "error: this version of shellcheck does not support diffs" + fi + fi +} + +format_all_scripts() { + command -v flake8 &> /dev/null; + HAS_FLAKE8=$? + + # Run isort before black to fix imports and let black deal with file format. + echo "$(date)" "isort...." + git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \ + isort + echo "$(date)" "Black...." + git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \ + black "${BLACK_EXCLUDES[@]}" + if [ $HAS_FLAKE8 ]; then + echo "$(date)" "Flake8...." + git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 5 \ + flake8 --config=.flake8 + fi + + if command -v shellcheck >/dev/null; then + local shell_files non_shell_files + non_shell_files=($(git ls-files -- ':(exclude)*.sh')) + shell_files=($(git ls-files -- '*.sh')) + if [ 0 -lt "${#non_shell_files[@]}" ]; then + shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true)) + fi + if [ 0 -lt "${#shell_files[@]}" ]; then + echo "$(date)" "shellcheck scripts...." + shellcheck_scripts "${shell_files[@]}" + fi + fi } # Format files that differ from main branch. Ignores dirs that are not slated # for autoformat yet. format_changed() { # The `if` guard ensures that the list of filenames is not empty, which - # could cause yapf to receive 0 positional arguments, making it hang - # waiting for STDIN. + # could cause the formatter to receive 0 positional arguments, making + # Black error. # # `diff-filter=ACRM` and $MERGEBASE is to ensure we only format files that # exist on both branches. @@ -101,40 +232,73 @@ format_changed() { if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ - yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" + isort + fi + + if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.py' &>/dev/null; then + git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ + black "${BLACK_EXCLUDES[@]}" if which flake8 >/dev/null; then git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \ - flake8 --inline-quotes '"' --no-avoid-escape --ignore=N,I,C408,E121,E123,E126,E226,E24,E704,W503,W504,W605 + flake8 --config=.flake8 fi fi if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' &>/dev/null; then if which flake8 >/dev/null; then git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' | xargs -P 5 \ - flake8 --inline-quotes '"' --no-avoid-escape --ignore=N,I,C408,E121,E123,E126,E211,E225,E226,E227,E24,E704,E999,W503,W504,W605 + flake8 --config=.flake8 "$FLAKE8_PYX_IGNORES" + fi + fi + + if which clang-format >/dev/null; then + if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.cc' '*.h' &>/dev/null; then + git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.cc' '*.h' | xargs -P 5 \ + clang-format -i fi fi -} -# Format all files, and print the diff to stdout for travis. -format_all() { - yapf --diff "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" xgboost_ray - flake8 --inline-quotes '"' --no-avoid-escape --ignore=N,I,C408,E121,E123,E126,E211,E225,E226,E227,E24,E704,E999,W503,W504,W605 xgboost_ray + if command -v shellcheck >/dev/null; then + local shell_files non_shell_files + non_shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- ':(exclude)*.sh')) + shell_files=($(git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.sh')) + if [ 0 -lt "${#non_shell_files[@]}" ]; then + shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true)) + fi + if [ 0 -lt "${#shell_files[@]}" ]; then + shellcheck_scripts "${shell_files[@]}" + fi + fi } # This flag formats individual files. --files *must* be the first command line # arg to use this option. -if [[ "$1" == '--files' ]]; then - format "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is formatted. -elif [[ "$1" == '--all' ]]; then - format_all +if [ "${1-}" == '--files' ]; then + format_files "${@:2}" +# If `--all` or `--scripts` are passed, then any further arguments are ignored. +# Format the entire python directory and other scripts. +elif [ "${1-}" == '--all-scripts' ]; then + format_all_scripts "${@}" + if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi +# Format the all Python, C++, Java and other script files. +elif [ "${1-}" == '--all' ]; then + format_all_scripts "${@}" + if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi else + # Add the upstream remote if it doesn't exist + if ! git remote -v | grep -q upstream; then + git remote add 'upstream' 'https://github.com/ray-project/xgboost_ray.git' + fi + + # Only fetch master since that's the branch we're diffing against. + git fetch upstream master || true + # Format only the files that changed in last commit. format_changed fi +check_docstyle + if ! git diff --quiet &>/dev/null; then echo 'Reformatted changed files. Please review and stage the changes.' echo 'Files updated:' @@ -144,5 +308,3 @@ if ! git diff --quiet &>/dev/null; then exit 1 fi - -echo 'Linting check finished successfully.' \ No newline at end of file diff --git a/requirements/lint-requirements.txt b/requirements/lint-requirements.txt new file mode 100644 index 00000000..f7a1df17 --- /dev/null +++ b/requirements/lint-requirements.txt @@ -0,0 +1,7 @@ +flake8==3.9.1 +flake8-comprehensions==3.10.1 +flake8-quotes==2.0.0 +flake8-bugbear==21.9.2 +black==22.10.0 +isort==5.10.1 +importlib-metadata==4.13.0 diff --git a/requirements-test.txt b/requirements/test-requirements.txt similarity index 64% rename from requirements-test.txt rename to requirements/test-requirements.txt index 40055799..3b698191 100644 --- a/requirements-test.txt +++ b/requirements/test-requirements.txt @@ -1,7 +1,3 @@ -flake8==3.7.7 -flake8-comprehensions -flake8-quotes==2.0.0 -yapf==0.23.0 packaging petastorm pytest diff --git a/run_ci_examples.sh b/run_ci_examples.sh index f03d7707..1283c735 100755 --- a/run_ci_examples.sh +++ b/run_ci_examples.sh @@ -1,3 +1,4 @@ +#!/bin/bash set -e TUNE=1 diff --git a/run_ci_tests.sh b/run_ci_tests.sh index 70e13a1d..4c3a52cc 100755 --- a/run_ci_tests.sh +++ b/run_ci_tests.sh @@ -1,3 +1,4 @@ +#!/bin/bash TUNE=1 for i in "$@" diff --git a/setup.py b/setup.py index a2804e80..762e7de9 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,11 @@ "distributed computing framework Ray.", url="https://github.com/ray-project/xgboost_ray", install_requires=[ - "ray>=1.10", "numpy>=1.16", "pandas", "wrapt>=1.12.1", "xgboost>=0.90", - 'packaging' - ]) + "ray>=1.10", + "numpy>=1.16", + "pandas", + "wrapt>=1.12.1", + "xgboost>=0.90", + "packaging", + ], +) diff --git a/xgboost_ray/__init__.py b/xgboost_ray/__init__.py index e064ef24..9e4565c4 100644 --- a/xgboost_ray/__init__.py +++ b/xgboost_ray/__init__.py @@ -1,19 +1,41 @@ -from xgboost_ray.main import RayParams, train, predict -from xgboost_ray.matrix import RayDMatrix, RayDeviceQuantileDMatrix,\ - RayFileType, RayShardingMode, \ - Data, combine_data +from xgboost_ray.main import RayParams, predict, train +from xgboost_ray.matrix import ( + Data, + RayDeviceQuantileDMatrix, + RayDMatrix, + RayFileType, + RayShardingMode, + combine_data, +) + # workaround for legacy xgboost==0.9.0 try: - from xgboost_ray.sklearn import RayXGBClassifier, RayXGBRegressor, \ - RayXGBRFClassifier, RayXGBRFRegressor, RayXGBRanker + from xgboost_ray.sklearn import ( + RayXGBClassifier, + RayXGBRanker, + RayXGBRegressor, + RayXGBRFClassifier, + RayXGBRFRegressor, + ) except ImportError: pass __version__ = "0.1.16" __all__ = [ - "__version__", "RayParams", "RayDMatrix", "RayDeviceQuantileDMatrix", - "RayFileType", "RayShardingMode", "Data", "combine_data", "train", - "predict", "RayXGBClassifier", "RayXGBRegressor", "RayXGBRFClassifier", - "RayXGBRFRegressor", "RayXGBRanker" + "__version__", + "RayParams", + "RayDMatrix", + "RayDeviceQuantileDMatrix", + "RayFileType", + "RayShardingMode", + "Data", + "combine_data", + "train", + "predict", + "RayXGBClassifier", + "RayXGBRegressor", + "RayXGBRFClassifier", + "RayXGBRFRegressor", + "RayXGBRanker", ] diff --git a/xgboost_ray/callback.py b/xgboost_ray/callback.py index a08aa39a..50f20488 100644 --- a/xgboost_ray/callback.py +++ b/xgboost_ray/callback.py @@ -1,9 +1,9 @@ +import os from abc import ABC -from typing import Dict, Sequence, TYPE_CHECKING, Any, Union +from typing import TYPE_CHECKING, Any, Dict, Sequence, Union -import os import pandas as pd -from ray.util.annotations import PublicAPI, DeveloperAPI +from ray.util.annotations import DeveloperAPI, PublicAPI if TYPE_CHECKING: from xgboost_ray.main import RayXGBoostActor @@ -29,27 +29,32 @@ class DistributedCallback(ABC): def on_init(self, actor: "RayXGBoostActor", *args, **kwargs): pass - def before_data_loading(self, actor: "RayXGBoostActor", data: "RayDMatrix", - *args, **kwargs): + def before_data_loading( + self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs + ): pass - def after_data_loading(self, actor: "RayXGBoostActor", data: "RayDMatrix", - *args, **kwargs): + def after_data_loading( + self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs + ): pass def before_train(self, actor: "RayXGBoostActor", *args, **kwargs): pass - def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, - **kwargs): + def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, **kwargs): pass def before_predict(self, actor: "RayXGBoostActor", *args, **kwargs): pass - def after_predict(self, actor: "RayXGBoostActor", - predictions: Union[pd.Series, pd.DataFrame], *args, - **kwargs): + def after_predict( + self, + actor: "RayXGBoostActor", + predictions: Union[pd.Series, pd.DataFrame], + *args, + **kwargs + ): pass @@ -62,13 +67,15 @@ def on_init(self, actor: "RayXGBoostActor", *args, **kwargs): for callback in self.callbacks: callback.on_init(actor, *args, **kwargs) - def before_data_loading(self, actor: "RayXGBoostActor", data: "RayDMatrix", - *args, **kwargs): + def before_data_loading( + self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs + ): for callback in self.callbacks: callback.before_data_loading(actor, data, *args, **kwargs) - def after_data_loading(self, actor: "RayXGBoostActor", data: "RayDMatrix", - *args, **kwargs): + def after_data_loading( + self, actor: "RayXGBoostActor", data: "RayDMatrix", *args, **kwargs + ): for callback in self.callbacks: callback.after_data_loading(actor, data, *args, **kwargs) @@ -76,8 +83,7 @@ def before_train(self, actor: "RayXGBoostActor", *args, **kwargs): for callback in self.callbacks: callback.before_train(actor, *args, **kwargs) - def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, - **kwargs): + def after_train(self, actor: "RayXGBoostActor", result_dict: Dict, *args, **kwargs): for callback in self.callbacks: callback.after_train(actor, result_dict, *args, **kwargs) @@ -85,9 +91,13 @@ def before_predict(self, actor: "RayXGBoostActor", *args, **kwargs): for callback in self.callbacks: callback.before_predict(actor, *args, **kwargs) - def after_predict(self, actor: "RayXGBoostActor", - predictions: Union[pd.Series, pd.DataFrame], *args, - **kwargs): + def after_predict( + self, + actor: "RayXGBoostActor", + predictions: Union[pd.Series, pd.DataFrame], + *args, + **kwargs + ): for callback in self.callbacks: callback.after_predict(actor, predictions, *args, **kwargs) diff --git a/xgboost_ray/compat/__init__.py b/xgboost_ray/compat/__init__.py index 2a75b9ea..81db4b2b 100644 --- a/xgboost_ray/compat/__init__.py +++ b/xgboost_ray/compat/__init__.py @@ -5,6 +5,7 @@ try: from xgboost.callback import TrainingCallback + LEGACY_CALLBACK = False except ImportError: @@ -23,13 +24,15 @@ def __call__(self, callback_env: "xgb.core.CallbackEnv"): self._before_iteration( model=callback_env.model, epoch=callback_env.iteration, - evals_log=callback_env.evaluation_result_list) + evals_log=callback_env.evaluation_result_list, + ) if hasattr(self, "after_iteration"): self.after_iteration( model=callback_env.model, epoch=callback_env.iteration, - evals_log=callback_env.evaluation_result_list) + evals_log=callback_env.evaluation_result_list, + ) def before_training(self, model): pass diff --git a/xgboost_ray/compat/tracker.py b/xgboost_ray/compat/tracker.py index 9e6846e3..5fac513b 100644 --- a/xgboost_ray/compat/tracker.py +++ b/xgboost_ray/compat/tracker.py @@ -17,12 +17,13 @@ # File copied from: # https://github.com/dmlc/xgboost/blob/8760ec48277b345aaaa895b82570c25566fc0503/python-package/xgboost/tracker.py +import logging + # License: # https://github.com/dmlc/xgboost/blob/8760ec48277b345aaaa895b82570c25566fc0503/LICENSE import socket import struct import time -import logging from threading import Thread @@ -41,13 +42,13 @@ def recvall(self, nbytes): chunk = self.sock.recv(min(nbytes - nread, 1024)) nread += len(chunk) res.append(chunk) - return b''.join(res) + return b"".join(res) def recvint(self): - return struct.unpack('@i', self.recvall(4))[0] + return struct.unpack("@i", self.recvall(4))[0] def sendint(self, n): - self.sock.sendall(struct.pack('@i', n)) + self.sock.sendall(struct.pack("@i", n)) def sendstr(self, s): self.sendint(len(s)) @@ -59,7 +60,7 @@ def recvstr(self): # magic number used to verify existence of data -kMagic = 0xff99 +kMagic = 0xFF99 def get_some_ip(host): @@ -67,24 +68,25 @@ def get_some_ip(host): def get_host_ip(hostIP=None): - if hostIP is None or hostIP == 'auto': - hostIP = 'ip' + if hostIP is None or hostIP == "auto": + hostIP = "ip" - if hostIP == 'dns': + if hostIP == "dns": hostIP = socket.getfqdn() - elif hostIP == 'ip': + elif hostIP == "ip": from socket import gaierror + try: hostIP = socket.gethostbyname(socket.getfqdn()) except gaierror: logging.debug( - 'gethostbyname(socket.getfqdn()) failed... trying on hostname()' + "gethostbyname(socket.getfqdn()) failed... trying on hostname()" ) hostIP = socket.gethostbyname(socket.gethostname()) if hostIP.startswith("127."): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # doesn't have to be reachable - s.connect(('10.255.255.255', 1)) + s.connect(("10.255.255.255", 1)) hostIP = s.getsockname()[0] return hostIP @@ -99,8 +101,7 @@ def __init__(self, sock, s_addr): self.sock = slave self.host = get_some_ip(s_addr[0]) magic = slave.recvint() - assert magic == kMagic, 'invalid magic number=%d from %s' % (magic, - self.host) + assert magic == kMagic, "invalid magic number=%d from %s" % (magic, self.host) slave.sendint(kMagic) self.rank = slave.recvint() self.world_size = slave.recvint() @@ -112,7 +113,7 @@ def __init__(self, sock, s_addr): def decide_rank(self, job_map): if self.rank >= 0: return self.rank - if self.jobid != 'NULL' and self.jobid in job_map: + if self.jobid != "NULL" and self.jobid in job_map: return job_map[self.jobid] return -1 @@ -197,7 +198,7 @@ def __init__(self, hostIP, nslave, port=9091, port_end=9999): self.start_time = None self.end_time = None self.nslave = nslave - logging.info('start listen on %s:%d', hostIP, self.port) + logging.info("start listen on %s:%d", hostIP, self.port) def __del__(self): self.sock.close() @@ -219,10 +220,7 @@ def slave_envs(self): get enviroment variables for slaves can be passed in as args or envs """ - return { - 'DMLC_TRACKER_URI': self.hostIP, - 'DMLC_TRACKER_PORT': self.port - } + return {"DMLC_TRACKER_URI": self.hostIP, "DMLC_TRACKER_PORT": self.port} def get_tree(self, nslave): tree_map = {} @@ -308,20 +306,20 @@ def accept_slaves(self, nslave): while len(shutdown) != nslave: fd, s_addr = self.sock.accept() s = SlaveEntry(fd, s_addr) - if s.cmd == 'print': + if s.cmd == "print": msg = s.sock.recvstr() print(msg.strip(), flush=True) continue - if s.cmd == 'shutdown': + if s.cmd == "shutdown": assert s.rank >= 0 and s.rank not in shutdown assert s.rank not in wait_conn shutdown[s.rank] = s - logging.debug('Received %s signal from %d', s.cmd, s.rank) + logging.debug("Received %s signal from %d", s.cmd, s.rank) continue - assert s.cmd == 'start' or s.cmd == 'recover' + assert s.cmd == "start" or s.cmd == "recover" # lazily initialize the slaves if tree_map is None: - assert s.cmd == 'start' + assert s.cmd == "start" if s.world_size > 0: nslave = s.world_size tree_map, parent_map, ring_map = self.get_link_map(nslave) @@ -329,7 +327,7 @@ def accept_slaves(self, nslave): todo_nodes = list(range(nslave)) else: assert s.world_size == -1 or s.world_size == nslave - if s.cmd == 'recover': + if s.cmd == "recover": assert s.rank >= 0 rank = s.decide_rank(job_map) @@ -341,28 +339,31 @@ def accept_slaves(self, nslave): pending.sort(key=lambda x: x.host) for s in pending: rank = todo_nodes.pop(0) - if s.jobid != 'NULL': + if s.jobid != "NULL": job_map[s.jobid] = rank - s.assign_rank(rank, wait_conn, tree_map, parent_map, - ring_map) + s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map) if s.wait_accept > 0: wait_conn[rank] = s logging.debug( - 'Received %s signal from %s; assign rank %d', - s.cmd, s.host, s.rank) + "Received %s signal from %s; assign rank %d", + s.cmd, + s.host, + s.rank, + ) if not todo_nodes: - logging.info('@tracker All of %d nodes getting started', - nslave) + logging.info("@tracker All of %d nodes getting started", nslave) self.start_time = time.time() else: s.assign_rank(rank, wait_conn, tree_map, parent_map, ring_map) - logging.debug('Received %s signal from %d', s.cmd, s.rank) + logging.debug("Received %s signal from %d", s.cmd, s.rank) if s.wait_accept > 0: wait_conn[rank] = s - logging.info('@tracker All nodes finishes job') + logging.info("@tracker All nodes finishes job") self.end_time = time.time() - logging.info('@tracker %s secs between node start and job finish', - str(self.end_time - self.start_time)) + logging.info( + "@tracker %s secs between node start and job finish", + str(self.end_time - self.start_time), + ) def start(self, nslave): def run(): diff --git a/xgboost_ray/data_sources/__init__.py b/xgboost_ray/data_sources/__init__.py index a834014f..bddc6e41 100644 --- a/xgboost_ray/data_sources/__init__.py +++ b/xgboost_ray/data_sources/__init__.py @@ -1,21 +1,39 @@ +from xgboost_ray.data_sources.csv import CSV +from xgboost_ray.data_sources.dask import Dask from xgboost_ray.data_sources.data_source import DataSource, RayFileType +from xgboost_ray.data_sources.modin import Modin from xgboost_ray.data_sources.numpy import Numpy +from xgboost_ray.data_sources.object_store import ObjectStore from xgboost_ray.data_sources.pandas import Pandas -from xgboost_ray.data_sources.modin import Modin -from xgboost_ray.data_sources.dask import Dask -from xgboost_ray.data_sources.petastorm import Petastorm -from xgboost_ray.data_sources.csv import CSV from xgboost_ray.data_sources.parquet import Parquet -from xgboost_ray.data_sources.object_store import ObjectStore -from xgboost_ray.data_sources.ray_dataset import RayDataset from xgboost_ray.data_sources.partitioned import Partitioned +from xgboost_ray.data_sources.petastorm import Petastorm +from xgboost_ray.data_sources.ray_dataset import RayDataset data_sources = [ - Numpy, Pandas, Partitioned, Modin, Dask, Petastorm, CSV, Parquet, - ObjectStore, RayDataset + Numpy, + Pandas, + Partitioned, + Modin, + Dask, + Petastorm, + CSV, + Parquet, + ObjectStore, + RayDataset, ] __all__ = [ - "DataSource", "RayFileType", "Numpy", "Pandas", "Modin", "Dask", - "Petastorm", "CSV", "Parquet", "ObjectStore", "RayDataset", "Partitioned" + "DataSource", + "RayFileType", + "Numpy", + "Pandas", + "Modin", + "Dask", + "Petastorm", + "CSV", + "Parquet", + "ObjectStore", + "RayDataset", + "Partitioned", ] diff --git a/xgboost_ray/data_sources/_distributed.py b/xgboost_ray/data_sources/_distributed.py index 11086880..c024be66 100644 --- a/xgboost_ray/data_sources/_distributed.py +++ b/xgboost_ray/data_sources/_distributed.py @@ -1,7 +1,7 @@ import itertools import math from collections import defaultdict -from typing import Dict, Any, Sequence +from typing import Any, Dict, Sequence import ray from ray.actor import ActorHandle @@ -13,16 +13,17 @@ def get_actor_rank_ips(actors: Sequence[ActorHandle]) -> Dict[int, str]: # Build a dict mapping actor ranks to their IP addresses actor_rank_ips: Dict[int, str] = dict( enumerate( - ray.get([ - actor.ip.remote() if actor is not None else no_obj - for actor in actors - ]))) + ray.get( + [actor.ip.remote() if actor is not None else no_obj for actor in actors] + ) + ) + ) return actor_rank_ips def assign_partitions_to_actors( - ip_to_parts: Dict[int, Any], - actor_rank_ips: Dict[int, str]) -> Dict[int, Sequence[Any]]: + ip_to_parts: Dict[int, Any], actor_rank_ips: Dict[int, str] +) -> Dict[int, Sequence[Any]]: """Assign partitions from a distributed dataframe to actors. This function collects distributed partitions and evenly distributes @@ -72,8 +73,7 @@ def assign_partitions_to_actors( num_parts_left_on_ip = len(ip_to_parts[actor_ip]) num_actor_parts = len(actor_to_partitions[rank]) - if num_parts_left_on_ip > 0 and \ - num_actor_parts < max_parts_per_actor: + if num_parts_left_on_ip > 0 and num_actor_parts < max_parts_per_actor: if num_actor_parts >= min_parts_per_actor: # Only allow up to `num_actors_with_max_parts actors to # have the maximum number of partitions assigned. @@ -106,6 +106,7 @@ def assign_partitions_to_actors( raise RuntimeError( "There are still partitions left to assign, but no actor " "has capacity for more. This is probably a bug. Please go " - "to https://github.com/ray-project/xgboost_ray to report it.") + "to https://github.com/ray-project/xgboost_ray to report it." + ) return actor_to_partitions diff --git a/xgboost_ray/data_sources/csv.py b/xgboost_ray/data_sources/csv.py index 338a772b..4c080b0f 100644 --- a/xgboost_ray/data_sources/csv.py +++ b/xgboost_ray/data_sources/csv.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Sequence, Iterable, Union +from typing import Any, Iterable, Optional, Sequence, Union import pandas as pd @@ -8,12 +8,12 @@ class CSV(DataSource): """Read one or many CSV files.""" + supports_central_loading = True supports_distributed_loading = True @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return filetype == RayFileType.CSV @staticmethod @@ -23,10 +23,12 @@ def get_filetype(data: Any) -> Optional[RayFileType]: return None @staticmethod - def load_data(data: Union[str, Sequence[str]], - ignore: Optional[Sequence[str]] = None, - indices: Optional[Sequence[int]] = None, - **kwargs): + def load_data( + data: Union[str, Sequence[str]], + ignore: Optional[Sequence[str]] = None, + indices: Optional[Sequence[int]] = None, + **kwargs + ): if isinstance(data, Iterable) and not isinstance(data, str): shards = [] diff --git a/xgboost_ray/data_sources/dask.py b/xgboost_ray/data_sources/dask.py index 024f645e..b8e61ca1 100644 --- a/xgboost_ray/data_sources/dask.py +++ b/xgboost_ray/data_sources/dask.py @@ -1,19 +1,21 @@ from collections import defaultdict -from typing import Any, List, Optional, Sequence, Dict, Union, Tuple -import wrapt +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union import pandas as pd - import ray +import wrapt from ray.actor import ActorHandle -from xgboost_ray.data_sources._distributed import \ - assign_partitions_to_actors, get_actor_rank_ips +from xgboost_ray.data_sources._distributed import ( + assign_partitions_to_actors, + get_actor_rank_ips, +) from xgboost_ray.data_sources.data_source import DataSource, RayFileType try: import dask # noqa: F401 from ray.util.dask import ray_dask_get + DASK_INSTALLED = True except ImportError: DASK_INSTALLED = False @@ -27,12 +29,14 @@ def _assert_dask_installed(): "\nFIX THIS by installing dask: `pip install dask`. " "\nPlease also raise an issue on our GitHub: " "https://github.com/ray-project/xgboost_ray as this part of " - "the code should not have been reached.") + "the code should not have been reached." + ) @wrapt.decorator -def ensure_ray_dask_initialized(func: Any, instance: Any, args: List[Any], - kwargs: Any) -> Any: +def ensure_ray_dask_initialized( + func: Any, instance: Any, args: List[Any], kwargs: Any +) -> Any: _assert_dask_installed() dask.config.set(scheduler=ray_dask_get) return func(*args, **kwargs) @@ -47,35 +51,34 @@ class Dask(DataSource): Dask dataframes are stored on multiple actors, making them suitable for distributed loading. """ + supports_central_loading = True supports_distributed_loading = True @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: if not DASK_INSTALLED: return False - from dask.dataframe import DataFrame as DaskDataFrame, \ - Series as DaskSeries + from dask.dataframe import DataFrame as DaskDataFrame + from dask.dataframe import Series as DaskSeries return isinstance(data, (DaskDataFrame, DaskSeries)) @ensure_ray_dask_initialized @staticmethod def load_data( - data: Any, # dask.pandas.DataFrame - ignore: Optional[Sequence[str]] = None, - indices: Optional[Union[Sequence[int], Sequence[int]]] = None, - **kwargs) -> pd.DataFrame: + data: Any, # dask.pandas.DataFrame + ignore: Optional[Sequence[str]] = None, + indices: Optional[Union[Sequence[int], Sequence[int]]] = None, + **kwargs + ) -> pd.DataFrame: _assert_dask_installed() import dask.dataframe as dd - if indices is not None and len(indices) > 0 and isinstance( - indices[0], Tuple): + if indices is not None and len(indices) > 0 and isinstance(indices[0], Tuple): # We got a list of partition IDs belonging to Dask partitions - return dd.concat( - [data.partitions[i] for (i, ) in indices]).compute() + return dd.concat([data.partitions[i] for (i,) in indices]).compute() # Dask does not support iloc() for row selection, so we have to # compute a local pandas dataframe first @@ -93,9 +96,9 @@ def load_data( @staticmethod def convert_to_series(data: Any) -> pd.Series: _assert_dask_installed() - from dask.dataframe import DataFrame as DaskDataFrame, \ - Series as DaskSeries from dask.array import Array as DaskArray + from dask.dataframe import DataFrame as DaskDataFrame + from dask.dataframe import Series as DaskSeries if isinstance(data, DaskDataFrame): return pd.Series(data.compute().squeeze()) @@ -109,9 +112,8 @@ def convert_to_series(data: Any) -> pd.Series: @ensure_ray_dask_initialized @staticmethod def get_actor_shards( - data: Any, # dask.dataframe.DataFrame - actors: Sequence[ActorHandle]) -> \ - Tuple[Any, Optional[Dict[int, Any]]]: + data: Any, actors: Sequence[ActorHandle] # dask.dataframe.DataFrame + ) -> Tuple[Any, Optional[Dict[int, Any]]]: _assert_dask_installed() actor_rank_ips = get_actor_rank_ips(actors) @@ -141,16 +143,15 @@ def get_ip_to_parts(data: Any) -> Dict[int, Sequence[Any]]: # 100% accurate as the map task could get scheduled on a different node # (though Ray tries to keep locality). We need to use that until # ray.state.objects() or something like it is available again. - partition_locations_df = persisted.map_partitions(lambda df: pd.DataFrame( - [ray.get_runtime_context().node_id.hex()])).compute() + partition_locations_df = persisted.map_partitions( + lambda df: pd.DataFrame([ray.get_runtime_context().node_id.hex()]) + ).compute() partition_locations = [ - partition_locations_df[0].iloc[i] - for i in range(partition_locations_df.size) + partition_locations_df[0].iloc[i] for i in range(partition_locations_df.size) ] ip_to_parts = defaultdict(list) - for (obj_name, - pid), obj_ref in dask.base.collections_to_dsk([persisted]).items(): + for (obj_name, pid), obj_ref in dask.base.collections_to_dsk([persisted]).items(): assert obj_name == name if isinstance(obj_ref, ray.ObjectRef): @@ -161,6 +162,6 @@ def get_ip_to_parts(data: Any) -> Dict[int, Sequence[Any]]: ip = "_no_ip" # Pass tuples here (integers can be misinterpreted as row numbers) - ip_to_parts[ip].append((pid, )) + ip_to_parts[ip].append((pid,)) return ip_to_parts diff --git a/xgboost_ray/data_sources/data_source.py b/xgboost_ray/data_sources/data_source.py index 1eea93de..89a2dc33 100644 --- a/xgboost_ray/data_sources/data_source.py +++ b/xgboost_ray/data_sources/data_source.py @@ -1,9 +1,7 @@ -from typing import Any, Optional, Sequence, Tuple, Dict, List, TYPE_CHECKING - from enum import Enum +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple import pandas as pd - from ray.actor import ActorHandle from ray.util.annotations import PublicAPI @@ -14,6 +12,7 @@ @PublicAPI(stability="beta") class RayFileType(Enum): """Enum for different file types (used for overrides).""" + CSV = 1 PARQUET = 2 PETASTORM = 3 @@ -32,17 +31,17 @@ class DataSource: ``DataSource`` classes are not instantiated. Instead, static and class methods are called directly. """ + supports_central_loading = True supports_distributed_loading = False @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: """Check if the supplied data matches this data source. Args: - data (Any): Dataset. - filetype (Optional[RayFileType]): RayFileType of the provided + data: Dataset. + filetype: RayFileType of the provided dataset. Some DataSource implementations might require that this is explicitly set (e.g. if multiple sources can read CSV files). @@ -62,7 +61,7 @@ def get_filetype(data: Any) -> Optional[RayFileType]: is returned. Args: - data (Any): Data set + data: Data set Returns: RayFileType or None. @@ -70,19 +69,21 @@ def get_filetype(data: Any) -> Optional[RayFileType]: return None @staticmethod - def load_data(data: Any, - ignore: Optional[Sequence[str]] = None, - indices: Optional[Sequence[Any]] = None, - **kwargs) -> pd.DataFrame: + def load_data( + data: Any, + ignore: Optional[Sequence[str]] = None, + indices: Optional[Sequence[Any]] = None, + **kwargs + ) -> pd.DataFrame: """ Load data into a pandas dataframe. Ignore specific columns, and optionally select specific indices. Args: - data (Any): Input data - ignore (Optional[Sequence[str]]): Column names to ignore - indices (Optional[Sequence[Any]]): Indices to select. What an + data: Input data + ignore: Column names to ignore + indices: Indices to select. What an index indicates depends on the data source. Returns: @@ -91,13 +92,12 @@ def load_data(data: Any, raise NotImplementedError @staticmethod - def update_feature_names(matrix: "xgb.DMatrix", - feature_names: Optional[List[str]]): + def update_feature_names(matrix: "xgb.DMatrix", feature_names: Optional[List[str]]): """Optionally update feature names before training/prediction Args: - matrix (xgb.DMatrix): xgboost DMatrix object. - feature_names (List[str]): Feature names manually passed to the + matrix: xgboost DMatrix object. + feature_names: Feature names manually passed to the ``RayDMatrix`` object. """ @@ -115,8 +115,9 @@ def convert_to_series(data: Any) -> pd.Series: return data @classmethod - def get_column(cls, data: pd.DataFrame, - column: Any) -> Tuple[pd.Series, Optional[str]]: + def get_column( + cls, data: pd.DataFrame, column: Any + ) -> Tuple[pd.Series, Optional[str]]: """Helper method wrapping around convert to series. This method should usually not be overwritten. @@ -134,13 +135,12 @@ def get_n(data: Any): @staticmethod def get_actor_shards( - data: Any, - actors: Sequence[ActorHandle]) -> \ - Tuple[Any, Optional[Dict[int, Any]]]: + data: Any, actors: Sequence[ActorHandle] + ) -> Tuple[Any, Optional[Dict[int, Any]]]: """Get a dict mapping actor ranks to shards. Args: - data (Any): Data to shard. + data: Data to shard. Returns: Returns a tuple of which the first element indicates the new diff --git a/xgboost_ray/data_sources/modin.py b/xgboost_ray/data_sources/modin.py index 78e229c7..56fe68f3 100644 --- a/xgboost_ray/data_sources/modin.py +++ b/xgboost_ray/data_sources/modin.py @@ -1,28 +1,26 @@ -from typing import Any, Optional, Sequence, Dict, Union, Tuple - from collections import defaultdict -import pandas as pd +from typing import Any, Dict, Optional, Sequence, Tuple, Union +import pandas as pd import ray from ray import ObjectRef from ray.actor import ActorHandle -from xgboost_ray.data_sources._distributed import \ - assign_partitions_to_actors, get_actor_rank_ips +from xgboost_ray.data_sources._distributed import ( + assign_partitions_to_actors, + get_actor_rank_ips, +) from xgboost_ray.data_sources.data_source import DataSource, RayFileType from xgboost_ray.data_sources.object_store import ObjectStore try: import modin # noqa: F401 from modin.config.envvars import Engine + from modin.distributed.dataframe.pandas import unwrap_partitions # noqa: F401 + from modin.pandas import DataFrame as ModinDataFrame # noqa: F401 + from modin.pandas import Series as ModinSeries # noqa: F401 from packaging.version import Version - from modin.pandas import ( # noqa: F401 - DataFrame as ModinDataFrame, # noqa: F401 - Series as ModinSeries # noqa: F401 - ) - from modin.distributed.dataframe.pandas import ( # noqa: F401 - unwrap_partitions # noqa: F401 - ) + MODIN_INSTALLED = Version(modin.__version__) >= Version("0.9.0") # Check if importing the Ray engine leads to errors @@ -43,7 +41,8 @@ def _assert_modin_installed(): "supported by modin." "\nPlease also raise an issue on our GitHub: " "https://github.com/ray-project/xgboost_ray as this part of " - "the code should not have been reached.") + "the code should not have been reached." + ) class Modin(DataSource): @@ -55,36 +54,36 @@ class Modin(DataSource): Modin dataframes are stored on multiple actors, making them suitable for distributed loading. """ + supports_central_loading = True supports_distributed_loading = True @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: if not MODIN_INSTALLED: return False # Has to be imported again. - from modin.pandas import ( # noqa: F811 - DataFrame as ModinDataFrame, # noqa: F811 - Series as ModinSeries # noqa: F811 - ) + from modin.pandas import DataFrame as ModinDataFrame # noqa: F811 + from modin.pandas import Series as ModinSeries # noqa: F811 return isinstance(data, (ModinDataFrame, ModinSeries)) @staticmethod def load_data( - data: Any, # modin.pandas.DataFrame - ignore: Optional[Sequence[str]] = None, - indices: Optional[Union[Sequence[int], Sequence[ - ObjectRef]]] = None, - **kwargs) -> pd.DataFrame: + data: Any, # modin.pandas.DataFrame + ignore: Optional[Sequence[str]] = None, + indices: Optional[Union[Sequence[int], Sequence[ObjectRef]]] = None, + **kwargs + ) -> pd.DataFrame: _assert_modin_installed() - if indices is not None and len(indices) > 0 and isinstance( - indices[0], ObjectRef): + if ( + indices is not None + and len(indices) > 0 + and isinstance(indices[0], ObjectRef) + ): # We got a list of ObjectRefs belonging to Modin partitions - return ObjectStore.load_data( - data=indices, indices=None, ignore=ignore) + return ObjectStore.load_data(data=indices, indices=None, ignore=ignore) local_df = data if indices: @@ -101,10 +100,8 @@ def load_data( def convert_to_series(data: Any) -> pd.Series: _assert_modin_installed() # Has to be imported again. - from modin.pandas import ( # noqa: F811 - DataFrame as ModinDataFrame, # noqa: F811 - Series as ModinSeries # noqa: F811 - ) + from modin.pandas import DataFrame as ModinDataFrame # noqa: F811 + from modin.pandas import Series as ModinSeries # noqa: F811 if isinstance(data, ModinDataFrame): return pd.Series(data._to_pandas().squeeze()) @@ -115,15 +112,12 @@ def convert_to_series(data: Any) -> pd.Series: @staticmethod def get_actor_shards( - data: Any, # modin.pandas.DataFrame - actors: Sequence[ActorHandle]) -> \ - Tuple[Any, Optional[Dict[int, Any]]]: + data: Any, actors: Sequence[ActorHandle] # modin.pandas.DataFrame + ) -> Tuple[Any, Optional[Dict[int, Any]]]: _assert_modin_installed() # Has to be imported again. - from modin.distributed.dataframe.pandas import ( # noqa: F811 - unwrap_partitions # noqa: F811 - ) + from modin.distributed.dataframe.pandas import unwrap_partitions # noqa: F811 actor_rank_ips = get_actor_rank_ips(actors) diff --git a/xgboost_ray/data_sources/numpy.py b/xgboost_ray/data_sources/numpy.py index bfd632c5..59916477 100644 --- a/xgboost_ray/data_sources/numpy.py +++ b/xgboost_ray/data_sources/numpy.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Sequence, List, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, List, Optional, Sequence import numpy as np import pandas as pd @@ -14,21 +14,20 @@ class Numpy(DataSource): """Read from numpy arrays.""" @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return isinstance(data, np.ndarray) @staticmethod - def update_feature_names(matrix: "xgb.DMatrix", - feature_names: Optional[List[str]]): + def update_feature_names(matrix: "xgb.DMatrix", feature_names: Optional[List[str]]): # Potentially unset feature names matrix.feature_names = feature_names @staticmethod - def load_data(data: np.ndarray, - ignore: Optional[Sequence[str]] = None, - indices: Optional[Sequence[int]] = None, - **kwargs) -> pd.DataFrame: - local_df = pd.DataFrame( - data, columns=[f"f{i}" for i in range(data.shape[1])]) + def load_data( + data: np.ndarray, + ignore: Optional[Sequence[str]] = None, + indices: Optional[Sequence[int]] = None, + **kwargs, + ) -> pd.DataFrame: + local_df = pd.DataFrame(data, columns=[f"f{i}" for i in range(data.shape[1])]) return Pandas.load_data(local_df, ignore=ignore, indices=indices) diff --git a/xgboost_ray/data_sources/object_store.py b/xgboost_ray/data_sources/object_store.py index bf99f62e..e06e0679 100644 --- a/xgboost_ray/data_sources/object_store.py +++ b/xgboost_ray/data_sources/object_store.py @@ -1,7 +1,6 @@ from typing import Any, Optional, Sequence import pandas as pd - import ray from ray import ObjectRef @@ -13,17 +12,18 @@ class ObjectStore(DataSource): """Read pandas dataframes and series from ray object store.""" @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: if isinstance(data, Sequence): return all(isinstance(d, ObjectRef) for d in data) return isinstance(data, ObjectRef) @staticmethod - def load_data(data: Sequence[ObjectRef], - ignore: Optional[Sequence[str]] = None, - indices: Optional[Sequence[int]] = None, - **kwargs) -> pd.DataFrame: + def load_data( + data: Sequence[ObjectRef], + ignore: Optional[Sequence[str]] = None, + indices: Optional[Sequence[int]] = None, + **kwargs + ) -> pd.DataFrame: if indices is not None: data = [data[i] for i in indices] diff --git a/xgboost_ray/data_sources/pandas.py b/xgboost_ray/data_sources/pandas.py index a8b101bc..28ba1364 100644 --- a/xgboost_ray/data_sources/pandas.py +++ b/xgboost_ray/data_sources/pandas.py @@ -9,15 +9,16 @@ class Pandas(DataSource): """Read from pandas dataframes and series.""" @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return isinstance(data, (pd.DataFrame, pd.Series)) @staticmethod - def load_data(data: Any, - ignore: Optional[Sequence[str]] = None, - indices: Optional[Sequence[int]] = None, - **kwargs) -> pd.DataFrame: + def load_data( + data: Any, + ignore: Optional[Sequence[str]] = None, + indices: Optional[Sequence[int]] = None, + **kwargs + ) -> pd.DataFrame: local_df = data if ignore: diff --git a/xgboost_ray/data_sources/parquet.py b/xgboost_ray/data_sources/parquet.py index ff3e80a2..2f61e727 100644 --- a/xgboost_ray/data_sources/parquet.py +++ b/xgboost_ray/data_sources/parquet.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Sequence, Iterable, Union +from typing import Any, Iterable, Optional, Sequence, Union import pandas as pd @@ -8,12 +8,12 @@ class Parquet(DataSource): """Read one or many Parquet files.""" + supports_central_loading = True supports_distributed_loading = True @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return filetype == RayFileType.PARQUET @staticmethod @@ -23,10 +23,12 @@ def get_filetype(data: Any) -> Optional[RayFileType]: return None @staticmethod - def load_data(data: Union[str, Sequence[str]], - ignore: Optional[Sequence[str]] = None, - indices: Optional[Sequence[int]] = None, - **kwargs) -> pd.DataFrame: + def load_data( + data: Union[str, Sequence[str]], + ignore: Optional[Sequence[str]] = None, + indices: Optional[Sequence[int]] = None, + **kwargs + ) -> pd.DataFrame: if isinstance(data, Iterable) and not isinstance(data, str): shards = [] diff --git a/xgboost_ray/data_sources/partitioned.py b/xgboost_ray/data_sources/partitioned.py index d290aa18..9d5b4722 100644 --- a/xgboost_ray/data_sources/partitioned.py +++ b/xgboost_ray/data_sources/partitioned.py @@ -1,17 +1,18 @@ -from typing import Any, Optional, Sequence, Dict, Tuple - from collections import defaultdict -import pandas as pd -import numpy as np +from typing import Any, Dict, Optional, Sequence, Tuple +import numpy as np +import pandas as pd from ray import ObjectRef from ray.actor import ActorHandle -from xgboost_ray.data_sources._distributed import \ - assign_partitions_to_actors, get_actor_rank_ips +from xgboost_ray.data_sources._distributed import ( + assign_partitions_to_actors, + get_actor_rank_ips, +) from xgboost_ray.data_sources.data_source import DataSource, RayFileType -from xgboost_ray.data_sources.pandas import Pandas from xgboost_ray.data_sources.numpy import Numpy +from xgboost_ray.data_sources.pandas import Pandas class Partitioned(DataSource): @@ -24,20 +25,21 @@ class Partitioned(DataSource): Also see the __partitioned__ spec: https://github.com/IntelPython/DPPY-Spec/blob/draft/partitioned/Partitioned.md """ + supports_central_loading = True supports_distributed_loading = True @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return hasattr(data, "__partitioned__") @staticmethod def load_data( - data: Any, # __partitioned__ dict - ignore: Optional[Sequence[str]] = None, - indices: Optional[Sequence[ObjectRef]] = None, - **kwargs) -> pd.DataFrame: + data: Any, # __partitioned__ dict + ignore: Optional[Sequence[str]] = None, + indices: Optional[Sequence[ObjectRef]] = None, + **kwargs + ) -> pd.DataFrame: assert isinstance(data, dict), "Expected __partitioned__ dict" _get = data["get"] @@ -46,28 +48,25 @@ def load_data( tiling = data["partition_tiling"] ndims = len(tiling) # we need tuples to access partitions in the right order - pos_suffix = (0, ) * (ndims - 1) + pos_suffix = (0,) * (ndims - 1) parts = data["partitions"] # get the full data, e.g. all shards/partitions local_df = [ - _get(parts[(i, ) + pos_suffix]["data"]) - for i in range(tiling[0]) + _get(parts[(i,) + pos_suffix]["data"]) for i in range(tiling[0]) ] else: # here we got a list of futures for partitions local_df = _get(indices) if isinstance(local_df[0], pd.DataFrame): - return Pandas.load_data( - pd.concat(local_df, copy=False), ignore=ignore) + return Pandas.load_data(pd.concat(local_df, copy=False), ignore=ignore) else: return Numpy.load_data(np.concatenate(local_df), ignore=ignore) @staticmethod def get_actor_shards( - data: Any, # partitioned.pandas.DataFrame - actors: Sequence[ActorHandle]) -> \ - Tuple[Any, Optional[Dict[int, Any]]]: + data: Any, actors: Sequence[ActorHandle] # partitioned.pandas.DataFrame + ) -> Tuple[Any, Optional[Dict[int, Any]]]: assert hasattr(data, "__partitioned__") actor_rank_ips = get_actor_rank_ips(actors) @@ -77,17 +76,17 @@ def get_actor_shards( parts = parted["partitions"] tiling = parted["partition_tiling"] ndims = len(tiling) - if ndims < 1 or ndims > 2 or any(tiling[x] != 1 - for x in range(1, ndims)): + if ndims < 1 or ndims > 2 or any(tiling[x] != 1 for x in range(1, ndims)): raise RuntimeError( - "Only row-wise partitionings of 1d/2d structures supported.") + "Only row-wise partitionings of 1d/2d structures supported." + ) # Now build a table mapping from IP to list of partitions ip_to_parts = defaultdict(lambda: []) # we need tuples to access partitions in the right order - pos_suffix = (0, ) * (ndims - 1) + pos_suffix = (0,) * (ndims - 1) for i in range(tiling[0]): - part = parts[(i, ) + pos_suffix] # this works for 1d and 2d + part = parts[(i,) + pos_suffix] # this works for 1d and 2d ip_to_parts[part["location"][0]].append(part["data"]) # __partitioned__ is serializable, so pass it here # as the first return value diff --git a/xgboost_ray/data_sources/petastorm.py b/xgboost_ray/data_sources/petastorm.py index f821e1a1..c2ac936b 100644 --- a/xgboost_ray/data_sources/petastorm.py +++ b/xgboost_ray/data_sources/petastorm.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Sequence, Union, List +from typing import Any, List, Optional, Sequence, Union import pandas as pd @@ -6,6 +6,7 @@ try: import petastorm + PETASTORM_INSTALLED = True except ImportError: PETASTORM_INSTALLED = False @@ -19,7 +20,8 @@ def _assert_petastorm_installed(): "\nFIX THIS by installing petastorm: `pip install petastorm`. " "\nPlease also raise an issue on our GitHub: " "https://github.com/ray-project/xgboost_ray as this part of " - "the code should not have been reached.") + "the code should not have been reached." + ) class Petastorm(DataSource): @@ -31,12 +33,12 @@ class Petastorm(DataSource): This class accesses Petastorm's dataset loading interface for efficient loading of large datasets. """ + supports_central_loading = True supports_distributed_loading = True @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: return PETASTORM_INSTALLED and filetype == RayFileType.PETASTORM @staticmethod @@ -48,10 +50,12 @@ def get_filetype(data: Any) -> Optional[RayFileType]: data = [data] def _is_compatible(url: str): - return url.endswith(".parquet") and (url.startswith("s3://") - or url.startswith("gs://") - or url.startswith("hdfs://") - or url.startswith("file://")) + return url.endswith(".parquet") and ( + url.startswith("s3://") + or url.startswith("gs://") + or url.startswith("hdfs://") + or url.startswith("file://") + ) if all(_is_compatible(url) for url in data): return RayFileType.PETASTORM @@ -59,14 +63,17 @@ def _is_compatible(url: str): return None @staticmethod - def load_data(data: Union[str, Sequence[str]], - ignore: Optional[Sequence[str]] = None, - indices: Optional[Sequence[int]] = None, - **kwargs) -> pd.DataFrame: + def load_data( + data: Union[str, Sequence[str]], + ignore: Optional[Sequence[str]] = None, + indices: Optional[Sequence[int]] = None, + **kwargs + ) -> pd.DataFrame: _assert_petastorm_installed() with petastorm.make_batch_reader(data) as reader: shards = [ - pd.DataFrame(batch._asdict()) for i, batch in enumerate(reader) + pd.DataFrame(batch._asdict()) + for i, batch in enumerate(reader) if not indices or i in indices ] diff --git a/xgboost_ray/data_sources/ray_dataset.py b/xgboost_ray/data_sources/ray_dataset.py index 013677b9..9147acb2 100644 --- a/xgboost_ray/data_sources/ray_dataset.py +++ b/xgboost_ray/data_sources/ray_dataset.py @@ -1,7 +1,6 @@ -from typing import Any, Optional, Sequence, Dict, Union, Tuple +from typing import Any, Dict, Optional, Sequence, Tuple, Union import pandas as pd - import ray from ray.actor import ActorHandle @@ -10,6 +9,7 @@ try: import ray.data.dataset # noqa: F401 + RAY_DATASET_AVAILABLE = True except (ImportError, AttributeError): RAY_DATASET_AVAILABLE = False @@ -25,33 +25,36 @@ def _assert_ray_data_available(): "\nFIX THIS by upgrading Ray: `pip install -U ray`. " "\nPlease also raise an issue on our GitHub: " "https://github.com/ray-project/xgboost_ray as this part of " - "the code should not have been reached.") + "the code should not have been reached." + ) class RayDataset(DataSource): """Read from distributed Ray dataset.""" + supports_central_loading = True supports_distributed_loading = True @staticmethod - def is_data_type(data: Any, - filetype: Optional[RayFileType] = None) -> bool: + def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool: if not RAY_DATASET_AVAILABLE: return False return isinstance(data, ray.data.dataset.Dataset) @staticmethod - def load_data(data: "ray.data.dataset.Dataset", - ignore: Optional[Sequence[str]] = None, - indices: Optional[Union[Sequence[int], Sequence[ - "ray.data.dataset.Dataset"]]] = None, - **kwargs) -> pd.DataFrame: + def load_data( + data: "ray.data.dataset.Dataset", + ignore: Optional[Sequence[str]] = None, + indices: Optional[ + Union[Sequence[int], Sequence["ray.data.dataset.Dataset"]] + ] = None, + **kwargs + ) -> pd.DataFrame: _assert_ray_data_available() if indices is not None: - if len(indices) > 0 and isinstance(indices[0], - ray.data.dataset.Dataset): + if len(indices) > 0 and isinstance(indices[0], ray.data.dataset.Dataset): # We got a list of Datasets belonging a partition data = indices else: @@ -61,28 +64,28 @@ def load_data(data: "ray.data.dataset.Dataset", local_df = data.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) else: local_df = pd.concat( - [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], - copy=False) + [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], copy=False + ) return Pandas.load_data(local_df, ignore=ignore) @staticmethod - def convert_to_series(data: Union["ray.data.dataset.Dataset", Sequence[ - "ray.data.dataset.Dataset"]]) -> pd.Series: + def convert_to_series( + data: Union["ray.data.dataset.Dataset", Sequence["ray.data.dataset.Dataset"]] + ) -> pd.Series: _assert_ray_data_available() if isinstance(data, ray.data.dataset.Dataset): data = data.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) else: data = pd.concat( - [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], - copy=False) + [ds.to_pandas(limit=DATASET_TO_PANDAS_LIMIT) for ds in data], copy=False + ) return DataSource.convert_to_series(data) @staticmethod def get_actor_shards( - data: "ray.data.dataset.Dataset", - actors: Sequence[ActorHandle]) -> \ - Tuple[Any, Optional[Dict[int, Any]]]: + data: "ray.data.dataset.Dataset", actors: Sequence[ActorHandle] + ) -> Tuple[Any, Optional[Dict[int, Any]]]: _assert_ray_data_available() # We do not use our assign_partitions_to_actors as assignment of splits @@ -95,8 +98,7 @@ def get_actor_shards( ) return None, { - i: [dataset_split] - for i, dataset_split in enumerate(dataset_splits) + i: [dataset_split] for i, dataset_split in enumerate(dataset_splits) } @staticmethod diff --git a/xgboost_ray/elastic.py b/xgboost_ray/elastic.py index 27569810..0e5295e7 100644 --- a/xgboost_ray/elastic.py +++ b/xgboost_ray/elastic.py @@ -1,19 +1,29 @@ import time -from typing import Optional, Dict, List, Tuple, Callable +from typing import Callable, Dict, List, Optional, Tuple import ray -from xgboost_ray.main import RayParams, _TrainingState, \ - logger, ActorHandle, _PrepareActorTask, _create_actor, \ - RayXGBoostActorAvailable, ENV - +from xgboost_ray.main import ( + ENV, + ActorHandle, + RayParams, + RayXGBoostActorAvailable, + _create_actor, + _PrepareActorTask, + _TrainingState, + logger, +) from xgboost_ray.matrix import RayDMatrix def _maybe_schedule_new_actors( - training_state: _TrainingState, num_cpus_per_actor: int, - num_gpus_per_actor: int, resources_per_actor: Optional[Dict], - ray_params: RayParams, load_data: List[RayDMatrix]) -> bool: + training_state: _TrainingState, + num_cpus_per_actor: int, + num_gpus_per_actor: int, + resources_per_actor: Optional[Dict], + ray_params: RayParams, + load_data: List[RayDMatrix], +) -> bool: """Schedule new actors for elastic training if resources are available. Potentially starts new actors and triggers data loading.""" @@ -23,7 +33,8 @@ def _maybe_schedule_new_actors( return False missing_actor_ranks = [ - rank for rank, actor in enumerate(training_state.actors) + rank + for rank, actor in enumerate(training_state.actors) if actor is None and rank not in training_state.pending_actors ] @@ -34,8 +45,10 @@ def _maybe_schedule_new_actors( now = time.time() # Check periodically every n seconds. - if now < training_state.last_resource_check_at + \ - ENV.ELASTIC_RESTART_RESOURCE_CHECK_S: + if ( + now + < training_state.last_resource_check_at + ENV.ELASTIC_RESTART_RESOURCE_CHECK_S + ): return False training_state.last_resource_check_at = now @@ -43,8 +56,7 @@ def _maybe_schedule_new_actors( new_pending_actors: Dict[int, Tuple[ActorHandle, _PrepareActorTask]] = {} for rank in missing_actor_ranks: # Actor rank should not be already pending - if rank in training_state.pending_actors \ - or rank in new_pending_actors: + if rank in training_state.pending_actors or rank in new_pending_actors: continue # Try to schedule this actor @@ -57,23 +69,29 @@ def _maybe_schedule_new_actors( placement_group=training_state.placement_group, queue=training_state.queue, checkpoint_frequency=ray_params.checkpoint_frequency, - distributed_callbacks=ray_params.distributed_callbacks) + distributed_callbacks=ray_params.distributed_callbacks, + ) task = _PrepareActorTask( actor, queue=training_state.queue, stop_event=training_state.stop_event, - load_data=load_data) + load_data=load_data, + ) new_pending_actors[rank] = (actor, task) - logger.debug(f"Re-scheduled actor with rank {rank}. Waiting for " - f"placement and data loading before promoting it " - f"to training.") + logger.debug( + f"Re-scheduled actor with rank {rank}. Waiting for " + f"placement and data loading before promoting it " + f"to training." + ) if new_pending_actors: training_state.pending_actors.update(new_pending_actors) - logger.info(f"Re-scheduled {len(new_pending_actors)} actors for " - f"training. Once data loading finished, they will be " - f"integrated into training again.") + logger.info( + f"Re-scheduled {len(new_pending_actors)} actors for " + f"training. Once data loading finished, they will be " + f"integrated into training again." + ) return bool(new_pending_actors) @@ -102,7 +120,7 @@ def _update_scheduled_actor_states(training_state: _TrainingState): if actor_became_ready: if not training_state.pending_actors: # No other actors are pending, so let's restart right away. - training_state.restart_training_at = now - 1. + training_state.restart_training_at = now - 1.0 # If an actor became ready but other actors are pending, we wait # for n seconds before restarting, as chances are that they become @@ -111,7 +129,8 @@ def _update_scheduled_actor_states(training_state: _TrainingState): if training_state.restart_training_at is None: logger.debug( f"A RayXGBoostActor became ready for training. Waiting " - f"{grace_period} seconds before triggering training restart.") + f"{grace_period} seconds before triggering training restart." + ) training_state.restart_training_at = now + grace_period if training_state.restart_training_at is not None: @@ -119,12 +138,14 @@ def _update_scheduled_actor_states(training_state: _TrainingState): training_state.restart_training_at = None raise RayXGBoostActorAvailable( "A new RayXGBoostActor became available for training. " - "Triggering restart.") + "Triggering restart." + ) -def _get_actor_alive_status(actors: List[ActorHandle], - callback: Callable[[ActorHandle], None]): - """Loop through all actors. Invoke a callback on dead actors. """ +def _get_actor_alive_status( + actors: List[ActorHandle], callback: Callable[[ActorHandle], None] +): + """Loop through all actors. Invoke a callback on dead actors.""" obj_to_rank = {} alive = 0 @@ -152,7 +173,6 @@ def _get_actor_alive_status(actors: List[ActorHandle], logger.debug(f"Actor {actors[rank]} is _not_ alive.") dead += 1 callback(actors[rank]) - logger.info(f"Actor status: {alive} alive, {dead} dead " - f"({alive+dead} total)") + logger.info(f"Actor status: {alive} alive, {dead} dead " f"({alive+dead} total)") return alive, dead diff --git a/xgboost_ray/examples/create_test_data.py b/xgboost_ray/examples/create_test_data.py index ddb7e33f..7c85ce5b 100644 --- a/xgboost_ray/examples/create_test_data.py +++ b/xgboost_ray/examples/create_test_data.py @@ -7,7 +7,8 @@ def main(): num_rows=1_000_000, num_partitions=100, num_features=8, - num_classes=2) + num_classes=2, + ) if __name__ == "__main__": diff --git a/xgboost_ray/examples/higgs.py b/xgboost_ray/examples/higgs.py index 33ea3a0a..4ca49b2c 100644 --- a/xgboost_ray/examples/higgs.py +++ b/xgboost_ray/examples/higgs.py @@ -1,14 +1,16 @@ import os import time -from xgboost_ray import train, RayDMatrix, RayParams +from xgboost_ray import RayDMatrix, RayParams, train FILENAME_CSV = "HIGGS.csv.gz" def download_higgs(target_file): - url = "https://archive.ics.uci.edu/ml/machine-learning-databases/" \ - "00280/HIGGS.csv.gz" + url = ( + "https://archive.ics.uci.edu/ml/machine-learning-databases/" + "00280/HIGGS.csv.gz" + ) try: import urllib.request @@ -16,7 +18,8 @@ def download_higgs(target_file): raise ValueError( f"Automatic downloading of the HIGGS dataset requires `urllib`." f"\nFIX THIS by running `pip install urllib` or manually " - f"downloading the dataset from {url}.") from e + f"downloading the dataset from {url}." + ) from e print(f"Downloading HIGGS dataset to {target_file}") urllib.request.urlretrieve(url, target_file) @@ -30,16 +33,14 @@ def main(): # https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz if not os.path.exists(FILENAME_CSV): - assert download_higgs(FILENAME_CSV), \ - "Downloading of HIGGS dataset failed." + assert download_higgs(FILENAME_CSV), "Downloading of HIGGS dataset failed." print("HIGGS dataset downloaded.") else: print("HIGGS dataset found locally.") colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)] - dtrain = RayDMatrix( - os.path.abspath(FILENAME_CSV), label="label", names=colnames) + dtrain = RayDMatrix(os.path.abspath(FILENAME_CSV), label="label", names=colnames) config = { "tree_method": "hist", @@ -55,17 +56,18 @@ def main(): evals_result=evals_result, ray_params=RayParams(max_actor_restarts=1, num_actors=1), num_boost_round=100, - evals=[(dtrain, "train")]) + evals=[(dtrain, "train")], + ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("higgs.xgb") - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": import ray + ray.init() start = time.time() diff --git a/xgboost_ray/examples/higgs_parquet.py b/xgboost_ray/examples/higgs_parquet.py index 3cc2cd87..faed2f7c 100644 --- a/xgboost_ray/examples/higgs_parquet.py +++ b/xgboost_ray/examples/higgs_parquet.py @@ -4,9 +4,9 @@ import pandas as pd import pyarrow as pa import pyarrow.parquet as pq - from higgs import download_higgs -from xgboost_ray import train, RayDMatrix, RayParams + +from xgboost_ray import RayDMatrix, RayParams, train FILENAME_CSV = "HIGGS.csv.gz" FILENAME_PARQUET = "HIGGS.parquet" @@ -18,7 +18,8 @@ def csv_to_parquet(in_file, out_file, chunksize=100_000, **csv_kwargs): print(f"Converting CSV {in_file} to PARQUET {out_file}") csv_stream = pd.read_csv( - in_file, sep=",", chunksize=chunksize, low_memory=False, **csv_kwargs) + in_file, sep=",", chunksize=chunksize, low_memory=False, **csv_kwargs + ) parquet_schema = None parquet_writer = None @@ -29,7 +30,8 @@ def csv_to_parquet(in_file, out_file, chunksize=100_000, **csv_kwargs): parquet_schema = pa.Table.from_pandas(df=chunk).schema # Open a Parquet file for writing parquet_writer = pq.ParquetWriter( - out_file, parquet_schema, compression="snappy") + out_file, parquet_schema, compression="snappy" + ) # Write CSV chunk to the parquet file table = pa.Table.from_pandas(chunk, schema=parquet_schema) parquet_writer.write_table(table) @@ -53,21 +55,44 @@ def main(): FILENAME_CSV, FILENAME_PARQUET, names=[ - "label", "feature-01", "feature-02", "feature-03", - "feature-04", "feature-05", "feature-06", "feature-07", - "feature-08", "feature-09", "feature-10", "feature-11", - "feature-12", "feature-13", "feature-14", "feature-15", - "feature-16", "feature-17", "feature-18", "feature-19", - "feature-20", "feature-21", "feature-22", "feature-23", - "feature-24", "feature-25", "feature-26", "feature-27", - "feature-28" - ]) + "label", + "feature-01", + "feature-02", + "feature-03", + "feature-04", + "feature-05", + "feature-06", + "feature-07", + "feature-08", + "feature-09", + "feature-10", + "feature-11", + "feature-12", + "feature-13", + "feature-14", + "feature-15", + "feature-16", + "feature-17", + "feature-18", + "feature-19", + "feature-20", + "feature-21", + "feature-22", + "feature-23", + "feature-24", + "feature-25", + "feature-26", + "feature-27", + "feature-28", + ], + ) colnames = ["label"] + ["feature-%02d" % i for i in range(1, 29)] # Here we load the Parquet file dtrain = RayDMatrix( - os.path.abspath(FILENAME_PARQUET), label="label", columns=colnames) + os.path.abspath(FILENAME_PARQUET), label="label", columns=colnames + ) config = { "tree_method": "hist", @@ -83,17 +108,18 @@ def main(): evals_result=evals_result, ray_params=RayParams(max_actor_restarts=1, num_actors=1), num_boost_round=100, - evals=[(dtrain, "train")]) + evals=[(dtrain, "train")], + ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("higgs.xgb") - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": import ray + ray.init() start = time.time() diff --git a/xgboost_ray/examples/readme.py b/xgboost_ray/examples/readme.py index c947d2ba..82b74687 100644 --- a/xgboost_ray/examples/readme.py +++ b/xgboost_ray/examples/readme.py @@ -2,9 +2,10 @@ def readme_simple(): - from xgboost_ray import RayDMatrix, RayParams, train from sklearn.datasets import load_breast_cancer + from xgboost_ray import RayDMatrix, RayParams, train + train_x, train_y = load_breast_cancer(return_X_y=True) train_set = RayDMatrix(train_x, train_y) @@ -18,17 +19,18 @@ def readme_simple(): evals_result=evals_result, evals=[(train_set, "train")], verbose_eval=False, - ray_params=RayParams(num_actors=2, cpus_per_actor=1)) + ray_params=RayParams(num_actors=2, cpus_per_actor=1), + ) bst.save_model("model.xgb") - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) def readme_predict(): - from xgboost_ray import RayDMatrix, RayParams, predict - from sklearn.datasets import load_breast_cancer import xgboost as xgb + from sklearn.datasets import load_breast_cancer + + from xgboost_ray import RayDMatrix, RayParams, predict data, labels = load_breast_cancer(return_X_y=True) @@ -41,14 +43,14 @@ def readme_predict(): def readme_tune(): - from xgboost_ray import RayDMatrix, RayParams, train from sklearn.datasets import load_breast_cancer + from xgboost_ray import RayDMatrix, RayParams, train + num_actors = 4 num_cpus_per_actor = 1 - ray_params = RayParams( - num_actors=num_actors, cpus_per_actor=num_cpus_per_actor) + ray_params = RayParams(num_actors=num_actors, cpus_per_actor=num_cpus_per_actor) def train_model(config): train_x, train_y = load_breast_cancer(return_X_y=True) @@ -61,7 +63,8 @@ def train_model(config): evals_result=evals_result, evals=[(train_set, "train")], verbose_eval=False, - ray_params=ray_params) + ray_params=ray_params, + ) bst.save_model("model.xgb") from ray import tune @@ -73,7 +76,7 @@ def train_model(config): "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), - "max_depth": tune.randint(1, 9) + "max_depth": tune.randint(1, 9), } # Make sure to use the `get_tune_resources` method to set the `resources_per_trial` @@ -83,7 +86,8 @@ def train_model(config): metric="train-error", mode="min", num_samples=4, - resources_per_trial=ray_params.get_tune_resources()) + resources_per_trial=ray_params.get_tune_resources(), + ) print("Best hyperparameters", analysis.best_config) diff --git a/xgboost_ray/examples/readme_sklearn_api.py b/xgboost_ray/examples/readme_sklearn_api.py index 3101c861..2706d917 100644 --- a/xgboost_ray/examples/readme_sklearn_api.py +++ b/xgboost_ray/examples/readme_sklearn_api.py @@ -1,17 +1,19 @@ def readme_sklearn_api(): - from xgboost_ray import RayXGBClassifier, RayParams from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split + from xgboost_ray import RayParams, RayXGBClassifier + seed = 42 X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( - X, y, train_size=0.25, random_state=42) + X, y, train_size=0.25, random_state=42 + ) clf = RayXGBClassifier( - n_jobs=4, # In XGBoost-Ray, n_jobs sets the number of actors - random_state=seed) + n_jobs=4, random_state=seed # In XGBoost-Ray, n_jobs sets the number of actors + ) # scikit-learn API will automatically conver the data # to RayDMatrix format as needed. diff --git a/xgboost_ray/examples/simple.py b/xgboost_ray/examples/simple.py index 545e9ad6..f0cd2c88 100644 --- a/xgboost_ray/examples/simple.py +++ b/xgboost_ray/examples/simple.py @@ -1,19 +1,17 @@ import argparse +import ray from sklearn import datasets from sklearn.model_selection import train_test_split -import ray - -from xgboost_ray import RayDMatrix, train, RayParams +from xgboost_ray import RayDMatrix, RayParams, train def main(cpus_per_actor, num_actors): # Load dataset data, labels = datasets.load_breast_cancer(return_X_y=True) # Split into train and test set - train_x, test_x, train_y, test_y = train_test_split( - data, labels, test_size=0.25) + train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25) train_set = RayDMatrix(train_x, train_y) test_set = RayDMatrix(test_x, test_y) @@ -37,40 +35,41 @@ def main(cpus_per_actor, num_actors): max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, - num_actors=num_actors), + num_actors=num_actors, + ), verbose_eval=False, - num_boost_round=10) + num_boost_round=10, + ) model_path = "simple.xgb" bst.save_model(model_path) - print("Final validation error: {:.4f}".format( - evals_result["eval"]["error"][-1])) + print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--address", - required=False, - type=str, - help="the address to use for Ray") + "--address", required=False, type=str, help="the address to use for Ray" + ) parser.add_argument( "--server-address", required=False, type=str, - help="Address of the remote server if using Ray Client.") + help="Address of the remote server if using Ray Client.", + ) parser.add_argument( "--cpus-per-actor", type=int, default=1, - help="Sets number of CPUs per xgboost training worker.") + help="Sets number of CPUs per xgboost training worker.", + ) parser.add_argument( "--num-actors", type=int, default=4, - help="Sets number of xgboost workers to use.") - parser.add_argument( - "--smoke-test", action="store_true", default=False, help="gpu") + help="Sets number of xgboost workers to use.", + ) + parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() diff --git a/xgboost_ray/examples/simple_dask.py b/xgboost_ray/examples/simple_dask.py index 53d9ced2..f98b137c 100644 --- a/xgboost_ray/examples/simple_dask.py +++ b/xgboost_ray/examples/simple_dask.py @@ -2,10 +2,9 @@ import numpy as np import pandas as pd - import ray -from xgboost_ray import RayDMatrix, train, RayParams +from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.data_sources.dask import DASK_INSTALLED @@ -18,6 +17,7 @@ def main(cpus_per_actor, num_actors): import dask import dask.dataframe as dd from ray.util.dask import ray_dask_get + dask.config.set(scheduler=ray_dask_get) # Generate dataset @@ -55,40 +55,41 @@ def main(cpus_per_actor, num_actors): max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, - num_actors=num_actors), + num_actors=num_actors, + ), verbose_eval=False, - num_boost_round=10) + num_boost_round=10, + ) model_path = "dask.xgb" bst.save_model(model_path) - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--address", - required=False, - type=str, - help="the address to use for Ray") + "--address", required=False, type=str, help="the address to use for Ray" + ) parser.add_argument( "--server-address", required=False, type=str, - help="Address of the remote server if using Ray Client.") + help="Address of the remote server if using Ray Client.", + ) parser.add_argument( "--cpus-per-actor", type=int, default=1, - help="Sets number of CPUs per xgboost training worker.") + help="Sets number of CPUs per xgboost training worker.", + ) parser.add_argument( "--num-actors", type=int, default=4, - help="Sets number of xgboost workers to use.") - parser.add_argument( - "--smoke-test", action="store_true", default=False, help="gpu") + help="Sets number of xgboost workers to use.", + ) + parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() diff --git a/xgboost_ray/examples/simple_modin.py b/xgboost_ray/examples/simple_modin.py index 432c0462..5d7f0399 100644 --- a/xgboost_ray/examples/simple_modin.py +++ b/xgboost_ray/examples/simple_modin.py @@ -2,17 +2,18 @@ import numpy as np import pandas as pd - import ray -from xgboost_ray import RayDMatrix, train, RayParams +from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.data_sources.modin import MODIN_INSTALLED def main(cpus_per_actor, num_actors): if not MODIN_INSTALLED: - print("Modin is not installed or installed in a version that is not " - "compatible with xgboost_ray (< 0.9.0).") + print( + "Modin is not installed or installed in a version that is not " + "compatible with xgboost_ray (< 0.9.0)." + ) return # Import modin after initializing Ray @@ -56,40 +57,41 @@ def main(cpus_per_actor, num_actors): max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, - num_actors=num_actors), + num_actors=num_actors, + ), verbose_eval=False, - num_boost_round=10) + num_boost_round=10, + ) model_path = "modin.xgb" bst.save_model(model_path) - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--address", - required=False, - type=str, - help="the address to use for Ray") + "--address", required=False, type=str, help="the address to use for Ray" + ) parser.add_argument( "--server-address", required=False, type=str, - help="Address of the remote server if using Ray Client.") + help="Address of the remote server if using Ray Client.", + ) parser.add_argument( "--cpus-per-actor", type=int, default=1, - help="Sets number of CPUs per xgboost training worker.") + help="Sets number of CPUs per xgboost training worker.", + ) parser.add_argument( "--num-actors", type=int, default=4, - help="Sets number of xgboost workers to use.") - parser.add_argument( - "--smoke-test", action="store_true", default=False, help="gpu") + help="Sets number of xgboost workers to use.", + ) + parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() diff --git a/xgboost_ray/examples/simple_objectstore.py b/xgboost_ray/examples/simple_objectstore.py index 9d45b445..f60256de 100644 --- a/xgboost_ray/examples/simple_objectstore.py +++ b/xgboost_ray/examples/simple_objectstore.py @@ -2,10 +2,9 @@ import numpy as np import pandas as pd - import ray -from xgboost_ray import RayDMatrix, train, RayParams +from xgboost_ray import RayDMatrix, RayParams, train def main(cpus_per_actor, num_actors): @@ -44,40 +43,41 @@ def main(cpus_per_actor, num_actors): max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, - num_actors=num_actors), + num_actors=num_actors, + ), verbose_eval=False, - num_boost_round=10) + num_boost_round=10, + ) model_path = "modin.xgb" bst.save_model(model_path) - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--address", - required=False, - type=str, - help="the address to use for Ray") + "--address", required=False, type=str, help="the address to use for Ray" + ) parser.add_argument( "--server-address", required=False, type=str, - help="Address of the remote server if using Ray Client.") + help="Address of the remote server if using Ray Client.", + ) parser.add_argument( "--cpus-per-actor", type=int, default=1, - help="Sets number of CPUs per xgboost training worker.") + help="Sets number of CPUs per xgboost training worker.", + ) parser.add_argument( "--num-actors", type=int, default=4, - help="Sets number of xgboost workers to use.") - parser.add_argument( - "--smoke-test", action="store_true", default=False, help="gpu") + help="Sets number of xgboost workers to use.", + ) + parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() diff --git a/xgboost_ray/examples/simple_partitioned.py b/xgboost_ray/examples/simple_partitioned.py index 9a6bb7c7..ee6569ff 100644 --- a/xgboost_ray/examples/simple_partitioned.py +++ b/xgboost_ray/examples/simple_partitioned.py @@ -1,13 +1,11 @@ import argparse -from sklearn import datasets -from sklearn.model_selection import train_test_split - import numpy as np - import ray +from sklearn import datasets +from sklearn.model_selection import train_test_split -from xgboost_ray import RayDMatrix, train, RayParams +from xgboost_ray import RayDMatrix, RayParams, train nc = 31 @@ -15,13 +13,13 @@ @ray.remote class AnActor: """We mimic a distributed DF by having several actors create - data which form the global DF. + data which form the global DF. """ @ray.method(num_returns=2) def genData(self, rank, nranks, nrows): """Generate global dataset and cut out local piece. - In real life each actor would of course directly create local data. + In real life each actor would of course directly create local data. """ # Load dataset data, labels = datasets.load_breast_cancer(return_X_y=True) @@ -32,12 +30,11 @@ def genData(self, rank, nranks, nrows): assert nrows <= train.shape[0] assert nc == train.shape[1] sz = nrows // nranks - return train[sz * rank:sz * (rank + 1)], ray.util.get_node_ip_address() + return train[sz * rank : sz * (rank + 1)], ray.util.get_node_ip_address() class Parted: - """Class exposing __partitioned__ - """ + """Class exposing __partitioned__""" def __init__(self, parted): self.__partitioned__ = parted @@ -46,23 +43,25 @@ def __init__(self, parted): def main(cpus_per_actor, num_actors): nr = 424 actors = [AnActor.remote() for _ in range(num_actors)] - parts = [ - actors[i].genData.remote(i, num_actors, nr) for i in range(num_actors) - ] + parts = [actors[i].genData.remote(i, num_actors, nr) for i in range(num_actors)] rowsperpart = nr // num_actors nr = rowsperpart * num_actors - parted = Parted({ - "shape": (nr, nc), - "partition_tiling": (num_actors, 1), - "get": lambda x: ray.get(x), - "partitions": {(i, 0): { - "start": (i * rowsperpart, 0), - "shape": (rowsperpart, nc), - "data": parts[i][0], - "location": [ray.get(parts[i][1])], + parted = Parted( + { + "shape": (nr, nc), + "partition_tiling": (num_actors, 1), + "get": lambda x: ray.get(x), + "partitions": { + (i, 0): { + "start": (i * rowsperpart, 0), + "shape": (rowsperpart, nc), + "data": parts[i][0], + "location": [ray.get(parts[i][1])], + } + for i in range(num_actors) + }, } - for i in range(num_actors)} - }) + ) yl = nc - 1 # Let's create DMatrix from our __partitioned__ structure @@ -86,40 +85,41 @@ def main(cpus_per_actor, num_actors): max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, - num_actors=num_actors), + num_actors=num_actors, + ), verbose_eval=False, - num_boost_round=10) + num_boost_round=10, + ) model_path = "partitioned.xgb" bst.save_model(model_path) - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--address", - required=False, - type=str, - help="the address to use for Ray") + "--address", required=False, type=str, help="the address to use for Ray" + ) parser.add_argument( "--server-address", required=False, type=str, - help="Address of the remote server if using Ray Client.") + help="Address of the remote server if using Ray Client.", + ) parser.add_argument( "--cpus-per-actor", type=int, default=1, - help="Sets number of CPUs per xgboost training worker.") + help="Sets number of CPUs per xgboost training worker.", + ) parser.add_argument( "--num-actors", type=int, default=4, - help="Sets number of xgboost workers to use.") - parser.add_argument( - "--smoke-test", action="store_true", default=False, help="gpu") + help="Sets number of xgboost workers to use.", + ) + parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() diff --git a/xgboost_ray/examples/simple_predict.py b/xgboost_ray/examples/simple_predict.py index 6989ddb1..8852e14d 100644 --- a/xgboost_ray/examples/simple_predict.py +++ b/xgboost_ray/examples/simple_predict.py @@ -1,18 +1,19 @@ import os -from sklearn import datasets - +import numpy as np import xgboost as xgb -from xgboost_ray import RayDMatrix, predict, RayParams +from sklearn import datasets -import numpy as np +from xgboost_ray import RayDMatrix, RayParams, predict def main(): if not os.path.exists("simple.xgb"): - raise ValueError("Model file not found: `simple.xgb`" - "\nFIX THIS by running `python `simple.py` first to " - "train the model.") + raise ValueError( + "Model file not found: `simple.xgb`" + "\nFIX THIS by running `python `simple.py` first to " + "train the model." + ) # Load dataset data, labels = datasets.load_breast_cancer(return_X_y=True) diff --git a/xgboost_ray/examples/simple_ray_dataset.py b/xgboost_ray/examples/simple_ray_dataset.py index 332871b2..8fee2930 100644 --- a/xgboost_ray/examples/simple_ray_dataset.py +++ b/xgboost_ray/examples/simple_ray_dataset.py @@ -2,10 +2,9 @@ import numpy as np import pandas as pd - import ray -from xgboost_ray import RayDMatrix, train, RayParams +from xgboost_ray import RayDMatrix, RayParams, train def main(cpus_per_actor, num_actors): @@ -53,40 +52,41 @@ def main(cpus_per_actor, num_actors): max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, - num_actors=num_actors), + num_actors=num_actors, + ), verbose_eval=False, - num_boost_round=10) + num_boost_round=10, + ) model_path = "ray_datasets.xgb" bst.save_model(model_path) - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--address", - required=False, - type=str, - help="the address to use for Ray") + "--address", required=False, type=str, help="the address to use for Ray" + ) parser.add_argument( "--server-address", required=False, type=str, - help="Address of the remote server if using Ray Client.") + help="Address of the remote server if using Ray Client.", + ) parser.add_argument( "--cpus-per-actor", type=int, default=1, - help="Sets number of CPUs per xgboost training worker.") + help="Sets number of CPUs per xgboost training worker.", + ) parser.add_argument( "--num-actors", type=int, default=4, - help="Sets number of xgboost workers to use.") - parser.add_argument( - "--smoke-test", action="store_true", default=False, help="gpu") + help="Sets number of xgboost workers to use.", + ) + parser.add_argument("--smoke-test", action="store_true", default=False, help="gpu") args, _ = parser.parse_known_args() diff --git a/xgboost_ray/examples/simple_tune.py b/xgboost_ray/examples/simple_tune.py index 338018bd..9a3cd338 100644 --- a/xgboost_ray/examples/simple_tune.py +++ b/xgboost_ray/examples/simple_tune.py @@ -1,22 +1,20 @@ import argparse import os -import xgboost_ray -from sklearn import datasets -from sklearn.model_selection import train_test_split - import ray from ray import tune +from sklearn import datasets +from sklearn.model_selection import train_test_split -from xgboost_ray import train, RayDMatrix, RayParams +import xgboost_ray +from xgboost_ray import RayDMatrix, RayParams, train def train_breast_cancer(config, ray_params): # Load dataset data, labels = datasets.load_breast_cancer(return_X_y=True) # Split into train and test set - train_x, test_x, train_y, test_y = train_test_split( - data, labels, test_size=0.25) + train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25) train_set = RayDMatrix(train_x, train_y) test_set = RayDMatrix(test_x, test_y) @@ -30,12 +28,12 @@ def train_breast_cancer(config, ray_params): evals_result=evals_result, ray_params=ray_params, verbose_eval=False, - num_boost_round=10) + num_boost_round=10, + ) model_path = "tuned.xgb" bst.save_model(model_path) - print("Final validation error: {:.4f}".format( - evals_result["eval"]["error"][-1])) + print("Final validation error: {:.4f}".format(evals_result["eval"]["error"][-1])) def main(cpus_per_actor, num_actors, num_samples): @@ -46,14 +44,15 @@ def main(cpus_per_actor, num_actors, num_samples): "eval_metric": ["logloss", "error"], "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), - "max_depth": tune.randint(1, 9) + "max_depth": tune.randint(1, 9), } ray_params = RayParams( max_actor_restarts=1, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, - num_actors=num_actors) + num_actors=num_actors, + ) analysis = tune.run( tune.with_parameters(train_breast_cancer, ray_params=ray_params), @@ -62,15 +61,17 @@ def main(cpus_per_actor, num_actors, num_samples): config=config, num_samples=num_samples, metric="eval-error", - mode="min") + mode="min", + ) # Load the best model checkpoint. best_bst = xgboost_ray.tune.load_model( - os.path.join(analysis.best_logdir, "tuned.xgb")) + os.path.join(analysis.best_logdir, "tuned.xgb") + ) best_bst.save_model("best_model.xgb") - accuracy = 1. - analysis.best_result["eval-error"] + accuracy = 1.0 - analysis.best_result["eval-error"] print(f"Best model parameters: {analysis.best_config}") print(f"Best model total accuracy: {accuracy:.4f}") @@ -78,30 +79,29 @@ def main(cpus_per_actor, num_actors, num_samples): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--address", - required=False, - type=str, - help="the address to use for Ray") + "--address", required=False, type=str, help="the address to use for Ray" + ) parser.add_argument( "--server-address", required=False, type=str, - help="Address of the remote server if using Ray Client.") + help="Address of the remote server if using Ray Client.", + ) parser.add_argument( "--cpus-per-actor", type=int, default=1, - help="Sets number of CPUs per XGBoost training worker.") + help="Sets number of CPUs per XGBoost training worker.", + ) parser.add_argument( "--num-actors", type=int, default=1, - help="Sets number of XGBoost workers to use.") + help="Sets number of XGBoost workers to use.", + ) parser.add_argument( - "--num-samples", - type=int, - default=4, - help="Number of samples to use for Tune.") + "--num-samples", type=int, default=4, help="Number of samples to use for Tune." + ) parser.add_argument("--smoke-test", action="store_true", default=False) args, _ = parser.parse_known_args() diff --git a/xgboost_ray/examples/train_on_test_data.py b/xgboost_ray/examples/train_on_test_data.py index 74e4b319..16e731cc 100644 --- a/xgboost_ray/examples/train_on_test_data.py +++ b/xgboost_ray/examples/train_on_test_data.py @@ -3,7 +3,7 @@ import shutil import time -from xgboost_ray import train, RayDMatrix, RayParams +from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.tests.utils import create_parquet_in_tempdir #### @@ -14,8 +14,7 @@ def main(fname, num_actors=2): - dtrain = RayDMatrix( - os.path.abspath(fname), label="labels", ignore=["partition"]) + dtrain = RayDMatrix(os.path.abspath(fname), label="labels", ignore=["partition"]) config = { "tree_method": "hist", @@ -31,13 +30,13 @@ def main(fname, num_actors=2): evals_result=evals_result, ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors), num_boost_round=10, - evals=[(dtrain, "train")]) + evals=[(dtrain, "train")], + ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("test_data.xgb") - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": @@ -46,7 +45,8 @@ def main(fname, num_actors=2): "--smoke-test", action="store_true", default=False, - help="Finish quickly for testing") + help="Finish quickly for testing", + ) args = parser.parse_args() temp_dir, path = None, None @@ -56,11 +56,13 @@ def main(fname, num_actors=2): num_rows=1_000, num_features=4, num_classes=2, - num_partitions=2) + num_partitions=2, + ) else: path = os.path.join(os.path.dirname(__file__), "parted.parquet") import ray + ray.init() start = time.time() diff --git a/xgboost_ray/examples/train_with_ml_dataset.py b/xgboost_ray/examples/train_with_ml_dataset.py index ac30a72f..49a56834 100644 --- a/xgboost_ray/examples/train_with_ml_dataset.py +++ b/xgboost_ray/examples/train_with_ml_dataset.py @@ -5,7 +5,7 @@ from ray.util.data import read_parquet -from xgboost_ray import train, RayDMatrix, RayParams +from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.tests.utils import create_parquet_in_tempdir #### @@ -34,13 +34,13 @@ def main(fname, num_actors=2): evals_result=evals_result, ray_params=RayParams(max_actor_restarts=1, num_actors=num_actors), num_boost_round=10, - evals=[(dtrain, "train")]) + evals=[(dtrain, "train")], + ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("test_data.xgb") - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) if __name__ == "__main__": @@ -49,7 +49,8 @@ def main(fname, num_actors=2): "--smoke-test", action="store_true", default=False, - help="Finish quickly for testing") + help="Finish quickly for testing", + ) args = parser.parse_args() temp_dir, path = None, None @@ -59,11 +60,13 @@ def main(fname, num_actors=2): num_rows=1_000, num_features=4, num_classes=2, - num_partitions=2) + num_partitions=2, + ) else: path = os.path.join(os.path.dirname(__file__), "parted.parquet") import ray + ray.init() start = time.time() diff --git a/xgboost_ray/main.py b/xgboost_ray/main.py index 039a7b35..ed0daf7a 100644 --- a/xgboost_ray/main.py +++ b/xgboost_ray/main.py @@ -1,22 +1,21 @@ -import platform -from typing import Tuple, Dict, Any, List, Optional, Callable, Union, Sequence -from dataclasses import dataclass, field -from packaging.version import Version - import functools +import inspect import multiprocessing import os import pickle -import time +import platform import threading +import time import warnings -import inspect +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd +from packaging.version import Version +from xgboost.core import XGBoostError from xgboost_ray.xgb import xgboost as xgb -from xgboost.core import XGBoostError try: from xgboost.core import EarlyStopException @@ -29,28 +28,32 @@ class EarlyStopException(XGBoostError): # From xgboost>=1.7.0, rabit is replaced by a collective communicator try: from xgboost.collective import CommunicatorContext + rabit = None HAS_COLLECTIVE = True except ImportError: from xgboost import rabit # noqa + CommunicatorContext = None HAS_COLLECTIVE = False -from xgboost_ray.callback import DistributedCallback, \ - DistributedCallbackContainer -from xgboost_ray.compat import TrainingCallback, RabitTracker, LEGACY_CALLBACK +from xgboost_ray.callback import DistributedCallback, DistributedCallbackContainer +from xgboost_ray.compat import LEGACY_CALLBACK, RabitTracker, TrainingCallback try: import ray from ray import logger - from ray.exceptions import RayActorError, RayTaskError from ray.actor import ActorHandle + from ray.exceptions import RayActorError, RayTaskError from ray.util import get_node_ip_address, placement_group - from ray.util.annotations import PublicAPI, DeveloperAPI - from ray.util.placement_group import PlacementGroup, \ - remove_placement_group, get_current_placement_group - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + from ray.util.annotations import DeveloperAPI, PublicAPI + from ray.util.placement_group import ( + PlacementGroup, + get_current_placement_group, + remove_placement_group, + ) from ray.util.queue import Queue + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from xgboost_ray.util import Event, MultiActorTask, force_on_current_node @@ -70,14 +73,27 @@ def inner_f(*args, **kwargs): DeveloperAPI = PublicAPI RAY_INSTALLED = False -from xgboost_ray.tune import _try_add_tune_callback, _get_tune_resources, \ - is_session_enabled - -from xgboost_ray.matrix import RayDMatrix, combine_data, \ - RayDeviceQuantileDMatrix, RayDataIter, concat_dataframes, \ - LEGACY_MATRIX, QUANTILE_AVAILABLE, RayQuantileDMatrix -from xgboost_ray.session import init_session, put_queue, \ - set_session_queue, get_rabit_rank +from xgboost_ray.matrix import ( + LEGACY_MATRIX, + QUANTILE_AVAILABLE, + RayDataIter, + RayDeviceQuantileDMatrix, + RayDMatrix, + RayQuantileDMatrix, + combine_data, + concat_dataframes, +) +from xgboost_ray.session import ( + get_rabit_rank, + init_session, + put_queue, + set_session_queue, +) +from xgboost_ray.tune import ( + _get_tune_resources, + _try_add_tune_callback, + is_session_enabled, +) def _get_environ(item: str, old_val: Any): @@ -137,7 +153,8 @@ def __getattribute__(self, item): f"(version {xgboost_version}). While we try to support " f"older XGBoost versions, please note that this library is only " f"fully tested and supported for XGBoost >= 1.4. Please consider " - f"upgrading your XGBoost version (`pip install -U xgboost`).") + f"upgrading your XGBoost version (`pip install -U xgboost`)." +) # XGBoost Version for comparisions XGBOOST_VERSION = Version(xgboost_version) @@ -146,18 +163,21 @@ def __getattribute__(self, item): class RayXGBoostTrainingError(RuntimeError): """Raised from RayXGBoostActor.train() when the local xgb.train function did not complete.""" + pass class RayXGBoostTrainingStopped(RuntimeError): """Raised from RayXGBoostActor.train() when training was deliberately stopped.""" + pass class RayXGBoostActorAvailable(RuntimeError): """Raise from `_update_scheduled_actor_states()` when new actors become available in elastic training""" + pass @@ -165,7 +185,8 @@ def _assert_ray_support(): if not RAY_INSTALLED: raise ImportError( "Ray needs to be installed in order to use this module. " - "Try: `pip install ray`") + "Try: `pip install ray`" + ) def _maybe_print_legacy_warning(): @@ -236,8 +257,7 @@ def _start_rabit_tracker(num_workers: int): env.update(rabit_tracker.worker_envs()) rabit_tracker.start(num_workers) - logger.debug( - f"Started Rabit tracker process with PID {rabit_tracker.thread.pid}") + logger.debug(f"Started Rabit tracker process with PID {rabit_tracker.thread.pid}") return rabit_tracker.thread, env @@ -253,8 +273,8 @@ class _RabitContextBase: Rabit tracker. Args: - actor_id (str): Unique actor ID - args (dict): Arguments for Rabit initialisation. These are + actor_id: Unique actor ID + args: Arguments for Rabit initialisation. These are environment variables to configure Rabit clients. """ @@ -274,8 +294,7 @@ class _RabitContext(_RabitContextBase, CommunicatorContext): class _RabitContext(_RabitContextBase): def __init__(self, actor_id: int, args: dict): super().__init__(actor_id, args) - self._list_args = [("%s=%s" % item).encode() - for item in self.args.items()] + self._list_args = [("%s=%s" % item).encode() for item in self.args.items()] def __enter__(self): xgb.rabit.init(self._list_args) @@ -305,8 +324,10 @@ def _ray_get_cluster_cpus(): def _get_min_node_cpus(): max_node_cpus = min( - node.get("Resources", {}).get("CPU", 0.0) for node in ray.nodes() - if node.get("Alive", False)) + node.get("Resources", {}).get("CPU", 0.0) + for node in ray.nodes() + if node.get("Alive", False) + ) return max_node_cpus if max_node_cpus > 0.0 else 1.0 @@ -396,7 +417,8 @@ def _get_dmatrix(data: RayDMatrix, param: Dict) -> xgb.DMatrix: if not LEGACY_MATRIX: matrix.set_info( - label_lower_bound=ll, label_upper_bound=lu, feature_weights=fw) + label_lower_bound=ll, label_upper_bound=lu, feature_weights=fw + ) data.update_matrix_properties(matrix) return matrix @@ -408,25 +430,26 @@ class RayParams: """Parameters to configure Ray-specific behavior. Args: - num_actors (int): Number of parallel Ray actors. - cpus_per_actor (int): Number of CPUs to be used per Ray actor. - gpus_per_actor (int): Number of GPUs to be used per Ray actor. - resources_per_actor (Optional[Dict]): Dict of additional resources + num_actors: Number of parallel Ray actors. + cpus_per_actor: Number of CPUs to be used per Ray actor. + gpus_per_actor: Number of GPUs to be used per Ray actor. + resources_per_actor: Dict of additional resources required per Ray actor. - elastic_training (bool): If True, training will continue with + elastic_training: If True, training will continue with fewer actors if an actor fails. Default False. - max_failed_actors (int): If `elastic_training` is True, this + max_failed_actors: If `elastic_training` is True, this specifies the maximum number of failed actors with which we still continue training. - max_actor_restarts (int): Number of retries when Ray actors fail. + max_actor_restarts: Number of retries when Ray actors fail. Defaults to 0 (no retries). Set to -1 for unlimited retries. - checkpoint_frequency (int): How often to save checkpoints. Defaults + checkpoint_frequency: How often to save checkpoints. Defaults to ``5`` (every 5th iteration). - verbose (bool): Whether to output Ray-specific info messages + verbose: Whether to output Ray-specific info messages during training/prediction. - placement_options (dict): Optional kwargs to pass to + placement_options: Optional kwargs to pass to ``PlacementGroupFactory`` in ``get_tune_resources()``. """ + # Actor scheduling num_actors: int = 0 cpus_per_actor: int = 0 @@ -448,14 +471,16 @@ class RayParams: def get_tune_resources(self): """Return the resources to use for xgboost_ray training with Tune.""" if self.cpus_per_actor <= 0 or self.num_actors <= 0: - raise ValueError("num_actors and cpus_per_actor both must be " - "greater than 0.") + raise ValueError( + "num_actors and cpus_per_actor both must be " "greater than 0." + ) return _get_tune_resources( num_actors=self.num_actors, cpus_per_actor=self.cpus_per_actor, gpus_per_actor=max(0, self.gpus_per_actor), resources_per_actor=self.resources_per_actor, - placement_options=self.placement_options) + placement_options=self.placement_options, + ) @dataclass @@ -464,8 +489,7 @@ class _Checkpoint: value: Optional[bytes] = None -def _validate_ray_params(ray_params: Union[None, RayParams, dict]) \ - -> RayParams: +def _validate_ray_params(ray_params: Union[None, RayParams, dict]) -> RayParams: if ray_params is None: ray_params = RayParams() elif isinstance(ray_params, dict): @@ -475,17 +499,20 @@ def _validate_ray_params(ray_params: Union[None, RayParams, dict]) \ f"`ray_params` must be a `RayParams` instance, a dict, or None, " f"but it was {type(ray_params)}." f"\nFIX THIS preferably by passing a `RayParams` instance as " - f"the `ray_params` parameter.") + f"the `ray_params` parameter." + ) if ray_params.num_actors <= 0: raise ValueError( "The `num_actors` parameter is set to 0. Please always specify " "the number of distributed actors you want to use." "\nFIX THIS by passing a `RayParams(num_actors=X)` argument " - "to your call to xgboost_ray.") + "to your call to xgboost_ray." + ) elif ray_params.num_actors < 2: warnings.warn( f"`num_actors` in `ray_params` is smaller than 2 " - f"({ray_params.num_actors}). XGBoost will NOT be distributed!") + f"({ray_params.num_actors}). XGBoost will NOT be distributed!" + ) if ray_params.verbose is None: # In Tune sessions, reduce verbosity ray_params.verbose = not is_session_enabled() @@ -505,22 +532,23 @@ class RayXGBoostActor: sends the checkpoint back to the driver. Args: - rank (int): Rank of the actor. Must be ``0 <= rank < num_actors``. - num_actors (int): Total number of actors. - queue (Queue): Ray queue to communicate with main process. - checkpoint_frequency (int): How often to store checkpoints. Defaults + rank: Rank of the actor. Must be ``0 <= rank < num_actors``. + num_actors: Total number of actors. + queue: Ray queue to communicate with main process. + checkpoint_frequency: How often to store checkpoints. Defaults to ``5``, saving checkpoints every 5 boosting rounds. """ def __init__( - self, - rank: int, - num_actors: int, - queue: Optional[Queue] = None, - stop_event: Optional[Event] = None, - checkpoint_frequency: int = 5, - distributed_callbacks: Optional[List[DistributedCallback]] = None): + self, + rank: int, + num_actors: int, + queue: Optional[Queue] = None, + stop_event: Optional[Event] = None, + checkpoint_frequency: int = 5, + distributed_callbacks: Optional[List[DistributedCallback]] = None, + ): self.queue = queue init_session(rank, self.queue) @@ -535,7 +563,8 @@ def __init__( self._stop_event = stop_event self._distributed_callbacks = DistributedCallbackContainer( - distributed_callbacks) + distributed_callbacks + ) self._distributed_callbacks.on_init(self) _set_omp_num_threads() @@ -565,8 +594,7 @@ def _save_checkpoint_callback(self): class _SaveInternalCheckpointCallback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): - if get_rabit_rank() == 0 and \ - epoch % this.checkpoint_frequency == 0: + if get_rabit_rank() == 0 and epoch % this.checkpoint_frequency == 0: put_queue(_Checkpoint(epoch, pickle.dumps(model))) def after_training(self, model): @@ -587,8 +615,10 @@ def _stop_callback(self): class _StopCallback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): try: - if this._stop_event.is_set() or \ - this._get_stop_event() is not initial_stop_event: + if ( + this._stop_event.is_set() + or this._get_stop_event() is not initial_stop_event + ): if LEGACY_CALLBACK: raise EarlyStopException(epoch) # Returning True stops training @@ -618,10 +648,16 @@ def load_data(self, data: RayDMatrix): self._distributed_callbacks.after_data_loading(self, data) - def train(self, rabit_args: List[str], return_bst: bool, - params: Dict[str, Any], dtrain: RayDMatrix, - evals: Tuple[RayDMatrix, str], *args, - **kwargs) -> Dict[str, Any]: + def train( + self, + rabit_args: List[str], + return_bst: bool, + params: Dict[str, Any], + dtrain: RayDMatrix, + evals: Tuple[RayDMatrix, str], + *args, + **kwargs, + ) -> Dict[str, Any]: self._distributed_callbacks.before_train(self) num_threads = _set_omp_num_threads() @@ -673,12 +709,14 @@ def _train(): "Training data has no label set. Please make sure " "to set the `label` argument when initializing " "`RayDMatrix()` for data you would like " - "to train on.") + "to train on." + ) local_evals = [] for deval, name in evals: - local_evals.append((_get_dmatrix( - deval, self._data[deval]), name)) + local_evals.append( + (_get_dmatrix(deval, self._data[deval]), name) + ) if LEGACY_CALLBACK: for xgb_callback in kwargs.get("callbacks", []): if isinstance(xgb_callback, TrainingCallback): @@ -690,18 +728,21 @@ def _train(): *args, evals=local_evals, evals_result=evals_result, - **kwargs) + **kwargs, + ) if LEGACY_CALLBACK: for xgb_callback in kwargs.get("callbacks", []): if isinstance(xgb_callback, TrainingCallback): xgb_callback.after_training(bst) - result_dict.update({ - "bst": bst, - "evals_result": evals_result, - "train_n": self._local_n[dtrain] - }) + result_dict.update( + { + "bst": bst, + "evals_result": evals_result, + "train_n": self._local_n[dtrain], + } + ) except EarlyStopException: # Usually this should be caught by XGBoost core. # Silent fail, will be raised as RayXGBoostTrainingStopped. @@ -755,8 +796,13 @@ class _RemoteRayXGBoostActor(RayXGBoostActor): class _PrepareActorTask(MultiActorTask): - def __init__(self, actor: ActorHandle, queue: Queue, stop_event: Event, - load_data: List[RayDMatrix]): + def __init__( + self, + actor: ActorHandle, + queue: Queue, + stop_event: Event, + load_data: List[RayDMatrix], + ): futures = [] futures.append(actor.set_queue.remote(queue)) futures.append(actor.set_stop_event.remote(stop_event)) @@ -766,8 +812,9 @@ def __init__(self, actor: ActorHandle, queue: Queue, stop_event: Event, super(_PrepareActorTask, self).__init__(futures) -def _autodetect_resources(ray_params: RayParams, - use_tree_method: bool = False) -> Tuple[int, int]: +def _autodetect_resources( + ray_params: RayParams, use_tree_method: bool = False +) -> Tuple[int, int]: gpus_per_actor = ray_params.gpus_per_actor cpus_per_actor = ray_params.cpus_per_actor @@ -786,20 +833,22 @@ def _autodetect_resources(ray_params: RayParams, 1, min( int(_get_min_node_cpus() or 1), - int(cluster_cpus // ray_params.num_actors))) + int(cluster_cpus // ray_params.num_actors), + ), + ) return cpus_per_actor, gpus_per_actor def _create_actor( - rank: int, - num_actors: int, - num_cpus_per_actor: int, - num_gpus_per_actor: int, - resources_per_actor: Optional[Dict] = None, - placement_group: Optional[PlacementGroup] = None, - queue: Optional[Queue] = None, - checkpoint_frequency: int = 5, - distributed_callbacks: Optional[Sequence[DistributedCallback]] = None + rank: int, + num_actors: int, + num_cpus_per_actor: int, + num_gpus_per_actor: int, + resources_per_actor: Optional[Dict] = None, + placement_group: Optional[PlacementGroup] = None, + queue: Optional[Queue] = None, + checkpoint_frequency: int = 5, + distributed_callbacks: Optional[Sequence[DistributedCallback]] = None, ) -> ActorHandle: # Send DEFAULT_PG here, which changed in Ray >= 1.5.0 # If we send `None`, this will ignore the parent placement group and @@ -811,14 +860,16 @@ def _create_actor( scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=placement_group or DEFAULT_PG, placement_group_capture_child_tasks=True, - )) + ), + ) return actor_cls.remote( rank=rank, num_actors=num_actors, queue=queue, checkpoint_frequency=checkpoint_frequency, - distributed_callbacks=distributed_callbacks) + distributed_callbacks=distributed_callbacks, + ) def _trigger_data_load(actor, dtrain, evals): @@ -828,8 +879,7 @@ def _trigger_data_load(actor, dtrain, evals): return wait_load -def _handle_queue(queue: Queue, checkpoint: _Checkpoint, - callback_returns: Dict): +def _handle_queue(queue: Queue, checkpoint: _Checkpoint, callback_returns: Dict): """Handle results obtained from workers through the remote Queue object. Remote actors supply these results via the @@ -852,13 +902,14 @@ def _handle_queue(queue: Queue, checkpoint: _Checkpoint, callback_returns[actor_rank].append(item) -def _shutdown(actors: List[ActorHandle], - pending_actors: Optional[Dict[int, Tuple[ - ActorHandle, _PrepareActorTask]]] = None, - queue: Optional[Queue] = None, - event: Optional[Event] = None, - placement_group: Optional[PlacementGroup] = None, - force: bool = False): +def _shutdown( + actors: List[ActorHandle], + pending_actors: Optional[Dict[int, Tuple[ActorHandle, _PrepareActorTask]]] = None, + queue: Optional[Queue] = None, + event: Optional[Event] = None, + placement_group: Optional[PlacementGroup] = None, + force: bool = False, +): alive_actors = [a for a in actors if a is not None] if pending_actors: alive_actors += [a for (a, _) in pending_actors.values()] @@ -884,16 +935,18 @@ def _shutdown(actors: List[ActorHandle], remove_placement_group(placement_group) -def _create_placement_group(cpus_per_actor, gpus_per_actor, - resources_per_actor, num_actors, strategy): +def _create_placement_group( + cpus_per_actor, gpus_per_actor, resources_per_actor, num_actors, strategy +): resources_per_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor} - extra_resources_per_bundle = {} if resources_per_actor is None else \ - resources_per_actor + extra_resources_per_bundle = ( + {} if resources_per_actor is None else resources_per_actor + ) # Create placement group for training worker colocation. - bundles = [{ - **resources_per_bundle, - **extra_resources_per_bundle - } for _ in range(num_actors)] + bundles = [ + {**resources_per_bundle, **extra_resources_per_bundle} + for _ in range(num_actors) + ] pg = placement_group(bundles, strategy=strategy) # Wait for placement group to get created. logger.debug("Waiting for placement group to start.") @@ -909,7 +962,8 @@ def _create_placement_group(cpus_per_actor, gpus_per_actor, f"available: {ray.available_resources()}, resources requested " f"by the placement group: {pg.bundle_specs}. " "You can change the timeout by setting the " - "RXGB_PLACEMENT_GROUP_TIMEOUT_S environment variable.") + "RXGB_PLACEMENT_GROUP_TIMEOUT_S environment variable." + ) return pg @@ -922,10 +976,12 @@ def _create_communication_processes(added_tune_callback: bool = False): # Also, if we are specifically in Tune, let's # ensure that we force Queue and # StopEvent onto same bundle as the Trainable. - placement_option.update({ - "placement_group": current_pg, - "placement_group_bundle_index": 0 if added_tune_callback else -1 - }) + placement_option.update( + { + "placement_group": current_pg, + "placement_group_bundle_index": 0 if added_tune_callback else -1, + } + ) else: # Create Queue and Event actors and make sure to colocate with # driver node. @@ -936,8 +992,7 @@ def _create_communication_processes(added_tune_callback: bool = False): return queue, stop_event -def _validate_kwargs_for_func(kwargs: Dict[str, Any], func: Callable, - func_name: str): +def _validate_kwargs_for_func(kwargs: Dict[str, Any], func: Callable, func_name: str): """Raise exception if kwargs are not valid for a given function.""" sig = inspect.signature(func) try: @@ -949,7 +1004,8 @@ def _validate_kwargs_for_func(kwargs: Dict[str, Any], func: Callable, raise TypeError( f"Got invalid keyword arguments to be passed to `{func_name}`. " - f"Please check these arguments: {invalid_kwargs}") from e + f"Please check these arguments: {invalid_kwargs}" + ) from e @dataclass @@ -961,7 +1017,7 @@ class _TrainingState: checkpoint: _Checkpoint additional_results: Dict - training_started_at: float = 0. + training_started_at: float = 0.0 placement_group: Optional[PlacementGroup] = None @@ -970,19 +1026,22 @@ class _TrainingState: # Last time we checked resources to schedule new actors last_resource_check_at: float = 0 pending_actors: Dict[int, Tuple[ActorHandle, _PrepareActorTask]] = field( - default_factory=dict) + default_factory=dict + ) restart_training_at: Optional[float] = None -def _train(params: Dict, - dtrain: RayDMatrix, - *args, - evals=(), - ray_params: RayParams, - cpus_per_actor: int, - gpus_per_actor: int, - _training_state: _TrainingState, - **kwargs) -> Tuple[xgb.Booster, Dict, Dict]: +def _train( + params: Dict, + dtrain: RayDMatrix, + *args, + evals=(), + ray_params: RayParams, + cpus_per_actor: int, + gpus_per_actor: int, + _training_state: _TrainingState, + **kwargs, +) -> Tuple[xgb.Booster, Dict, Dict]: """This is the local train function wrapped by :func:`train() `. This function can be thought of one invocation of a multi-actor xgboost @@ -995,8 +1054,11 @@ def _train(params: Dict, errors occur. It is called more than once if errors occurred (e.g. an actor died) and failure handling is enabled. """ - from xgboost_ray.elastic import _maybe_schedule_new_actors, \ - _update_scheduled_actor_states, _get_actor_alive_status + from xgboost_ray.elastic import ( + _get_actor_alive_status, + _maybe_schedule_new_actors, + _update_scheduled_actor_states, + ) # Do not modify original parameters params = params.copy() @@ -1006,11 +1068,13 @@ def _train(params: Dict, if "nthread" in params or "n_jobs" in params: if ("nthread" in params and params["nthread"] > cpus_per_actor) or ( - "n_jobs" in params and params["n_jobs"] > cpus_per_actor): + "n_jobs" in params and params["n_jobs"] > cpus_per_actor + ): raise ValueError( "Specified number of threads greater than number of CPUs. " "\nFIX THIS by passing a lower value for the `nthread` " - "parameter or a higher number for `cpus_per_actor`.") + "parameter or a higher number for `cpus_per_actor`." + ) else: params["nthread"] = cpus_per_actor params["n_jobs"] = cpus_per_actor @@ -1038,8 +1102,8 @@ def handle_actor_failure(actor_id): for i in list(_training_state.failed_actor_ranks): if _training_state.actors[i] is not None: raise RuntimeError( - f"Trying to create actor with rank {i}, but it already " - f"exists.") + f"Trying to create actor with rank {i}, but it already " f"exists." + ) actor = _create_actor( rank=i, num_actors=ray_params.num_actors, @@ -1049,7 +1113,8 @@ def handle_actor_failure(actor_id): placement_group=_training_state.placement_group, queue=_training_state.queue, checkpoint_frequency=ray_params.checkpoint_frequency, - distributed_callbacks=ray_params.distributed_callbacks) + distributed_callbacks=ray_params.distributed_callbacks, + ) # Set actor entry in our list _training_state.actors[i] = actor # Remove from this set so it is not created again @@ -1058,9 +1123,11 @@ def handle_actor_failure(actor_id): alive_actors = sum(1 for a in _training_state.actors if a is not None) - maybe_log(f"[RayXGBoost] Created {newly_created} new actors " - f"({alive_actors} total actors). Waiting until actors " - f"are ready for training.") + maybe_log( + f"[RayXGBoost] Created {newly_created} new actors " + f"({alive_actors} total actors). Waiting until actors " + f"are ready for training." + ) # For distributed datasets (e.g. Modin), this will initialize # (and fix) the assignment of data shards to actor ranks @@ -1080,7 +1147,9 @@ def handle_actor_failure(actor_id): # Maybe we got a new Event actor, so send it to all actors. stop_event=_training_state.stop_event, # Trigger data loading - load_data=load_data) for actor in _training_state.actors + load_data=load_data, + ) + for actor in _training_state.actors if actor is not None ] @@ -1092,8 +1161,10 @@ def handle_actor_failure(actor_id): while not all(ready_states): if time.time() >= last_status + ENV.STATUS_FREQUENCY_S: wait_time = time.time() - start_wait - logger.info(f"Waiting until actors are ready " - f"({wait_time:.0f} seconds passed).") + logger.info( + f"Waiting until actors are ready " + f"({wait_time:.0f} seconds passed)." + ) last_status = time.time() time.sleep(0.1) ready_states = [task.is_ready() for task in prepare_actor_tasks] @@ -1117,34 +1188,27 @@ def handle_actor_failure(actor_id): logger.error( "Trying to load continue from checkpoint, but the checkpoint" "indicates training already finished. Returning last" - "checkpointed model instead.") + "checkpointed model instead." + ) return kwargs["xgb_model"], {}, _training_state.additional_results # The callback_returns dict contains actor-rank indexed lists of # results obtained through the `put_queue` function, usually # sent via callbacks. - callback_returns = _training_state.additional_results.get( - "callback_returns") + callback_returns = _training_state.additional_results.get("callback_returns") if callback_returns is None: callback_returns = [list() for _ in range(len(_training_state.actors))] - _training_state.additional_results[ - "callback_returns"] = callback_returns + _training_state.additional_results["callback_returns"] = callback_returns _training_state.training_started_at = time.time() # Trigger the train function - live_actors = [ - actor for actor in _training_state.actors if actor is not None - ] + live_actors = [actor for actor in _training_state.actors if actor is not None] training_futures = [ actor.train.remote( - rabit_args, - i == 0, # return_bst - params, - dtrain, - evals, - *args, - **kwargs) for i, actor in enumerate(live_actors) + rabit_args, i == 0, params, dtrain, evals, *args, **kwargs # return_bst + ) + for i, actor in enumerate(live_actors) ] # Failure handling loop. Here we wait until all training tasks finished. @@ -1169,29 +1233,33 @@ def handle_actor_failure(actor_id): _handle_queue( queue=_training_state.queue, checkpoint=_training_state.checkpoint, - callback_returns=callback_returns) + callback_returns=callback_returns, + ) - if ray_params.elastic_training \ - and not ENV.ELASTIC_RESTART_DISABLED: + if ray_params.elastic_training and not ENV.ELASTIC_RESTART_DISABLED: _maybe_schedule_new_actors( training_state=_training_state, num_cpus_per_actor=cpus_per_actor, num_gpus_per_actor=gpus_per_actor, resources_per_actor=ray_params.resources_per_actor, ray_params=ray_params, - load_data=load_data) + load_data=load_data, + ) # This may raise RayXGBoostActorAvailable _update_scheduled_actor_states(_training_state) if time.time() >= last_status + ENV.STATUS_FREQUENCY_S: wait_time = time.time() - start_wait - logger.info(f"Training in progress " - f"({wait_time:.0f} seconds since last restart).") + logger.info( + f"Training in progress " + f"({wait_time:.0f} seconds since last restart)." + ) last_status = time.time() ready, not_ready = ray.wait( - not_ready, num_returns=len(not_ready), timeout=1) + not_ready, num_returns=len(not_ready), timeout=1 + ) ray.get(ready) # Get items from queue one last time @@ -1201,7 +1269,8 @@ def handle_actor_failure(actor_id): _handle_queue( queue=_training_state.queue, checkpoint=_training_state.checkpoint, - callback_returns=callback_returns) + callback_returns=callback_returns, + ) # The inner loop should catch all exceptions except Exception as exc: @@ -1232,8 +1301,7 @@ def handle_actor_failure(actor_id): evals_result = all_results[0]["evals_result"] if callback_returns: - _training_state.additional_results[ - "callback_returns"] = callback_returns + _training_state.additional_results["callback_returns"] = callback_returns total_n = sum(res["train_n"] or 0 for res in all_results) @@ -1244,17 +1312,17 @@ def handle_actor_failure(actor_id): @PublicAPI(stability="beta") def train( - params: Dict, - dtrain: RayDMatrix, - num_boost_round: int = 10, - *args, - evals: Union[List[Tuple[RayDMatrix, str]], Tuple[RayDMatrix, str]] = ( - ), - evals_result: Optional[Dict] = None, - additional_results: Optional[Dict] = None, - ray_params: Union[None, RayParams, Dict] = None, - _remote: Optional[bool] = None, - **kwargs) -> xgb.Booster: + params: Dict, + dtrain: RayDMatrix, + num_boost_round: int = 10, + *args, + evals: Union[List[Tuple[RayDMatrix, str]], Tuple[RayDMatrix, str]] = (), + evals_result: Optional[Dict] = None, + additional_results: Optional[Dict] = None, + ray_params: Union[None, RayParams, Dict] = None, + _remote: Optional[bool] = None, + **kwargs, +) -> xgb.Booster: """Distributed XGBoost training via Ray. This function will connect to a Ray cluster, create ``num_actors`` @@ -1287,16 +1355,15 @@ def train( Third, if none of the above is the case, training is aborted. Args: - params (Dict): parameter dict passed to ``xgboost.train()`` - dtrain (RayDMatrix): Data object containing the training data. - evals (Union[List[Tuple[RayDMatrix, str]], Tuple[RayDMatrix, str]]): - ``evals`` tuple passed to ``xgboost.train()``. - evals_result (Optional[Dict]): Dict to store evaluation results in. - additional_results (Optional[Dict]): Dict to store additional results. - ray_params (Union[None, RayParams, Dict]): Parameters to configure + params: parameter dict passed to ``xgboost.train()`` + dtrain: Data object containing the training data. + evals: ``evals`` tuple passed to ``xgboost.train()``. + evals_result: Dict to store evaluation results in. + additional_results: Dict to store additional results. + ray_params: Parameters to configure Ray-specific behavior. See :class:`RayParams` for a list of valid configuration parameters. - _remote (bool): Whether to run the driver process in a remote + _remote: Whether to run the driver process in a remote function. This is enabled by default in Ray client mode. **kwargs: Keyword arguments will be passed to the local `xgb.train()` calls. @@ -1306,17 +1373,18 @@ def train( os.environ.setdefault("RAY_IGNORE_UNHANDLED_ERRORS", "1") if platform.system() == "Windows": - raise RuntimeError("xgboost-ray training currently does not support " - "Windows.") + raise RuntimeError( + "xgboost-ray training currently does not support " "Windows." + ) if xgb is None: raise ImportError( "xgboost package is not installed. XGBoost-Ray WILL NOT WORK. " - "FIX THIS by running `pip install \"xgboost-ray\"`.") + 'FIX THIS by running `pip install "xgboost-ray"`.' + ) if _remote is None: - _remote = _is_client_connected() and \ - not is_session_enabled() + _remote = _is_client_connected() and not is_session_enabled() if not ray.is_initialized(): ray.init() @@ -1332,7 +1400,8 @@ def _wrapped(*args, **kwargs): num_boost_round=num_boost_round, evals_result=_evals_result, additional_results=_additional_results, - **kwargs) + **kwargs, + ) return bst, _evals_result, _additional_results # Make sure that train is called on the server node. @@ -1347,7 +1416,8 @@ def _wrapped(*args, **kwargs): ray_params=ray_params, _remote=False, **kwargs, - )) + ) + ) if isinstance(evals_result, dict): evals_result.update(train_evals_result) if isinstance(additional_results, dict): @@ -1362,8 +1432,11 @@ def _wrapped(*args, **kwargs): ray_params = _validate_ray_params(ray_params) - max_actor_restarts = ray_params.max_actor_restarts \ - if ray_params.max_actor_restarts >= 0 else float("inf") + max_actor_restarts = ( + ray_params.max_actor_restarts + if ray_params.max_actor_restarts >= 0 + else float("inf") + ) _assert_ray_support() if not isinstance(dtrain, RayDMatrix): @@ -1371,16 +1444,21 @@ def _wrapped(*args, **kwargs): "The `dtrain` argument passed to `train()` is not a RayDMatrix, " "but of type {}. " "\nFIX THIS by instantiating a RayDMatrix first: " - "`dtrain = RayDMatrix(data=data, label=label)`.".format( - type(dtrain))) + "`dtrain = RayDMatrix(data=data, label=label)`.".format(type(dtrain)) + ) added_tune_callback = _try_add_tune_callback(kwargs) # Tune currently does not support elastic training. - if added_tune_callback and ray_params.elastic_training and not bool( - os.getenv("RXGB_ALLOW_ELASTIC_TUNE", "0")): - raise ValueError("Elastic Training cannot be used with Ray Tune. " - "Please disable elastic_training in RayParams in " - "order to use xgboost_ray with Tune.") + if ( + added_tune_callback + and ray_params.elastic_training + and not bool(os.getenv("RXGB_ALLOW_ELASTIC_TUNE", "0")) + ): + raise ValueError( + "Elastic Training cannot be used with Ray Tune. " + "Please disable elastic_training in RayParams in " + "order to use xgboost_ray with Tune." + ) if added_tune_callback or get_current_placement_group(): # Don't autodetect resources when used with Tune. @@ -1391,18 +1469,19 @@ def _wrapped(*args, **kwargs): ray_params=ray_params, use_tree_method="tree_method" in params and params["tree_method"] is not None - and params["tree_method"].startswith("gpu")) + and params["tree_method"].startswith("gpu"), + ) tree_method = params.get("tree_method", "auto") or "auto" # preemptively raise exceptions with bad params if tree_method == "exact": - raise ValueError( - "`exact` tree method doesn't support distributed training.") + raise ValueError("`exact` tree method doesn't support distributed training.") if params.get("updater", None) == "grow_colmaker": raise ValueError( - "`grow_colmaker` updater doesn't support distributed training.") + "`grow_colmaker` updater doesn't support distributed training." + ) if gpus_per_actor > 0 and not tree_method.startswith("gpu_"): warnings.warn( @@ -1410,31 +1489,37 @@ def _wrapped(*args, **kwargs): f"tree method is set to `{tree_method}`. Thus, GPUs will " f"currently not be used. To enable GPUs usage, please set the " f"`tree_method` to a GPU-compatible option, " - f"e.g. `gpu_hist`.") + f"e.g. `gpu_hist`." + ) if gpus_per_actor == 0 and cpus_per_actor == 0: - raise ValueError("cpus_per_actor and gpus_per_actor both cannot be " - "0. Are you sure your cluster has CPUs available?") + raise ValueError( + "cpus_per_actor and gpus_per_actor both cannot be " + "0. Are you sure your cluster has CPUs available?" + ) if ray_params.elastic_training and ray_params.max_failed_actors == 0: raise ValueError( "Elastic training enabled but the maximum number of failed " "actors is set to 0. This means that elastic training is " "effectively disabled. Please set `RayParams.max_failed_actors` " - "to something larger than 0 to enable elastic training.") + "to something larger than 0 to enable elastic training." + ) if ray_params.elastic_training and ray_params.max_actor_restarts == 0: raise ValueError( "Elastic training enabled but the maximum number of actor " "restarts is set to 0. This means that elastic training is " "effectively disabled. Please set `RayParams.max_actor_restarts` " - "to something larger than 0 to enable elastic training.") + "to something larger than 0 to enable elastic training." + ) if not dtrain.has_label: raise ValueError( "Training data has no label set. Please make sure to set " "the `label` argument when initializing `RayDMatrix()` " - "for data you would like to train on.") + "for data you would like to train on." + ) if not dtrain.loaded and not dtrain.distributed: dtrain.load_data(ray_params.num_actors) @@ -1444,7 +1529,8 @@ def _wrapped(*args, **kwargs): raise ValueError( "Evaluation data has no label set. Please make sure to set " "the `label` argument when initializing `RayDMatrix()` " - "for data you would like to evaluate on.") + "for data you would like to evaluate on." + ) if not deval.loaded and not deval.distributed: deval.load_data(ray_params.num_actors) @@ -1471,22 +1557,25 @@ def _wrapped(*args, **kwargs): placement_strategy = "SPREAD" if placement_strategy is not None: - pg = _create_placement_group(cpus_per_actor, gpus_per_actor, - ray_params.resources_per_actor, - ray_params.num_actors, placement_strategy) + pg = _create_placement_group( + cpus_per_actor, + gpus_per_actor, + ray_params.resources_per_actor, + ray_params.num_actors, + placement_strategy, + ) else: pg = None start_actor_ranks = set(range(ray_params.num_actors)) # Start these - total_training_time = 0. + total_training_time = 0.0 boost_rounds_left = num_boost_round last_checkpoint_value = checkpoint.value while tries <= max_actor_restarts: # Only update number of iterations if the checkpoint changed # If it didn't change, we already subtracted the iterations. - if checkpoint.iteration >= 0 and \ - checkpoint.value != last_checkpoint_value: + if checkpoint.iteration >= 0 and checkpoint.value != last_checkpoint_value: boost_rounds_left -= checkpoint.iteration + 1 last_checkpoint_value = checkpoint.value @@ -1499,10 +1588,11 @@ def _wrapped(*args, **kwargs): stop_event=stop_event, checkpoint=checkpoint, additional_results=current_results, - training_started_at=0., + training_started_at=0.0, placement_group=pg, failed_actor_ranks=start_actor_ranks, - pending_actors=pending_actors) + pending_actors=pending_actors, + ) try: bst, train_evals_result, train_additional_results = _train( @@ -1515,56 +1605,58 @@ def _wrapped(*args, **kwargs): cpus_per_actor=cpus_per_actor, gpus_per_actor=gpus_per_actor, _training_state=training_state, - **kwargs) - if training_state.training_started_at > 0.: - total_training_time += time.time( - ) - training_state.training_started_at + **kwargs, + ) + if training_state.training_started_at > 0.0: + total_training_time += time.time() - training_state.training_started_at break except (RayActorError, RayTaskError) as exc: - if training_state.training_started_at > 0.: - total_training_time += time.time( - ) - training_state.training_started_at + if training_state.training_started_at > 0.0: + total_training_time += time.time() - training_state.training_started_at alive_actors = sum(1 for a in actors if a is not None) start_again = False if ray_params.elastic_training: - if alive_actors < ray_params.num_actors - \ - ray_params.max_failed_actors: + if alive_actors < ray_params.num_actors - ray_params.max_failed_actors: raise RuntimeError( "A Ray actor died during training and the maximum " "number of dead actors in elastic training was " - "reached. Shutting down training.") from exc + "reached. Shutting down training." + ) from exc # Do not start new actors before resuming training # (this might still restart actors during training) start_actor_ranks.clear() - if exc.__cause__ and isinstance(exc.__cause__, - RayXGBoostActorAvailable): + if exc.__cause__ and isinstance( + exc.__cause__, RayXGBoostActorAvailable + ): # New actor available, integrate into training loop logger.info( f"A new actor became available. Re-starting training " f"from latest checkpoint with new actor. " f"This will use {alive_actors} existing actors and " f"start {len(start_actor_ranks)} new actors. " - f"Sleeping for 10 seconds for cleanup.") + f"Sleeping for 10 seconds for cleanup." + ) tries -= 1 # This is deliberate so shouldn't count start_again = True elif tries + 1 <= max_actor_restarts: - if exc.__cause__ and isinstance(exc.__cause__, - RayXGBoostTrainingError): + if exc.__cause__ and isinstance( + exc.__cause__, RayXGBoostTrainingError + ): logger.warning(f"Caught exception: {exc.__cause__}") logger.warning( f"A Ray actor died during training. Trying to " f"continue training on the remaining actors. " f"This will use {alive_actors} existing actors and " f"start {len(start_actor_ranks)} new actors. " - f"Sleeping for 10 seconds for cleanup.") + f"Sleeping for 10 seconds for cleanup." + ) start_again = True elif tries + 1 <= max_actor_restarts: - if exc.__cause__ and isinstance(exc.__cause__, - RayXGBoostTrainingError): + if exc.__cause__ and isinstance(exc.__cause__, RayXGBoostTrainingError): logger.warning(f"Caught exception: {exc.__cause__}") logger.warning( f"A Ray actor died during training. Trying to restart " @@ -1572,7 +1664,8 @@ def _wrapped(*args, **kwargs): f"(restart {tries + 1} of {max_actor_restarts}). " f"This will use {alive_actors} existing actors and start " f"{len(start_actor_ranks)} new actors. " - f"Sleeping for 10 seconds for cleanup.") + f"Sleeping for 10 seconds for cleanup." + ) start_again = True if start_again: @@ -1598,10 +1691,13 @@ def _wrapped(*args, **kwargs): else: maybe_log = logger.debug - maybe_log("[RayXGBoost] Finished XGBoost training on training data " - "with total N={total_n:,} in {total_time_s:.2f} seconds " - "({training_time_s:.2f} pure XGBoost training time).".format( - **train_additional_results)) + maybe_log( + "[RayXGBoost] Finished XGBoost training on training data " + "with total N={total_n:,} in {total_time_s:.2f} seconds " + "({training_time_s:.2f} pure XGBoost training time).".format( + **train_additional_results + ) + ) _shutdown( actors=actors, @@ -1609,7 +1705,8 @@ def _wrapped(*args, **kwargs): queue=queue, event=stop_event, placement_group=pg, - force=False) + force=False, + ) if isinstance(evals_result, dict): evals_result.update(train_evals_result) @@ -1619,8 +1716,7 @@ def _wrapped(*args, **kwargs): return bst -def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams, - **kwargs): +def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams, **kwargs): _assert_ray_support() if ray_params.verbose: @@ -1638,9 +1734,11 @@ def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams, num_actors=ray_params.num_actors, num_cpus_per_actor=ray_params.cpus_per_actor, num_gpus_per_actor=ray_params.gpus_per_actor - if ray_params.gpus_per_actor >= 0 else 0, + if ray_params.gpus_per_actor >= 0 + else 0, resources_per_actor=ray_params.resources_per_actor, - distributed_callbacks=ray_params.distributed_callbacks) + distributed_callbacks=ray_params.distributed_callbacks, + ) for i in range(ray_params.num_actors) ] maybe_log(f"[RayXGBoost] Created {len(actors)} remote actors.") @@ -1678,11 +1776,13 @@ def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams, @PublicAPI(stability="beta") -def predict(model: xgb.Booster, - data: RayDMatrix, - ray_params: Union[None, RayParams, Dict] = None, - _remote: Optional[bool] = None, - **kwargs) -> Optional[np.ndarray]: +def predict( + model: xgb.Booster, + data: RayDMatrix, + ray_params: Union[None, RayParams, Dict] = None, + _remote: Optional[bool] = None, + **kwargs, +) -> Optional[np.ndarray]: """Distributed XGBoost predict via Ray. This function will connect to a Ray cluster, create ``num_actors`` @@ -1691,12 +1791,12 @@ def predict(model: xgb.Booster, returned. Args: - model (xgb.Booster): Booster object to call for prediction. - data (RayDMatrix): Data object containing the prediction data. - ray_params (Union[None, RayParams, Dict]): Parameters to configure + model: Booster object to call for prediction. + data: Data object containing the prediction data. + ray_params: Parameters to configure Ray-specific behavior. See :class:`RayParams` for a list of valid configuration parameters. - _remote (bool): Whether to run the driver process in a remote + _remote: Whether to run the driver process in a remote function. This is enabled by default in Ray client mode. **kwargs: Keyword arguments will be passed to the local `xgb.predict()` calls. @@ -1709,11 +1809,11 @@ def predict(model: xgb.Booster, if xgb is None: raise ImportError( "xgboost package is not installed. XGBoost-Ray WILL NOT WORK. " - "FIX THIS by running `pip install \"xgboost-ray\"`.") + 'FIX THIS by running `pip install "xgboost-ray"`.' + ) if _remote is None: - _remote = _is_client_connected() and \ - not is_session_enabled() + _remote = _is_client_connected() and not is_session_enabled() if not ray.is_initialized(): ray.init() @@ -1721,14 +1821,19 @@ def predict(model: xgb.Booster, if _remote: return ray.get( ray.remote(num_cpus=0)(predict).remote( - model, data, ray_params, _remote=False, **kwargs)) + model, data, ray_params, _remote=False, **kwargs + ) + ) _maybe_print_legacy_warning() ray_params = _validate_ray_params(ray_params) - max_actor_restarts = ray_params.max_actor_restarts \ - if ray_params.max_actor_restarts >= 0 else float("inf") + max_actor_restarts = ( + ray_params.max_actor_restarts + if ray_params.max_actor_restarts >= 0 + else float("inf") + ) _assert_ray_support() if not isinstance(data, RayDMatrix): @@ -1736,7 +1841,8 @@ def predict(model: xgb.Booster, "The `data` argument passed to `train()` is not a RayDMatrix, " "but of type {}. " "\nFIX THIS by instantiating a RayDMatrix first: " - "`data = RayDMatrix(data=data)`.".format(type(data))) + "`data = RayDMatrix(data=data)`.".format(type(data)) + ) tries = 0 while tries <= max_actor_restarts: @@ -1747,12 +1853,13 @@ def predict(model: xgb.Booster, logger.warning( "A Ray actor died during prediction. Trying to restart " "prediction from scratch. " - "Sleeping for 10 seconds for cleanup.") + "Sleeping for 10 seconds for cleanup." + ) time.sleep(10) else: raise RuntimeError( "A Ray actor died during prediction and the maximum " - "number of retries ({}) is exhausted.".format( - max_actor_restarts)) + "number of retries ({}) is exhausted.".format(max_actor_restarts) + ) tries += 1 return None diff --git a/xgboost_ray/matrix.py b/xgboost_ray/matrix.py index 5bf92838..d8db5c7b 100644 --- a/xgboost_ray/matrix.py +++ b/xgboost_ray/matrix.py @@ -1,8 +1,19 @@ import glob import uuid from enum import Enum -from typing import Union, Optional, Tuple, Iterable, List, Dict, Sequence, \ - Callable, Type, TYPE_CHECKING, Set +from typing import ( + TYPE_CHECKING, + Callable, + Dict, + Iterable, + List, + Optional, + Sequence, + Set, + Tuple, + Type, + Union, +) from ray.actor import ActorHandle @@ -11,16 +22,15 @@ except ImportError: cp = None -import numpy as np -import pandas as pd - import os +import numpy as np +import pandas as pd import ray from ray import logger -from ray.util.annotations import PublicAPI, DeveloperAPI +from ray.util.annotations import DeveloperAPI, PublicAPI -from xgboost_ray.data_sources import DataSource, data_sources, RayFileType +from xgboost_ray.data_sources import DataSource, RayFileType, data_sources try: from ray.data.dataset import Dataset as RayDataset @@ -32,6 +42,7 @@ class RayDataset: try: from xgboost.core import DataIter + LEGACY_MATRIX = False except ImportError: DataIter = object @@ -39,6 +50,7 @@ class RayDataset: try: from xgboost.core import QuantileDmatrix + QUANTILE_AVAILABLE = True except ImportError: QuantileDmatrix = object @@ -55,8 +67,9 @@ def concat_dataframes(dfs: List[Optional[pd.DataFrame]]): return pd.concat(filtered, ignore_index=True, copy=False) -def ensure_sorted_by_qid(df: pd.DataFrame, qid: Data - ) -> Tuple[Union[np.array, str], pd.DataFrame]: +def ensure_sorted_by_qid( + df: pd.DataFrame, qid: Data +) -> Tuple[Union[np.array, str], pd.DataFrame]: _qid: pd.Series = None if isinstance(qid, str): _qid = df[qid] @@ -64,9 +77,11 @@ def ensure_sorted_by_qid(df: pd.DataFrame, qid: Data _qid = pd.Series(qid) elif isinstance(qid, pd.DataFrame): if len(df.shape) != 2 and df.shape[1] != 1: - raise ValueError(f"qid argument of type pd.DataFrame is expected" - "to contains only 1 column of data " - f"but the qid passed in is of shape {df.shape}.") + raise ValueError( + f"qid argument of type pd.DataFrame is expected" + "to contains only 1 column of data " + f"but the qid passed in is of shape {df.shape}." + ) _qid = qid.iloc[:, 0] elif isinstance(qid, pd.Series): _qid = qid @@ -76,8 +91,9 @@ def ensure_sorted_by_qid(df: pd.DataFrame, qid: Data if isinstance(qid, str): return qid, df.sort_values([qid]) else: # case when qid is not part of df - return _qid.sort_values(), \ - df.set_index(_qid).sort_index().reset_index(drop=True) + return _qid.sort_values(), df.set_index(_qid).sort_index().reset_index( + drop=True + ) @PublicAPI(stability="beta") @@ -96,6 +112,7 @@ class RayShardingMode(Enum): data source that assigns actors to specific data shards on initialization and then keeps these fixed. """ + INTERLEAVED = 1 BATCH = 2 FIXED = 3 @@ -104,19 +121,19 @@ class RayShardingMode(Enum): @DeveloperAPI class RayDataIter(DataIter): def __init__( - self, - data: List[Data], - label: List[Optional[Data]], - missing: Optional[float], - weight: List[Optional[Data]], - feature_weights: List[Optional[Data]], - qid: List[Optional[Data]], - base_margin: List[Optional[Data]], - label_lower_bound: List[Optional[Data]], - label_upper_bound: List[Optional[Data]], - feature_names: Optional[List[str]], - feature_types: Optional[List[np.dtype]], - enable_categorical: Optional[bool], + self, + data: List[Data], + label: List[Optional[Data]], + missing: Optional[float], + weight: List[Optional[Data]], + feature_weights: List[Optional[Data]], + qid: List[Optional[Data]], + base_margin: List[Optional[Data]], + label_lower_bound: List[Optional[Data]], + label_upper_bound: List[Optional[Data]], + feature_names: Optional[List[str]], + feature_types: Optional[List[np.dtype]], + enable_categorical: Optional[bool], ): super(RayDataIter, self).__init__() @@ -167,28 +184,31 @@ def next(self, input_data: Callable): label_upper_bound=self._prop(self._label_upper_bound), feature_names=self._feature_names, feature_types=self._feature_types, - enable_categorical=self._enable_categorical) + enable_categorical=self._enable_categorical, + ) self._iter += 1 return 1 class _RayDMatrixLoader: - def __init__(self, - data: Data, - label: Optional[Data] = None, - missing: Optional[float] = None, - weight: Optional[Data] = None, - feature_weights: Optional[Data] = None, - base_margin: Optional[Data] = None, - label_lower_bound: Optional[Data] = None, - label_upper_bound: Optional[Data] = None, - feature_names: Optional[List[str]] = None, - feature_types: Optional[List[np.dtype]] = None, - qid: Optional[Data] = None, - enable_categorical: Optional[bool] = None, - filetype: Optional[RayFileType] = None, - ignore: Optional[List[str]] = None, - **kwargs): + def __init__( + self, + data: Data, + label: Optional[Data] = None, + missing: Optional[float] = None, + weight: Optional[Data] = None, + feature_weights: Optional[Data] = None, + base_margin: Optional[Data] = None, + label_lower_bound: Optional[Data] = None, + label_upper_bound: Optional[Data] = None, + feature_names: Optional[List[str]] = None, + feature_types: Optional[List[np.dtype]] = None, + qid: Optional[Data] = None, + enable_categorical: Optional[bool] = None, + filetype: Optional[RayFileType] = None, + ignore: Optional[List[str]] = None, + **kwargs, + ): self.data = data self.label = label self.missing = missing @@ -230,7 +250,8 @@ def __init__(self, "but filetype could not be detected. " "\nFIX THIS by passing " "the `filetype` parameter to the RayDMatrix. Use the " - "`RayFileType` enum for this.") + "`RayFileType` enum for this." + ) def get_data_source(self) -> Type[DataSource]: raise NotImplementedError @@ -254,9 +275,15 @@ def assign_shards_to_actors(self, actors: Sequence[ActorHandle]) -> bool: return False def _split_dataframe( - self, local_data: pd.DataFrame, data_source: Type[DataSource] - ) -> Tuple[pd.DataFrame, Optional[pd.Series], Optional[pd.Series], - Optional[pd.Series], Optional[pd.Series], Optional[pd.Series]]: + self, local_data: pd.DataFrame, data_source: Type[DataSource] + ) -> Tuple[ + pd.DataFrame, + Optional[pd.Series], + Optional[pd.Series], + Optional[pd.Series], + Optional[pd.Series], + Optional[pd.Series], + ]: """ Split dataframe into @@ -281,7 +308,8 @@ def _split_dataframe( exclude_cols.add(exclude) feature_weights, exclude = data_source.get_column( - local_data, self.feature_weights) + local_data, self.feature_weights + ) if exclude: exclude_cols.add(exclude) @@ -289,18 +317,19 @@ def _split_dataframe( if exclude: exclude_cols.add(exclude) - base_margin, exclude = data_source.get_column(local_data, - self.base_margin) + base_margin, exclude = data_source.get_column(local_data, self.base_margin) if exclude: exclude_cols.add(exclude) label_lower_bound, exclude = data_source.get_column( - local_data, self.label_lower_bound) + local_data, self.label_lower_bound + ) if exclude: exclude_cols.add(exclude) label_upper_bound, exclude = data_source.get_column( - local_data, self.label_upper_bound) + local_data, self.label_upper_bound + ) if exclude: exclude_cols.add(exclude) @@ -308,13 +337,20 @@ def _split_dataframe( if exclude_cols: x = x[[col for col in x.columns if col not in exclude_cols]] - return x, label, weight, feature_weights, base_margin, \ - label_lower_bound, label_upper_bound, qid - - def load_data(self, - num_actors: int, - sharding: RayShardingMode, - rank: Optional[int] = None) -> Tuple[Dict, int]: + return ( + x, + label, + weight, + feature_weights, + base_margin, + label_lower_bound, + label_upper_bound, + qid, + ) + + def load_data( + self, num_actors: int, sharding: RayShardingMode, rank: Optional[int] = None + ) -> Tuple[Dict, int]: raise NotImplementedError @@ -339,7 +375,8 @@ def get_data_source(self) -> Type[DataSource]: # is not available. logger.warning( f"Checking data source {source.__name__} failed " - f"with exception: {exc}") + f"with exception: {exc}" + ) continue if not data_source: @@ -352,10 +389,15 @@ def get_data_source(self) -> Type[DataSource]: "specify the type of the source. Use the `RayFileType` " "enum for that. If using Modin, Dask, or Petastorm, " "make sure the library is installed.".format( - type(self.data), self.filetype)) - - if self.label is not None and not isinstance(self.label, str) and \ - not type(self.data) != type(self.label): # noqa: E721: + type(self.data), self.filetype + ) + ) + + if ( + self.label is not None + and not isinstance(self.label, str) + and not type(self.data) != type(self.label) # noqa: E721 + ): # noqa: E721: # Label is an object of a different type than the main data. # We have to make sure they are compatible if not data_source.is_data_type(self.label): @@ -365,16 +407,17 @@ def get_data_source(self) -> Type[DataSource]: "`RayDMatrix` - e.g. a `pandas.DataFrame` as `data` " "and `label`. The `label` can always be a string. Got " "{} for the main data and {} for the label.".format( - type(self.data), type(self.label))) + type(self.data), type(self.label) + ) + ) self.data_source = data_source self._cached_n = data_source.get_n(self.data) return self.data_source - def load_data(self, - num_actors: int, - sharding: RayShardingMode, - rank: Optional[int] = None) -> Tuple[Dict, int]: + def load_data( + self, num_actors: int, sharding: RayShardingMode, rank: Optional[int] = None + ) -> Tuple[Dict, int]: """ Load data into memory """ @@ -391,14 +434,17 @@ def load_data(self, raise RuntimeError( f"Trying to shard data for {num_actors} actors, but the " f"maximum number of shards (i.e. the number of data rows) " - f"is {max_num_shards}. Consider using fewer actors.") + f"is {max_num_shards}. Consider using fewer actors." + ) # We're doing central data loading here, so we don't pass any indices, # yet. Instead, we'll be selecting the rows below. local_df = data_source.load_data( - self.data, ignore=self.ignore, indices=None, **self.kwargs) + self.data, ignore=self.ignore, indices=None, **self.kwargs + ) x, y, w, fw, b, ll, lu, qid = self._split_dataframe( - local_df, data_source=data_source) + local_df, data_source=data_source + ) if isinstance(x, list): n = sum(len(a) for a in x) @@ -414,12 +460,13 @@ def load_data(self, "label": ray.put(y.iloc[indices] if y is not None else None), "weight": ray.put(w.iloc[indices] if w is not None else None), "feature_weights": ray.put(fw), - "base_margin": ray.put(b.iloc[indices] - if b is not None else None), - "label_lower_bound": ray.put(ll.iloc[indices] - if ll is not None else None), - "label_upper_bound": ray.put(lu.iloc[indices] - if lu is not None else None), + "base_margin": ray.put(b.iloc[indices] if b is not None else None), + "label_lower_bound": ray.put( + ll.iloc[indices] if ll is not None else None + ), + "label_upper_bound": ray.put( + lu.iloc[indices] if lu is not None else None + ), "qid": ray.put(qid.iloc[indices] if qid is not None else None), } refs[i] = actor_refs @@ -452,22 +499,29 @@ def get_data_source(self) -> Type[DataSource]: # Todo (krfricke): It would be good to have a more general way to # check for compatibility here. Combine with test below? - if not (isinstance(self.data, (Iterable, RayDataset)) - or hasattr(self.data, "__partitioned__")) or invalid_data: + if ( + not ( + isinstance(self.data, (Iterable, RayDataset)) + or hasattr(self.data, "__partitioned__") + ) + or invalid_data + ): raise ValueError( f"Distributed data loading only works with already " f"distributed datasets. These should be specified through a " f"list of locations (or a single string). " f"Got: {type(self.data)}." f"\nFIX THIS by passing a list of files (e.g. on S3) to the " - f"RayDMatrix.") + f"RayDMatrix." + ) if self.label is not None and not isinstance(self.label, str): raise ValueError( f"Invalid `label` value for distributed datasets: " f"{self.label}. Only strings are supported. " f"\nFIX THIS by passing a string indicating the label " - f"column of the dataset as the `label` argument.") + f"column of the dataset as the `label` argument." + ) data_source = None for source in data_sources: @@ -483,7 +537,8 @@ def get_data_source(self) -> Type[DataSource]: # is not available. logger.warning( f"Checking data source {source.__name__} failed " - f"with exception: {exc}") + f"with exception: {exc}" + ) continue if not data_source: @@ -494,7 +549,8 @@ def get_data_source(self) -> Type[DataSource]: "data types for distributed datasets are a list of " "CSV or Parquet sources. If using " "Modin, Dask, or Petastorm, make sure the library is " - "installed.") + "installed." + ) self.data_source = data_source self._cached_n = data_source.get_n(self.data) @@ -516,7 +572,8 @@ def assert_enough_shards_for_actors(self, num_actors: int): f"want to shard the dataset by rows, consider " f"centralized loading by passing `distributed=False` to " f"the `RayDMatrix`. Otherwise consider using fewer actors " - f"or re-partitioning your data.") + f"or re-partitioning your data." + ) def assign_shards_to_actors(self, actors: Sequence[ActorHandle]) -> bool: if not isinstance(self.label, str): @@ -537,10 +594,9 @@ def assign_shards_to_actors(self, actors: Sequence[ActorHandle]) -> bool: self.actor_shards = actor_shards return True - def load_data(self, - num_actors: int, - sharding: RayShardingMode, - rank: Optional[int] = None) -> Tuple[Dict, int]: + def load_data( + self, num_actors: int, sharding: RayShardingMode, rank: Optional[int] = None + ) -> Tuple[Dict, int]: """ Load data into memory """ @@ -550,7 +606,8 @@ def load_data(self, "driver program. " "\nFIX THIS by refraining from calling `RayDMatrix.load()` " "manually for distributed datasets. Hint: You can check if " - "`RayDMatrix.distributed` is set to True or False.") + "`RayDMatrix.distributed` is set to True or False." + ) if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] @@ -560,16 +617,15 @@ def load_data(self, if self.actor_shards: if rank is None: raise RuntimeError( - "Distributed loading requires a rank to be passed, " - "got None") + "Distributed loading requires a rank to be passed, " "got None" + ) rank_shards = self.actor_shards[rank] local_df = data_source.load_data( - self.data, - indices=rank_shards, - ignore=self.ignore, - **self.kwargs) + self.data, indices=rank_shards, ignore=self.ignore, **self.kwargs + ) x, y, w, fw, b, ll, lu, qid = self._split_dataframe( - local_df, data_source=data_source) + local_df, data_source=data_source + ) if isinstance(x, list): n = sum(len(a) for a in x) @@ -580,17 +636,24 @@ def load_data(self, indices = _get_sharding_indices(sharding, rank, num_actors, n) if not indices: - x, y, w, fw, b, ll, lu, qid = (None, None, None, None, None, - None, None, None) + x, y, w, fw, b, ll, lu, qid = ( + None, + None, + None, + None, + None, + None, + None, + None, + ) n = 0 else: local_df = data_source.load_data( - self.data, - ignore=self.ignore, - indices=indices, - **self.kwargs) + self.data, ignore=self.ignore, indices=indices, **self.kwargs + ) x, y, w, fw, b, ll, lu, qid = self._split_dataframe( - local_df, data_source=data_source) + local_df, data_source=data_source + ) if isinstance(x, list): n = sum(len(a) for a in x) @@ -653,7 +716,7 @@ class RayDMatrix: after initialization. If this is None, it will be set by the ``xgboost_ray.train()`` function, and it will be loaded and stored in the object store then. Defaults to None. - filetype (Optional[RayFileType]): Type of data to read. + filetype: Type of data to read. This is disregarded if a data object like a pandas dataframe is passed as the ``data`` argument. For filenames, the filetype is automaticlly detected via the file name @@ -661,14 +724,14 @@ class RayDMatrix: Passing this argument will overwrite the detected filename. If the filename cannot be determined from the ``data`` object, passing this is mandatory. Defaults to ``None`` (auto detection). - ignore (Optional[List[str]]): Exclude these columns from the + ignore: Exclude these columns from the dataframe after loading the data. - distributed (Optional[bool]): If True, use distributed loading + distributed: If True, use distributed loading (each worker loads a share of the dataset). If False, use central loading (the head node loads the whole dataset and distributed it). If None, auto-detect and default to distributed loading, if possible. - sharding (RayShardingMode): How to shard the data for different + sharding: How to shard the data for different workers. ``RayShardingMode.INTERLEAVED`` will divide the data per row, i.e. every i-th row will be passed to the first worker, every (i+1)th row to the second worker, etc. @@ -679,7 +742,7 @@ class RayDMatrix: loading, sharding happens on a per-file basis, and not on a per-row basis, i.e. For interleaved every ith *file* will be passed into the first worker, etc. - lazy (bool): If ``num_actors`` is passed, setting this to ``True`` + lazy: If ``num_actors`` is passed, setting this to ``True`` will defer data loading and storing until ``load_data()`` or ``get_data()`` is called. Defaults to ``False``. **kwargs: Keyword arguments will be passed to the data loading @@ -704,32 +767,35 @@ class RayDMatrix: """ - def __init__(self, - data: Data, - label: Optional[Data] = None, - weight: Optional[Data] = None, - feature_weights: Optional[Data] = None, - base_margin: Optional[Data] = None, - missing: Optional[float] = None, - label_lower_bound: Optional[Data] = None, - label_upper_bound: Optional[Data] = None, - feature_names: Optional[List[str]] = None, - feature_types: Optional[List[np.dtype]] = None, - qid: Optional[Data] = None, - enable_categorical: Optional[bool] = None, - num_actors: Optional[int] = None, - filetype: Optional[RayFileType] = None, - ignore: Optional[List[str]] = None, - distributed: Optional[bool] = None, - sharding: RayShardingMode = RayShardingMode.INTERLEAVED, - lazy: bool = False, - **kwargs): + def __init__( + self, + data: Data, + label: Optional[Data] = None, + weight: Optional[Data] = None, + feature_weights: Optional[Data] = None, + base_margin: Optional[Data] = None, + missing: Optional[float] = None, + label_lower_bound: Optional[Data] = None, + label_upper_bound: Optional[Data] = None, + feature_names: Optional[List[str]] = None, + feature_types: Optional[List[np.dtype]] = None, + qid: Optional[Data] = None, + enable_categorical: Optional[bool] = None, + num_actors: Optional[int] = None, + filetype: Optional[RayFileType] = None, + ignore: Optional[List[str]] = None, + distributed: Optional[bool] = None, + sharding: RayShardingMode = RayShardingMode.INTERLEAVED, + lazy: bool = False, + **kwargs, + ): if kwargs.get("group", None) is not None: raise ValueError( "`group` parameter is not supported. " "If you are using XGBoost-Ray, use `qid` parameter instead. " - "If you are using LightGBM-Ray, ranking is not yet supported.") + "If you are using LightGBM-Ray, ranking is not yet supported." + ) if qid is not None and weight is not None: raise NotImplementedError("per-group weight is not implemented.") @@ -754,7 +820,8 @@ def __init__(self, f"the specified data source of type {type(data)} cannot " f"be loaded in a distributed fashion. " f"\nFIX THIS by passing a list of sources (e.g. parquet " - f"files stored in a network location) instead.") + f"files stored in a network location) instead." + ) self.distributed = distributed @@ -774,7 +841,8 @@ def __init__(self, filetype=filetype, ignore=ignore, qid=qid, - **kwargs) + **kwargs, + ) else: self.loader = _CentralRayDMatrixLoader( data=data, @@ -791,7 +859,8 @@ def __init__(self, filetype=filetype, ignore=ignore, qid=qid, - **kwargs) + **kwargs, + ) self.refs: Dict[int, Dict[str, ray.ObjectRef]] = {} self.n = None @@ -814,9 +883,7 @@ def assign_shards_to_actors(self, actors: Sequence[ActorHandle]) -> bool: def assert_enough_shards_for_actors(self, num_actors: int): self.loader.assert_enough_shards_for_actors(num_actors=num_actors) - def load_data(self, - num_actors: Optional[int] = None, - rank: Optional[int] = None): + def load_data(self, num_actors: Optional[int] = None, rank: Optional[int] = None): """Load data, putting it into the Ray object store. If a rank is given, only data for this rank is loaded (for @@ -824,8 +891,7 @@ def load_data(self, """ if not self.loaded: if num_actors is not None: - if self.num_actors is not None \ - and num_actors != self.num_actors: + if self.num_actors is not None and num_actors != self.num_actors: raise ValueError( f"The `RayDMatrix` was initialized or `load_data()`" f"has been called with a different numbers of" @@ -834,21 +900,24 @@ def load_data(self, f"\nFIX THIS by not instantiating the matrix with " f"`num_actors` and making sure calls to `load_data()` " f"or `get_data()` use the same numbers of actors " - f"at each call.") + f"at each call." + ) self.num_actors = num_actors if self.num_actors is None: raise ValueError( "Trying to load data for `RayDMatrix` object, but " "`num_actors` is not set." "\nFIX THIS by passing `num_actors` on instantiation " - "of the `RayDMatrix` or when calling `load_data()`.") + "of the `RayDMatrix` or when calling `load_data()`." + ) refs, self.n = self.loader.load_data( - self.num_actors, self.sharding, rank=rank) + self.num_actors, self.sharding, rank=rank + ) self.refs.update(refs) self.loaded = True def get_data( - self, rank: int, num_actors: Optional[int] = None + self, rank: int, num_actors: Optional[int] = None ) -> Dict[str, Union[None, pd.DataFrame, List[Optional[pd.DataFrame]]]]: """Get data, i.e. return dataframe for a specific actor. @@ -884,36 +953,41 @@ def __eq__(self, other): class RayQuantileDMatrix(RayDMatrix): """Currently just a thin wrapper for type detection""" + pass class RayDeviceQuantileDMatrix(RayDMatrix): """Currently just a thin wrapper for type detection""" - def __init__(self, - data: Data, - label: Optional[Data] = None, - weight: Optional[Data] = None, - base_margin: Optional[Data] = None, - missing: Optional[float] = None, - label_lower_bound: Optional[Data] = None, - label_upper_bound: Optional[Data] = None, - feature_names: Optional[List[str]] = None, - feature_types: Optional[List[np.dtype]] = None, - qid: Optional[Data] = None, - enable_categorical: Optional[bool] = None, - *args, - **kwargs): + def __init__( + self, + data: Data, + label: Optional[Data] = None, + weight: Optional[Data] = None, + base_margin: Optional[Data] = None, + missing: Optional[float] = None, + label_lower_bound: Optional[Data] = None, + label_upper_bound: Optional[Data] = None, + feature_names: Optional[List[str]] = None, + feature_types: Optional[List[np.dtype]] = None, + qid: Optional[Data] = None, + enable_categorical: Optional[bool] = None, + *args, + **kwargs, + ): if cp is None: raise RuntimeError( "RayDeviceQuantileDMatrix requires cupy to be installed." "\nFIX THIS by installing cupy: `pip install cupy-cudaXYZ` " - "where XYZ is your local CUDA version.") + "where XYZ is your local CUDA version." + ) if label_lower_bound or label_upper_bound: raise RuntimeError( "RayDeviceQuantileDMatrix does not support " "`label_lower_bound` and `label_upper_bound` (just as the " - "xgboost.DeviceQuantileDMatrix). Please pass None instead.") + "xgboost.DeviceQuantileDMatrix). Please pass None instead." + ) super(RayDeviceQuantileDMatrix, self).__init__( data=data, label=label, @@ -927,13 +1001,15 @@ def __init__(self, qid=qid, enable_categorical=enable_categorical, *args, - **kwargs) + **kwargs, + ) def get_data( - self, rank: int, num_actors: Optional[int] = None + self, rank: int, num_actors: Optional[int] = None ) -> Dict[str, Union[None, pd.DataFrame, List[Optional[pd.DataFrame]]]]: data_dict = super(RayDeviceQuantileDMatrix, self).get_data( - rank=rank, num_actors=num_actors) + rank=rank, num_actors=num_actors + ) # Remove some dict keys here that are generated automatically data_dict.pop("label_lower_bound", None) data_dict.pop("label_upper_bound", None) @@ -970,14 +1046,18 @@ def _can_load_distributed(source: Data) -> bool: def _detect_distributed(source: Data) -> bool: """Returns True if we should try to use distributed data loading""" from xgboost_ray.data_sources.modin import Modin + if not _can_load_distributed(source): return False if Modin.is_data_type(source): return True if isinstance(source, RayDataset): return True - if isinstance(source, Iterable) and not isinstance(source, str) and \ - not (isinstance(source, Sequence) and isinstance(source[0], str)): + if ( + isinstance(source, Iterable) + and not isinstance(source, str) + and not (isinstance(source, Sequence) and isinstance(source[0], str)) + ): # This is an iterable but not a Sequence of strings, and not a # pandas dataframe, series, or numpy array. # Detect False per default, can be overridden by passing @@ -988,35 +1068,41 @@ def _detect_distributed(source: Data) -> bool: return True -def _get_sharding_indices(sharding: RayShardingMode, rank: int, - num_actors: int, n: int): +def _get_sharding_indices( + sharding: RayShardingMode, rank: int, num_actors: int, n: int +): """Return indices that belong to worker with rank `rank`""" if sharding == RayShardingMode.BATCH: # based on numpy.array_split # github.com/numpy/numpy/blob/v1.21.0/numpy/lib/shape_base.py n_per_actor, extras = divmod(n, num_actors) - div_points = np.array([0] + extras * [n_per_actor + 1] + - (num_actors - extras) * [n_per_actor]).cumsum() + div_points = np.array( + [0] + extras * [n_per_actor + 1] + (num_actors - extras) * [n_per_actor] + ).cumsum() indices = list(range(div_points[rank], div_points[rank + 1])) elif sharding == RayShardingMode.INTERLEAVED: indices = list(range(rank, n, num_actors)) else: - raise ValueError(f"Invalid value for `sharding` parameter: " - f"{sharding}" - f"\nFIX THIS by passing any item of the " - f"`RayShardingMode` enum, for instance " - f"`RayShardingMode.BATCH`.") + raise ValueError( + f"Invalid value for `sharding` parameter: " + f"{sharding}" + f"\nFIX THIS by passing any item of the " + f"`RayShardingMode` enum, for instance " + f"`RayShardingMode.BATCH`." + ) return indices @DeveloperAPI def combine_data(sharding: RayShardingMode, data: Iterable) -> np.ndarray: if sharding not in (RayShardingMode.BATCH, RayShardingMode.INTERLEAVED): - raise ValueError(f"Invalid value for `sharding` parameter: " - f"{sharding}" - f"\nFIX THIS by passing any item of the " - f"`RayShardingMode` enum, for instance " - f"`RayShardingMode.BATCH`.") + raise ValueError( + f"Invalid value for `sharding` parameter: " + f"{sharding}" + f"\nFIX THIS by passing any item of the " + f"`RayShardingMode` enum, for instance " + f"`RayShardingMode.BATCH`." + ) # discard empty arrays that show up with BATCH data = [d for d in data if len(d)] @@ -1031,7 +1117,8 @@ def combine_data(sharding: RayShardingMode, data: Iterable) -> np.ndarray: res = np.ravel(np.column_stack([d[0:min_len] for d in data])) # Append these here res = np.concatenate( - [res] + [d[min_len:] for d in data if len(d) > min_len]) + [res] + [d[min_len:] for d in data if len(d) > min_len] + ) else: # objective="multi:softprob" returns n-dimensional arrays that # need to be handled differently @@ -1044,8 +1131,10 @@ def combine_data(sharding: RayShardingMode, data: Iterable) -> np.ndarray: class_len = data[0].shape[1] min_len_data = [d[0:min_len] for d in data] res = np.hstack(min_len_data).reshape( - len(min_len_data) * min_len, class_len) + len(min_len_data) * min_len, class_len + ) # Append these here res = np.concatenate( - [res] + [d[min_len:] for d in data if len(d) > min_len]) + [res] + [d[min_len:] for d in data if len(d) > min_len] + ) return res diff --git a/xgboost_ray/session.py b/xgboost_ray/session.py index ffa421f5..85ff77cf 100644 --- a/xgboost_ray/session.py +++ b/xgboost_ray/session.py @@ -1,5 +1,6 @@ from typing import Optional -from ray.util.annotations import PublicAPI, DeveloperAPI + +from ray.util.annotations import DeveloperAPI, PublicAPI from ray.util.queue import Queue @@ -20,7 +21,8 @@ def put_queue(self, item): raise ValueError( "Trying to put something into session queue, but queue " "was not initialized. This is probably a bug, please raise " - "an issue at https://github.com/ray-project/xgboost_ray") + "an issue at https://github.com/ray-project/xgboost_ray" + ) self._queue.put((self._rank, item)) @@ -33,7 +35,8 @@ def init_session(*args, **kwargs): if _session: raise ValueError( "Trying to initialize RayXGBoostSession twice." - "\nFIX THIS by not calling `init_session()` manually.") + "\nFIX THIS by not calling `init_session()` manually." + ) _session = RayXGBoostSession(*args, **kwargs) @@ -44,7 +47,8 @@ def get_session() -> RayXGBoostSession: raise ValueError( "Trying to access RayXGBoostSession from outside an XGBoost run." "\nFIX THIS by calling function in `session.py` like " - "`get_actor_rank()` only from within an XGBoost actor session.") + "`get_actor_rank()` only from within an XGBoost actor session." + ) return _session @@ -63,6 +67,7 @@ def get_actor_rank() -> int: @PublicAPI def get_rabit_rank() -> int: import xgboost as xgb + try: # From xgboost>=1.7.0, rabit is replaced by a collective communicator return xgb.collective.get_rank() diff --git a/xgboost_ray/sklearn.py b/xgboost_ray/sklearn.py index 654e8ad1..2d5920fb 100644 --- a/xgboost_ray/sklearn.py +++ b/xgboost_ray/sklearn.py @@ -24,27 +24,30 @@ # License: # https://github.com/dmlc/xgboost/blob/c6a0bdbb5a68232cd59ea556c981c633cc0646ca/LICENSE -from typing import Callable, Tuple, Dict, Optional, Union, Any, List -from packaging.version import Version - -import numpy as np - -import warnings import functools import inspect +import warnings from inspect import _finddoc +from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from ray.util.annotations import PublicAPI, DeveloperAPI +import numpy as np +from packaging.version import Version +from ray.util.annotations import DeveloperAPI, PublicAPI +from xgboost import Booster +from xgboost import __version__ as xgboost_version +from xgboost.sklearn import ( + XGBClassifier, + XGBModel, + XGBRanker, + XGBRegressor, + XGBRFClassifier, + XGBRFRegressor, + _objective_decorator, +) -from xgboost_ray.main import (RayParams, train, predict, XGBOOST_VERSION, - LEGACY_WARNING) +from xgboost_ray.main import LEGACY_WARNING, XGBOOST_VERSION, RayParams, predict, train from xgboost_ray.matrix import RayDMatrix -from xgboost import Booster, __version__ as xgboost_version -from xgboost.sklearn import (XGBModel, XGBClassifier, XGBRegressor, - XGBRFClassifier, XGBRFRegressor, XGBRanker, - _objective_decorator) - # avoiding exception in xgboost==0.9.0 try: from xgboost.sklearn import _deprecate_positional_args @@ -67,21 +70,21 @@ def inner_f(*args, **kwargs): except ImportError: # copied from the file in the top comment def _wrap_evaluation_matrices( - missing: float, - X: Any, - y: Any, - group: Optional[Any], - qid: Optional[Any], - sample_weight: Optional[Any], - base_margin: Optional[Any], - feature_weights: Optional[Any], - eval_set: Optional[List[Tuple[Any, Any]]], - sample_weight_eval_set: Optional[List[Any]], - base_margin_eval_set: Optional[List[Any]], - eval_group: Optional[List[Any]], - eval_qid: Optional[List[Any]], - create_dmatrix: Callable, - label_transform: Callable = lambda x: x, + missing: float, + X: Any, + y: Any, + group: Optional[Any], + qid: Optional[Any], + sample_weight: Optional[Any], + base_margin: Optional[Any], + feature_weights: Optional[Any], + eval_set: Optional[List[Tuple[Any, Any]]], + sample_weight_eval_set: Optional[List[Any]], + base_margin_eval_set: Optional[List[Any]], + eval_group: Optional[List[Any]], + eval_qid: Optional[List[Any]], + create_dmatrix: Callable, + label_transform: Callable = lambda x: x, ) -> Tuple[Any, Optional[List[Tuple[Any, str]]]]: """Convert array_like evaluation matrices into DMatrix. Perform validation on the way. @@ -104,25 +107,34 @@ def validate_or_none(meta: Optional[List], name: str) -> List: return [None] * n_validation if len(meta) != n_validation: raise ValueError( - f"{name}'s length does not eqaul to `eval_set`, " + - f"expecting {n_validation}, got {len(meta)}") + f"{name}'s length does not eqaul to `eval_set`, " + + f"expecting {n_validation}, got {len(meta)}" + ) return meta if eval_set is not None: sample_weight_eval_set = validate_or_none( - sample_weight_eval_set, "sample_weight_eval_set") - base_margin_eval_set = validate_or_none(base_margin_eval_set, - "base_margin_eval_set") + sample_weight_eval_set, "sample_weight_eval_set" + ) + base_margin_eval_set = validate_or_none( + base_margin_eval_set, "base_margin_eval_set" + ) eval_group = validate_or_none(eval_group, "eval_group") eval_qid = validate_or_none(eval_qid, "eval_qid") evals = [] for i, (valid_X, valid_y) in enumerate(eval_set): # Skip the duplicated entry. - if all((valid_X is X, valid_y is y, + if all( + ( + valid_X is X, + valid_y is y, sample_weight_eval_set[i] is sample_weight, base_margin_eval_set[i] is base_margin, - eval_group[i] is group, eval_qid[i] is qid)): + eval_group[i] is group, + eval_qid[i] is qid, + ) + ): evals.append(train_dmatrix) else: m = create_dmatrix( @@ -139,15 +151,19 @@ def validate_or_none(meta: Optional[List], name: str) -> List: eval_names = ["validation_{}".format(i) for i in range(nevals)] evals = list(zip(evals, eval_names)) else: - if any(meta is not None for meta in [ + if any( + meta is not None + for meta in [ sample_weight_eval_set, base_margin_eval_set, eval_group, eval_qid, - ]): + ] + ): raise ValueError( "`eval_set` is not set but one of the other evaluation " - "meta info is not None.") + "meta info is not None." + ) evals = [] return train_dmatrix, evals @@ -232,10 +248,14 @@ def _get_doc(object: Any) -> Optional[str]: def _treat_estimator_doc(doc: Optional[str]) -> Optional[str]: """Helper function to make nececssary changes in estimator docstrings""" if doc: - doc = doc.replace(*_N_JOBS_DOC_REPLACE).replace( - "scikit-learn API for XGBoost", - "scikit-learn API for Ray-distributed XGBoost").replace( - ":doc:`tree method\n `", "tree method") + doc = ( + doc.replace(*_N_JOBS_DOC_REPLACE) + .replace( + "scikit-learn API for XGBoost", + "scikit-learn API for Ray-distributed XGBoost", + ) + .replace(":doc:`tree method\n `", "tree method") + ) return doc @@ -243,11 +263,14 @@ def _treat_X_doc(doc: Optional[str]) -> Optional[str]: if doc: doc = doc.replace( "Data to predict with.", - "Data to predict with. Can also be a ``RayDMatrix``.") - doc = doc.replace("Feature matrix.", - "Feature matrix. Can also be a ``RayDMatrix``.") - doc = doc.replace("Feature matrix", - "Feature matrix. Can also be a ``RayDMatrix``.") + "Data to predict with. Can also be a ``RayDMatrix``.", + ) + doc = doc.replace( + "Feature matrix.", "Feature matrix. Can also be a ``RayDMatrix``." + ) + doc = doc.replace( + "Feature matrix", "Feature matrix. Can also be a ``RayDMatrix``." + ) return doc @@ -263,13 +286,15 @@ def inner_f(*args, **kwargs): return inner_f -def _check_if_params_are_ray_dmatrix(X, - sample_weight, - base_margin, - eval_set, - sample_weight_eval_set, - base_margin_eval_set, - eval_qid=None): +def _check_if_params_are_ray_dmatrix( + X, + sample_weight, + base_margin, + eval_set, + sample_weight_eval_set, + base_margin_eval_set, + eval_qid=None, +): train_dmatrix = None evals = () eval_set = eval_set or () @@ -279,16 +304,20 @@ def _check_if_params_are_ray_dmatrix(X, params_to_warn_about.append("sample_weight") if base_margin is not None: params_to_warn_about.append("base_margin") - warnings.warn(f"X is a RayDMatrix, {', '.join(params_to_warn_about)}" - " will be ignored!") + warnings.warn( + f"X is a RayDMatrix, {', '.join(params_to_warn_about)}" " will be ignored!" + ) train_dmatrix = X if eval_set: - if any(not isinstance(eval_data, RayDMatrix) - or not isinstance(eval_name, str) - for eval_data, eval_name in eval_set): - raise ValueError("If X is a RayDMatrix, all elements of " - "`eval_set` must be (RayDMatrix, str) " - "tuples.") + if any( + not isinstance(eval_data, RayDMatrix) or not isinstance(eval_name, str) + for eval_data, eval_name in eval_set + ): + raise ValueError( + "If X is a RayDMatrix, all elements of " + "`eval_set` must be (RayDMatrix, str) " + "tuples." + ) params_to_warn_about = [] if sample_weight_eval_set is not None: params_to_warn_about.append("sample_weight_eval_set") @@ -299,14 +328,18 @@ def _check_if_params_are_ray_dmatrix(X, if params_to_warn_about: warnings.warn( "`eval_set` is composed of RayDMatrix tuples, " - f"{', '.join(params_to_warn_about)} will be ignored!") + f"{', '.join(params_to_warn_about)} will be ignored!" + ) evals = eval_set or () elif any( - isinstance(eval_x, RayDMatrix) or isinstance(eval_y, RayDMatrix) - for eval_x, eval_y in eval_set): - raise ValueError("If X is not a RayDMatrix, all `eval_set` " - "elements must be (array_like, array_like)" - " tuples.") + isinstance(eval_x, RayDMatrix) or isinstance(eval_y, RayDMatrix) + for eval_x, eval_y in eval_set + ): + raise ValueError( + "If X is not a RayDMatrix, all `eval_set` " + "elements must be (array_like, array_like)" + " tuples." + ) return train_dmatrix, evals @@ -315,8 +348,8 @@ class RayXGBMixin: """Mixin class to provide xgboost-ray functionality""" def _ray_set_ray_params_n_jobs( - self, ray_params: Optional[Union[RayParams, dict]], - n_jobs: Optional[int]) -> RayParams: + self, ray_params: Optional[Union[RayParams, dict]], n_jobs: Optional[int] + ) -> RayParams: """Helper function to set num_actors in ray_params if not set by the user""" if ray_params is None: @@ -324,27 +357,30 @@ def _ray_set_ray_params_n_jobs( n_jobs = 1 ray_params = RayParams(num_actors=n_jobs) elif n_jobs is not None: - warnings.warn("`ray_params` is not `None` and will override " - "the `n_jobs` attribute.") + warnings.warn( + "`ray_params` is not `None` and will override " + "the `n_jobs` attribute." + ) return ray_params def _ray_predict( - self: "XGBModel", - X, - output_margin=False, - ntree_limit=None, - validate_features=True, - base_margin=None, - iteration_range=None, - ray_params: Union[None, RayParams, Dict] = None, - _remote: Optional[bool] = None, - ray_dmatrix_params: Optional[Dict] = None, + self: "XGBModel", + X, + output_margin=False, + ntree_limit=None, + validate_features=True, + base_margin=None, + iteration_range=None, + ray_params: Union[None, RayParams, Dict] = None, + _remote: Optional[bool] = None, + ray_dmatrix_params: Optional[Dict] = None, ): """Distributed predict via Ray""" compat_predict_kwargs = {} if _convert_ntree_limit is not None: iteration_range = _convert_ntree_limit( - self.get_booster(), ntree_limit, iteration_range) + self.get_booster(), ntree_limit, iteration_range + ) iteration_range = self._get_iteration_range(iteration_range) compat_predict_kwargs["iteration_range"] = iteration_range else: @@ -357,15 +393,12 @@ def _ray_predict( if not isinstance(X, RayDMatrix): test = RayDMatrix( - X, - base_margin=base_margin, - missing=self.missing, - **ray_dmatrix_params) + X, base_margin=base_margin, missing=self.missing, **ray_dmatrix_params + ) else: test = X if base_margin is not None: - warnings.warn( - "X is a RayDMatrix, base_margin will be ignored!") + warnings.warn("X is a RayDMatrix, base_margin will be ignored!") return predict( self.get_booster(), @@ -378,21 +411,25 @@ def _ray_predict( ) def _ray_get_wrap_evaluation_matrices_compat_kwargs( - self, label_transform=None) -> dict: + self, label_transform=None + ) -> dict: ret = {} wrap_evaluation_matrices_parameters = inspect.signature( - _wrap_evaluation_matrices).parameters + _wrap_evaluation_matrices + ).parameters if "label_transform" in wrap_evaluation_matrices_parameters: # XGBoost < 1.6.0 identity_func = lambda x: x # noqa ret["label_transform"] = label_transform or identity_func - if hasattr( - self, "enable_categorical" - ) and "enable_categorical" in wrap_evaluation_matrices_parameters: + if ( + hasattr(self, "enable_categorical") + and "enable_categorical" in wrap_evaluation_matrices_parameters + ): ret["enable_categorical"] = self.enable_categorical - if hasattr( - self, "feature_types" - ) and "feature_types" in wrap_evaluation_matrices_parameters: + if ( + hasattr(self, "feature_types") + and "feature_types" in wrap_evaluation_matrices_parameters + ): ret["feature_types"] = self.feature_types return ret @@ -400,10 +437,10 @@ def _ray_get_wrap_evaluation_matrices_compat_kwargs( # provided here for compatibility with legacy xgboost versions # will be overwritten by vanilla xgboost if possible def _configure_fit( - self, - booster: Optional[Union[Booster, "XGBModel", str]], - eval_metric: Optional[Union[Callable, str, List[str]]], - params: Dict[str, Any], + self, + booster: Optional[Union[Booster, "XGBModel", str]], + eval_metric: Optional[Union[Callable, str, List[str]]], + params: Dict[str, Any], ) -> Tuple[Optional[Union[Booster, str]], Dict[str, Any]]: # pylint: disable=protected-access, no-self-use if isinstance(booster, XGBModel): @@ -427,8 +464,7 @@ def _set_evaluation_result(self, evals_result) -> None: if evals_result: for val in evals_result.items(): evals_result_key = list(val[1].keys())[0] - evals_result[val[0]][evals_result_key] = val[1][ - evals_result_key] + evals_result[val[0]][evals_result_key] = val[1][evals_result_key] self.evals_result_ = evals_result @@ -438,31 +474,36 @@ class RayXGBRegressor(XGBRegressor, RayXGBMixin): @_deprecate_positional_args def fit( - self, - X, - y, - *, - sample_weight=None, - base_margin=None, - eval_set=None, - eval_metric=None, - early_stopping_rounds=None, - verbose=True, - xgb_model: Optional[Union[Booster, str, "XGBModel"]] = None, - sample_weight_eval_set=None, - base_margin_eval_set=None, - feature_weights=None, - callbacks=None, - ray_params: Union[None, RayParams, Dict] = None, - _remote: Optional[bool] = None, - ray_dmatrix_params: Optional[Dict] = None, + self, + X, + y, + *, + sample_weight=None, + base_margin=None, + eval_set=None, + eval_metric=None, + early_stopping_rounds=None, + verbose=True, + xgb_model: Optional[Union[Booster, str, "XGBModel"]] = None, + sample_weight_eval_set=None, + base_margin_eval_set=None, + feature_weights=None, + callbacks=None, + ray_params: Union[None, RayParams, Dict] = None, + _remote: Optional[bool] = None, + ray_dmatrix_params: Optional[Dict] = None, ): evals_result = {} ray_dmatrix_params = ray_dmatrix_params or {} train_dmatrix, evals = _check_if_params_are_ray_dmatrix( - X, sample_weight, base_margin, eval_set, sample_weight_eval_set, - base_margin_eval_set) + X, + sample_weight, + base_margin, + eval_set, + sample_weight_eval_set, + base_margin_eval_set, + ) if train_dmatrix is None: train_dmatrix, evals = _wrap_evaluation_matrices( @@ -480,11 +521,11 @@ def fit( eval_group=None, eval_qid=None, # changed in xgboost-ray: - create_dmatrix=lambda **kwargs: RayDMatrix(**{ - **kwargs, - **ray_dmatrix_params - }), - **self._ray_get_wrap_evaluation_matrices_compat_kwargs()) + create_dmatrix=lambda **kwargs: RayDMatrix( + **{**kwargs, **ray_dmatrix_params} + ), + **self._ray_get_wrap_evaluation_matrices_compat_kwargs(), + ) params = self.get_xgb_params() @@ -495,13 +536,18 @@ def fit( obj = None try: - model, feval, params = self._configure_fit(xgb_model, eval_metric, - params) + model, feval, params = self._configure_fit(xgb_model, eval_metric, params) except TypeError: # XGBoost >= 1.6.0 - (model, feval, params, early_stopping_rounds, - callbacks) = self._configure_fit(xgb_model, eval_metric, params, - early_stopping_rounds, callbacks) + ( + model, + feval, + params, + early_stopping_rounds, + callbacks, + ) = self._configure_fit( + xgb_model, eval_metric, params, early_stopping_rounds, callbacks + ) # remove those as they will be set in RayXGBoostActor params.pop("n_jobs", None) @@ -540,16 +586,16 @@ def _can_use_inplace_predict(self) -> bool: return False def predict( - self, - X, - output_margin=False, - ntree_limit=None, - validate_features=True, - base_margin=None, - iteration_range=None, - ray_params: Union[None, RayParams, Dict] = None, - _remote: Optional[bool] = None, - ray_dmatrix_params: Optional[Dict] = None, + self, + X, + output_margin=False, + ntree_limit=None, + validate_features=True, + base_margin=None, + iteration_range=None, + ray_params: Union[None, RayParams, Dict] = None, + _remote: Optional[bool] = None, + ray_dmatrix_params: Optional[Dict] = None, ): return self._ray_predict( X, @@ -560,10 +606,10 @@ def predict( iteration_range=iteration_range, ray_params=ray_params, _remote=_remote, - ray_dmatrix_params=ray_dmatrix_params) + ray_dmatrix_params=ray_dmatrix_params, + ) - predict.__doc__ = _treat_X_doc(_get_doc( - XGBRegressor.predict)) + _RAY_PARAMS_DOC + predict.__doc__ = _treat_X_doc(_get_doc(XGBRegressor.predict)) + _RAY_PARAMS_DOC def load_model(self, fname): if not hasattr(self, "_Booster"): @@ -580,25 +626,28 @@ class RayXGBRFRegressor(RayXGBRegressor): if xgboost_version == "0.90": def __init__(self, *args, **kwargs): - raise ValueError( - "RayXGBRFRegressor not available with xgboost<1.0.0") + raise ValueError("RayXGBRFRegressor not available with xgboost<1.0.0") + else: @_deprecate_positional_args @_xgboost_version_warn - def __init__(self, - *, - learning_rate=1, - subsample=0.8, - colsample_bynode=0.8, - reg_lambda=1e-5, - **kwargs): + def __init__( + self, + *, + learning_rate=1, + subsample=0.8, + colsample_bynode=0.8, + reg_lambda=1e-5, + **kwargs, + ): super().__init__( learning_rate=learning_rate, subsample=subsample, colsample_bynode=colsample_bynode, reg_lambda=reg_lambda, - **kwargs) + **kwargs, + ) def get_xgb_params(self): params = super().get_xgb_params() @@ -618,24 +667,24 @@ class RayXGBClassifier(XGBClassifier, RayXGBMixin): @_deprecate_positional_args def fit( - self, - X, - y, - *, - sample_weight=None, - base_margin=None, - eval_set=None, - eval_metric=None, - early_stopping_rounds=None, - verbose=True, - xgb_model=None, - sample_weight_eval_set=None, - base_margin_eval_set=None, - feature_weights=None, - callbacks=None, - ray_params: Union[None, RayParams, Dict] = None, - _remote: Optional[bool] = None, - ray_dmatrix_params: Optional[Dict] = None, + self, + X, + y, + *, + sample_weight=None, + base_margin=None, + eval_set=None, + eval_metric=None, + early_stopping_rounds=None, + verbose=True, + xgb_model=None, + sample_weight_eval_set=None, + base_margin_eval_set=None, + feature_weights=None, + callbacks=None, + ray_params: Union[None, RayParams, Dict] = None, + _remote: Optional[bool] = None, + ray_dmatrix_params: Optional[Dict] = None, ): evals_result = {} @@ -644,22 +693,30 @@ def fit( params = self.get_xgb_params() train_dmatrix, evals = _check_if_params_are_ray_dmatrix( - X, sample_weight, base_margin, eval_set, sample_weight_eval_set, - base_margin_eval_set) + X, + sample_weight, + base_margin, + eval_set, + sample_weight_eval_set, + base_margin_eval_set, + ) if train_dmatrix is not None: if not hasattr(self, "use_label_encoder"): - warnings.warn("If X is a RayDMatrix, no label encoding" - " will be performed. Ensure the labels are" - " encoded.") + warnings.warn( + "If X is a RayDMatrix, no label encoding" + " will be performed. Ensure the labels are" + " encoded." + ) elif self.use_label_encoder: raise ValueError( - "X cannot be a RayDMatrix if `use_label_encoder` " - "is set to True") + "X cannot be a RayDMatrix if `use_label_encoder` " "is set to True" + ) if "num_class" not in params: raise ValueError( "`num_class` must be set during initalization if X" - " is a RayDMatrix") + " is a RayDMatrix" + ) self.classes_ = list(range(0, params["num_class"])) self.n_classes_ = params["num_class"] if self.n_classes_ <= 2: @@ -670,8 +727,8 @@ def fit( # Simply raise an error here since there might be many # different ways of reshaping raise ValueError( - "Please reshape the input data X into 2-dimensional " - "matrix.") + "Please reshape the input data X into 2-dimensional " "matrix." + ) label_transform = self._ray_fit_preprocess(y) @@ -689,13 +746,18 @@ def fit( params["num_class"] = self.n_classes_ try: - model, feval, params = self._configure_fit(xgb_model, eval_metric, - params) + model, feval, params = self._configure_fit(xgb_model, eval_metric, params) except TypeError: # XGBoost >= 1.6.0 - (model, feval, params, early_stopping_rounds, - callbacks) = self._configure_fit(xgb_model, eval_metric, params, - early_stopping_rounds, callbacks) + ( + model, + feval, + params, + early_stopping_rounds, + callbacks, + ) = self._configure_fit( + xgb_model, eval_metric, params, early_stopping_rounds, callbacks + ) if train_dmatrix is None: train_dmatrix, evals = _wrap_evaluation_matrices( @@ -713,12 +775,13 @@ def fit( eval_group=None, eval_qid=None, # changed in xgboost-ray: - create_dmatrix=lambda **kwargs: RayDMatrix(**{ - **kwargs, - **ray_dmatrix_params - }), + create_dmatrix=lambda **kwargs: RayDMatrix( + **{**kwargs, **ray_dmatrix_params} + ), **self._ray_get_wrap_evaluation_matrices_compat_kwargs( - label_transform=label_transform)) + label_transform=label_transform + ), + ) # remove those as they will be set in RayXGBoostActor params.pop("n_jobs", None) @@ -764,41 +827,47 @@ def _ray_fit_preprocess(self, y) -> Callable: use_label_encoder = getattr(self, "use_label_encoder", True) label_encoding_check_error = ( "The label must consist of integer " - "labels of form 0, 1, 2, ..., [num_class - 1].") + "labels of form 0, 1, 2, ..., [num_class - 1]." + ) label_encoder_deprecation_msg = ( "The use of label encoder in XGBClassifier is deprecated and will " "be removed in a future release. To remove this warning, do the " "following: 1) Pass option use_label_encoder=False when " "constructing XGBClassifier object; and 2) Encode your labels (y) " - "as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].") + "as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]." + ) # ray: modified this to allow for compatibility with legacy xgboost - if (_is_cudf_df and _is_cudf_df(y)) or (_is_cudf_ser - and _is_cudf_ser(y)): + if (_is_cudf_df and _is_cudf_df(y)) or (_is_cudf_ser and _is_cudf_ser(y)): import cupy as cp # pylint: disable=E0401 self.classes_ = cp.unique(y.values) self.n_classes_ = len(self.classes_) can_use_label_encoder = False expected_classes = cp.arange(self.n_classes_) - if (self.classes_.shape != expected_classes.shape - or not (self.classes_ == expected_classes).all()): + if ( + self.classes_.shape != expected_classes.shape + or not (self.classes_ == expected_classes).all() + ): raise ValueError(label_encoding_check_error) - elif (_is_cupy_array and _is_cupy_array(y)): + elif _is_cupy_array and _is_cupy_array(y): import cupy as cp # pylint: disable=E0401 self.classes_ = cp.unique(y) self.n_classes_ = len(self.classes_) can_use_label_encoder = False expected_classes = cp.arange(self.n_classes_) - if (self.classes_.shape != expected_classes.shape - or not (self.classes_ == expected_classes).all()): + if ( + self.classes_.shape != expected_classes.shape + or not (self.classes_ == expected_classes).all() + ): raise ValueError(label_encoding_check_error) else: self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) - if not use_label_encoder and (not np.array_equal( - self.classes_, np.arange(self.n_classes_))): + if not use_label_encoder and ( + not np.array_equal(self.classes_, np.arange(self.n_classes_)) + ): raise ValueError(label_encoding_check_error) if use_label_encoder: @@ -807,7 +876,8 @@ def _ray_fit_preprocess(self, y) -> Callable: "The option use_label_encoder=True is incompatible with " "inputs of type cuDF or cuPy. Please set " "use_label_encoder=False when constructing XGBClassifier " - "object. NOTE:" + label_encoder_deprecation_msg) + "object. NOTE:" + label_encoder_deprecation_msg + ) if hasattr(self, "use_label_encoder"): warnings.warn(label_encoder_deprecation_msg, UserWarning) self._le = XGBoostLabelEncoder().fit(y) @@ -821,16 +891,16 @@ def _can_use_inplace_predict(self) -> bool: return False def predict( - self, - X, - output_margin=False, - ntree_limit=None, - validate_features=True, - base_margin=None, - iteration_range: Optional[Tuple[int, int]] = None, - ray_params: Union[None, RayParams, Dict] = None, - _remote: Optional[bool] = None, - ray_dmatrix_params: Optional[Dict] = None, + self, + X, + output_margin=False, + ntree_limit=None, + validate_features=True, + base_margin=None, + iteration_range: Optional[Tuple[int, int]] = None, + ray_params: Union[None, RayParams, Dict] = None, + _remote: Optional[bool] = None, + ray_dmatrix_params: Optional[Dict] = None, ): class_probs = self._ray_predict( X=X, @@ -841,7 +911,8 @@ def predict( iteration_range=iteration_range, ray_params=ray_params, _remote=_remote, - ray_dmatrix_params=ray_dmatrix_params) + ray_dmatrix_params=ray_dmatrix_params, + ) if output_margin: # If output_margin is active, simply return the scores return class_probs @@ -858,19 +929,18 @@ def predict( return self._le.inverse_transform(column_indexes) return column_indexes - predict.__doc__ = _treat_X_doc(_get_doc( - XGBModel.predict)) + _RAY_PARAMS_DOC + predict.__doc__ = _treat_X_doc(_get_doc(XGBModel.predict)) + _RAY_PARAMS_DOC def predict_proba( - self, - X, - ntree_limit=None, - validate_features=False, - base_margin=None, - iteration_range: Optional[Tuple[int, int]] = None, - ray_params: Union[None, RayParams, Dict] = None, - _remote: Optional[bool] = None, - ray_dmatrix_params: Optional[Dict] = None, + self, + X, + ntree_limit=None, + validate_features=False, + base_margin=None, + iteration_range: Optional[Tuple[int, int]] = None, + ray_params: Union[None, RayParams, Dict] = None, + _remote: Optional[bool] = None, + ray_dmatrix_params: Optional[Dict] = None, ) -> np.ndarray: class_probs = self._ray_predict( @@ -882,10 +952,12 @@ def predict_proba( iteration_range=iteration_range, ray_params=ray_params, _remote=_remote, - ray_dmatrix_params=ray_dmatrix_params) + ray_dmatrix_params=ray_dmatrix_params, + ) # If model is loaded from a raw booster there's no `n_classes_` return _cls_predict_proba( - getattr(self, "n_classes_", None), class_probs, np.vstack) + getattr(self, "n_classes_", None), class_probs, np.vstack + ) def load_model(self, fname): if not hasattr(self, "_Booster"): @@ -893,7 +965,8 @@ def load_model(self, fname): return super().load_model(fname) predict_proba.__doc__ = ( - _treat_X_doc(_get_doc(XGBClassifier.predict_proba)) + _RAY_PARAMS_DOC) + _treat_X_doc(_get_doc(XGBClassifier.predict_proba)) + _RAY_PARAMS_DOC + ) RayXGBClassifier.__doc__ = _treat_estimator_doc(_get_doc(XGBClassifier)) @@ -904,48 +977,53 @@ class RayXGBRFClassifier(RayXGBClassifier): if xgboost_version == "0.90": def __init__(self, *args, **kwargs): - raise ValueError( - "RayXGBRFClassifier not available with xgboost<1.0.0") + raise ValueError("RayXGBRFClassifier not available with xgboost<1.0.0") # use_label_encoder added in xgboost commit # c8ec62103a36f1717d032b1ddff2bf9e0642508a (1.3.0) - elif "use_label_encoder" in inspect.signature( - XGBRFClassifier.__init__).parameters: + elif "use_label_encoder" in inspect.signature(XGBRFClassifier.__init__).parameters: @_deprecate_positional_args @_xgboost_version_warn - def __init__(self, - *, - learning_rate=1, - subsample=0.8, - colsample_bynode=0.8, - reg_lambda=1e-5, - use_label_encoder=True, - **kwargs): + def __init__( + self, + *, + learning_rate=1, + subsample=0.8, + colsample_bynode=0.8, + reg_lambda=1e-5, + use_label_encoder=True, + **kwargs, + ): super().__init__( learning_rate=learning_rate, subsample=subsample, colsample_bynode=colsample_bynode, reg_lambda=reg_lambda, use_label_encoder=use_label_encoder, - **kwargs) + **kwargs, + ) + else: @_deprecate_positional_args @_xgboost_version_warn - def __init__(self, - *, - learning_rate=1, - subsample=0.8, - colsample_bynode=0.8, - reg_lambda=1e-5, - **kwargs): + def __init__( + self, + *, + learning_rate=1, + subsample=0.8, + colsample_bynode=0.8, + reg_lambda=1e-5, + **kwargs, + ): super().__init__( learning_rate=learning_rate, subsample=subsample, colsample_bynode=colsample_bynode, reg_lambda=reg_lambda, - **kwargs) + **kwargs, + ) def get_xgb_params(self): params = super().get_xgb_params() @@ -965,28 +1043,28 @@ class RayXGBRanker(XGBRanker, RayXGBMixin): @_deprecate_positional_args def fit( - self, - X, - y, - *, - group=None, - qid=None, - sample_weight=None, - base_margin=None, - eval_set=None, - eval_group=None, - eval_qid=None, - eval_metric=None, - early_stopping_rounds=None, - verbose=False, - xgb_model: Optional[Union[Booster, str, XGBModel]] = None, - sample_weight_eval_set=None, - base_margin_eval_set=None, - feature_weights=None, - callbacks=None, - ray_params: Union[None, RayParams, Dict] = None, - _remote: Optional[bool] = None, - ray_dmatrix_params: Optional[Dict] = None, + self, + X, + y, + *, + group=None, + qid=None, + sample_weight=None, + base_margin=None, + eval_set=None, + eval_group=None, + eval_qid=None, + eval_metric=None, + early_stopping_rounds=None, + verbose=False, + xgb_model: Optional[Union[Booster, str, XGBModel]] = None, + sample_weight_eval_set=None, + base_margin_eval_set=None, + feature_weights=None, + callbacks=None, + ray_params: Union[None, RayParams, Dict] = None, + _remote: Optional[bool] = None, + ray_dmatrix_params: Optional[Dict] = None, ): if not (group is None and eval_group is None): @@ -996,8 +1074,7 @@ def fit( if eval_set is not None: if eval_qid is None: - raise ValueError("`eval_qid `is required if" - " `eval_set` is not None") + raise ValueError("`eval_qid `is required if" " `eval_set` is not None") evals_result = {} ray_dmatrix_params = ray_dmatrix_params or {} @@ -1005,8 +1082,14 @@ def fit( params = self.get_xgb_params() train_dmatrix, evals = _check_if_params_are_ray_dmatrix( - X, sample_weight, base_margin, eval_set, sample_weight_eval_set, - base_margin_eval_set, eval_qid) + X, + sample_weight, + base_margin, + eval_set, + sample_weight_eval_set, + base_margin_eval_set, + eval_qid, + ) if train_dmatrix is None: train_dmatrix, evals = _wrap_evaluation_matrices( @@ -1024,23 +1107,29 @@ def fit( eval_group=eval_group, eval_qid=eval_qid, # changed in xgboost-ray: - create_dmatrix=lambda **kwargs: RayDMatrix(**{ - **kwargs, - **ray_dmatrix_params - }), - **self._ray_get_wrap_evaluation_matrices_compat_kwargs()) + create_dmatrix=lambda **kwargs: RayDMatrix( + **{**kwargs, **ray_dmatrix_params} + ), + **self._ray_get_wrap_evaluation_matrices_compat_kwargs(), + ) try: - model, feval, params = self._configure_fit(xgb_model, eval_metric, - params) + model, feval, params = self._configure_fit(xgb_model, eval_metric, params) except TypeError: # XGBoost >= 1.6.0 - (model, feval, params, early_stopping_rounds, - callbacks) = self._configure_fit(xgb_model, eval_metric, params, - early_stopping_rounds, callbacks) + ( + model, + feval, + params, + early_stopping_rounds, + callbacks, + ) = self._configure_fit( + xgb_model, eval_metric, params, early_stopping_rounds, callbacks + ) if callable(feval): raise ValueError( - "Custom evaluation metric is not yet supported for XGBRanker.") + "Custom evaluation metric is not yet supported for XGBRanker." + ) # remove those as they will be set in RayXGBoostActor params.pop("n_jobs", None) @@ -1080,16 +1169,16 @@ def _can_use_inplace_predict(self) -> bool: return False def predict( - self, - X, - output_margin=False, - ntree_limit=None, - validate_features=True, - base_margin=None, - iteration_range=None, - ray_params: Union[None, RayParams, Dict] = None, - _remote: Optional[bool] = None, - ray_dmatrix_params: Optional[Dict] = None, + self, + X, + output_margin=False, + ntree_limit=None, + validate_features=True, + base_margin=None, + iteration_range=None, + ray_params: Union[None, RayParams, Dict] = None, + _remote: Optional[bool] = None, + ray_dmatrix_params: Optional[Dict] = None, ): return self._ray_predict( X, @@ -1100,10 +1189,10 @@ def predict( iteration_range=iteration_range, ray_params=ray_params, _remote=_remote, - ray_dmatrix_params=ray_dmatrix_params) + ray_dmatrix_params=ray_dmatrix_params, + ) - predict.__doc__ = _treat_X_doc(_get_doc( - XGBRanker.predict)) + _RAY_PARAMS_DOC + predict.__doc__ = _treat_X_doc(_get_doc(XGBRanker.predict)) + _RAY_PARAMS_DOC def load_model(self, fname): if not hasattr(self, "_Booster"): diff --git a/xgboost_ray/tests/env_info.sh b/xgboost_ray/tests/env_info.sh index aa140be1..beabb494 100755 --- a/xgboost_ray/tests/env_info.sh +++ b/xgboost_ray/tests/env_info.sh @@ -1,4 +1,5 @@ #!/bin/bash +# shellcheck disable=SC2005 echo "Test environment information" echo "----------------------------" diff --git a/xgboost_ray/tests/fault_tolerance.py b/xgboost_ray/tests/fault_tolerance.py index 63379dd2..e3af87ee 100644 --- a/xgboost_ray/tests/fault_tolerance.py +++ b/xgboost_ray/tests/fault_tolerance.py @@ -1,7 +1,7 @@ import os import time from collections import defaultdict -from typing import Dict, Tuple, Set +from typing import Dict, Set, Tuple import ray from ray.actor import ActorHandle @@ -30,8 +30,7 @@ def schedule_kill(self, rank: int, boost_round: int): """Kill an actor when reaching this global boost round""" self.scheduled_kill[boost_round].add(rank) - def delay_return(self, rank: int, start_boost_round: int, - end_boost_round: int): + def delay_return(self, rank: int, start_boost_round: int, end_boost_round: int): """Do not allow an actor to finish data loading between these rounds""" self.delayed_return[rank].add((start_boost_round, end_boost_round)) @@ -69,10 +68,7 @@ def get_logs(self): class DelayedLoadingCallback(DistributedCallback): """Used to control when actors return to training""" - def __init__(self, - ft_manager: ActorHandle, - reload_data=True, - sleep_time=0.5): + def __init__(self, ft_manager: ActorHandle, reload_data=True, sleep_time=0.5): self.ft_manager = ft_manager self.reload_data = reload_data self.sleep_time = sleep_time diff --git a/xgboost_ray/tests/release/benchmark_cpu_gpu.py b/xgboost_ray/tests/release/benchmark_cpu_gpu.py index 8178a035..807937d7 100644 --- a/xgboost_ray/tests/release/benchmark_cpu_gpu.py +++ b/xgboost_ray/tests/release/benchmark_cpu_gpu.py @@ -1,29 +1,36 @@ +import argparse import glob import os - -import argparse import shutil import time import ray -from xgboost_ray import train, RayDMatrix, RayFileType, \ - RayDeviceQuantileDMatrix, RayParams + +from xgboost_ray import ( + RayDeviceQuantileDMatrix, + RayDMatrix, + RayFileType, + RayParams, + train, +) from xgboost_ray.tests.utils import create_parquet_in_tempdir if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] -def train_ray(path, - num_workers, - num_boost_rounds, - num_files=0, - regression=False, - use_gpu=False, - smoke_test=False, - ray_params=None, - xgboost_params=None, - **kwargs): +def train_ray( + path, + num_workers, + num_boost_rounds, + num_files=0, + regression=False, + use_gpu=False, + smoke_test=False, + ray_params=None, + xgboost_params=None, + **kwargs, +): if num_files: files = sorted(glob.glob(f"{path}/**/*.parquet")) while num_files > len(files): @@ -34,6 +41,7 @@ def train_ray(path, if use_gpu: try: import cupy # noqa: F401 + use_device_matrix = True except ImportError: use_device_matrix = False @@ -44,30 +52,34 @@ def train_ray(path, num_actors=num_workers, label="labels", ignore=["partition"], - filetype=RayFileType.PARQUET) + filetype=RayFileType.PARQUET, + ) else: dtrain = RayDMatrix( path, num_actors=num_workers, label="labels", ignore=["partition"], - filetype=RayFileType.PARQUET) + filetype=RayFileType.PARQUET, + ) - config = xgboost_params or { - "tree_method": "hist" if not use_gpu else "gpu_hist" - } + config = xgboost_params or {"tree_method": "hist" if not use_gpu else "gpu_hist"} if not regression: # Classification - config.update({ - "objective": "binary:logistic", - "eval_metric": ["logloss", "error"], - }) + config.update( + { + "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], + } + ) else: # Regression - config.update({ - "objective": "reg:squarederror", - "eval_metric": ["logloss", "rmse"], - }) + config.update( + { + "objective": "reg:squarederror", + "eval_metric": ["logloss", "rmse"], + } + ) start = time.time() evals_result = {} @@ -76,19 +88,21 @@ def train_ray(path, dtrain, evals_result=evals_result, num_boost_round=num_boost_rounds, - ray_params=ray_params or RayParams( + ray_params=ray_params + or RayParams( max_actor_restarts=2, num_actors=num_workers, cpus_per_actor=4 if not smoke_test else 1, - gpus_per_actor=0 if not use_gpu else 1), + gpus_per_actor=0 if not use_gpu else 1, + ), evals=[(dtrain, "train")], - **kwargs) + **kwargs, + ) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")) - print("Final training error: {:.4f}".format( - evals_result["train"]["error"][-1])) + print("Final training error: {:.4f}".format(evals_result["train"]["error"][-1])) return bst, taken @@ -100,16 +114,18 @@ def train_ray(path, parser.add_argument("num_files", type=int, help="num files") parser.add_argument( - "--file", default="/data/parted.parquet", type=str, help="data file") + "--file", default="/data/parted.parquet", type=str, help="data file" + ) parser.add_argument( - "--regression", action="store_true", default=False, help="regression") + "--regression", action="store_true", default=False, help="regression" + ) - parser.add_argument( - "--gpu", action="store_true", default=False, help="gpu") + parser.add_argument("--gpu", action="store_true", default=False, help="gpu") parser.add_argument( - "--smoke-test", action="store_true", default=False, help="smoke test") + "--smoke-test", action="store_true", default=False, help="smoke test" + ) args = parser.parse_args() @@ -125,14 +141,16 @@ def train_ray(path, num_rows=args.num_workers * 500, num_features=4, num_classes=2, - num_partitions=args.num_workers * 10) + num_partitions=args.num_workers * 10, + ) use_gpu = False else: path = args.file if not os.path.exists(path): raise ValueError( f"Benchmarking data not found: {path}." - f"\nFIX THIS by running `python create_test_data.py` first.") + f"\nFIX THIS by running `python create_test_data.py` first." + ) init_start = time.time() if args.smoke_test: @@ -149,21 +167,31 @@ def train_ray(path, num_files=num_files, regression=args.regression, use_gpu=use_gpu, - smoke_test=args.smoke_test) + smoke_test=args.smoke_test, + ) full_taken = time.time() - full_start - print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds " - f"({init_taken:.2f} for init)") + print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds " f"({init_taken:.2f} for init)") if args.smoke_test: shutil.rmtree(temp_dir, ignore_errors=True) else: with open("res.csv", "at") as fp: - fp.writelines([ - ",".join([ - str(e) for e in [ - num_workers, num_files, - int(use_gpu), num_boost_rounds, init_taken, full_taken, - train_taken - ] - ]) + "\n" - ]) + fp.writelines( + [ + ",".join( + [ + str(e) + for e in [ + num_workers, + num_files, + int(use_gpu), + num_boost_rounds, + init_taken, + full_taken, + train_taken, + ] + ] + ) + + "\n" + ] + ) diff --git a/xgboost_ray/tests/release/benchmark_ft.py b/xgboost_ray/tests/release/benchmark_ft.py index 3d7ba9f9..1125b375 100644 --- a/xgboost_ray/tests/release/benchmark_ft.py +++ b/xgboost_ray/tests/release/benchmark_ft.py @@ -1,41 +1,52 @@ -from typing import List, Dict - import argparse import glob import os +from typing import Dict, List import numpy as np - import ray from ray import tune from ray.tune import CLIReporter -from xgboost_ray import train, RayDMatrix, RayFileType, \ - RayDeviceQuantileDMatrix, RayParams, RayShardingMode + +from xgboost_ray import ( + RayDeviceQuantileDMatrix, + RayDMatrix, + RayFileType, + RayParams, + RayShardingMode, + train, +) from xgboost_ray.callback import EnvironmentCallback from xgboost_ray.matrix import _get_sharding_indices -from xgboost_ray.tests.fault_tolerance import DelayedLoadingCallback, \ - DieCallback, FaultToleranceManager +from xgboost_ray.tests.fault_tolerance import ( + DelayedLoadingCallback, + DieCallback, + FaultToleranceManager, +) from xgboost_ray.tests.utils import create_parquet_in_tempdir if "OMP_NUM_THREADS" in os.environ: del os.environ["OMP_NUM_THREADS"] -def train_ray(train_files, - eval_files, - num_workers, - num_boost_round, - regression=False, - use_gpu=False, - ray_params=None, - xgboost_params=None, - ft_manager=None, - aws=None, - **kwargs): +def train_ray( + train_files, + eval_files, + num_workers, + num_boost_round, + regression=False, + use_gpu=False, + ray_params=None, + xgboost_params=None, + ft_manager=None, + aws=None, + **kwargs, +): use_device_matrix = False if use_gpu: try: import cupy # noqa: F401 + use_device_matrix = True except ImportError: use_device_matrix = False @@ -46,26 +57,30 @@ def train_ray(train_files, num_actors=num_workers, label="labels", ignore=["partition"], - filetype=RayFileType.PARQUET) + filetype=RayFileType.PARQUET, + ) deval = RayDeviceQuantileDMatrix( eval_files, num_actors=num_workers, label="labels", ignore=["partition"], - filetype=RayFileType.PARQUET) + filetype=RayFileType.PARQUET, + ) else: dtrain = RayDMatrix( train_files, num_actors=num_workers, label="labels", ignore=["partition"], - filetype=RayFileType.PARQUET) + filetype=RayFileType.PARQUET, + ) deval = RayDMatrix( eval_files, num_actors=num_workers, label="labels", ignore=["partition"], - filetype=RayFileType.PARQUET) + filetype=RayFileType.PARQUET, + ) config = xgboost_params or {"tree_method": "hist"} @@ -74,24 +89,29 @@ def train_ray(train_files, if not regression: # Classification - config.update({ - "objective": "binary:logistic", - "eval_metric": ["logloss", "error"], - }) + config.update( + { + "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], + } + ) return_metric = "error" else: # Regression - config.update({ - "objective": "reg:squarederror", - "eval_metric": ["logloss", "rmse"], - }) + config.update( + { + "objective": "reg:squarederror", + "eval_metric": ["logloss", "rmse"], + } + ) return_metric = "rmse" xgboost_callbacks = [] distributed_callbacks = [] if ft_manager: delay_callback = DelayedLoadingCallback( - ft_manager, reload_data=True, sleep_time=0.1) + ft_manager, reload_data=True, sleep_time=0.1 + ) distributed_callbacks.append(delay_callback) die_callback = DieCallback(ft_manager, training_delay=0.1) @@ -118,25 +138,31 @@ def train_ray(train_files, ray_params=ray_params, evals=[(dtrain, "train"), (deval, "eval")], callbacks=xgboost_callbacks, - **kwargs) + **kwargs, + ) bst.save_model("benchmark_{}.xgb".format("cpu" if not use_gpu else "gpu")) - print("Final training error: {:.4f}".format( - evals_result["train"][return_metric][-1])) + print( + "Final training error: {:.4f}".format(evals_result["train"][return_metric][-1]) + ) results = { "train-logloss": evals_result["train"]["logloss"][-1], f"train-{return_metric}": evals_result["train"][return_metric][-1], "eval-logloss": evals_result["eval"]["logloss"][-1], f"eval-{return_metric}": evals_result["eval"][return_metric][-1], - "total_n": additional_results["total_n"] + "total_n": additional_results["total_n"], } return bst, results -def ft_setup(workers: List[int], num_rounds: int, die_round_factor: 0.25, - comeback_round_factor: 0.75): +def ft_setup( + workers: List[int], + num_rounds: int, + die_round_factor: 0.25, + comeback_round_factor: 0.75, +): """Setup fault tolerance manager, schedule kills and comebacks""" if workers is None: return None @@ -150,13 +176,14 @@ def ft_setup(workers: List[int], num_rounds: int, die_round_factor: 0.25, for worker in workers: ft_manager.schedule_kill.remote(rank=worker, boost_round=die_round) ft_manager.delay_return.remote( - rank=1, - start_boost_round=die_round - 2, - end_boost_round=comeback_round - 1) + rank=1, start_boost_round=die_round - 2, end_boost_round=comeback_round - 1 + ) - print(f"Scheduled workers {list(workers)} to die at round {die_round} " - f"and to come back at round {comeback_round} " - f"(total {num_rounds} training rounds)") + print( + f"Scheduled workers {list(workers)} to die at round {die_round} " + f"and to come back at round {comeback_round} " + f"(total {num_rounds} training rounds)" + ) return ft_manager @@ -188,9 +215,8 @@ def run_experiments(config, files, aws): if num_affected_workers: affected_workers = np.random.choice( - np.arange(1, num_workers), - size=num_affected_workers, - replace=False).tolist() + np.arange(1, num_workers), size=num_affected_workers, replace=False + ).tolist() else: affected_workers = None @@ -216,7 +242,8 @@ def run_experiments(config, files, aws): xgboost_params=xgboost_params, ft_manager=None, aws=aws, - early_stopping_rounds=10) + early_stopping_rounds=10, + ) return results @@ -230,7 +257,8 @@ def run_experiments(config, files, aws): sharding=sharding_mode, rank=rank, num_actors=num_workers, - n=len(train_files)) + n=len(train_files), + ) mask = np.ones(len(train_files), dtype=bool) mask[remove_shards] = False @@ -253,7 +281,8 @@ def run_experiments(config, files, aws): ray_params=ray_params, xgboost_params=xgboost_params, ft_manager=None, - aws=aws) + aws=aws, + ) return results @@ -312,7 +341,8 @@ def run_experiments(config, files, aws): ray_params=ray_params, xgboost_params=xgboost_params, ft_manager=ft_manager, - aws=aws) + aws=aws, + ) return results @@ -324,26 +354,25 @@ def run_experiments(config, files, aws): parser.add_argument("num_rounds", type=int, help="num boost rounds") parser.add_argument("num_files", type=int, help="num files") - parser.add_argument( - "--cpu", default=0, type=int, help="num cpus per worker") + parser.add_argument("--cpu", default=0, type=int, help="num cpus per worker") parser.add_argument( - "--file", default="/data/parted.parquet", type=str, help="data file") + "--file", default="/data/parted.parquet", type=str, help="data file" + ) parser.add_argument( - "--regression", action="store_true", default=False, help="regression") + "--regression", action="store_true", default=False, help="regression" + ) - parser.add_argument( - "--gpu", action="store_true", default=False, help="gpu") + parser.add_argument("--gpu", action="store_true", default=False, help="gpu") parser.add_argument( - "--calibrate", - action="store_true", - default=False, - help="calibrate boost rounds") + "--calibrate", action="store_true", default=False, help="calibrate boost rounds" + ) parser.add_argument( - "--smoke-test", action="store_true", default=False, help="smoke test") + "--smoke-test", action="store_true", default=False, help="smoke test" + ) args = parser.parse_args() @@ -361,7 +390,8 @@ def run_experiments(config, files, aws): num_rows=args.num_workers * 500, num_features=4, num_classes=2, - num_partitions=args.num_workers * 10) + num_partitions=args.num_workers * 10, + ) use_gpu = False else: path = args.file @@ -369,32 +399,35 @@ def run_experiments(config, files, aws): base, num_partitions = path.split("#", maxsplit=1) num_partitions = int(num_partitions) files = [ - f"{base}/partition={i}/part_{i}.parquet" - for i in range(num_partitions) + f"{base}/partition={i}/part_{i}.parquet" for i in range(num_partitions) ] - print(f"Using S3 dataset with base {base} and " - f"{num_partitions} partitions.") + print( + f"Using S3 dataset with base {base} and " + f"{num_partitions} partitions." + ) try: aws = { "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"], - "AWS_SECRET_ACCESS_KEY": os.environ[ - "AWS_SECRET_ACCESS_KEY"], + "AWS_SECRET_ACCESS_KEY": os.environ["AWS_SECRET_ACCESS_KEY"], "AWS_SESSION_TOKEN": os.environ["AWS_SESSION_TOKEN"], } except KeyError as e: raise ValueError( "Trying to access AWS S3, but credentials are not set " "in the environment. Did you forget to set your " - "credentials?") from e + "credentials?" + ) from e elif not os.path.exists(path): raise ValueError( f"Benchmarking data not found: {path}." - f"\nFIX THIS by running `python create_test_data.py` first.") + f"\nFIX THIS by running `python create_test_data.py` first." + ) else: files = sorted(glob.glob(f"{path}/**/*.parquet")) - print(f"Using local dataset with base {path} and " - f"{len(files)} partitions.") + print( + f"Using local dataset with base {path} and " f"{len(files)} partitions." + ) if num_files: while num_files > len(files): @@ -416,12 +449,14 @@ def run_experiments(config, files, aws): "num_workers": num_workers, "num_boost_round": num_boost_round, "seed": 1000, - "condition": tune.grid_search([ - "fewer_workers", - "non_elastic", - "elastic_no_comeback", - "elastic_comeback", - ]), + "condition": tune.grid_search( + [ + "fewer_workers", + "non_elastic", + "elastic_no_comeback", + "elastic_comeback", + ] + ), "affected_workers": tune.grid_search([0, 1, 2, 3]), "regression": args.regression, "use_gpu": args.gpu, @@ -439,9 +474,14 @@ def run_experiments(config, files, aws): reporter = CLIReporter( parameter_columns=["condition", "affected_workers"], metric_columns=[ - metric, "eval-logloss", train_metric, "total_n", "time_total_s" + metric, + "eval-logloss", + train_metric, + "total_n", + "time_total_s", ], - print_intermediate_tables=True) + print_intermediate_tables=True, + ) analysis = tune.run( tune.with_parameters(run_experiments, files=files, aws=aws), @@ -452,7 +492,7 @@ def run_experiments(config, files, aws): reuse_actors=True, progress_reporter=reporter, log_to_file=True, - verbose=2) + verbose=2, + ) - print(f"Best config: {analysis.best_config} " - f"with result {analysis.best_result}") + print(f"Best config: {analysis.best_config} " f"with result {analysis.best_result}") diff --git a/xgboost_ray/tests/release/create_learnable_data.py b/xgboost_ray/tests/release/create_learnable_data.py index 32da79b2..939d03c7 100644 --- a/xgboost_ray/tests/release/create_learnable_data.py +++ b/xgboost_ray/tests/release/create_learnable_data.py @@ -1,8 +1,8 @@ import argparse -import numpy as np import os -import pandas as pd +import numpy as np +import pandas as pd from sklearn.datasets import make_classification, make_regression if __name__ == "__main__": @@ -12,47 +12,38 @@ parser = argparse.ArgumentParser(description="Create fake data.") parser.add_argument("filename", type=str, default="/data/parted.parquet/") parser.add_argument( - "-r", - "--num-rows", - required=False, - type=int, - default=1e8, - help="num rows") + "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows" + ) parser.add_argument( "-p", "--num-partitions", required=False, type=int, default=100, - help="num partitions") + help="num partitions", + ) parser.add_argument( "-c", "--num-cols", required=False, type=int, default=4, - help="num columns (features)") + help="num columns (features)", + ) parser.add_argument( - "-C", - "--num-classes", - required=False, - type=int, - default=2, - help="num classes") + "-C", "--num-classes", required=False, type=int, default=2, help="num classes" + ) parser.add_argument( - "-s", - "--seed", - required=False, - type=int, - default=1234, - help="random seed") + "-s", "--seed", required=False, type=int, default=1234, help="random seed" + ) parser.add_argument( "-T", "--target", required=False, type=float, default=0.8, - help="target accuracy") + help="target accuracy", + ) args = parser.parse_args() @@ -93,8 +84,7 @@ rows_per_partition = np.floor(len(data) / num_partitions) - partition_arr = np.repeat( - np.arange(num_partitions), repeats=rows_per_partition) + partition_arr = np.repeat(np.arange(num_partitions), repeats=rows_per_partition) if len(partition_arr) < len(data): # If this was not evenly divided, append missing = len(data) - len(partition_arr) @@ -114,4 +104,5 @@ filename, partition_cols=["partition"], engine="pyarrow", - partition_filename_cb=lambda key: f"part_{key[0]}.parquet") + partition_filename_cb=lambda key: f"part_{key[0]}.parquet", + ) diff --git a/xgboost_ray/tests/release/create_test_data.py b/xgboost_ray/tests/release/create_test_data.py index 21b1c8fd..2984b50e 100644 --- a/xgboost_ray/tests/release/create_test_data.py +++ b/xgboost_ray/tests/release/create_test_data.py @@ -1,7 +1,8 @@ import argparse -import numpy as np import os +import numpy as np + from xgboost_ray.tests.utils import create_parquet if __name__ == "__main__": @@ -10,42 +11,33 @@ parser = argparse.ArgumentParser(description="Create fake data.") parser.add_argument( - "filename", type=str, default="/data/parted.parquet/", help="ray/dask") + "filename", type=str, default="/data/parted.parquet/", help="ray/dask" + ) parser.add_argument( - "-r", - "--num-rows", - required=False, - type=int, - default=1e8, - help="num rows") + "-r", "--num-rows", required=False, type=int, default=1e8, help="num rows" + ) parser.add_argument( "-p", "--num-partitions", required=False, type=int, default=100, - help="num partitions") + help="num partitions", + ) parser.add_argument( "-c", "--num-cols", required=False, type=int, default=4, - help="num columns (features)") + help="num columns (features)", + ) parser.add_argument( - "-C", - "--num-classes", - required=False, - type=int, - default=2, - help="num classes") + "-C", "--num-classes", required=False, type=int, default=2, help="num classes" + ) parser.add_argument( - "-s", - "--seed", - required=False, - type=int, - default=1234, - help="random seed") + "-s", "--seed", required=False, type=int, default=1234, help="random seed" + ) args = parser.parse_args() @@ -55,4 +47,5 @@ num_rows=int(args.num_rows), num_partitions=int(args.num_partitions), num_features=int(args.num_cols), - num_classes=int(args.num_classes)) + num_classes=int(args.num_classes), + ) diff --git a/xgboost_ray/tests/release/custom_objective_metric.py b/xgboost_ray/tests/release/custom_objective_metric.py index b337dfda..5cdcf5ff 100644 --- a/xgboost_ray/tests/release/custom_objective_metric.py +++ b/xgboost_ray/tests/release/custom_objective_metric.py @@ -10,6 +10,8 @@ def _init_ray(self): if __name__ == "__main__": - import pytest import sys + + import pytest + sys.exit(pytest.main(["-v", f"{__file__}::XGBoostDistributedAPITest"])) diff --git a/xgboost_ray/tests/release/run_e2e_gpu.sh b/xgboost_ray/tests/release/run_e2e_gpu.sh index 2e345c96..8c12b602 100755 --- a/xgboost_ray/tests/release/run_e2e_gpu.sh +++ b/xgboost_ray/tests/release/run_e2e_gpu.sh @@ -1,3 +1,5 @@ +#!/bin/bash + if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 diff --git a/xgboost_ray/tests/release/setup_xgboost.sh b/xgboost_ray/tests/release/setup_xgboost.sh index e1e76083..6c0fcb6a 100755 --- a/xgboost_ray/tests/release/setup_xgboost.sh +++ b/xgboost_ray/tests/release/setup_xgboost.sh @@ -5,7 +5,7 @@ pip install pytest pip uninstall -y xgboost_ray || true # Install xgboost package -pip install -U ${XGBOOST_RAY_PACKAGE:-xgboost_ray} +pip install -U "${XGBOOST_RAY_PACKAGE:-xgboost_ray}" # Create test dataset sudo mkdir -p /data || true diff --git a/xgboost_ray/tests/release/start_cpu_cluster.sh b/xgboost_ray/tests/release/start_cpu_cluster.sh index bc2b64d2..4cd71c2b 100755 --- a/xgboost_ray/tests/release/start_cpu_cluster.sh +++ b/xgboost_ray/tests/release/start_cpu_cluster.sh @@ -1,3 +1,5 @@ +#!/bin/bash + if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 diff --git a/xgboost_ray/tests/release/start_ft_cluster.sh b/xgboost_ray/tests/release/start_ft_cluster.sh index 38f18eed..d5337681 100755 --- a/xgboost_ray/tests/release/start_ft_cluster.sh +++ b/xgboost_ray/tests/release/start_ft_cluster.sh @@ -1,3 +1,5 @@ +#!/bin/bash + if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 diff --git a/xgboost_ray/tests/release/start_gpu_cluster.sh b/xgboost_ray/tests/release/start_gpu_cluster.sh index 7b2b0907..2e6237b0 100755 --- a/xgboost_ray/tests/release/start_gpu_cluster.sh +++ b/xgboost_ray/tests/release/start_gpu_cluster.sh @@ -1,3 +1,5 @@ +#!/bin/bash + if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 diff --git a/xgboost_ray/tests/release/submit_cpu_gpu_benchmark.sh b/xgboost_ray/tests/release/submit_cpu_gpu_benchmark.sh index 9c0d41b1..94c07e00 100755 --- a/xgboost_ray/tests/release/submit_cpu_gpu_benchmark.sh +++ b/xgboost_ray/tests/release/submit_cpu_gpu_benchmark.sh @@ -1,3 +1,5 @@ +#!/bin/bash + if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 diff --git a/xgboost_ray/tests/release/submit_ft_benchmark.sh b/xgboost_ray/tests/release/submit_ft_benchmark.sh index be1ee7e8..9871787a 100755 --- a/xgboost_ray/tests/release/submit_ft_benchmark.sh +++ b/xgboost_ray/tests/release/submit_ft_benchmark.sh @@ -1,3 +1,5 @@ +#!/bin/bash + if [ ! -f "./.anyscale.yaml" ]; then echo "Anyscale project not initialized. Please run 'anyscale init'" exit 1 diff --git a/xgboost_ray/tests/release/tune_placement.py b/xgboost_ray/tests/release/tune_placement.py index 5e3f18a1..5026e05d 100644 --- a/xgboost_ray/tests/release/tune_placement.py +++ b/xgboost_ray/tests/release/tune_placement.py @@ -17,25 +17,22 @@ hosts actors of the same Ray Tune trial. """ +import argparse import json import os - -import argparse import shutil import time from collections import defaultdict -from xgboost_ray.compat import TrainingCallback - import ray - +from benchmark_cpu_gpu import train_ray from ray import tune -from ray.tune.session import get_trial_id from ray.tune.integration.docker import DockerSyncer +from ray.tune.session import get_trial_id from ray.util import get_node_ip_address -from benchmark_cpu_gpu import train_ray from xgboost_ray import RayParams +from xgboost_ray.compat import TrainingCallback from xgboost_ray.session import put_queue from xgboost_ray.tests.utils import create_parquet from xgboost_ray.tune import TuneReportCallback @@ -59,21 +56,24 @@ def after_iteration(self, model, epoch, evals_log): time.sleep(8) -def tune_test(path, - num_trials, - num_workers, - num_boost_rounds, - num_files=0, - regression=False, - use_gpu=False, - fake_data=False, - smoke_test=False): +def tune_test( + path, + num_trials, + num_workers, + num_boost_rounds, + num_files=0, + regression=False, + use_gpu=False, + fake_data=False, + smoke_test=False, +): ray_params = RayParams( elastic_training=False, max_actor_restarts=0, num_actors=num_workers, cpus_per_actor=1, - gpus_per_actor=0 if not use_gpu else 1) + gpus_per_actor=0 if not use_gpu else 1, + ) def local_train(config): temp_dir = None @@ -90,23 +90,27 @@ def local_train(config): num_rows=args.num_workers * 500, num_features=4, num_classes=2, - num_partitions=args.num_workers * 10) + num_partitions=args.num_workers * 10, + ) else: if not os.path.exists(path): raise ValueError( f"Benchmarking data not found: {path}." f"\nFIX THIS by running `python create_test_data.py` " - f"on all nodes first.") + f"on all nodes first." + ) local_path = path xgboost_params = { "tree_method": "hist" if not use_gpu else "gpu_hist", } - xgboost_params.update({ - "objective": "binary:logistic", - "eval_metric": ["logloss", "error"], - }) + xgboost_params.update( + { + "objective": "binary:logistic", + "eval_metric": ["logloss", "error"], + } + ) xgboost_params.update(config) @@ -124,8 +128,8 @@ def local_train(config): xgboost_params=xgboost_params, # kwargs additional_results=additional_results, - callbacks=[PlacementCallback(), - TuneReportCallback()]) + callbacks=[PlacementCallback(), TuneReportCallback()], + ) bst.save_model("tuned.xgb") @@ -136,9 +140,7 @@ def local_train(config): tune_trial = get_trial_id() with tune.checkpoint_dir(num_boost_rounds + 1) as checkpoint_dir: - with open( - os.path.join(checkpoint_dir, "callback_returns.json"), - "wt") as f: + with open(os.path.join(checkpoint_dir, "callback_returns.json"), "wt") as f: json.dump({tune_trial: trial_ips}, f) if temp_dir: @@ -147,7 +149,7 @@ def local_train(config): search_space = { "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), - "max_depth": tune.randint(1, 9) + "max_depth": tune.randint(1, 9), } analysis = tune.run( @@ -155,7 +157,8 @@ def local_train(config): config=search_space, num_samples=num_trials, sync_config=tune.SyncConfig(sync_to_driver=DockerSyncer), - resources_per_trial=ray_params.get_tune_resources()) + resources_per_trial=ray_params.get_tune_resources(), + ) # In our PACK scheduling, we expect that each IP hosts only workers # for one Ray Tune trial. @@ -163,8 +166,8 @@ def local_train(config): for trial in analysis.trials: trial = trial with open( - os.path.join(trial.checkpoint.value, "callback_returns.json"), - "rt") as f: + os.path.join(trial.checkpoint.value, "callback_returns.json"), "rt" + ) as f: trial_to_ips = json.load(f) for tune_trial, ips in trial_to_ips.items(): for node_ip in ips: @@ -182,29 +185,30 @@ def local_train(config): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Test Ray Tune placement " - "strategy") + parser = argparse.ArgumentParser(description="Test Ray Tune placement " "strategy") parser.add_argument("num_trials", type=int, help="num trials") - parser.add_argument( - "num_workers", type=int, help="num workers (per trial)") + parser.add_argument("num_workers", type=int, help="num workers (per trial)") parser.add_argument("num_rounds", type=int, help="num boost rounds") parser.add_argument("num_files", type=int, help="num files (per trial)") parser.add_argument( - "--file", default="/data/parted.parquet", type=str, help="data file") + "--file", default="/data/parted.parquet", type=str, help="data file" + ) parser.add_argument( - "--regression", action="store_true", default=False, help="regression") + "--regression", action="store_true", default=False, help="regression" + ) - parser.add_argument( - "--gpu", action="store_true", default=False, help="gpu") + parser.add_argument("--gpu", action="store_true", default=False, help="gpu") parser.add_argument( - "--fake-data", action="store_true", default=False, help="fake data") + "--fake-data", action="store_true", default=False, help="fake data" + ) parser.add_argument( - "--smoke-test", action="store_true", default=False, help="smoke test") + "--smoke-test", action="store_true", default=False, help="smoke test" + ) args = parser.parse_args() @@ -233,6 +237,7 @@ def local_train(config): regression=args.regression, use_gpu=use_gpu, fake_data=args.fake_data, - smoke_test=args.smoke_test) + smoke_test=args.smoke_test, + ) full_taken = time.time() - full_start print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds ") diff --git a/xgboost_ray/tests/test_client.py b/xgboost_ray/tests/test_client.py index 4f558e8f..41e287a5 100644 --- a/xgboost_ray/tests/test_client.py +++ b/xgboost_ray/tests/test_client.py @@ -1,9 +1,9 @@ import os import pytest - import ray from ray.util.client.ray_client_helpers import ray_start_client_server + from xgboost_ray.data_sources.ray_dataset import RAY_DATASET_AVAILABLE @@ -24,26 +24,29 @@ def start_client_server_5_cpus(): def test_simple_train(start_client_server_4_cpus): assert ray.util.client.ray.is_connected() from xgboost_ray.examples.simple import main + main(num_actors=4, cpus_per_actor=1) -@pytest.mark.skipif( - os.environ.get("TUNE", "0") != "1", reason="Sipping Tune tests") +@pytest.mark.skipif(os.environ.get("TUNE", "0") != "1", reason="Sipping Tune tests") def test_simple_tune(start_client_server_4_cpus): assert ray.util.client.ray.is_connected() from xgboost_ray.examples.simple_tune import main + main(cpus_per_actor=1, num_actors=1, num_samples=4) def test_simple_dask(start_client_server_5_cpus): assert ray.util.client.ray.is_connected() from xgboost_ray.examples.simple_dask import main + main(cpus_per_actor=1, num_actors=4) def test_simple_modin(start_client_server_5_cpus): assert ray.util.client.ray.is_connected() from xgboost_ray.examples.simple_modin import main + main(cpus_per_actor=1, num_actors=4) @@ -52,9 +55,10 @@ def test_client_actor_cpus(start_client_server_5_cpus): from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @ray.remote - class DummyTrainActor(): + class DummyTrainActor: def test(self): import xgboost_ray + return xgboost_ray.main._ray_get_actor_cpus() actor = DummyTrainActor.options(num_cpus=2).remote() @@ -64,21 +68,25 @@ def test(self): ray.get(pg.ready()) actor2 = DummyTrainActor.options( num_cpus=2, - scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=pg)).remote() + scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg), + ).remote() assert ray.get(actor2.test.remote()) == 2 @pytest.mark.skipif( not RAY_DATASET_AVAILABLE, - reason="Ray datasets are not available in this version of Ray") + reason="Ray datasets are not available in this version of Ray", +) def test_simple_ray_dataset(start_client_server_5_cpus): assert ray.util.client.ray.is_connected() from xgboost_ray.examples.simple_ray_dataset import main + main(cpus_per_actor=1, num_actors=4) if __name__ == "__main__": - import pytest # noqa: F811 import sys + + import pytest # noqa: F811 + sys.exit(pytest.main(["-v", __file__])) diff --git a/xgboost_ray/tests/test_colocation.py b/xgboost_ray/tests/test_colocation.py index 9297d32e..f72ff93b 100644 --- a/xgboost_ray/tests/test_colocation.py +++ b/xgboost_ray/tests/test_colocation.py @@ -3,13 +3,13 @@ import tempfile import unittest from unittest.mock import patch -import pytest import numpy as np - +import pytest import ray from ray.util.queue import _QueueActor -from xgboost_ray import train, RayDMatrix, RayParams + +from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.main import _train from xgboost_ray.util import _EventActor @@ -28,12 +28,15 @@ def get_node_id(self): class TestColocation(unittest.TestCase): def setUp(self) -> None: repeat = 8 # Repeat data a couple of times for stability - self.x = np.array([ - [1, 0, 0, 0], # Feature 0 -> Label 0 - [0, 1, 0, 0], # Feature 1 -> Label 1 - [0, 0, 1, 1], # Feature 2+3 -> Label 0 - [0, 0, 1, 0], # Feature 2+!3 -> Label 1 - ] * repeat) + self.x = np.array( + [ + [1, 0, 0, 0], # Feature 0 -> Label 0 + [0, 1, 0, 0], # Feature 1 -> Label 1 + [0, 0, 1, 1], # Feature 2+3 -> Label 0 + [0, 0, 1, 0], # Feature 2+!3 -> Label 1 + ] + * repeat + ) self.y = np.array([0, 1, 0, 1] * repeat) self.params = { @@ -42,7 +45,7 @@ def setUp(self) -> None: "nthread": 1, "max_depth": 2, "objective": "binary:logistic", - "seed": 1000 + "seed": 1000, } self.kwargs = {} @@ -75,11 +78,14 @@ def test_communication_colocation(self): assert local_node in ray.state.node_ids() def _mock_train(*args, _training_state, **kwargs): - assert ray.get(_training_state.queue.actor.get_node_id.remote( - )) == ray.state.current_node_id() - assert ray.get( - _training_state.stop_event.actor.get_node_id.remote()) == \ - ray.state.current_node_id() + assert ( + ray.get(_training_state.queue.actor.get_node_id.remote()) + == ray.state.current_node_id() + ) + assert ( + ray.get(_training_state.stop_event.actor.get_node_id.remote()) + == ray.state.current_node_id() + ) return _train(*args, _training_state=_training_state, **kwargs) with patch("xgboost_ray.main._train") as mocked: @@ -88,7 +94,8 @@ def _mock_train(*args, _training_state, **kwargs): self.params, RayDMatrix(self.x, self.y), num_boost_round=2, - ray_params=RayParams(max_actor_restarts=1, num_actors=6)) + ray_params=RayParams(max_actor_restarts=1, num_actors=6), + ) def test_no_tune_spread(self): """Tests whether workers are spread when not using Tune.""" @@ -98,13 +105,11 @@ def test_no_tune_spread(self): cluster.wait_for_nodes() ray.init(address=cluster.address) - ray_params = RayParams( - max_actor_restarts=1, num_actors=2, cpus_per_actor=2) + ray_params = RayParams(max_actor_restarts=1, num_actors=2, cpus_per_actor=2) def _mock_train(*args, _training_state, **kwargs): try: - results = _train( - *args, _training_state=_training_state, **kwargs) + results = _train(*args, _training_state=_training_state, **kwargs) return results except Exception: raise @@ -124,7 +129,8 @@ def _mock_train(*args, _training_state, **kwargs): self.params, RayDMatrix(self.x, self.y), num_boost_round=4, - ray_params=ray_params) + ray_params=ray_params, + ) def test_tune_pack(self): """Tests whether workers are packed when using Tune.""" @@ -140,12 +146,12 @@ def test_tune_pack(self): ray.init(address=cluster.address) ray_params = RayParams( - max_actor_restarts=1, num_actors=num_actors, cpus_per_actor=1) + max_actor_restarts=1, num_actors=num_actors, cpus_per_actor=1 + ) def _mock_train(*args, _training_state, **kwargs): try: - results = _train( - *args, _training_state=_training_state, **kwargs) + results = _train(*args, _training_state=_training_state, **kwargs) return results except Exception: raise @@ -167,7 +173,8 @@ def inner_func(config): params, RayDMatrix(x, y), num_boost_round=4, - ray_params=ray_params) + ray_params=ray_params, + ) return inner_func @@ -192,10 +199,14 @@ def test_timeout(self): ray_params=RayParams( max_actor_restarts=1, num_actors=2, - resources_per_actor={"invalid": 1})) + resources_per_actor={"invalid": 1}, + ), + ) if __name__ == "__main__": - import pytest # noqa: F811 import sys + + import pytest # noqa: F811 + sys.exit(pytest.main(["-v", __file__])) diff --git a/xgboost_ray/tests/test_data_source.py b/xgboost_ray/tests/test_data_source.py index c86e16d1..4567b09b 100644 --- a/xgboost_ray/tests/test_data_source.py +++ b/xgboost_ray/tests/test_data_source.py @@ -1,18 +1,16 @@ import unittest -from typing import Sequence, List +from typing import List, Sequence from unittest.mock import patch import numpy as np import pandas as pd - import ray from ray import ObjectRef -from xgboost_ray.data_sources import Modin, Dask, Partitioned -from xgboost_ray.main import _RemoteRayXGBoostActor - -from xgboost_ray.data_sources.modin import MODIN_INSTALLED +from xgboost_ray.data_sources import Dask, Modin, Partitioned from xgboost_ray.data_sources.dask import DASK_INSTALLED +from xgboost_ray.data_sources.modin import MODIN_INSTALLED +from xgboost_ray.main import _RemoteRayXGBoostActor class _DistributedDataSourceTest: @@ -31,12 +29,10 @@ def _init_ray(self): if not ray.is_initialized(): ray.init(num_cpus=1) - def _testAssignPartitions(self, part_nodes, actor_nodes, - expected_actor_parts): + def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts): raise NotImplementedError - def _testDataSourceAssignment(self, part_nodes, actor_nodes, - expected_actor_parts): + def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts): raise NotImplementedError def testAssignEvenTrivial(self): @@ -54,10 +50,8 @@ def testAssignEvenTrivial(self): 2: [4, 5], 3: [6, 7], } - self._testAssignPartitions(part_nodes, actor_nodes, - expected_actor_parts) - self._testDataSourceAssignment(part_nodes, actor_nodes, - expected_actor_parts) + self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) + self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) def testAssignEvenRedistributeOne(self): """Assign actors to co-located partitions, non-trivial case. @@ -75,10 +69,8 @@ def testAssignEvenRedistributeOne(self): 2: [3, 4], 3: [6, 7], } - self._testAssignPartitions(part_nodes, actor_nodes, - expected_actor_parts) - self._testDataSourceAssignment(part_nodes, actor_nodes, - expected_actor_parts) + self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) + self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) def testAssignEvenRedistributeMost(self): """Assign actors to co-located partitions, redistribute case. @@ -94,8 +86,7 @@ def testAssignEvenRedistributeMost(self): 2: [3, 6], 3: [4, 7], } - self._testAssignPartitions(part_nodes, actor_nodes, - expected_actor_parts) + self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) # This part of the test never works - Modin materializes partitions # onto different nodes while unwrapping. @@ -115,10 +106,8 @@ def testAssignUnevenTrivial(self): 1: [3, 4], 2: [5, 6, 7], } - self._testAssignPartitions(part_nodes, actor_nodes, - expected_actor_parts) - self._testDataSourceAssignment(part_nodes, actor_nodes, - expected_actor_parts) + self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) + self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) def testAssignUnevenRedistribute(self): """Assign actors to co-located partitions, redistribute uneven case. @@ -134,10 +123,8 @@ def testAssignUnevenRedistribute(self): 1: [2, 3, 4], 2: [6, 7], } - self._testAssignPartitions(part_nodes, actor_nodes, - expected_actor_parts) - self._testDataSourceAssignment(part_nodes, actor_nodes, - expected_actor_parts) + self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) + self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) def testAssignUnevenRedistributeColocated(self): """Assign actors to co-located partitions, redistribute uneven case. @@ -153,10 +140,8 @@ def testAssignUnevenRedistributeColocated(self): 1: [1, 3], 2: [5, 6], } - self._testAssignPartitions(part_nodes, actor_nodes, - expected_actor_parts) - self._testDataSourceAssignment(part_nodes, actor_nodes, - expected_actor_parts) + self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) + self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) def testAssignUnevenRedistributeAll(self): """Assign actors to co-located partitions, redistribute uneven case. @@ -172,23 +157,18 @@ def testAssignUnevenRedistributeAll(self): 1: [1, 3], 2: [5, 6], } - self._testAssignPartitions(part_nodes, actor_nodes, - expected_actor_parts) - self._testDataSourceAssignment(part_nodes, actor_nodes, - expected_actor_parts) + self._testAssignPartitions(part_nodes, actor_nodes, expected_actor_parts) + self._testDataSourceAssignment(part_nodes, actor_nodes, expected_actor_parts) @unittest.skipIf( - not MODIN_INSTALLED, - reason="Modin is not installed in a supported version.") + not MODIN_INSTALLED, reason="Modin is not installed in a supported version." +) class ModinDataSourceTest(_DistributedDataSourceTest, unittest.TestCase): """This test suite validates core RayDMatrix functionality.""" - def _testAssignPartitions(self, part_nodes, actor_nodes, - expected_actor_parts): - partitions = [ - ray.put(p) for p in np.array_split(self.x, len(part_nodes)) - ] + def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts): + partitions = [ray.put(p) for p in np.array_split(self.x, len(part_nodes))] # Dict from partition (obj ref) to node host part_to_node = dict(zip(partitions, [f"node{n}" for n in part_nodes])) @@ -204,7 +184,8 @@ def _testAssignPartitions(self, part_nodes, actor_nodes, actor_to_parts[actor_rank][i], partitions[part_id], msg=f"Assignment failed: Actor rank {actor_rank}, " - f"partition {i} is not partition with ID {part_id}.") + f"partition {i} is not partition with ID {part_id}.", + ) def _getActorToParts(self, actors_to_node, node_to_part): def unwrap(data, *args, **kwargs): @@ -213,23 +194,20 @@ def unwrap(data, *args, **kwargs): def actor_ranks(actors): return actors_to_node - with patch("modin.distributed.dataframe.pandas.unwrap_partitions" - ) as mock_unwrap, patch( - "xgboost_ray.data_sources.modin.get_actor_rank_ips" - ) as mock_ranks: + with patch( + "modin.distributed.dataframe.pandas.unwrap_partitions" + ) as mock_unwrap, patch( + "xgboost_ray.data_sources.modin.get_actor_rank_ips" + ) as mock_ranks: mock_unwrap.side_effect = unwrap mock_ranks.side_effect = actor_ranks - _, actor_to_parts = Modin.get_actor_shards( - data=node_to_part, actors=[]) + _, actor_to_parts = Modin.get_actor_shards(data=node_to_part, actors=[]) return actor_to_parts - def _testDataSourceAssignment(self, part_nodes, actor_nodes, - expected_actor_parts): - node_ips = [ - node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"] - ] + def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts): + node_ips = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]] if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1: print("Not running on cluster, skipping rest of this test.") return @@ -244,17 +222,25 @@ def create_remote_df(arr): return ray.put(pd.DataFrame(arr)) partitions = np.array_split(self.x, len(part_nodes)) - node_dfs: Sequence[ObjectRef] = ray.get([ - create_remote_df.options(resources={ - f"node:{pip}": 0.1 - }).remote(partitions[pid]) for pid, pip in enumerate(part_node_ips) - ]) - node_ip_dfs = [(ray.put(part_node_ips[pid]), node_df) - for pid, node_df in enumerate(node_dfs)] + node_dfs: Sequence[ObjectRef] = ray.get( + [ + create_remote_df.options(resources={f"node:{pip}": 0.1}).remote( + partitions[pid] + ) + for pid, pip in enumerate(part_node_ips) + ] + ) + node_ip_dfs = [ + (ray.put(part_node_ips[pid]), node_df) + for pid, node_df in enumerate(node_dfs) + ] # Create modin dataframe from distributed partitions - from modin.distributed.dataframe.pandas import (from_partitions, - unwrap_partitions) + from modin.distributed.dataframe.pandas import ( + from_partitions, + unwrap_partitions, + ) + modin_df = from_partitions(node_ip_dfs, axis=0) # Sanity check @@ -265,23 +251,26 @@ def create_remote_df(arr): self.assertSequenceEqual( [df[0][0] for df in partitions], [df[0][0] for df in ray.get(list(df_objs))], - msg="Modin mixed up the partition order") + msg="Modin mixed up the partition order", + ) self.assertSequenceEqual( part_node_ips, ray.get(list(ip_objs)), - msg="Modin moved partitions to different IPs") + msg="Modin moved partitions to different IPs", + ) except AssertionError as exc: print(f"Modin part of the test failed: {exc}") - print("This is a stochastic test failure. Ignoring the rest " - "of this test.") + print( + "This is a stochastic test failure. Ignoring the rest " "of this test." + ) return # Create ray actors actors = [ - _RemoteRayXGBoostActor.options(resources={ - f"node:{nip}": 0.1 - }).remote(rank=rank, num_actors=len(actor_nodes)) + _RemoteRayXGBoostActor.options(resources={f"node:{nip}": 0.1}).remote( + rank=rank, num_actors=len(actor_nodes) + ) for rank, nip in enumerate(actor_node_ips) ] @@ -296,21 +285,23 @@ def create_remote_df(arr): self.assertTrue( assigned_df.equals(part_df), msg=f"Assignment failed: Actor rank {actor_rank}, " - f"partition {i} is not partition with ID {part_id}.") + f"partition {i} is not partition with ID {part_id}.", + ) @unittest.skipIf( - not DASK_INSTALLED, reason="Dask is not installed in a supported version.") + not DASK_INSTALLED, reason="Dask is not installed in a supported version." +) class DaskDataSourceTest(_DistributedDataSourceTest, unittest.TestCase): """This test suite validates core RayDMatrix functionality.""" - def _testAssignPartitions(self, part_nodes, actor_nodes, - expected_actor_parts): + def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts): partitions = list(range(len(part_nodes))) # Dict from partition (id) to node host part_to_node = dict( - zip(range(len(partitions)), [f"node{n}" for n in part_nodes])) + zip(range(len(partitions)), [f"node{n}" for n in part_nodes]) + ) node_to_part = [(n, p) for p, n in part_to_node.items()] actors_to_node = dict(enumerate(f"node{n}" for n in actor_nodes)) @@ -323,11 +314,13 @@ def _testAssignPartitions(self, part_nodes, actor_nodes, actor_to_parts[actor_rank][i], partitions[part_id], msg=f"Assignment failed: Actor rank {actor_rank}, " - f"partition {i} is not partition with ID {part_id}.") + f"partition {i} is not partition with ID {part_id}.", + ) def _getActorToParts(self, actors_to_node, node_to_part): def ip_to_parts(data, *args, **kwargs): from collections import defaultdict + ip_to_parts_dict = defaultdict(list) for node, pid in data: ip_to_parts_dict[node].append(pid) @@ -336,31 +329,28 @@ def ip_to_parts(data, *args, **kwargs): def actor_ranks(actors): return actors_to_node - with patch("xgboost_ray.data_sources.dask.get_ip_to_parts" - ) as mock_parts, patch( - "xgboost_ray.data_sources.dask.get_actor_rank_ips" - ) as mock_ranks: + with patch( + "xgboost_ray.data_sources.dask.get_ip_to_parts" + ) as mock_parts, patch( + "xgboost_ray.data_sources.dask.get_actor_rank_ips" + ) as mock_ranks: mock_parts.side_effect = ip_to_parts mock_ranks.side_effect = actor_ranks - _, actor_to_parts = Dask.get_actor_shards( - data=node_to_part, actors=[]) + _, actor_to_parts = Dask.get_actor_shards(data=node_to_part, actors=[]) return actor_to_parts - def _testDataSourceAssignment(self, part_nodes, actor_nodes, - expected_actor_parts): - self.skipTest( - "Data-locality aware scheduling using Dask is currently broken.") + def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts): + self.skipTest("Data-locality aware scheduling using Dask is currently broken.") import dask import dask.dataframe as dd from ray.util.dask import ray_dask_get + dask.config.set(scheduler=ray_dask_get) - node_ips = [ - node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"] - ] + node_ips = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]] if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1: print("Not running on cluster, skipping rest of this test.") return @@ -375,17 +365,20 @@ def create_remote_df(arr): return dd.from_array(arr) partitions = np.array_split(self.x, len(part_nodes)) - node_dfs: List[dd.DataFrame] = ray.get([ - create_remote_df.options(resources={ - f"node:{pip}": 0.1 - }).remote(partitions[pid]) for pid, pip in enumerate(part_node_ips) - ]) + node_dfs: List[dd.DataFrame] = ray.get( + [ + create_remote_df.options(resources={f"node:{pip}": 0.1}).remote( + partitions[pid] + ) + for pid, pip in enumerate(part_node_ips) + ] + ) node_dfs_concat = dd.concat(node_dfs).persist() # Get node IPs partition_locations_df = node_dfs_concat.map_partitions( - lambda df: pd.DataFrame([ray.util.get_node_ip_address()] - )).compute() + lambda df: pd.DataFrame([ray.util.get_node_ip_address()]) + ).compute() partition_locations = [ partition_locations_df[0].iloc[i] for i in range(partition_locations_df.size) @@ -400,23 +393,26 @@ def create_remote_df(arr): self.assertSequenceEqual( [df[0][0] for df in partitions], [df[0][0] for df in dask_df.partitions.compute()], - msg="Dask mixed up the partition order") + msg="Dask mixed up the partition order", + ) self.assertSequenceEqual( part_node_ips, partition_locations, - msg="Dask moved partitions to different IPs") + msg="Dask moved partitions to different IPs", + ) except AssertionError as exc: print(f"Dask part of the test failed: {exc}") - print("This is a stochastic test failure. Ignoring the rest " - "of this test.") + print( + "This is a stochastic test failure. Ignoring the rest " "of this test." + ) return # Create ray actors actors = [ - _RemoteRayXGBoostActor.options(resources={ - f"node:{nip}": 0.1 - }).remote(rank=rank, num_actors=len(actor_nodes)) + _RemoteRayXGBoostActor.options(resources={f"node:{nip}": 0.1}).remote( + rank=rank, num_actors=len(actor_nodes) + ) for rank, nip in enumerate(actor_node_ips) ] @@ -431,7 +427,8 @@ def create_remote_df(arr): self.assertTrue( assigned_df.equals(part_df), msg=f"Assignment failed: Actor rank {actor_rank}, " - f"partition {i} is not partition with ID {part_id}.") + f"partition {i} is not partition with ID {part_id}.", + ) # Ray Datasets data source is not tested, as we do not make use of xgboost-ray @@ -440,11 +437,9 @@ def create_remote_df(arr): class PartitionedSourceTest(_DistributedDataSourceTest, unittest.TestCase): - def _testAssignPartitions(self, part_nodes, actor_nodes, - expected_actor_parts): + def _testAssignPartitions(self, part_nodes, actor_nodes, expected_actor_parts): partitions = [ - ray.put(pd.DataFrame(p)) - for p in np.array_split(self.x, len(part_nodes)) + ray.put(pd.DataFrame(p)) for p in np.array_split(self.x, len(part_nodes)) ] # Dict from partition (obj ref) to node host @@ -452,8 +447,9 @@ def _testAssignPartitions(self, part_nodes, actor_nodes, actors_to_node = dict(enumerate(f"node{n}" for n in actor_nodes)) - actor_to_parts = self._getActorToParts(actors_to_node, partitions, - part_to_node, part_nodes) + actor_to_parts = self._getActorToParts( + actors_to_node, partitions, part_to_node, part_nodes + ) for actor_rank, part_ids in expected_actor_parts.items(): for i, part_id in enumerate(part_ids): @@ -461,12 +457,12 @@ def _testAssignPartitions(self, part_nodes, actor_nodes, actor_to_parts[actor_rank][i], partitions[part_id], msg=f"Assignment failed: Actor rank {actor_rank}, " - f"partition {i} is not partition with ID {part_id}.") + f"partition {i} is not partition with ID {part_id}.", + ) def _mk_partitioned(self, part_to_node, nr, nc, shapes): class Parted: - """Class exposing __partitioned__ - """ + """Class exposing __partitioned__""" def __init__(self, parted): self.__partitioned__ = parted @@ -476,7 +472,7 @@ def __init__(self, parted): "shape": (nr, nc), "partition_tiling": (num_parts, 1), "get": lambda x: ray.get(x), - "partitions": {} + "partitions": {}, } startx = 0 for i, pn in enumerate(part_to_node.items()): @@ -491,31 +487,26 @@ def __init__(self, parted): return Parted(data) - def _getActorToParts(self, actors_to_node, partitions, part_to_node, - part_nodes): + def _getActorToParts(self, actors_to_node, partitions, part_to_node, part_nodes): def actor_ranks(actors): return actors_to_node - with patch("xgboost_ray.data_sources.partitioned.get_actor_rank_ips" - ) as mock_ranks: + with patch( + "xgboost_ray.data_sources.partitioned.get_actor_rank_ips" + ) as mock_ranks: mock_ranks.side_effect = actor_ranks nr, nc = self.x.shape data = self._mk_partitioned( - part_to_node, nr, nc, - {p: ray.get(p).shape - for p in partitions}) + part_to_node, nr, nc, {p: ray.get(p).shape for p in partitions} + ) - _, actor_to_parts = Partitioned.get_actor_shards( - data=data, actors=[]) + _, actor_to_parts = Partitioned.get_actor_shards(data=data, actors=[]) return actor_to_parts - def _testDataSourceAssignment(self, part_nodes, actor_nodes, - expected_actor_parts): - node_ips = [ - node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"] - ] + def _testDataSourceAssignment(self, part_nodes, actor_nodes, expected_actor_parts): + node_ips = [node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]] if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1: print("Not running on cluster, skipping rest of this test.") return @@ -533,9 +524,10 @@ def create_remote_df(arr): node_dfs, shapes = {}, {} for pid, pip in enumerate(part_node_ips): pref = ray.get( - create_remote_df.options(resources={ - f"node:{pip}": 0.1 - }).remote(partitions[pid])) + create_remote_df.options(resources={f"node:{pip}": 0.1}).remote( + partitions[pid] + ) + ) node_dfs[pref] = pip shapes[pref] = partitions[pid].shape @@ -545,9 +537,9 @@ def create_remote_df(arr): # Create ray actors actors = [ - _RemoteRayXGBoostActor.options(resources={ - f"node:{nip}": 0.1 - }).remote(rank=rank, num_actors=len(actor_nodes)) + _RemoteRayXGBoostActor.options(resources={f"node:{nip}": 0.1}).remote( + rank=rank, num_actors=len(actor_nodes) + ) for rank, nip in enumerate(actor_node_ips) ] @@ -561,10 +553,13 @@ def create_remote_df(arr): self.assertTrue( assigned_df.equals(part_df), msg=f"Assignment failed: Actor rank {actor_rank}, " - f"partition {i} is not partition with ID {part_id}.") + f"partition {i} is not partition with ID {part_id}.", + ) if __name__ == "__main__": - import pytest import sys + + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/xgboost_ray/tests/test_end_to_end.py b/xgboost_ray/tests/test_end_to_end.py index 7f9c5a12..2913ce46 100644 --- a/xgboost_ray/tests/test_end_to_end.py +++ b/xgboost_ray/tests/test_end_to_end.py @@ -1,20 +1,18 @@ import os import shutil import tempfile - -import numpy as np import unittest -import xgboost as xgb +import numpy as np import ray +import xgboost as xgb from ray.exceptions import RayActorError, RayTaskError from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - from scipy.sparse import csr_matrix -from xgboost_ray import RayParams, train, RayDMatrix, predict, RayShardingMode -from xgboost_ray.main import RayXGBoostTrainingError +from xgboost_ray import RayDMatrix, RayParams, RayShardingMode, predict, train from xgboost_ray.callback import DistributedCallback +from xgboost_ray.main import RayXGBoostTrainingError from xgboost_ray.tests.utils import get_num_trees @@ -72,12 +70,15 @@ class XGBoostRayEndToEndTest(unittest.TestCase): def setUp(self): repeat = 8 # Repeat data a couple of times for stability - self.x = np.array([ - [1, 0, 0, 0], # Feature 0 -> Label 0 - [0, 1, 0, 0], # Feature 1 -> Label 1 - [0, 0, 1, 1], # Feature 2+3 -> Label 2 - [0, 0, 1, 0], # Feature 2+!3 -> Label 3 - ] * repeat) + self.x = np.array( + [ + [1, 0, 0, 0], # Feature 0 -> Label 0 + [0, 1, 0, 0], # Feature 1 -> Label 1 + [0, 0, 1, 1], # Feature 2+3 -> Label 2 + [0, 0, 1, 0], # Feature 2+!3 -> Label 3 + ] + * repeat + ) self.y = np.array([0, 1, 2, 3] * repeat) self.params = { @@ -85,7 +86,7 @@ def setUp(self): "nthread": 1, "max_depth": 2, "objective": "multi:softmax", - "num_class": 4 + "num_class": 4, } def tearDown(self): @@ -141,9 +142,10 @@ def test_client_actor_cpus(self): ray.init(num_cpus=5, num_gpus=0) @ray.remote - class DummyTrainActor(): + class DummyTrainActor: def test(self): import xgboost_ray + return xgboost_ray.main._ray_get_actor_cpus() actor = DummyTrainActor.options(num_cpus=2).remote() @@ -153,13 +155,11 @@ def test(self): ray.get(pg.ready()) actor2 = DummyTrainActor.options( num_cpus=2, - scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=pg)).remote() + scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg), + ).remote() assert ray.get(actor2.test.remote()) == 2 - def _testJointTraining(self, - sharding=RayShardingMode.INTERLEAVED, - softprob=False): + def _testJointTraining(self, sharding=RayShardingMode.INTERLEAVED, softprob=False): """Train with Ray. The data will be split, but the trees should be combined together and find the true model.""" params = self.params.copy() @@ -169,7 +169,8 @@ def _testJointTraining(self, bst = train( params, RayDMatrix(self.x, self.y, sharding=sharding), - ray_params=RayParams(num_actors=2)) + ray_params=RayParams(num_actors=2), + ) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) @@ -189,7 +190,8 @@ def _testJointTraining(self, bst = train( params, RayDMatrix(self.x[:-1], self.y[:-1], sharding=sharding), - ray_params=RayParams(num_actors=2)) + ray_params=RayParams(num_actors=2), + ) x_mat = RayDMatrix(self.x[:-1], sharding=sharding) pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2)) @@ -201,19 +203,16 @@ def _testJointTraining(self, def testJointTrainingInterleaved(self): ray.init(num_cpus=2, num_gpus=0) self._testJointTraining(sharding=RayShardingMode.INTERLEAVED) - self._testJointTraining( - sharding=RayShardingMode.INTERLEAVED, softprob=True) + self._testJointTraining(sharding=RayShardingMode.INTERLEAVED, softprob=True) def testJointTrainingBatch(self): ray.init(num_cpus=2, num_gpus=0) self._testJointTraining(sharding=RayShardingMode.BATCH) self._testJointTraining(sharding=RayShardingMode.BATCH, softprob=True) - def testTrainPredict(self, - init=True, - remote=None, - softprob=False, - **ray_param_dict): + def testTrainPredict( + self, init=True, remote=None, softprob=False, **ray_param_dict + ): """Train with evaluation and predict""" if init: ray.init(num_cpus=2, num_gpus=0) @@ -233,7 +232,8 @@ def testTrainPredict(self, ray_params=RayParams(num_actors=2, **ray_param_dict), evals=[(dtrain, "dtrain")], evals_result=evals_result, - _remote=remote) + _remote=remote, + ) self.assertEqual(get_num_trees(bst), 38) @@ -244,7 +244,8 @@ def testTrainPredict(self, bst, x_mat, ray_params=RayParams(num_actors=2, **ray_param_dict), - _remote=remote) + _remote=remote, + ) if softprob: self.assertEqual(pred_y.shape[1], len(np.unique(self.y))) @@ -281,14 +282,16 @@ def testDistributedCallbacksTrainPredict(self, init=True, remote=False): test_callback = _make_callback(tmpdir) self.testTrainPredict( - init=init, remote=remote, distributed_callbacks=[test_callback]) + init=init, remote=remote, distributed_callbacks=[test_callback] + ) rank_0_log_file = os.path.join(tmpdir, "rank_0.log") rank_1_log_file = os.path.join(tmpdir, "rank_1.log") self.assertTrue(os.path.exists(rank_1_log_file)) rank_0_log = open(rank_0_log_file, "rt").read() self.assertEqual( - rank_0_log, "Actor 0: Init\n" + rank_0_log, + "Actor 0: Init\n" "Actor 0: Before loading\n" "Actor 0: After loading\n" "Actor 0: Before train\n" @@ -297,7 +300,8 @@ def testDistributedCallbacksTrainPredict(self, init=True, remote=False): "Actor 0: Before loading\n" "Actor 0: After loading\n" "Actor 0: Before predict\n" - "Actor 0: After predict\n") + "Actor 0: After predict\n", + ) shutil.rmtree(tmpdir) def testDistributedCallbacksTrainPredictClient(self): @@ -326,11 +330,12 @@ def testFailPrintErrors(self): { "objective": "multi:softmax", "num_class": 2, - "eval_metric": ["logloss", "error"] + "eval_metric": ["logloss", "error"], }, # This will error train_set, evals=[(train_set, "train")], - ray_params=RayParams(num_actors=1, max_actor_restarts=0)) + ray_params=RayParams(num_actors=1, max_actor_restarts=0), + ) except RuntimeError as exc: self.assertTrue(exc.__cause__) self.assertTrue(isinstance(exc.__cause__, RayActorError)) @@ -340,11 +345,12 @@ def testFailPrintErrors(self): self.assertTrue(exc.__cause__.__cause__.cause) self.assertTrue( - isinstance(exc.__cause__.__cause__.cause, - RayXGBoostTrainingError)) + isinstance(exc.__cause__.__cause__.cause, RayXGBoostTrainingError) + ) - self.assertIn("label and prediction size not match", - str(exc.__cause__.__cause__)) + self.assertIn( + "label and prediction size not match", str(exc.__cause__.__cause__) + ) def testKwargsValidation(self): x = np.random.uniform(0, 1, size=(100, 4)) @@ -357,22 +363,42 @@ def testKwargsValidation(self): { "objective": "multi:softmax", "num_class": 2, - "eval_metric": ["logloss", "error"] + "eval_metric": ["logloss", "error"], }, train_set, evals=[(train_set, "train")], ray_params=RayParams(num_actors=1, max_actor_restarts=0), - totally_invalid_kwarg="") + totally_invalid_kwarg="", + ) def testRanking(self): Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17]) Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3]) - X = csr_matrix( - (np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4)).toarray() - y = np.array([ - 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, - 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0 - ]) + X = csr_matrix((np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4)).toarray() + y = np.array( + [ + 0.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 1.0, + 0.0, + 1.0, + 0.0, + 0.0, + 1.0, + 0.0, + 0.0, + 1.0, + 0.0, + 1.0, + 1.0, + 0.0, + 0.0, + ] + ) qid = np.array([0] * 5 + [1] * 5 + [2] * 5 + [3] * 5) dtrain = RayDMatrix(X, label=y, qid=qid) @@ -381,7 +407,7 @@ def testRanking(self): "eta": 1, "objective": "rank:pairwise", "eval_metric": ["auc", "aucpr"], - "max_depth": 1 + "max_depth": 1, } evals_result = {} train( @@ -390,14 +416,16 @@ def testRanking(self): 10, evals=[(dtrain, "train")], evals_result=evals_result, - ray_params=RayParams(num_actors=2, max_actor_restarts=0)) + ray_params=RayParams(num_actors=2, max_actor_restarts=0), + ) auc_rec = evals_result["train"]["auc"] self.assertTrue(all(p <= q for p, q in zip(auc_rec, auc_rec[1:]))) auc_rec = evals_result["train"]["aucpr"] self.assertTrue((p <= q for p, q in zip(auc_rec, auc_rec[1:]))) - @unittest.skipIf(xgb.__version__ < "1.3.0", - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + xgb.__version__ < "1.3.0", f"not supported in xgb version {xgb.__version__}" + ) def testFeatureWeightsParam(self): """Test the feature_weights parameter for xgb version >= 1.3.0. Adapted from the official demo codes: @@ -411,7 +439,7 @@ def testFeatureWeightsParam(self): X = rng.randn(kRows, kCols) y = rng.randn(kRows) - fw = np.ones(shape=(kCols, )) + fw = np.ones(shape=(kCols,)) for i in range(kCols): fw[i] *= float(i) train_set = RayDMatrix(X, y, feature_weights=fw) @@ -429,8 +457,9 @@ def testFeatureWeightsParam(self): evals=[(train_set, "train")], verbose_eval=False, ray_params=RayParams( - num_actors=2, # Number of remote actors - cpus_per_actor=1)) + num_actors=2, cpus_per_actor=1 # Number of remote actors + ), + ) feature_map = bst.get_fscore() # feature zero has 0 weight @@ -439,6 +468,8 @@ def testFeatureWeightsParam(self): if __name__ == "__main__": - import pytest import sys + + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/xgboost_ray/tests/test_fault_tolerance.py b/xgboost_ray/tests/test_fault_tolerance.py index 0bcc9934..94bb0a54 100644 --- a/xgboost_ray/tests/test_fault_tolerance.py +++ b/xgboost_ray/tests/test_fault_tolerance.py @@ -3,20 +3,28 @@ import shutil import tempfile import time -from unittest.mock import patch, DEFAULT, MagicMock - -import numpy as np import unittest -import xgboost as xgb +from unittest.mock import DEFAULT, MagicMock, patch +import numpy as np import ray +import xgboost as xgb -from xgboost_ray import train, RayDMatrix, RayParams +from xgboost_ray import RayDMatrix, RayParams, train from xgboost_ray.main import RayXGBoostActorAvailable -from xgboost_ray.tests.fault_tolerance import FaultToleranceManager, \ - DelayedLoadingCallback, DieCallback -from xgboost_ray.tests.utils import flatten_obj, _checkpoint_callback, \ - _fail_callback, tree_obj, _kill_callback, get_num_trees +from xgboost_ray.tests.fault_tolerance import ( + DelayedLoadingCallback, + DieCallback, + FaultToleranceManager, +) +from xgboost_ray.tests.utils import ( + _checkpoint_callback, + _fail_callback, + _kill_callback, + flatten_obj, + get_num_trees, + tree_obj, +) class _FakeTask(MagicMock): @@ -37,12 +45,15 @@ def setUp(self): os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "0" repeat = 8 # Repeat data a couple of times for stability - self.x = np.array([ - [1, 0, 0, 0], # Feature 0 -> Label 0 - [0, 1, 0, 0], # Feature 1 -> Label 1 - [0, 0, 1, 1], # Feature 2+3 -> Label 2 - [0, 0, 1, 0], # Feature 2+!3 -> Label 3 - ] * repeat) + self.x = np.array( + [ + [1, 0, 0, 0], # Feature 0 -> Label 0 + [0, 1, 0, 0], # Feature 1 -> Label 1 + [0, 0, 1, 1], # Feature 2+3 -> Label 2 + [0, 0, 1, 0], # Feature 2+!3 -> Label 3 + ] + * repeat + ) self.y = np.array([0, 1, 2, 3] * repeat) self.params = { @@ -50,7 +61,7 @@ def setUp(self): "nthread": 1, "max_depth": 2, "objective": "multi:softmax", - "num_class": 4 + "num_class": 4, } self.tmpdir = str(tempfile.mkdtemp()) @@ -93,7 +104,8 @@ def keep(actors, *args, **kwargs): callbacks=[_kill_callback(self.die_lock_file)], num_boost_round=20, ray_params=RayParams(max_actor_restarts=1, num_actors=2), - additional_results=additional_results) + additional_results=additional_results, + ) self.assertEqual(20, get_num_trees(bst)) @@ -134,8 +146,10 @@ def keep(actors, *args, **kwargs): max_actor_restarts=1, num_actors=2, elastic_training=True, - max_failed_actors=1), - additional_results=additional_results) + max_failed_actors=1, + ), + additional_results=additional_results, + ) self.assertEqual(20, get_num_trees(bst)) @@ -159,11 +173,11 @@ def testTrainingContinuationElasticKilledRestarted(self): ft_manager = FaultToleranceManager.remote() ft_manager.schedule_kill.remote(rank=0, boost_round=6) - ft_manager.delay_return.remote( - rank=1, start_boost_round=12, end_boost_round=21) + ft_manager.delay_return.remote(rank=1, start_boost_round=12, end_boost_round=21) delay_callback = DelayedLoadingCallback( - ft_manager, reload_data=True, sleep_time=0.1) + ft_manager, reload_data=True, sleep_time=0.1 + ) die_callback = DieCallback(ft_manager, training_delay=0.25) additional_results = {} @@ -185,8 +199,10 @@ def keep(actors, *args, **kwargs): num_actors=2, elastic_training=True, max_failed_actors=1, - distributed_callbacks=[delay_callback]), - additional_results=additional_results) + distributed_callbacks=[delay_callback], + ), + additional_results=additional_results, + ) self.assertEqual(20, get_num_trees(bst)) @@ -216,18 +232,18 @@ def testTrainingContinuationElasticMultiKilled(self): self.params, RayDMatrix(self.x, self.y), callbacks=[ - _kill_callback( - self.die_lock_file, fail_iteration=6, actor_rank=0), - _kill_callback( - self.die_lock_file_2, fail_iteration=14, actor_rank=1), + _kill_callback(self.die_lock_file, fail_iteration=6, actor_rank=0), + _kill_callback(self.die_lock_file_2, fail_iteration=14, actor_rank=1), ], num_boost_round=20, ray_params=RayParams( max_actor_restarts=2, num_actors=2, elastic_training=True, - max_failed_actors=2), - additional_results=additional_results) + max_failed_actors=2, + ), + additional_results=additional_results, + ) self.assertEqual(20, get_num_trees(bst)) @@ -258,8 +274,10 @@ def keep(actors, *args, **kwargs): max_actor_restarts=1, num_actors=2, elastic_training=True, - max_failed_actors=1), - additional_results=additional_results) + max_failed_actors=1, + ), + additional_results=additional_results, + ) self.assertEqual(20, get_num_trees(bst)) @@ -285,7 +303,8 @@ def testTrainingStop(self): RayDMatrix(self.x, self.y), callbacks=[_kill_callback(self.die_lock_file)], num_boost_round=20, - ray_params=RayParams(max_actor_restarts=0, num_actors=2)) + ray_params=RayParams(max_actor_restarts=0, num_actors=2), + ) def testTrainingStopElastic(self): """This should now stop training after one actor died.""" @@ -296,11 +315,11 @@ def testTrainingStopElastic(self): ft_manager.schedule_kill.remote(rank=0, boost_round=3) ft_manager.schedule_kill.remote(rank=1, boost_round=6) - ft_manager.delay_return.remote( - rank=0, start_boost_round=4, end_boost_round=5) + ft_manager.delay_return.remote(rank=0, start_boost_round=4, end_boost_round=5) delay_callback = DelayedLoadingCallback( - ft_manager, reload_data=True, sleep_time=0.1) + ft_manager, reload_data=True, sleep_time=0.1 + ) die_callback = DieCallback(ft_manager, training_delay=0.25) with self.assertRaises(RuntimeError): @@ -314,7 +333,9 @@ def testTrainingStopElastic(self): max_failed_actors=1, max_actor_restarts=1, num_actors=2, - distributed_callbacks=[delay_callback])) + distributed_callbacks=[delay_callback], + ), + ) def testCheckpointContinuationValidity(self): """Test that checkpoints are stored and loaded correctly""" @@ -324,12 +345,11 @@ def testCheckpointContinuationValidity(self): bst_1 = train( self.params, RayDMatrix(self.x, self.y), - callbacks=[ - _checkpoint_callback(frequency=1, before_iteration_=False) - ], + callbacks=[_checkpoint_callback(frequency=1, before_iteration_=False)], num_boost_round=2, ray_params=RayParams(num_actors=2), - additional_results=res_1) + additional_results=res_1, + ) last_checkpoint_1 = res_1["callback_returns"][0][-1] last_checkpoint_other_rank_1 = res_1["callback_returns"][1][-1] @@ -347,12 +367,13 @@ def testCheckpointContinuationValidity(self): RayDMatrix(self.x, self.y), callbacks=[ _checkpoint_callback(frequency=1, before_iteration_=True), - _checkpoint_callback(frequency=1, before_iteration_=False) + _checkpoint_callback(frequency=1, before_iteration_=False), ], num_boost_round=4, ray_params=RayParams(num_actors=2), additional_results=res_2, - xgb_model=lc1) + xgb_model=lc1, + ) first_checkpoint_2 = res_2["callback_returns"][0][0] first_checkpoint_other_actor_2 = res_2["callback_returns"][1][0] last_checkpoint_2 = res_2["callback_returns"][0][-1] @@ -384,20 +405,23 @@ def testSameResultWithAndWithoutError(self): self.params, RayDMatrix(self.x, self.y), num_boost_round=10, - ray_params=RayParams(max_actor_restarts=0, num_actors=2)) + ray_params=RayParams(max_actor_restarts=0, num_actors=2), + ) bst_2part_1 = train( self.params, RayDMatrix(self.x, self.y), num_boost_round=5, - ray_params=RayParams(max_actor_restarts=0, num_actors=2)) + ray_params=RayParams(max_actor_restarts=0, num_actors=2), + ) bst_2part_2 = train( self.params, RayDMatrix(self.x, self.y), num_boost_round=5, ray_params=RayParams(max_actor_restarts=0, num_actors=2), - xgb_model=bst_2part_1) + xgb_model=bst_2part_1, + ) res_error = {} bst_error = train( @@ -406,8 +430,10 @@ def testSameResultWithAndWithoutError(self): callbacks=[_fail_callback(self.die_lock_file, fail_iteration=7)], num_boost_round=10, ray_params=RayParams( - max_actor_restarts=1, num_actors=2, checkpoint_frequency=5), - additional_results=res_error) + max_actor_restarts=1, num_actors=2, checkpoint_frequency=5 + ), + additional_results=res_error, + ) flat_noerror = flatten_obj({"tree": tree_obj(bst_noerror)}) flat_error = flatten_obj({"tree": tree_obj(bst_error)}) @@ -436,19 +462,24 @@ def testMaybeScheduleNewActors(self): after each call. """ + from xgboost_ray.elastic import ( + _maybe_schedule_new_actors, + _update_scheduled_actor_states, + ) from xgboost_ray.main import _TrainingState - from xgboost_ray.elastic import _update_scheduled_actor_states - from xgboost_ray.elastic import _maybe_schedule_new_actors os.environ["RXGB_ELASTIC_RESTART_GRACE_PERIOD_S"] = "30" # Three actors are dead actors = [ - MagicMock(), None, MagicMock(), - MagicMock(), None, - MagicMock(), None, - MagicMock() + None, + MagicMock(), + MagicMock(), + None, + MagicMock(), + None, + MagicMock(), ] # Mock training state @@ -480,7 +511,9 @@ def fake_create_actor(rank, *args, **kwargs): num_actors=8, elastic_training=True, max_failed_actors=1, - max_actor_restarts=2)) + max_actor_restarts=2, + ), + ) # 3 new actors should have been created self.assertEqual(len(created_actors), 3) @@ -498,7 +531,9 @@ def fake_create_actor(rank, *args, **kwargs): num_actors=8, elastic_training=True, max_failed_actors=1, - max_actor_restarts=2)) + max_actor_restarts=2, + ), + ) self.assertEqual(len(created_actors), 3) self.assertEqual(len(state.pending_actors), 3) @@ -528,8 +563,7 @@ def fake_create_actor(rank, *args, **kwargs): # Grace period is set through ENV.ELASTIC_RESTART_GRACE_PERIOD_S # Allow for some slack in test execution - self.assertGreaterEqual(state.restart_training_at, - time.time() + 22) + self.assertGreaterEqual(state.restart_training_at, time.time() + 22) # The first actor should have been promoted to full actor self.assertTrue(actors[1]) @@ -554,11 +588,11 @@ def testFaultToleranceManager(self): ft_manager = FaultToleranceManager.remote() ft_manager.schedule_kill.remote(rank=1, boost_round=16) - ft_manager.delay_return.remote( - rank=1, start_boost_round=14, end_boost_round=68) + ft_manager.delay_return.remote(rank=1, start_boost_round=14, end_boost_round=68) delay_callback = DelayedLoadingCallback( - ft_manager, reload_data=True, sleep_time=0.1) + ft_manager, reload_data=True, sleep_time=0.1 + ) die_callback = DieCallback(ft_manager, training_delay=0.25) res_1 = {} @@ -573,8 +607,10 @@ def testFaultToleranceManager(self): elastic_training=True, max_failed_actors=1, max_actor_restarts=1, - distributed_callbacks=[delay_callback]), - additional_results=res_1) + distributed_callbacks=[delay_callback], + ), + additional_results=res_1, + ) logs = ray.get(ft_manager.get_logs.remote()) @@ -597,6 +633,8 @@ def testFaultToleranceManager(self): if __name__ == "__main__": - import pytest import sys + + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/xgboost_ray/tests/test_matrix.py b/xgboost_ray/tests/test_matrix.py index 014f776a..6c764492 100644 --- a/xgboost_ray/tests/test_matrix.py +++ b/xgboost_ray/tests/test_matrix.py @@ -2,12 +2,12 @@ import os import tempfile import unittest -import xgboost as xgb import numpy as np import pandas as pd - import ray +import xgboost as xgb + try: import ray.data as ray_data except (ImportError, ModuleNotFoundError): @@ -15,8 +15,7 @@ ray_data = None from xgboost_ray import RayDMatrix -from xgboost_ray.matrix import (concat_dataframes, RayShardingMode, - _get_sharding_indices) +from xgboost_ray.matrix import RayShardingMode, _get_sharding_indices, concat_dataframes class XGBoostRayDMatrixTest(unittest.TestCase): @@ -24,12 +23,15 @@ class XGBoostRayDMatrixTest(unittest.TestCase): def setUp(self): repeat = 8 # Repeat data a couple of times for stability - self.x = np.array([ - [1, 0, 0, 0], # Feature 0 -> Label 0 - [0, 1, 0, 0], # Feature 1 -> Label 1 - [0, 0, 1, 1], # Feature 2+3 -> Label 2 - [0, 0, 1, 0], # Feature 2+!3 -> Label 3 - ] * repeat) + self.x = np.array( + [ + [1, 0, 0, 0], # Feature 0 -> Label 0 + [0, 1, 0, 0], # Feature 1 -> Label 1 + [0, 0, 1, 1], # Feature 2+3 -> Label 2 + [0, 0, 1, 0], # Feature 2+!3 -> Label 3 + ] + * repeat + ) self.y = np.array([0, 1, 2, 3] * repeat) @classmethod @@ -117,6 +119,7 @@ def testFromPandasDfString(self): def testFromModinDfDf(self): from xgboost_ray.data_sources.modin import MODIN_INSTALLED + if not MODIN_INSTALLED: self.skipTest("Modin not installed.") return @@ -129,6 +132,7 @@ def testFromModinDfDf(self): def testFromModinDfSeries(self): from xgboost_ray.data_sources.modin import MODIN_INSTALLED + if not MODIN_INSTALLED: self.skipTest("Modin not installed.") return @@ -141,6 +145,7 @@ def testFromModinDfSeries(self): def testFromModinDfString(self): from xgboost_ray.data_sources.modin import MODIN_INSTALLED + if not MODIN_INSTALLED: self.skipTest("Modin not installed.") return @@ -154,6 +159,7 @@ def testFromModinDfString(self): def testFromDaskDfSeries(self): from xgboost_ray.data_sources.dask import DASK_INSTALLED + if not DASK_INSTALLED: self.skipTest("Dask not installed.") return @@ -167,12 +173,13 @@ def testFromDaskDfSeries(self): def testFromDaskDfArray(self): from xgboost_ray.data_sources.dask import DASK_INSTALLED + if not DASK_INSTALLED: self.skipTest("Dask not installed.") return - import dask.dataframe as dd import dask.array as da + import dask.dataframe as dd in_x = dd.from_array(self.x) in_y = da.from_array(self.y) @@ -181,6 +188,7 @@ def testFromDaskDfArray(self): def testFromDaskDfString(self): from xgboost_ray.data_sources.dask import DASK_INSTALLED + if not DASK_INSTALLED: self.skipTest("Dask not installed.") return @@ -207,10 +215,8 @@ def testFromPetastormParquetString(self): data_df["label"] = pd.Series(self.y) data_df.to_parquet(data_file) - self._testMatrixCreation( - f"file://{data_file}", "label", distributed=False) - self._testMatrixCreation( - f"file://{data_file}", "label", distributed=True) + self._testMatrixCreation(f"file://{data_file}", "label", distributed=False) + self._testMatrixCreation(f"file://{data_file}", "label", distributed=True) def testFromPetastormMultiParquetString(self): with tempfile.TemporaryDirectory() as dir: @@ -220,8 +226,8 @@ def testFromPetastormMultiParquetString(self): data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) - df_1 = data_df[0:len(data_df) // 2] - df_2 = data_df[len(data_df) // 2:] + df_1 = data_df[0 : len(data_df) // 2] + df_2 = data_df[len(data_df) // 2 :] df_1.to_parquet(data_file_1) df_2.to_parquet(data_file_2) @@ -229,11 +235,13 @@ def testFromPetastormMultiParquetString(self): self._testMatrixCreation( [f"file://{data_file_1}", f"file://{data_file_2}"], "label", - distributed=False) + distributed=False, + ) self._testMatrixCreation( [f"file://{data_file_1}", f"file://{data_file_2}"], "label", - distributed=True) + distributed=True, + ) def testFromCSVString(self): with tempfile.TemporaryDirectory() as dir: @@ -255,16 +263,18 @@ def testFromMultiCSVString(self): data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) - df_1 = data_df[0:len(data_df) // 2] - df_2 = data_df[len(data_df) // 2:] + df_1 = data_df[0 : len(data_df) // 2] + df_2 = data_df[len(data_df) // 2 :] df_1.to_csv(data_file_1, header=True, index=False) df_2.to_csv(data_file_2, header=True, index=False) self._testMatrixCreation( - [data_file_1, data_file_2], "label", distributed=False) + [data_file_1, data_file_2], "label", distributed=False + ) self._testMatrixCreation( - [data_file_1, data_file_2], "label", distributed=True) + [data_file_1, data_file_2], "label", distributed=True + ) def testFromParquetString(self): with tempfile.TemporaryDirectory() as dir: @@ -285,16 +295,18 @@ def testFromMultiParquetString(self): data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"]) data_df["label"] = pd.Series(self.y) - df_1 = data_df[0:len(data_df) // 2] - df_2 = data_df[len(data_df) // 2:] + df_1 = data_df[0 : len(data_df) // 2] + df_2 = data_df[len(data_df) // 2 :] df_1.to_parquet(data_file_1) df_2.to_parquet(data_file_2) self._testMatrixCreation( - [data_file_1, data_file_2], "label", distributed=False) + [data_file_1, data_file_2], "label", distributed=False + ) self._testMatrixCreation( - [data_file_1, data_file_2], "label", distributed=True) + [data_file_1, data_file_2], "label", distributed=True + ) def testDetectDistributed(self): with tempfile.TemporaryDirectory() as dir: @@ -341,8 +353,7 @@ def testTooManyActorsCentral(self): def testBatchShardingAllActorsGetIndices(self): """Check if all actors get indices with batch mode""" for i in range(16): - self.assertTrue( - _get_sharding_indices(RayShardingMode.BATCH, i, 16, 100)) + self.assertTrue(_get_sharding_indices(RayShardingMode.BATCH, i, 16, 100)) def testLegacyParams(self): """Test if all params can be set regardless of xgb version""" @@ -359,17 +370,20 @@ def testLegacyParams(self): weight=weight, base_margin=base_margin, label_lower_bound=label_lower_bound, - label_upper_bound=label_upper_bound) + label_upper_bound=label_upper_bound, + ) self._testMatrixCreation( in_x, in_y, qid=qid, base_margin=base_margin, label_lower_bound=label_lower_bound, - label_upper_bound=label_upper_bound) + label_upper_bound=label_upper_bound, + ) - @unittest.skipIf(xgb.__version__ < "1.3.0", - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + xgb.__version__ < "1.3.0", f"not supported in xgb version {xgb.__version__}" + ) def testFeatureWeightsParam(self): """Test the feature_weights parameter for xgb version >= 1.3.0""" in_x = self.x @@ -377,8 +391,10 @@ def testFeatureWeightsParam(self): feature_weights = np.arange(len(in_y)) self._testMatrixCreation(in_x, in_y, feature_weights=feature_weights) - @unittest.skipIf("qid" not in inspect.signature(xgb.DMatrix).parameters, - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + "qid" not in inspect.signature(xgb.DMatrix).parameters, + f"not supported in xgb version {xgb.__version__}", + ) def testQidSortedBehaviorXGBoost(self): """Test that data with unsorted qid is sorted in RayDMatrix""" in_x = self.x @@ -386,22 +402,24 @@ def testQidSortedBehaviorXGBoost(self): unsorted_qid = np.array([1, 2] * 16) from xgboost import DMatrix + with self.assertRaises(ValueError): DMatrix(**{"data": in_x, "label": in_y, "qid": unsorted_qid}) - DMatrix(**{ - "data": in_x, - "label": in_y, - "qid": np.sort(unsorted_qid) - }) # no exception + DMatrix( + **{"data": in_x, "label": in_y, "qid": np.sort(unsorted_qid)} + ) # no exception # test RayDMatrix handles sorting automatically mat = RayDMatrix(in_x, in_y, qid=unsorted_qid) params = mat.get_data(rank=0, num_actors=1) DMatrix(**params) - @unittest.skipIf("qid" not in inspect.signature(xgb.DMatrix).parameters, - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + "qid" not in inspect.signature(xgb.DMatrix).parameters, + f"not supported in xgb version {xgb.__version__}", + ) def testQidSortedParquet(self): from xgboost import DMatrix + with tempfile.TemporaryDirectory() as dir: parquet_file1 = os.path.join(dir, "file1.parquet") parquet_file2 = os.path.join(dir, "file2.parquet") @@ -423,12 +441,15 @@ def testQidSortedParquet(self): [parquet_file1, parquet_file2], columns=["a", "b", "c", "d", "label", "group"], label="label", - qid="group") + qid="group", + ) params = mat.get_data(rank=0, num_actors=1) DMatrix(**params) if __name__ == "__main__": - import pytest import sys + + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/xgboost_ray/tests/test_sklearn.py b/xgboost_ray/tests/test_sklearn.py index c87390c5..8418a61e 100644 --- a/xgboost_ray/tests/test_sklearn.py +++ b/xgboost_ray/tests/test_sklearn.py @@ -23,29 +23,31 @@ # License: # https://github.com/dmlc/xgboost/blob/a5c852660b1056204aa2e0cbfcd5b4ecfbf31adf/LICENSE -# import collections -# import importlib.util -import numpy as np -import xgboost as xgb -import unittest -from packaging.version import Version +import json +import os +import shutil # import io # from contextlib import redirect_stdout, redirect_stderr import tempfile -import os -import shutil -import json +import unittest +# import collections +# import importlib.util +import numpy as np import ray +import xgboost as xgb +from packaging.version import Version -from xgboost_ray.sklearn import (RayXGBClassifier, RayXGBRegressor, - RayXGBRFClassifier, RayXGBRFRegressor, - RayXGBRanker) - -from xgboost_ray.main import (XGBOOST_VERSION, RayDMatrix, RayParams, train, - predict) +from xgboost_ray.main import XGBOOST_VERSION, RayDMatrix, RayParams, predict, train from xgboost_ray.matrix import RayShardingMode +from xgboost_ray.sklearn import ( + RayXGBClassifier, + RayXGBRanker, + RayXGBRegressor, + RayXGBRFClassifier, + RayXGBRFRegressor, +) def softmax(x): @@ -79,8 +81,10 @@ def objective(labels, predt): def get_basescore(model: xgb.XGBModel) -> float: """Get base score from an XGBoost sklearn estimator.""" base_score = float( - json.loads(model.get_booster().save_config())["learner"][ - "learner_model_param"]["base_score"]) + json.loads(model.get_booster().save_config())["learner"]["learner_model_param"][ + "base_score" + ] + ) return base_score @@ -128,10 +132,12 @@ def run_binary_classification(self, cls, ray_dmatrix_params=None): ray_dmatrix_params=ray_dmatrix_params, ) preds = xgb_model.predict( - X[test_index], ray_dmatrix_params=ray_dmatrix_params) + X[test_index], ray_dmatrix_params=ray_dmatrix_params + ) labels = y[test_index] - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) assert err < 0.1 def test_binary_classification(self): @@ -139,12 +145,14 @@ def test_binary_classification(self): def test_binary_classification_dmatrix_params(self): self.run_binary_classification( - RayXGBClassifier, - ray_dmatrix_params={"sharding": RayShardingMode.BATCH}) + RayXGBClassifier, ray_dmatrix_params={"sharding": RayShardingMode.BATCH} + ) # ray: added for legacy CI test - @unittest.skipIf(XGBOOST_VERSION < Version("1.0.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.0.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_binary_rf_classification(self): self.run_binary_classification(RayXGBRFClassifier) @@ -156,12 +164,13 @@ def test_multiclass_classification(self): def check_pred(preds, labels, output_margin): if output_margin: - err = sum(1 for i in range(len(preds)) - if preds[i].argmax() != labels[i]) / float( - len(preds)) + err = sum( + 1 for i in range(len(preds)) if preds[i].argmax() != labels[i] + ) / float(len(preds)) else: - err = sum(1 for i in range(len(preds)) - if preds[i] != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if preds[i] != labels[i] + ) / float(len(preds)) assert err < 0.4 iris = load_iris() @@ -171,16 +180,17 @@ def check_pred(preds, labels, output_margin): for train_index, test_index in kf.split(X, y): xgb_model = RayXGBClassifier().fit(X[train_index], y[train_index]) if hasattr(xgb_model.get_booster(), "num_boosted_rounds"): - assert (xgb_model.get_booster().num_boosted_rounds() == - xgb_model.n_estimators) + assert ( + xgb_model.get_booster().num_boosted_rounds() + == xgb_model.n_estimators + ) preds = xgb_model.predict(X[test_index]) # test other params in XGBClassifier().fit - preds2 = xgb_model.predict( - X[test_index], output_margin=True, ntree_limit=3) - preds3 = xgb_model.predict( - X[test_index], output_margin=True, ntree_limit=0) + preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) + preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict( - X[test_index], output_margin=False, ntree_limit=3) + X[test_index], output_margin=False, ntree_limit=3 + ) labels = y[test_index] check_pred(preds, labels, output_margin=False) @@ -196,14 +206,15 @@ def check_pred(preds, labels, output_margin): # custom objective, the default is multi:softprob # so no transformation is required. - cls = RayXGBClassifier( - n_estimators=4, objective=softprob_obj(3)).fit(X, y) + cls = RayXGBClassifier(n_estimators=4, objective=softprob_obj(3)).fit(X, y) proba = cls.predict_proba(X) assert proba.shape[0] == X.shape[0] assert proba.shape[1] == cls.n_classes_ - @unittest.skipIf(XGBOOST_VERSION < Version("1.4.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.4.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_best_ntree_limit(self): self._init_ray() @@ -214,9 +225,8 @@ def test_best_ntree_limit(self): def train(booster, forest): rounds = 4 cls = RayXGBClassifier( - n_estimators=rounds, num_parallel_tree=forest, - booster=booster).fit( - X, y, eval_set=[(X, y)], early_stopping_rounds=3) + n_estimators=rounds, num_parallel_tree=forest, booster=booster + ).fit(X, y, eval_set=[(X, y)], early_stopping_rounds=3) if forest: assert cls.best_ntree_limit == rounds * forest @@ -236,11 +246,10 @@ def train(booster, forest): def test_stacking_regression(self): self._init_ray() - from sklearn.model_selection import train_test_split from sklearn.datasets import load_diabetes + from sklearn.ensemble import RandomForestRegressor, StackingRegressor from sklearn.linear_model import RidgeCV - from sklearn.ensemble import RandomForestRegressor - from sklearn.ensemble import StackingRegressor + from sklearn.model_selection import train_test_split X, y = load_diabetes(return_X_y=True) estimators = [ @@ -249,24 +258,22 @@ def test_stacking_regression(self): ] reg = StackingRegressor( estimators=estimators, - final_estimator=RandomForestRegressor( - n_estimators=10, random_state=42), + final_estimator=RandomForestRegressor(n_estimators=10, random_state=42), ) - X_train, X_test, y_train, y_test = train_test_split( - X, y, random_state=42) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg.fit(X_train, y_train).score(X_test, y_test) def test_stacking_classification(self): self._init_ray() - from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris - from sklearn.svm import LinearSVC + from sklearn.ensemble import StackingClassifier from sklearn.linear_model import LogisticRegression - from sklearn.preprocessing import StandardScaler + from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline - from sklearn.ensemble import StackingClassifier + from sklearn.preprocessing import StandardScaler + from sklearn.svm import LinearSVC X, y = load_iris(return_X_y=True) estimators = [ @@ -277,10 +284,10 @@ def test_stacking_classification(self): ), ] clf = StackingClassifier( - estimators=estimators, final_estimator=LogisticRegression()) + estimators=estimators, final_estimator=LogisticRegression() + ) - X_train, X_test, y_train, y_test = train_test_split( - X, y, random_state=42) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) clf.fit(X_train, y_train).score(X_test, y_test) # exact tree method doesn't support distributed training @@ -308,8 +315,7 @@ def test_num_parallel_tree(self): from sklearn.datasets import fetch_california_housing - reg = RayXGBRegressor( - n_estimators=4, num_parallel_tree=4, tree_method="hist") + reg = RayXGBRegressor(n_estimators=4, num_parallel_tree=4, tree_method="hist") ds = fetch_california_housing() bst = reg.fit(X=ds["data"], y=ds["target"]) dump = bst.get_booster().get_dump(dump_format="json") @@ -323,18 +329,30 @@ def test_num_parallel_tree(self): if XGBOOST_VERSION >= Version("1.6.0"): config = json.loads(bst.get_booster().save_config()) - assert (int(config["learner"]["gradient_booster"][ - "gbtree_model_param"]["num_parallel_tree"]) == 4) + assert ( + int( + config["learner"]["gradient_booster"]["gbtree_model_param"][ + "num_parallel_tree" + ] + ) + == 4 + ) else: config = json.loads(bst.get_booster().save_config()) - assert (int(config["learner"]["gradient_booster"][ - "gbtree_train_param"]["num_parallel_tree"]) == 4) + assert ( + int( + config["learner"]["gradient_booster"]["gbtree_train_param"][ + "num_parallel_tree" + ] + ) + == 4 + ) def test_california_housing_regression(self): self._init_ray() - from sklearn.metrics import mean_squared_error from sklearn.datasets import fetch_california_housing + from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold ds = fetch_california_housing() @@ -346,12 +364,11 @@ def test_california_housing_regression(self): preds = xgb_model.predict(X[test_index]) # test other params in XGBRegressor().fit - preds2 = xgb_model.predict( - X[test_index], output_margin=True, ntree_limit=3) - preds3 = xgb_model.predict( - X[test_index], output_margin=True, ntree_limit=0) + preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3) + preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0) preds4 = xgb_model.predict( - X[test_index], output_margin=False, ntree_limit=3) + X[test_index], output_margin=False, ntree_limit=3 + ) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 @@ -359,19 +376,21 @@ def test_california_housing_regression(self): assert mean_squared_error(preds3, labels) < 25 assert mean_squared_error(preds4, labels) < 350 - @unittest.skipIf(XGBOOST_VERSION < Version("1.0.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.0.0"), + f"not supported in xgb version {xgb.__version__}", + ) def run_california_housing_rf_regression(self, tree_method): - from sklearn.metrics import mean_squared_error from sklearn.datasets import fetch_california_housing + from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold X, y = fetch_california_housing(return_X_y=True) kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): - xgb_model = RayXGBRFRegressor( - random_state=42, tree_method=tree_method).fit( - X[train_index], y[train_index]) + xgb_model = RayXGBRFRegressor(random_state=42, tree_method=tree_method).fit( + X[train_index], y[train_index] + ) preds = xgb_model.predict(X[test_index]) labels = y[test_index] assert mean_squared_error(preds, labels) < 35 @@ -384,8 +403,8 @@ def test_california_housing_rf_regression(self): def test_parameter_tuning(self): self._init_ray() - from sklearn.model_selection import GridSearchCV from sklearn.datasets import fetch_california_housing + from sklearn.model_selection import GridSearchCV ds = fetch_california_housing() y = ds["target"] @@ -393,10 +412,7 @@ def test_parameter_tuning(self): xgb_model = RayXGBRegressor(learning_rate=0.1) clf = GridSearchCV( xgb_model, - { - "max_depth": [2, 4, 6], - "n_estimators": [50, 100, 200] - }, + {"max_depth": [2, 4, 6], "n_estimators": [50, 100, 200]}, cv=3, verbose=1, ) @@ -407,8 +423,8 @@ def test_parameter_tuning(self): def test_regression_with_custom_objective(self): self._init_ray() - from sklearn.metrics import mean_squared_error from sklearn.datasets import fetch_california_housing + from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold def objective_ls(y_true, y_pred): @@ -422,7 +438,8 @@ def objective_ls(y_true, y_pred): kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBRegressor(objective=objective_ls).fit( - X[train_index], y[train_index]) + X[train_index], y[train_index] + ) preds = xgb_model.predict(X[test_index]) labels = y[test_index] assert mean_squared_error(preds, labels) < 25 @@ -460,8 +477,9 @@ def logregobj(y_true, y_pred): xgb_model.fit(X[train_index], y[train_index]) preds = xgb_model.predict(X[test_index]) labels = y[test_index] - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) assert err < 0.1 # Test that the custom objective function is actually used @@ -499,16 +517,17 @@ def test_sklearn_api(self): iris = load_iris() tr_d, te_d, tr_l, te_l = train_test_split( - iris.data, iris.target, train_size=120, test_size=0.2) + iris.data, iris.target, train_size=120, test_size=0.2 + ) classifier = RayXGBClassifier( - booster="gbtree", n_estimators=10, random_state=self.seed) + booster="gbtree", n_estimators=10, random_state=self.seed + ) classifier.fit(tr_d, tr_l) preds = classifier.predict(te_d) labels = te_l - err = (sum([1 for p, l in zip(preds, labels) - if p != l]) * 1.0 / len(te_l)) + err = sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l) assert err < 0.2 def test_sklearn_api_gblinear(self): @@ -519,20 +538,23 @@ def test_sklearn_api_gblinear(self): iris = load_iris() tr_d, te_d, tr_l, te_l = train_test_split( - iris.data, iris.target, train_size=120) + iris.data, iris.target, train_size=120 + ) classifier = RayXGBClassifier( - booster="gblinear", n_estimators=100, random_state=self.seed) + booster="gblinear", n_estimators=100, random_state=self.seed + ) classifier.fit(tr_d, tr_l) preds = classifier.predict(te_d) labels = te_l - err = (sum([1 for p, l in zip(preds, labels) - if p != l]) * 1.0 / len(te_l)) + err = sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l) assert err < 0.5 - @unittest.skipIf(XGBOOST_VERSION < Version("1.0.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.0.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_sklearn_random_state(self): self._init_ray() @@ -546,8 +568,10 @@ def test_sklearn_random_state(self): clf = RayXGBClassifier(random_state=random_state) assert isinstance(clf.get_xgb_params()["random_state"], int) - @unittest.skipIf(XGBOOST_VERSION < Version("1.0.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.0.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_sklearn_n_jobs(self): self._init_ray() @@ -557,8 +581,10 @@ def test_sklearn_n_jobs(self): clf = RayXGBClassifier(n_jobs=2) assert clf.get_xgb_params()["n_jobs"] == 2 - @unittest.skipIf(XGBOOST_VERSION < Version("1.3.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.3.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_parameters_access(self): self._init_ray() @@ -596,8 +622,8 @@ def test_kwargs_error(self): def test_kwargs_grid_search(self): self._init_ray() - from sklearn.model_selection import GridSearchCV from sklearn import datasets + from sklearn.model_selection import GridSearchCV params = {"tree_method": "hist"} clf = RayXGBClassifier(n_estimators=1, learning_rate=1.0, **params) @@ -623,8 +649,10 @@ def test_sklearn_clone(self): clf.n_jobs = -1 clone(clf) - @unittest.skipIf(XGBOOST_VERSION < Version("1.0.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.0.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_sklearn_get_default_params(self): self._init_ray() @@ -639,8 +667,10 @@ def test_sklearn_get_default_params(self): base_score = get_basescore(cls) np.testing.assert_equal(base_score, 0.5) - @unittest.skipIf(XGBOOST_VERSION < Version("1.1.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.1.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_validation_weights_xgbmodel(self): self._init_ray() @@ -674,7 +704,8 @@ def test_validation_weights_xgbmodel(self): # evaluate logloss metric on test set *without* using weights evals_result_without_weights = clf.evals_result() logloss_without_weights = evals_result_without_weights["validation_0"][ - "logloss"] + "logloss" + ] # now use weights for the test set np.random.seed(0) @@ -689,13 +720,13 @@ def test_validation_weights_xgbmodel(self): verbose=False, ) evals_result_with_weights = clf.evals_result() - logloss_with_weights = evals_result_with_weights["validation_0"][ - "logloss"] + logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"] # check that the logloss in the test set is actually different # when using weights than when not using them - assert all((logloss_with_weights[i] != logloss_without_weights[i] - for i in [0, 1])) + assert all( + (logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]) + ) with self.assertRaises((ValueError, AssertionError)): # length of eval set and sample weight doesn't match. @@ -750,7 +781,8 @@ def test_validation_weights_xgbclassifier(self): # evaluate logloss metric on test set *without* using weights evals_result_without_weights = clf.evals_result() logloss_without_weights = evals_result_without_weights["validation_0"][ - "logloss"] + "logloss" + ] # now use weights for the test set np.random.seed(0) @@ -765,13 +797,13 @@ def test_validation_weights_xgbclassifier(self): verbose=False, ) evals_result_with_weights = clf.evals_result() - logloss_with_weights = evals_result_with_weights["validation_0"][ - "logloss"] + logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"] # check that the logloss in the test set is actually different # when using weights than when not using them - assert all((logloss_with_weights[i] != logloss_without_weights[i] - for i in [0, 1])) + assert all( + (logloss_with_weights[i] != logloss_without_weights[i] for i in [0, 1]) + ) def save_load_model(self, model_path): from sklearn.datasets import load_digits @@ -783,7 +815,8 @@ def save_load_model(self, model_path): kf = KFold(n_splits=2, shuffle=True, random_state=self.rng) for train_index, test_index in kf.split(X, y): xgb_model = RayXGBClassifier(use_label_encoder=False).fit( - X[train_index], y[train_index]) + X[train_index], y[train_index] + ) xgb_model.save_model(model_path) xgb_model = RayXGBClassifier() @@ -795,24 +828,26 @@ def save_load_model(self, model_path): preds = xgb_model.predict(X[test_index]) labels = y[test_index] - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) assert err < 0.1 assert xgb_model.get_booster().attr("scikit_learn") is None # test native booster preds = xgb_model.predict(X[test_index], output_margin=True) booster = xgb.Booster(model_file=model_path) - predt_1 = booster.predict( - xgb.DMatrix(X[test_index]), output_margin=True) + predt_1 = booster.predict(xgb.DMatrix(X[test_index]), output_margin=True) assert np.allclose(preds, predt_1) with self.assertRaises(TypeError): xgb_model = xgb.XGBModel() xgb_model.load_model(model_path) - @unittest.skipIf(XGBOOST_VERSION < Version("1.3.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.3.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_save_load_model(self): self._init_ray() @@ -832,10 +867,7 @@ def test_save_load_model(self): y = digits["target"] X = digits["data"] booster = xgb.train( - { - "tree_method": "hist", - "objective": "binary:logistic" - }, + {"tree_method": "hist", "objective": "binary:logistic"}, dtrain=xgb.DMatrix(X, y), num_boost_round=4, ) @@ -929,13 +961,11 @@ def test_XGBClassifier_resume(self): with TemporaryDirectory() as tempdir: model1_path = os.path.join(tempdir, "test_XGBClassifier.model") - model1_booster_path = os.path.join(tempdir, - "test_XGBClassifier.booster") + model1_booster_path = os.path.join(tempdir, "test_XGBClassifier.booster") X, Y = load_breast_cancer(return_X_y=True) - model1 = RayXGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=8) + model1 = RayXGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8) model1.fit(X, Y) pred1 = model1.predict(X) @@ -943,8 +973,7 @@ def test_XGBClassifier_resume(self): # file name of stored xgb model model1.save_model(model1_path) - model2 = RayXGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=8) + model2 = RayXGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8) model2.fit(X, Y, xgb_model=model1_path) pred2 = model2.predict(X) @@ -955,8 +984,7 @@ def test_XGBClassifier_resume(self): # file name of 'Booster' instance Xgb model model1.get_booster().save_model(model1_booster_path) - model2 = RayXGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=8) + model2 = RayXGBClassifier(learning_rate=0.3, random_state=0, n_estimators=8) model2.fit(X, Y, xgb_model=model1_booster_path) pred2 = model2.predict(X) @@ -965,8 +993,10 @@ def test_XGBClassifier_resume(self): assert np.any(pred1 != pred2) assert log_loss1 > log_loss2 - @unittest.skipIf(XGBOOST_VERSION < Version("1.0.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.0.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_constraint_parameters(self): self._init_ray() @@ -978,13 +1008,19 @@ def test_constraint_parameters(self): config = json.loads(reg.get_booster().save_config()) if XGBOOST_VERSION >= Version("1.6.0"): - assert (config["learner"]["gradient_booster"]["updater"][ - "grow_histmaker"]["train_param"]["interaction_constraints"] == - "[[0, 1], [2, 3, 4]]") + assert ( + config["learner"]["gradient_booster"]["updater"]["grow_histmaker"][ + "train_param" + ]["interaction_constraints"] + == "[[0, 1], [2, 3, 4]]" + ) else: - assert (config["learner"]["gradient_booster"]["updater"]["prune"][ - "train_param"]["interaction_constraints"] == - "[[0, 1], [2, 3, 4]]") + assert ( + config["learner"]["gradient_booster"]["updater"]["prune"][ + "train_param" + ]["interaction_constraints"] + == "[[0, 1], [2, 3, 4]]" + ) # TODO check why this is not working (output is empty, probably due to Ray) # def test_parameter_validation(self): @@ -1079,8 +1115,7 @@ def test_pandas_input(self): train = df.drop(columns=["status"]) model = RayXGBClassifier() model.fit(train, target) - clf_isotonic = CalibratedClassifierCV( - model, cv="prefit", method="isotonic") + clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic") clf_isotonic.fit(train, target) try: estimator = clf_isotonic.calibrated_classifiers_[0].base_estimator @@ -1091,8 +1126,7 @@ def test_pandas_input(self): estimator, RayXGBClassifier, ) - self.assertTrue( - np.allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))) + self.assertTrue(np.allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))) # def run_feature_weights(self, X, y, fw, model=RayXGBRegressor): # with TemporaryDirectory() as tmpdir: @@ -1191,13 +1225,17 @@ def boost_from_prediction(self, tree_method): self.run_boost_from_prediction(tree_method) - @unittest.skipIf(XGBOOST_VERSION < Version("1.0.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.0.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_boost_from_prediction_hist(self): self.run_boost_from_prediction("hist") - @unittest.skipIf(XGBOOST_VERSION < Version("1.2.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.2.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_boost_from_prediction_approx(self): self.run_boost_from_prediction("approx") @@ -1207,8 +1245,10 @@ def test_boost_from_prediction_exact(self): with self.assertRaises(ValueError): self.run_boost_from_prediction("exact") - @unittest.skipIf(XGBOOST_VERSION < Version("1.4.0"), - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + XGBOOST_VERSION < Version("1.4.0"), + f"not supported in xgb version {xgb.__version__}", + ) def test_estimator_type(self): self._init_ray() @@ -1253,7 +1293,7 @@ def test_ranking(self): "max_depth": 6, "n_estimators": 4, "random_state": 1, - "n_jobs": 2 + "n_jobs": 2, } model = RayXGBRanker(**params) model.fit( @@ -1261,7 +1301,8 @@ def test_ranking(self): y_train, qid=train_qid, eval_set=[(x_valid, y_valid)], - eval_qid=[valid_qid]) + eval_qid=[valid_qid], + ) assert model.evals_result() pred = model.predict(x_test) @@ -1276,24 +1317,27 @@ def test_ranking(self): "gamma": 1.0, "min_child_weight": 0.1, "max_depth": 6, - "random_state": 1 + "random_state": 1, } xgb_model_orig = train( params_orig, train_data, num_boost_round=4, evals=[(valid_data, "validation")], - ray_params=RayParams(num_actors=2, max_actor_restarts=0)) + ray_params=RayParams(num_actors=2, max_actor_restarts=0), + ) pred_orig = predict( xgb_model_orig, test_data, - ray_params=RayParams(num_actors=2, max_actor_restarts=0)) + ray_params=RayParams(num_actors=2, max_actor_restarts=0), + ) np.testing.assert_almost_equal(pred, pred_orig) if __name__ == "__main__": - import pytest import sys + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/xgboost_ray/tests/test_sklearn_matrix.py b/xgboost_ray/tests/test_sklearn_matrix.py index 271dfa89..cbe37971 100644 --- a/xgboost_ray/tests/test_sklearn_matrix.py +++ b/xgboost_ray/tests/test_sklearn_matrix.py @@ -1,19 +1,17 @@ -from packaging.version import Version -import numpy as np import unittest +import numpy as np import ray import xgboost as xgb - +from packaging.version import Version from sklearn.model_selection import train_test_split -from xgboost_ray.sklearn import (RayXGBClassifier, RayXGBRegressor) -from xgboost_ray.main import RayDMatrix - -from xgboost_ray.main import XGBOOST_VERSION +from xgboost_ray.main import XGBOOST_VERSION, RayDMatrix +from xgboost_ray.sklearn import RayXGBClassifier, RayXGBRegressor -has_label_encoder = (XGBOOST_VERSION >= Version("1.0.0") - and XGBOOST_VERSION < Version("1.6.0")) +has_label_encoder = XGBOOST_VERSION >= Version("1.0.0") and XGBOOST_VERSION < Version( + "1.6.0" +) class XGBoostRaySklearnMatrixTest(unittest.TestCase): @@ -30,8 +28,9 @@ def _init_ray(self): if not ray.is_initialized(): ray.init(num_cpus=4) - @unittest.skipIf(not has_label_encoder, - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + not has_label_encoder, f"not supported in xgb version {xgb.__version__}" + ) def testClassifierLabelEncoder(self, n_class=2): self._init_ray() @@ -41,50 +40,51 @@ def testClassifierLabelEncoder(self, n_class=2): y = digits["target"] X = digits["data"] - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.5) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) train_matrix = RayDMatrix(X_train, y_train) test_matrix = RayDMatrix(X_test, y_test) with self.assertRaisesRegex(Exception, "use_label_encoder"): - RayXGBClassifier( - use_label_encoder=True, **self.params).fit(train_matrix, None) + RayXGBClassifier(use_label_encoder=True, **self.params).fit( + train_matrix, None + ) with self.assertRaisesRegex(Exception, "num_class"): - RayXGBClassifier( - use_label_encoder=False, **self.params).fit( - train_matrix, None) + RayXGBClassifier(use_label_encoder=False, **self.params).fit( + train_matrix, None + ) with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"): - RayXGBClassifier( - use_label_encoder=False, **self.params).fit( - train_matrix, None, eval_set=[(X_test, y_test)]) + RayXGBClassifier(use_label_encoder=False, **self.params).fit( + train_matrix, None, eval_set=[(X_test, y_test)] + ) - with self.assertRaisesRegex(Exception, - r"must be \(array_like, array_like\)"): - RayXGBClassifier( - use_label_encoder=False, **self.params).fit( - X_train, y_train, eval_set=[(test_matrix, "eval")]) + with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"): + RayXGBClassifier(use_label_encoder=False, **self.params).fit( + X_train, y_train, eval_set=[(test_matrix, "eval")] + ) - RayXGBClassifier( - use_label_encoder=False, num_class=n_class, **self.params).fit( - train_matrix, None) + RayXGBClassifier(use_label_encoder=False, num_class=n_class, **self.params).fit( + train_matrix, None + ) clf = RayXGBClassifier( - use_label_encoder=False, num_class=n_class, **self.params).fit( - train_matrix, None, eval_set=[(test_matrix, "eval")]) + use_label_encoder=False, num_class=n_class, **self.params + ).fit(train_matrix, None, eval_set=[(test_matrix, "eval")]) clf.predict(test_matrix) clf.predict_proba(test_matrix) - @unittest.skipIf(not has_label_encoder, - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + not has_label_encoder, f"not supported in xgb version {xgb.__version__}" + ) def testClassifierMulticlassLabelEncoder(self): self.testClassifierLabelEncoder(n_class=3) - @unittest.skipIf(has_label_encoder, - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + has_label_encoder, f"not supported in xgb version {xgb.__version__}" + ) def testClassifierNoLabelEncoder(self, n_class=2): self._init_ray() @@ -94,8 +94,7 @@ def testClassifierNoLabelEncoder(self, n_class=2): y = digits["target"] X = digits["data"] - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.5) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) train_matrix = RayDMatrix(X_train, y_train) test_matrix = RayDMatrix(X_test, y_test) @@ -105,25 +104,26 @@ def testClassifierNoLabelEncoder(self, n_class=2): with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"): RayXGBClassifier(**self.params).fit( - train_matrix, None, eval_set=[(X_test, y_test)]) + train_matrix, None, eval_set=[(X_test, y_test)] + ) - with self.assertRaisesRegex(Exception, - r"must be \(array_like, array_like\)"): + with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"): RayXGBClassifier(**self.params).fit( - X_train, y_train, eval_set=[(test_matrix, "eval")]) + X_train, y_train, eval_set=[(test_matrix, "eval")] + ) - RayXGBClassifier( - num_class=n_class, **self.params).fit(train_matrix, None) + RayXGBClassifier(num_class=n_class, **self.params).fit(train_matrix, None) - clf = RayXGBClassifier( - num_class=n_class, **self.params).fit( - train_matrix, None, eval_set=[(test_matrix, "eval")]) + clf = RayXGBClassifier(num_class=n_class, **self.params).fit( + train_matrix, None, eval_set=[(test_matrix, "eval")] + ) clf.predict(test_matrix) clf.predict_proba(test_matrix) - @unittest.skipIf(has_label_encoder, - f"not supported in xgb version {xgb.__version__}") + @unittest.skipIf( + has_label_encoder, f"not supported in xgb version {xgb.__version__}" + ) def testClassifierMulticlassNoLabelEncoder(self): self.testClassifierNoLabelEncoder(n_class=3) @@ -136,24 +136,25 @@ def testRegressor(self): y = ds["target"] X = ds["data"] - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.5) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) train_matrix = RayDMatrix(X_train, y_train) test_matrix = RayDMatrix(X_test, y_test) with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"): RayXGBRegressor(**self.params).fit( - train_matrix, None, eval_set=[(X_test, y_test)]) + train_matrix, None, eval_set=[(X_test, y_test)] + ) - with self.assertRaisesRegex(Exception, - r"must be \(array_like, array_like\)"): + with self.assertRaisesRegex(Exception, r"must be \(array_like, array_like\)"): RayXGBRegressor(**self.params).fit( - X_train, y_train, eval_set=[(test_matrix, "eval")]) + X_train, y_train, eval_set=[(test_matrix, "eval")] + ) RayXGBRegressor(**self.params).fit(train_matrix, None) reg = RayXGBRegressor(**self.params).fit( - train_matrix, None, eval_set=[(test_matrix, "eval")]) + train_matrix, None, eval_set=[(test_matrix, "eval")] + ) reg.predict(test_matrix) diff --git a/xgboost_ray/tests/test_tune.py b/xgboost_ray/tests/test_tune.py index 09852758..05bf5082 100644 --- a/xgboost_ray/tests/test_tune.py +++ b/xgboost_ray/tests/test_tune.py @@ -5,17 +5,20 @@ from unittest.mock import patch import numpy as np - import ray from ray import tune from ray.tune import TuneError -from ray.tune.integration.xgboost import \ - TuneReportCallback as OrigTuneReportCallback, \ - TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback - -from xgboost_ray import RayDMatrix, train, RayParams -from xgboost_ray.tune import TuneReportCallback,\ - TuneReportCheckpointCallback, _try_add_tune_callback +from ray.tune.integration.xgboost import TuneReportCallback as OrigTuneReportCallback +from ray.tune.integration.xgboost import ( + TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback, +) + +from xgboost_ray import RayDMatrix, RayParams, train +from xgboost_ray.tune import ( + TuneReportCallback, + TuneReportCheckpointCallback, + _try_add_tune_callback, +) try: from ray.air import Checkpoint @@ -29,12 +32,15 @@ class XGBoostRayTuneTest(unittest.TestCase): def setUp(self): ray.init(num_cpus=4) repeat = 8 # Repeat data a couple of times for stability - x = np.array([ - [1, 0, 0, 0], # Feature 0 -> Label 0 - [0, 1, 0, 0], # Feature 1 -> Label 1 - [0, 0, 1, 1], # Feature 2+3 -> Label 2 - [0, 0, 1, 0], # Feature 2+!3 -> Label 3 - ] * repeat) + x = np.array( + [ + [1, 0, 0, 0], # Feature 0 -> Label 0 + [0, 1, 0, 0], # Feature 1 -> Label 1 + [0, 0, 1, 1], # Feature 2+3 -> Label 2 + [0, 0, 1, 0], # Feature 2+!3 -> Label 3 + ] + * repeat + ) y = np.array([0, 1, 2, 3] * repeat) self.params = { @@ -46,13 +52,12 @@ def setUp(self): "num_class": 4, "eval_metric": ["mlogloss", "merror"], }, - "num_boost_round": tune.choice([1, 3]) + "num_boost_round": tune.choice([1, 3]), } - def train_func(ray_params, - callbacks=None, - check_for_spread_strategy=False, - **kwargs): + def train_func( + ray_params, callbacks=None, check_for_spread_strategy=False, **kwargs + ): def _inner_train(config, checkpoint_dir): if check_for_spread_strategy: assert tune.get_trial_resources().strategy == "SPREAD" @@ -64,7 +69,8 @@ def _inner_train(config, checkpoint_dir): num_boost_round=config["num_boost_round"], evals=[(train_set, "train")], callbacks=callbacks, - **kwargs) + **kwargs + ) return _inner_train @@ -86,11 +92,13 @@ def testNumIters(self): self.train_func(ray_params), config=self.params, resources_per_trial=ray_params.get_tune_resources(), - num_samples=1) + num_samples=1, + ) self.assertSequenceEqual( list(analysis.results_df["training_iteration"]), - list(analysis.results_df["config/num_boost_round"])) + list(analysis.results_df["config/num_boost_round"]), + ) def testNumItersClient(self): """Test ray client mode""" @@ -106,25 +114,25 @@ def testNumItersClient(self): def testPlacementOptions(self): ray_params = RayParams( - cpus_per_actor=1, - num_actors=1, - placement_options={"strategy": "SPREAD"}) + cpus_per_actor=1, num_actors=1, placement_options={"strategy": "SPREAD"} + ) tune.run( self.train_func(ray_params, check_for_spread_strategy=True), config=self.params, resources_per_trial=ray_params.get_tune_resources(), - num_samples=1) + num_samples=1, + ) def testElasticFails(self): """Test if error is thrown when using Tune with elastic training.""" - ray_params = RayParams( - cpus_per_actor=1, num_actors=1, elastic_training=True) + ray_params = RayParams(cpus_per_actor=1, num_actors=1, elastic_training=True) with self.assertRaises(TuneError): tune.run( self.train_func(ray_params), config=self.params, resources_per_trial=ray_params.get_tune_resources(), - num_samples=1) + num_samples=1, + ) def testReplaceTuneCheckpoints(self): """Test if ray.tune.integration.xgboost callbacks are replaced""" @@ -141,9 +149,7 @@ def testReplaceTuneCheckpoints(self): self.assertSequenceEqual(replaced._metrics, ["met"]) # Report and checkpointing callback - in_cp = [ - OrigTuneReportCheckpointCallback(metrics="met", filename="test") - ] + in_cp = [OrigTuneReportCheckpointCallback(metrics="met", filename="test")] in_dict = {"callbacks": in_cp} with patch("xgboost_ray.tune.is_session_enabled") as mocked: @@ -159,15 +165,16 @@ def testEndToEndCheckpointing(self): ray_params = RayParams(cpus_per_actor=1, num_actors=2) analysis = tune.run( self.train_func( - ray_params, - callbacks=[TuneReportCheckpointCallback(frequency=1)]), + ray_params, callbacks=[TuneReportCheckpointCallback(frequency=1)] + ), config=self.params, resources_per_trial=ray_params.get_tune_resources(), num_samples=1, metric="train-mlogloss", mode="min", log_to_file=True, - local_dir=self.experiment_dir) + local_dir=self.experiment_dir, + ) if isinstance(analysis.best_checkpoint, Checkpoint): self.assertTrue(analysis.best_checkpoint) @@ -178,15 +185,16 @@ def testEndToEndCheckpointingOrigTune(self): ray_params = RayParams(cpus_per_actor=1, num_actors=2) analysis = tune.run( self.train_func( - ray_params, - callbacks=[OrigTuneReportCheckpointCallback(frequency=1)]), + ray_params, callbacks=[OrigTuneReportCheckpointCallback(frequency=1)] + ), config=self.params, resources_per_trial=ray_params.get_tune_resources(), num_samples=1, metric="train-mlogloss", mode="min", log_to_file=True, - local_dir=self.experiment_dir) + local_dir=self.experiment_dir, + ) if isinstance(analysis.best_checkpoint, Checkpoint): self.assertTrue(analysis.best_checkpoint) @@ -195,6 +203,8 @@ def testEndToEndCheckpointingOrigTune(self): if __name__ == "__main__": - import pytest import sys + + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/xgboost_ray/tests/test_xgboost_api.py b/xgboost_ray/tests/test_xgboost_api.py index d7cac151..b1c6169b 100644 --- a/xgboost_ray/tests/test_xgboost_api.py +++ b/xgboost_ray/tests/test_xgboost_api.py @@ -1,14 +1,12 @@ -from typing import Tuple - import unittest +from typing import Tuple import numpy as np -import xgboost as xgb -from xgboost_ray.compat import TrainingCallback - import ray +import xgboost as xgb -from xgboost_ray import RayDMatrix, train, RayParams +from xgboost_ray import RayDMatrix, RayParams, train +from xgboost_ray.compat import TrainingCallback # From XGBoost documentation: # https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html @@ -22,11 +20,12 @@ def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray: y = dtrain.get_label() - return ((-np.log1p(predt) + np.log1p(y) + 1) / np.power(predt + 1, 2)) + return (-np.log1p(predt) + np.log1p(y) + 1) / np.power(predt + 1, 2) -def squared_log(predt: np.ndarray, - dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]: +def squared_log( + predt: np.ndarray, dtrain: xgb.DMatrix +) -> Tuple[np.ndarray, np.ndarray]: predt[predt < -1] = -1 + 1e-6 grad = gradient(predt, dtrain) hess = hessian(predt, dtrain) @@ -45,12 +44,15 @@ class XGBoostAPITest(unittest.TestCase): def setUp(self): repeat = 8 # Repeat data a couple of times for stability - self.x = np.array([ - [1, 0, 0, 0], # Feature 0 -> Label 0 - [0, 1, 0, 0], # Feature 1 -> Label 1 - [0, 0, 1, 1], # Feature 2+3 -> Label 0 - [0, 0, 1, 0], # Feature 2+!3 -> Label 1 - ] * repeat) + self.x = np.array( + [ + [1, 0, 0, 0], # Feature 0 -> Label 0 + [0, 1, 0, 0], # Feature 1 -> Label 1 + [0, 0, 1, 1], # Feature 2+3 -> Label 0 + [0, 0, 1, 0], # Feature 2+!3 -> Label 1 + ] + * repeat + ) self.y = np.array([0, 1, 0, 1] * repeat) self.params = { @@ -59,7 +61,7 @@ def setUp(self): "nthread": 1, "max_depth": 2, "objective": "binary:logistic", - "seed": 1000 + "seed": 1000, } self.kwargs = {} @@ -82,15 +84,15 @@ def testCustomObjectiveFunction(self): params = self.params.copy() params.pop("objective", None) - bst_xgb = xgb.train( - params, xgb.DMatrix(self.x, self.y), obj=squared_log) + bst_xgb = xgb.train(params, xgb.DMatrix(self.x, self.y), obj=squared_log) bst_ray = train( params, RayDMatrix(self.x, self.y), ray_params=RayParams(num_actors=2), obj=squared_log, - **self.kwargs) + **self.kwargs, + ) x_mat = xgb.DMatrix(self.x) pred_y_xgb = np.round(bst_xgb.predict(x_mat)) @@ -118,7 +120,8 @@ def testCustomMetricFunction(self): obj=squared_log, feval=rmsle, evals=[(dtrain_xgb, "dtrain")], - evals_result=evals_result_xgb) + evals_result=evals_result_xgb, + ) dtrain_ray = RayDMatrix(self.x, self.y) evals_result_ray = {} @@ -130,7 +133,8 @@ def testCustomMetricFunction(self): feval=rmsle, evals=[(dtrain_ray, "dtrain")], evals_result=evals_result_ray, - **self.kwargs) + **self.kwargs, + ) x_mat = xgb.DMatrix(self.x) pred_y_xgb = np.round(bst_xgb.predict(x_mat)) @@ -143,7 +147,9 @@ def testCustomMetricFunction(self): np.allclose( evals_result_xgb["dtrain"]["PyRMSLE"], evals_result_ray["dtrain"]["PyRMSLE"], - atol=0.1)) + atol=0.1, + ) + ) def testCallbacks(self): class _Callback(TrainingCallback): @@ -160,18 +166,21 @@ def after_iteration(self, model, epoch, evals_log): ray_params=RayParams(num_actors=2), callbacks=[callback], additional_results=additional_results, - **self.kwargs) + **self.kwargs, + ) self.assertEqual(len(additional_results["callback_returns"]), 2) self.assertTrue( - all(rank == 0 - for (_, rank) in additional_results["callback_returns"][0])) + all(rank == 0 for (_, rank) in additional_results["callback_returns"][0]) + ) self.assertTrue( - all(rank == 1 - for (_, rank) in additional_results["callback_returns"][1])) + all(rank == 1 for (_, rank) in additional_results["callback_returns"][1]) + ) if __name__ == "__main__": - import pytest import sys + + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/xgboost_ray/tests/utils.py b/xgboost_ray/tests/utils.py index 412e860e..0b594587 100644 --- a/xgboost_ray/tests/utils.py +++ b/xgboost_ray/tests/utils.py @@ -2,11 +2,10 @@ import os import tempfile import time -from typing import Tuple, Union, List, Dict, Optional +from typing import Dict, List, Optional, Tuple, Union import numpy as np import pandas as pd - import xgboost as xgb from xgboost_ray.compat import TrainingCallback @@ -15,6 +14,7 @@ def get_num_trees(bst: xgb.Booster): import json + data = [json.loads(d) for d in bst.get_dump(dump_format="json")] return len(data) // 4 @@ -24,38 +24,40 @@ def create_data(num_rows: int, num_cols: int, dtype: np.dtype = np.float32): return pd.DataFrame( np.random.uniform(0.0, 10.0, size=(num_rows, num_cols)), columns=[f"feature_{i}" for i in range(num_cols)], - dtype=dtype) + dtype=dtype, + ) -def create_labels(num_rows: int, - num_classes: int = 2, - dtype: Optional[np.dtype] = None): +def create_labels( + num_rows: int, num_classes: int = 2, dtype: Optional[np.dtype] = None +): if num_classes == 0: # Create regression label dtype = dtype or np.float32 return pd.Series( - np.random.uniform(0, 1, size=num_rows), dtype=dtype, name="label") + np.random.uniform(0, 1, size=num_rows), dtype=dtype, name="label" + ) dtype = dtype or np.int32 return pd.Series( - np.random.randint(0, num_classes, size=num_rows), - dtype=dtype, - name="label") + np.random.randint(0, num_classes, size=num_rows), dtype=dtype, name="label" + ) -def create_parquet(filename: str, - num_rows: int, - num_features: int, - num_classes: int = 2, - num_partitions: int = 1): +def create_parquet( + filename: str, + num_rows: int, + num_features: int, + num_classes: int = 2, + num_partitions: int = 1, +): partition_rows = num_rows // num_partitions for partition in range(num_partitions): print(f"Creating partition {partition}") data = create_data(partition_rows, num_features) labels = create_labels(partition_rows, num_classes) - partition = pd.Series( - np.full(partition_rows, partition), dtype=np.int32) + partition = pd.Series(np.full(partition_rows, partition), dtype=np.int32) data["labels"] = labels data["partition"] = partition @@ -65,14 +67,17 @@ def create_parquet(filename: str, filename, partition_cols=["partition"], engine="pyarrow", - partition_filename_cb=lambda key: f"part_{key[0]}.parquet") + partition_filename_cb=lambda key: f"part_{key[0]}.parquet", + ) -def create_parquet_in_tempdir(filename: str, - num_rows: int, - num_features: int, - num_classes: int = 2, - num_partitions: int = 1) -> Tuple[str, str]: +def create_parquet_in_tempdir( + filename: str, + num_rows: int, + num_features: int, + num_classes: int = 2, + num_partitions: int = 1, +) -> Tuple[str, str]: temp_dir = tempfile.mkdtemp() path = os.path.join(temp_dir, filename) create_parquet( @@ -80,7 +85,8 @@ def create_parquet_in_tempdir(filename: str, num_rows=num_rows, num_features=num_features, num_classes=num_classes, - num_partitions=num_partitions) + num_partitions=num_partitions, + ) return temp_dir, path @@ -102,16 +108,14 @@ def tree_obj(bst: xgb.Booster): return [json.loads(j) for j in bst.get_dump(dump_format="json")] -def _kill_callback(die_lock_file: str, - actor_rank: int = 0, - fail_iteration: int = 6): +def _kill_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6): """Returns a callback to kill an actor process. Args: - die_lock_file (str): A file lock used to prevent race conditions + die_lock_file: A file lock used to prevent race conditions when killing the actor. - actor_rank (int): The rank of the actor to kill. - fail_iteration (int): The iteration after which the actor is killed. + actor_rank: The rank of the actor to kill. + fail_iteration: The iteration after which the actor is killed. """ @@ -119,9 +123,11 @@ class _KillCallback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): if get_actor_rank() == actor_rank: put_queue((epoch, time.time())) - if get_actor_rank() == actor_rank and \ - epoch == fail_iteration and \ - not os.path.exists(die_lock_file): + if ( + get_actor_rank() == actor_rank + and epoch == fail_iteration + and not os.path.exists(die_lock_file) + ): # Get PID pid = os.getpid() @@ -136,16 +142,14 @@ def after_iteration(self, model, epoch, evals_log): return _KillCallback() -def _fail_callback(die_lock_file: str, - actor_rank: int = 0, - fail_iteration: int = 6): +def _fail_callback(die_lock_file: str, actor_rank: int = 0, fail_iteration: int = 6): """Returns a callback to cause an Xgboost actor to fail training. Args: - die_lock_file (str): A file lock used to prevent race conditions + die_lock_file: A file lock used to prevent race conditions when causing the actor to fail. - actor_rank (int): The rank of the actor to fail. - fail_iteration (int): The iteration after which the training for + actor_rank: The rank of the actor to fail. + fail_iteration: The iteration after which the training for the specified actor fails. """ @@ -155,14 +159,17 @@ def after_iteration(self, model, epoch, evals_log): if get_actor_rank() == actor_rank: put_queue((epoch, time.time())) - if get_actor_rank() == actor_rank and \ - epoch == fail_iteration and \ - not os.path.exists(die_lock_file): + if ( + get_actor_rank() == actor_rank + and epoch == fail_iteration + and not os.path.exists(die_lock_file) + ): with open(die_lock_file, "wt") as fp: fp.write("") time.sleep(2) import sys + print(f"Testing: Rank {get_actor_rank()} will now fail.") sys.exit(1) @@ -173,9 +180,9 @@ def _checkpoint_callback(frequency: int = 1, before_iteration_=False): """Returns a callback to checkpoint a model. Args: - frequency (int): The interval at which checkpointing occurs. If + frequency: The interval at which checkpointing occurs. If frequency is set to n, checkpointing occurs every n epochs. - before_iteration_ (bool): If True, checkpoint before the iteration + before_iteration_: If True, checkpoint before the iteration begins. Else, checkpoint after the iteration ends. """ @@ -201,17 +208,19 @@ def _sleep_callback(sleep_iteration: int = 6, sleep_seconds: int = 5): This artificially inflates training time. Args: - sleep_iteration (int): The iteration after which the actor should + sleep_iteration: The iteration after which the actor should sleep. - sleep_seconds (int): Time in seconds the actor should sleep. + sleep_seconds: Time in seconds the actor should sleep. """ class _SleepCallback(TrainingCallback): def after_iteration(self, model, epoch, evals_log): if epoch == sleep_iteration: - print(f"Testing: Rank {get_actor_rank()} will now sleep " - f"for {sleep_seconds} seconds.") + print( + f"Testing: Rank {get_actor_rank()} will now sleep " + f"for {sleep_seconds} seconds." + ) time.sleep(sleep_seconds) return _SleepCallback() diff --git a/xgboost_ray/tune.py b/xgboost_ray/tune.py index efde3773..bc80272c 100644 --- a/xgboost_ray/tune.py +++ b/xgboost_ray/tune.py @@ -1,33 +1,37 @@ # Tune imports. +import logging from typing import Dict, Optional import ray - -import logging - from ray.util.annotations import PublicAPI -from xgboost_ray.xgb import xgboost as xgb - -from xgboost_ray.session import put_queue, get_rabit_rank +from xgboost_ray.session import get_rabit_rank, put_queue from xgboost_ray.util import Unavailable, force_on_current_node +from xgboost_ray.xgb import xgboost as xgb try: from ray import tune from ray.tune import is_session_enabled + from ray.tune.integration.xgboost import ( + TuneReportCallback as OrigTuneReportCallback, + ) + from ray.tune.integration.xgboost import ( + TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback, + ) + from ray.tune.integration.xgboost import ( + _TuneCheckpointCallback as _OrigTuneCheckpointCallback, + ) from ray.tune.utils import flatten_dict - from ray.tune.integration.xgboost import \ - TuneReportCallback as OrigTuneReportCallback, \ - _TuneCheckpointCallback as _OrigTuneCheckpointCallback, \ - TuneReportCheckpointCallback as OrigTuneReportCheckpointCallback TUNE_INSTALLED = True except ImportError: tune = None - TuneReportCallback = _TuneCheckpointCallback = \ - TuneReportCheckpointCallback = Unavailable - OrigTuneReportCallback = _OrigTuneCheckpointCallback = \ - OrigTuneReportCheckpointCallback = object + TuneReportCallback = ( + _TuneCheckpointCallback + ) = TuneReportCheckpointCallback = Unavailable + OrigTuneReportCallback = ( + _OrigTuneCheckpointCallback + ) = OrigTuneReportCheckpointCallback = object def is_session_enabled(): return False @@ -46,8 +50,11 @@ def after_iteration(self, model, epoch: int, evals_log: Dict): class _TuneCheckpointCallback(_OrigTuneCheckpointCallback): def after_iteration(self, model, epoch: int, evals_log: Dict): if get_rabit_rank() == 0: - put_queue(lambda: self._create_checkpoint( - model, epoch, self._filename, self._frequency)) + put_queue( + lambda: self._create_checkpoint( + model, epoch, self._filename, self._frequency + ) + ) class TuneReportCheckpointCallback(OrigTuneReportCheckpointCallback): _checkpoint_callback_cls = _TuneCheckpointCallback @@ -60,13 +67,14 @@ def _try_add_tune_callback(kwargs: Dict): new_callbacks = [] has_tune_callback = False - REPLACE_MSG = "Replaced `{orig}` with `{target}`. If you want to " \ - "avoid this warning, pass `{target}` as a callback " \ - "directly in your calls to `xgboost_ray.train()`." + REPLACE_MSG = ( + "Replaced `{orig}` with `{target}`. If you want to " + "avoid this warning, pass `{target}` as a callback " + "directly in your calls to `xgboost_ray.train()`." + ) for cb in callbacks: - if isinstance(cb, - (TuneReportCallback, TuneReportCheckpointCallback)): + if isinstance(cb, (TuneReportCallback, TuneReportCheckpointCallback)): has_tune_callback = True new_callbacks.append(cb) elif isinstance(cb, OrigTuneReportCallback): @@ -75,19 +83,23 @@ def _try_add_tune_callback(kwargs: Dict): logging.warning( REPLACE_MSG.format( orig="ray.tune.integration.xgboost.TuneReportCallback", - target="xgboost_ray.tune.TuneReportCallback")) + target="xgboost_ray.tune.TuneReportCallback", + ) + ) has_tune_callback = True elif isinstance(cb, OrigTuneReportCheckpointCallback): replace_cb = TuneReportCheckpointCallback( metrics=cb._report._metrics, filename=cb._checkpoint._filename, - frequency=cb._checkpoint._frequency) + frequency=cb._checkpoint._frequency, + ) new_callbacks.append(replace_cb) logging.warning( REPLACE_MSG.format( orig="ray.tune.integration.xgboost." "TuneReportCheckpointCallback", - target="xgboost_ray.tune.TuneReportCheckpointCallback") + target="xgboost_ray.tune.TuneReportCheckpointCallback", + ) ) has_tune_callback = True else: @@ -103,22 +115,23 @@ def _try_add_tune_callback(kwargs: Dict): return False -def _get_tune_resources(num_actors: int, cpus_per_actor: int, - gpus_per_actor: int, - resources_per_actor: Optional[Dict], - placement_options: Optional[Dict]): +def _get_tune_resources( + num_actors: int, + cpus_per_actor: int, + gpus_per_actor: int, + resources_per_actor: Optional[Dict], + placement_options: Optional[Dict], +): """Returns object to use for ``resources_per_trial`` with Ray Tune.""" if TUNE_INSTALLED: from ray.tune import PlacementGroupFactory head_bundle = {} child_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor} - child_bundle_extra = {} if resources_per_actor is None else \ - resources_per_actor - child_bundles = [{ - **child_bundle, - **child_bundle_extra - } for _ in range(num_actors)] + child_bundle_extra = {} if resources_per_actor is None else resources_per_actor + child_bundles = [ + {**child_bundle, **child_bundle_extra} for _ in range(num_actors) + ] bundles = [head_bundle] + child_bundles placement_options = placement_options or {} placement_options.setdefault("strategy", "PACK") @@ -127,14 +140,15 @@ def _get_tune_resources(num_actors: int, cpus_per_actor: int, # TODO remove after Ray 2.3 is out if placement_options.get("_max_cpu_fraction_per_node", None) is None: placement_options.pop("_max_cpu_fraction_per_node", None) - placement_group_factory = PlacementGroupFactory( - bundles, **placement_options) + placement_group_factory = PlacementGroupFactory(bundles, **placement_options) return placement_group_factory else: - raise RuntimeError("Tune is not installed, so `get_tune_resources` is " - "not supported. You can install Ray Tune via `pip " - "install ray[tune]`.") + raise RuntimeError( + "Tune is not installed, so `get_tune_resources` is " + "not supported. You can install Ray Tune via `pip " + "install ray[tune]`." + ) @PublicAPI(stability="beta") diff --git a/xgboost_ray/util.py b/xgboost_ray/util.py index ebf3e231..43b1f448 100644 --- a/xgboost_ray/util.py +++ b/xgboost_ray/util.py @@ -1,6 +1,5 @@ -from typing import Dict, Optional, List - import asyncio +from typing import Dict, List, Optional import ray from ray.util.annotations import DeveloperAPI @@ -56,7 +55,7 @@ class MultiActorTask: The `is_ready()` method will return True once all futures are ready. Args: - pending_futures (list): List of object references (futures) + pending_futures: List of object references (futures) that should be tracked. """