diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ed1a2d304916..d4f028e33f93 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -1,4 +1,4 @@ -Thanks for participating in the XGBoost community! We use https://discuss.xgboost.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking. You are always welcomed to post on the forum first :) +Thanks for participating in the XGBoost community! The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking. Issues that are inactive for a period of time may get closed. We adopt this policy so that we won't lose track of actionable issues that may fall at the bottom of the pile. Feel free to reopen a new one if you feel there is an additional problem that needs attention when an old one gets closed. diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index 1a8098071ba3..000000000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,35 +0,0 @@ -# To get started with Dependabot version updates, you'll need to specify which -# package ecosystems to update and where the package manifests are located. -# Please see the documentation for all configuration options: -# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates - -version: 2 -updates: - - package-ecosystem: "maven" - directory: "/jvm-packages" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-example" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "github-actions" - directory: / - schedule: - interval: "monthly" diff --git a/.github/runs-on.yml b/.github/runs-on.yml index d951a08e8273..e21895ee8c3b 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -34,4 +34,3 @@ runners: cpu: 32 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: windows-amd64 - diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml index d3208a1294d1..26e8fa34c119 100644 --- a/.github/workflows/freebsd.yml +++ b/.github/workflows/freebsd.yml @@ -15,20 +15,15 @@ jobs: timeout-minutes: 20 name: A job to run test in FreeBSD steps: - - uses: actions/checkout@v4 - with: - submodules: 'true' - - name: Test in FreeBSD - id: test - uses: vmactions/freebsd-vm@v1 - with: - usesh: true - prepare: | - pkg install -y cmake git ninja googletest - - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON - ninja -v - ./testxgboost + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Test in FreeBSD + id: test + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + prepare: | + pkg install -y cmake git ninja googletest bash + run: | + bash ops/pipeline/test-freebsd.sh diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml index aec7e9d31087..8b7c71a82bf8 100644 --- a/.github/workflows/i386.yml +++ b/.github/workflows/i386.yml @@ -19,25 +19,25 @@ jobs: ports: - 5000:5000 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3.7.1 - with: - driver-opts: network=host - - name: Build and push container - uses: docker/build-push-action@v6 - with: - context: . - file: tests/ci_build/Dockerfile.i386 - push: true - tags: localhost:5000/xgboost/build-32bit:latest - cache-from: type=gha - cache-to: type=gha,mode=max - - name: Build XGBoost - run: | - docker run --rm -v $PWD:/workspace -w /workspace \ - -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ - localhost:5000/xgboost/build-32bit:latest \ - tests/ci_build/build_via_cmake.sh + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: network=host + - name: Build and push container + uses: docker/build-push-action@v6 + with: + context: . + file: ops/docker/dockerfile/Dockerfile.i386 + push: true + tags: localhost:5000/xgboost/build-32bit:latest + cache-from: type=gha + cache-to: type=gha,mode=max + - name: Build XGBoost + run: | + docker run --rm -v $PWD:/workspace -w /workspace \ + -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ + localhost:5000/xgboost/build-32bit:latest \ + bash ops/script/build_via_cmake.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 945f362685a4..53e695721887 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -1,100 +1,287 @@ -name: XGBoost-JVM-Tests +name: XGBoost CI (JVM packages) on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + jobs: - test-with-jvm: - name: Test JVM on OS ${{ matrix.os }} - timeout-minutes: 30 - runs-on: ${{ matrix.os }} + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=jvm-tests-build-containers-${{ matrix.container_id }} strategy: - fail-fast: false matrix: - os: [windows-latest, ubuntu-latest, macos-13] - + container_id: + - xgb-ci.manylinux2014_x86_64 + - xgb-ci.jvm + - xgb-ci.jvm_gpu_build + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh ${{ matrix.container_id }} - - uses: actions/setup-java@8df1039502a15bceb9433410b1a100fbe190c53b # v4.5.0 - with: - distribution: 'temurin' - java-version: '8' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: jvm_tests - environment-file: tests/ci_build/conda_env/jvm_tests.yml - use-mamba: true + build-jvm-manylinux2014: + name: >- + Build libxgboost4j.so targeting glibc 2.17 + (arch ${{ matrix.arch }}, runner ${{ matrix.runner }}) + needs: build-containers + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=jvm-tests-build-jvm-manylinux2014-${{ matrix.arch }} + strategy: + fail-fast: false + matrix: + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} + - name: Upload libxgboost4j.so + run: | + libname=lib/libxgboost4j_linux_${{ matrix.arch }}_${{ github.sha }}.so + mv -v lib/libxgboost4j.so ${libname} + bash ops/pipeline/publish-artifact.sh ${libname} \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ - - name: Cache Maven packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} + build-jvm-gpu: + name: Build libxgboost4j.so with CUDA + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=jvm-tests-build-jvm-gpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build + - run: bash ops/pipeline/build-jvm-gpu.sh + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh stash build-jvm-gpu lib/libxgboost4j.so - - name: Test XGBoost4J (Core) - run: | - cd jvm-packages - mvn test -B -pl :xgboost4j_2.12 + build-jvm-mac: + name: "Build libxgboost4j.dylib for ${{ matrix.description }}" + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - description: "MacOS (Apple Silicon)" + script: ops/pipeline/build-jvm-macos-apple-silicon.sh + libname: libxgboost4j_m1_${{ github.sha }}.dylib + runner: macos-14 + - description: "MacOS (Intel)" + script: ops/pipeline/build-jvm-macos-intel.sh + libname: libxgboost4j_intel_${{ github.sha }}.dylib + runner: macos-13 + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - run: bash ${{ matrix.script }} + - name: Upload libxgboost4j.dylib + run: | + mv -v lib/libxgboost4j.dylib ${{ matrix.libname }} + bash ops/pipeline/publish-artifact.sh ${{ matrix.libname }} \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - name: Test XGBoost4J (Core, Spark, Examples) - run: | - rm -rfv build/ - cd jvm-packages - mvn -B test - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows + build-jvm-docs: + name: Build docs for JVM packages + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=jvm-tests-build-jvm-docs + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build + - name: Unstash files + run: | + bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so + - run: bash ops/pipeline/build-jvm-doc.sh + - name: Upload JVM doc + run: | + bash ops/pipeline/publish-artifact.sh \ + jvm-packages/${{ env.BRANCH_NAME }}.tar.bz2 \ + s3://xgboost-docs/ - - name: Extract branch name - shell: bash - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - (matrix.os == 'windows-latest' || matrix.os == 'macos-13') + build-test-jvm-packages: + name: Build and test JVM packages (Linux, Scala ${{ matrix.scala_version }}) + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=jvm-tests-build-test-jvm-packages-scala${{ matrix.scala_version }} + strategy: + fail-fast: false + matrix: + scala_version: ["2.12", "2.13"] + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.jvm + - name: Build and test JVM packages (Scala ${{ matrix.scala_version }}) + run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: ${{ matrix.scala_version }} + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh stash \ + build-test-jvm-packages lib/libxgboost4j.so + if: matrix.scala_version == '2.13' - - name: Publish artifact xgboost4j.dll to S3 - run: | - cd lib/ - Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll - dir - python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'windows-latest' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} + build-test-jvm-packages-other-os: + name: Build and test JVM packages (${{ matrix.os }}) + timeout-minutes: 30 + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [windows-latest, macos-13] + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '8' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: minimal + environment-file: ops/conda_env/minimal.yml + - name: Cache Maven packages + uses: actions/cache@v4 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} + restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} + - name: Test XGBoost4J (Core) + run: | + cd jvm-packages + mvn test -B -pl :xgboost4j_2.12 + - name: Publish artifact xgboost4j.dll to S3 + run: | + cd lib/ + Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll + python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` + --acl public-read --region us-west-2 + if: | + (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && + matrix.os == 'windows-latest' + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - name: Publish artifact libxgboost4j.dylib to S3 - shell: bash -l {0} - run: | - cd lib/ - mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib - ls - python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'macos-13' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} + test-jvm-packages-gpu: + name: Test JVM packages with CUDA (Scala ${{ matrix.scala_version }}) + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-mgpu + - tag=jvm-tests-test-jvm-packages-gpu-scala${{ matrix.scala_version }} + strategy: + fail-fast: false + matrix: + scala_version: ["2.12", "2.13"] + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build + - name: Unstash files + run: | + bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so + - run: bash ops/pipeline/test-jvm-gpu.sh + env: + SCALA_VERSION: ${{ matrix.scala_version }} - - name: Build and Test XGBoost4J with scala 2.13 - run: | - rm -rfv build/ - cd jvm-packages - mvn -B clean install test -Pdefault,scala-2.13 - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows + deploy-jvm-packages: + name: Deploy JVM packages to S3 (${{ matrix.variant.name }}) + needs: [build-jvm-gpu, build-test-jvm-packages, test-jvm-packages-gpu] + runs-on: + - runs-on + - runner=linux-amd64-cpu + - run-id=${{ github.run_id }} + - tag=jvm-tests-deploy-jvm-packages-${{ matrix.variant.name }}-scala${{ matrix.scala_version }} + strategy: + fail-fast: false + matrix: + variant: + - name: cpu + container_id: xgb-ci.jvm + artifact_from: build-test-jvm-packages + - name: gpu + container_id: xgb-ci.jvm_gpu_build + artifact_from: build-jvm-gpu + scala_version: ['2.12', '2.13'] + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh ${{ matrix.variant.container_id }} + - name: Unstash files + run: | + bash ops/pipeline/stash-artifacts.sh \ + unstash ${{ matrix.variant.artifact_from }} \ + lib/libxgboost4j.so + ls -lh lib/libxgboost4j.so + - name: Deploy JVM packages to S3 + run: | + bash ops/pipeline/deploy-jvm-packages.sh ${{ matrix.variant.name }} \ + ${{ matrix.variant.container_id }} ${{ matrix.scala_version }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000000..2c400b073988 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,119 @@ +name: XGBoost CI (Lint) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + build-containers: + name: Build CI containers + env: + CONTAINER_ID: xgb-ci.clang_tidy + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=lint-build-containers + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ env.CONTAINER_ID }} + run: bash ops/docker_build.sh ${{ env.CONTAINER_ID }} + + clang-tidy: + name: Run clang-tidy + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=lint-clang-tidy + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.clang_tidy + - run: bash ops/pipeline/run-clang-tidy.sh + + python-mypy-lint: + runs-on: ubuntu-latest + name: Type and format checks for the Python package + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: python_lint + environment-file: ops/conda_env/python_lint.yml + - name: Run mypy + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=1 --pylint=0 + - name: Run formatter + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=1 --type-check=0 --pylint=0 + - name: Run pylint + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=0 --pylint=1 + + cpp-lint: + runs-on: ubuntu-latest + name: Code linting for C++ + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + architecture: 'x64' + - name: Install Python packages + run: | + python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint + - name: Run lint + run: | + python3 ops/script/lint_cpp.py + bash ops/script/lint_cmake.sh + + lintr: + runs-on: ubuntu-latest + name: Run R linters on Ubuntu + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: r-lib/actions/setup-r@v2 + with: + r-version: "release" + - name: Cache R packages + uses: actions/cache@v4 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} + - name: Install dependencies + shell: Rscript {0} + run: | + source("./R-package/tests/helper_scripts/install_deps.R") + - name: Run lintr + run: | + MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ + Rscript ops/script/lint_r.R $(pwd) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d1395c15f77e..cbed730405fa 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,193 +1,311 @@ -# This is a basic workflow to help you get started with Actions +name: XGBoost CI -name: XGBoost-CI - -# Controls when the action will run. Triggers the workflow on push or pull request -# events but only for the master branch on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -# A workflow run is made up of one or more jobs that can run sequentially or in parallel +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + jobs: - gtest-cpu: - name: Test Google C++ test (CPU) - runs-on: ${{ matrix.os }} + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-build-containers-${{ matrix.container_id }} strategy: - fail-fast: false matrix: - os: [macos-12] + container_id: + - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu_build_rockylinux8_dev_ver + - xgb-ci.gpu_build_r_rockylinux8 + - xgb-ci.gpu + - xgb-ci.cpu + - xgb-ci.manylinux_2_28_x86_64 + - xgb-ci.manylinux2014_x86_64 + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu + - container_id: xgb-ci.aarch64 + runner: linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh ${{ matrix.container_id }} + + build-cpu: + name: Build CPU + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.cpu + - run: bash ops/pipeline/build-cpu.sh + - name: Stash CLI executable + run: bash ops/pipeline/stash-artifacts.sh stash build-cpu ./xgboost + + build-cpu-arm64: + name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-arm64-cpu + - tag=build-cpu-arm64 + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.aarch64 + - run: bash ops/pipeline/build-cpu-arm64.sh + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh stash build-cpu-arm64 \ + ./xgboost python-package/dist/*.whl + - name: Upload Python wheel + run: | + bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ + + build-cuda: + name: Build CUDA + manylinux_2_28_x86_64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-cuda + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/pipeline/build-cuda.sh + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh stash build-cuda \ + build/testxgboost ./xgboost python-package/dist/*.whl + - name: Upload Python wheel + run: | + for file in python-package/dist/*.whl python-package/dist/meta.json + do + bash ops/pipeline/publish-artifact.sh "${file}" \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ + done + + build-cuda-with-rmm: + name: Build CUDA with RMM + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-cuda-with-rmm + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 + - run: | + bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8 + - name: Stash files + run: | + bash ops/pipeline/stash-artifacts.sh \ + stash build-cuda-with-rmm build/testxgboost + - name: Upload Python wheel + run: | + bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \ + s3://xgboost-nightly-builds/experimental_build_with_rmm/ + + build-cuda-with-rmm-dev: + name: Build CUDA with RMM (dev) + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-cuda-with-rmm-dev steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - brew install ninja libomp - - name: Build gtest binary - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo - ninja -v - - name: Run gtest binary - run: | - cd build - ./testxgboost - ctest -R TestXGBoostCLI --extra-verbose + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8_dev_ver + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64 + - run: | + bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8_dev_ver - gtest-cpu-nonomp: - name: Test Google C++ unittest (CPU Non-OMP) - runs-on: ${{ matrix.os }} + build-manylinux2014: + name: Build manylinux2014_${{ matrix.arch }} wheel + needs: build-containers + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-build-manylinux2014-${{ matrix.arch }} strategy: fail-fast: false matrix: - os: [ubuntu-latest] + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} + - name: Upload Python wheel + run: | + for wheel in python-package/dist/*.whl + do + bash ops/pipeline/publish-artifact.sh "${wheel}" \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ + done + + build-gpu-rpkg: + name: Build GPU-enabled R package + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + - tag=main-build-gpu-rpkg steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - sudo apt-get install -y --no-install-recommends ninja-build - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON - ninja -v - - name: Run gtest binary - run: | - cd build - ctest --extra-verbose + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu_build_r_rockylinux8 + - run: bash ops/pipeline/build-gpu-rpkg.sh + - name: Upload R tarball + run: | + bash ops/pipeline/publish-artifact.sh xgboost_r_gpu_linux_*.tar.gz \ + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ - gtest-cpu-sycl: - name: Test Google C++ unittest (CPU SYCL) - runs-on: ${{ matrix.os }} + + test-cpp-gpu: + name: >- + Run Google Tests with GPUs + (Suite ${{ matrix.suite }}, Runner ${{ matrix.runner }}) + needs: [build-cuda, build-cuda-with-rmm] + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-test-cpp-gpu-${{ matrix.suite }} strategy: fail-fast: false matrix: - os: [ubuntu-latest] - python-version: ["3.10"] + include: + - suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - suite: gpu-rmm + runner: linux-amd64-gpu + artifact_from: build-cuda-with-rmm + - suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX - make -j$(nproc) - - name: Run gtest binary for SYCL - run: | - cd build - ./testxgboost --gtest_filter=Sycl* - - name: Run gtest binary for non SYCL - run: | - cd build - ./testxgboost --gtest_filter=-Sycl* + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh xgb-ci.gpu + - name: Unstash gtest + run: | + bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ + build/testxgboost + chmod +x build/testxgboost + - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} - c-api-demo: - name: Test installing XGBoost lib + building the C API demo - runs-on: ${{ matrix.os }} - defaults: - run: - shell: bash -l {0} + test-python-wheel: + name: Run Python tests (${{ matrix.description }}) + needs: [build-cuda, build-cpu-arm64] + runs-on: + - runs-on + - runner=${{ matrix.runner }} + - run-id=${{ github.run_id }} + - tag=main-test-python-wheel-${{ matrix.description }} strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: cpp_test - environment-file: tests/ci_build/conda_env/cpp_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - - name: Build and install XGBoost static library - run: | - mkdir build - cd build - cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja -v install - cd - - - name: Build and run C API demo with static - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - cd .. - rm -rf ./build - popd - - - name: Build and install XGBoost shared library - run: | - cd build - cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON - ninja -v install - ./testxgboost - cd - - - name: Build and run C API demo with shared - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - popd - ./tests/ci_build/verify_link.sh ./demo/c-api/build/basic/api-demo - ./tests/ci_build/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo - - cpp-lint: - runs-on: ubuntu-latest - name: Code linting for C++ + include: + - description: single-gpu + container: xgb-ci.gpu + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: multiple-gpu + container: xgb-ci.gpu + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: cpu-amd64 + container: xgb-ci.cpu + suite: cpu + runner: linux-amd64-cpu + artifact_from: build-cuda + - description: cpu-arm64 + container: xgb-ci.aarch64 + suite: cpu-arm64 + runner: linux-arm64-cpu + artifact_from: build-cpu-arm64 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - name: Install Python packages - run: | - python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint - - name: Run lint - run: | - python3 tests/ci_build/lint_cpp.py - sh ./tests/ci_build/lint_cmake.sh + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh ${{ matrix.container }} + - name: Unstash Python wheel + run: | + bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \ + python-package/dist/*.whl ./xgboost + chmod +x ./xgboost + - name: Run Python tests, ${{ matrix.description }} + run: bash ops/pipeline/test-python-wheel.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml new file mode 100644 index 000000000000..67c1bf57d3a2 --- /dev/null +++ b/.github/workflows/misc.yml @@ -0,0 +1,49 @@ +name: XGBoost CI (misc) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + gtest-cpu-nonomp: + name: Test Google C++ unittest (CPU Non-OMP) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Install system packages + run: | + sudo apt-get install -y --no-install-recommends ninja-build + - name: Build and install XGBoost + run: bash ops/script/build_via_cmake.sh -DUSE_OPENMP=OFF + - name: Run gtest binary + run: | + cd build + ctest --extra-verbose + + c-api-demo: + name: Test installing XGBoost lib + building the C API demo + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: cpp_test + environment-file: ops/conda_env/cpp_test.yml + - name: Build and run C API demo with shared + run: bash ops/pipeline/test-c-api-demo.sh diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 8f0ab1c68262..dc8de819e2bb 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -1,4 +1,4 @@ -name: XGBoost-Python-Tests +name: XGBoost CI (Python tests) on: [push, pull_request] @@ -14,335 +14,51 @@ concurrency: cancel-in-progress: true jobs: - python-mypy-lint: - runs-on: ubuntu-latest - name: Type and format checks for the Python package - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: python_lint - environment-file: tests/ci_build/conda_env/python_lint.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Run mypy - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=1 --pylint=0 - - name: Run formatter - run: | - python tests/ci_build/lint_python.py --format=1 --type-check=0 --pylint=0 - - name: Run pylint - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=0 --pylint=1 - - python-sdist-test-on-Linux: - # Mismatched glibcxx version between system and conda forge. - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: sdist_test - environment-file: tests/ci_build/conda_env/sdist_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False - cd .. - python -c 'import xgboost' - python-sdist-test: - # Use system toolchain instead of conda toolchain for macos and windows. - # MacOS has linker error if clang++ from conda-forge is used runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} + name: Test installing Python XGBoost from the source distribution (${{ matrix.os }}) strategy: + fail-fast: false matrix: - os: [macos-13, windows-latest] - python-version: ["3.10"] + os: [macos-13, windows-latest, ubuntu-latest] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install osx system dependencies - if: matrix.os == 'macos-13' - run: | - brew install ninja libomp - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - activate-environment: test - - name: Install build - run: | - conda install -c conda-forge python-build - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz - cd .. - python -c 'import xgboost' + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: sdist_test + environment-file: ops/conda_env/sdist_test.yml + - name: Install extra package for MacOS + run: | + mamba install -c conda-forge llvm-openmp + if: matrix.os == 'macos-13' + - name: Build and install XGBoost + run: bash ops/pipeline/test-python-sdist.sh python-tests-on-macos: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 60 - strategy: - matrix: - config: - - {os: macos-13} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: macos_cpu_test - environment-file: tests/ci_build/conda_env/macos_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on macos - run: | - brew install ninja - - mkdir build - cd build - # Set prefix, to use OpenMP library from Conda env - # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 - # to learn why we don't use libomp from Homebrew. - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - python-tests-on-win: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} + name: Test XGBoost Python package on macos-13 + runs-on: macos-13 timeout-minutes: 60 - strategy: - matrix: - config: - - {os: windows-latest, python-version: '3.10'} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - auto-update-conda: true - python-version: ${{ matrix.config.python-version }} - activate-environment: win64_env - environment-file: tests/ci_build/conda_env/win64_cpu_test.yml - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Windows - run: | - mkdir build_msvc - cd build_msvc - cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON - cmake --build . --config Release --parallel $(nproc) - - - name: Install Python package - run: | - cd python-package - python --version - pip wheel -v . --wheel-dir dist/ - pip install ./dist/*.whl - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - python-tests-on-ubuntu: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_cpu_test - environment-file: tests/ci_build/conda_env/linux_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - - name: Test PySpark Interface - shell: bash -l {0} - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark - - python-sycl-tests-on-ubuntu: - name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - make -j$(nproc) - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ - + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: macos_cpu_test + environment-file: ops/conda_env/macos_cpu_test.yml + - run: bash ops/pipeline/test-python-macos.sh python-system-installation-on-ubuntu: - name: Test XGBoost Python package System Installation on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - + name: Test XGBoost Python package System Installation on Ubuntu + runs-on: ubuntu-latest steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4 with: submodules: 'true' - - name: Set up Python 3.10 - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@v5 with: python-version: "3.10" - - - name: Install ninja - run: | - sudo apt-get update && sudo apt-get install -y ninja-build - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja - ninja - - - name: Copy lib to system lib - run: | - cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib" - - - name: Install XGBoost in Virtual Environment - run: | - cd python-package - pip install virtualenv - virtualenv venv - source venv/bin/activate && \ - pip install -v . --config-settings use_system_libxgboost=True && \ - python -c 'import xgboost' + - run: bash ops/pipeline/test-python-with-sysprefix.sh diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml deleted file mode 100644 index 1bbdedc3f9c6..000000000000 --- a/.github/workflows/python_wheels.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: XGBoost-Python-Wheels - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - python-wheels: - name: Build wheel for ${{ matrix.platform_id }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - os: macos-13 - platform_id: macosx_x86_64 - - os: macos-14 - platform_id: macosx_arm64 - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Set up homebrew - uses: Homebrew/actions/setup-homebrew@68fa6aeb1ccb0596d311f2b34ec74ec21ee68e54 - - name: Install libomp - run: brew install libomp - - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - python-version: "3.10" - use-mamba: true - - name: Build wheels - run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }} - - name: Extract branch name - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - - name: Upload Python wheel - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - run: | - python -m pip install awscli - python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read --region us-west-2 - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml new file mode 100644 index 000000000000..ab13dfa395cd --- /dev/null +++ b/.github/workflows/python_wheels_macos.yml @@ -0,0 +1,53 @@ +name: Build Python wheels targeting MacOS + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +defaults: + run: + shell: bash -l {0} + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + python-wheels-macos: + name: Build wheel for ${{ matrix.platform_id }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + - os: macos-13 + platform_id: macosx_x86_64 + - os: macos-14 + platform_id: macosx_arm64 + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Set up homebrew + uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 + - name: Install libomp + run: brew install libomp + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: minimal + environment-file: ops/conda_env/minimal.yml + - name: Build wheels + run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} + - name: Upload Python wheel + if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + run: | + python -m pip install awscli + python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml index 4b506927e06c..da01f39f650b 100644 --- a/.github/workflows/r_nold.yml +++ b/.github/workflows/r_nold.yml @@ -22,23 +22,20 @@ jobs: container: image: rhub/debian-gcc-devel-nold steps: - - name: Install git and system packages - shell: bash - run: | - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Run R tests - shell: bash - run: | - cd R-package && \ - /tmp/R-devel/bin/R CMD INSTALL . && \ - /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" + - name: Install git and system packages + shell: bash + run: | + apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Install dependencies + shell: bash -l {0} + run: | + /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" + - name: Run R tests + shell: bash + run: | + cd R-package && \ + /tmp/R-devel/bin/R CMD INSTALL . && \ + /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index c56d1f8ef943..fc0245f5752e 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -13,138 +13,91 @@ concurrency: cancel-in-progress: true jobs: - lintr: - runs-on: ${{ matrix.config.os }} - name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: - matrix: - config: - - {os: ubuntu-latest, r: 'release'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Run lintr - run: | - MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ - Rscript tests/ci_build/lint_r.R $(pwd) - test-Rpkg: - runs-on: ${{ matrix.config.os }} - name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} + runs-on: ${{ matrix.os }} + name: Test R on OS ${{ matrix.os }}, R ${{ matrix.r }}, Compiler ${{ matrix.compiler }}, Build ${{ matrix.build }} strategy: fail-fast: false matrix: - config: - - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'} - - {os: ubuntu-latest, r: 'release', compiler: 'none', build: 'cmake'} + include: + - os: windows-latest + r: release + compiler: mingw + build: autotools + - os: ubuntu-latest + r: release + compiler: none + build: cmake env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - steps: - - name: Install system dependencies - run: | - sudo apt update - sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev - if: matrix.config.os == 'ubuntu-latest' - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - - uses: r-lib/actions/setup-tinytex@v2 - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Test R - run: | - python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler != 'none' - - - name: Test R - run: | - python tests/ci_build/test_r_package.py --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler == 'none' + - name: Install system dependencies + run: | + sudo apt update + sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev + if: matrix.os == 'ubuntu-latest' + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.r }} + - name: Cache R packages + uses: actions/cache@v4 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + architecture: 'x64' + - uses: r-lib/actions/setup-tinytex@v2 + - name: Install dependencies + shell: Rscript {0} + run: | + source("./R-package/tests/helper_scripts/install_deps.R") + - name: Test R + run: | + python ops/script/test_r_package.py --compiler='${{ matrix.compiler }}' --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler != 'none' + - name: Test R + run: | + python ops/script/test_r_package.py --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler == 'none' test-R-on-Debian: name: Test R package on Debian runs-on: ubuntu-latest container: image: rhub/debian-gcc-release - steps: - - name: Install system dependencies - run: | - # Must run before checkout to have the latest git installed. - # No need to add pandoc, the container has it figured out. - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - name: Trust git cloning project sources - run: | - git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Test R - shell: bash -l {0} - run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check - - - uses: dorny/paths-filter@v3 - id: changes - with: - filters: | - r_package: - - 'R-package/**' - - - name: Run document check - if: steps.changes.outputs.r_package == 'true' - run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc + - name: Install system dependencies + run: | + # Must run before checkout to have the latest git installed. + # No need to add pandoc, the container has it figured out. + apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y + - name: Trust git cloning project sources + run: | + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Install dependencies + shell: bash -l {0} + run: | + Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" + - name: Test R + shell: bash -l {0} + run: | + python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check + - uses: dorny/paths-filter@v3 + id: changes + with: + filters: | + r_package: + - 'R-package/**' + - name: Run document check + if: steps.changes.outputs.r_package == 'true' + run: | + python3 ops/script/test_r_package.py --r=/usr/bin/R --task=doc diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 85a9abb57e1b..f3837391b4fe 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -22,7 +22,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@v4 with: persist-credentials: false diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml new file mode 100644 index 000000000000..22456b1b68e5 --- /dev/null +++ b/.github/workflows/sycl_tests.yml @@ -0,0 +1,48 @@ +name: XGBoost CI (oneAPI) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +defaults: + run: + shell: bash -l {0} + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + gtest-cpu-sycl: + name: Test Google C++ unittest (CPU SYCL) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: linux_sycl_test + environment-file: ops/conda_env/linux_sycl_test.yml + - name: Run gtest + run: bash ops/pipeline/build-test-sycl.sh gtest + + python-sycl-tests-on-ubuntu: + name: Test XGBoost Python package with SYCL + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: linux_sycl_test + environment-file: ops/conda_env/linux_sycl_test.yml + - name: Test Python package + run: bash ops/pipeline/build-test-sycl.sh pytest diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml index 5e229db4c050..4a3e4747c3ff 100644 --- a/.github/workflows/update_rapids.yml +++ b/.github/workflows/update_rapids.yml @@ -25,20 +25,20 @@ jobs: name: Check latest RAPIDS runs-on: ubuntu-latest steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Check latest RAPIDS and update conftest.sh - run: | - bash tests/buildkite/update-rapids.sh - - name: Create Pull Request - uses: peter-evans/create-pull-request@v7 - if: github.ref == 'refs/heads/master' - with: - add-paths: | - tests/buildkite - branch: create-pull-request/update-rapids - base: master - title: "[CI] Update RAPIDS to latest stable" - commit-message: "[CI] Update RAPIDS to latest stable" + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Check latest RAPIDS and update conftest.sh + run: | + bash ops/script/update_rapids.sh + - name: Create Pull Request + uses: peter-evans/create-pull-request@v7 + if: github.ref == 'refs/heads/master' + with: + add-paths: | + ops/docker + branch: create-pull-request/update-rapids + base: master + title: "[CI] Update RAPIDS to latest stable" + commit-message: "[CI] Update RAPIDS to latest stable" diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 000000000000..f97daf761abf --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,53 @@ +name: XGBoost CI (Windows) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: powershell + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + build-win64-gpu: + name: Build XGBoost for Windows with CUDA + runs-on: + - runs-on=${{ github.run_id }} + - runner=windows-cpu + - tag=windows-build-win64-gpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - run: powershell ops/pipeline/build-win64-gpu.ps1 + - name: Stash files + run: | + powershell ops/pipeline/stash-artifacts.ps1 stash build-win64-gpu ` + build/testxgboost.exe xgboost.exe ` + (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) + + test-win64-gpu: + name: Test XGBoost on Windows + needs: build-win64-gpu + runs-on: + - runs-on=${{ github.run_id }} + - runner=windows-gpu + - tag=windows-test-win64-gpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Unstash files + run: | + powershell ops/pipeline/stash-artifacts.ps1 unstash build-win64-gpu ` + build/testxgboost.exe xgboost.exe python-package/dist/*.whl + - run: powershell ops/pipeline/test-win64-gpu.ps1 diff --git a/.gitignore b/.gitignore index 082e85e2c67f..c29dcc43d9d3 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,7 @@ Debug *.bak #.Rbuildignore R-package.Rproj +R-package/build/* *.cache* .mypy_cache/ doxygen @@ -144,11 +145,13 @@ credentials.csv .bloop # python tests +*.bin demo/**/*.txt *.dmatrix .hypothesis __MACOSX/ model*.json +/tests/python/models/models/ # R tests *.htm diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 035f4ae45f47..8bd8caabc20f 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -10,6 +10,7 @@ S3method(getinfo,xgb.Booster) S3method(getinfo,xgb.DMatrix) S3method(length,xgb.Booster) S3method(predict,xgb.Booster) +S3method(predict,xgboost) S3method(print,xgb.Booster) S3method(print,xgb.DMatrix) S3method(print,xgb.cv.synchronous) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 78249a53f18d..008a88dcd715 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -423,7 +423,7 @@ NULL #' #' @description #' When it comes to serializing XGBoost models, it's possible to use R serializers such as -#' [save()] or [saveRDS()] to serialize an XGBoost R model, but XGBoost also provides +#' [save()] or [saveRDS()] to serialize an XGBoost model object, but XGBoost also provides #' its own serializers with better compatibility guarantees, which allow loading #' said models in other language bindings of XGBoost. #' @@ -451,14 +451,15 @@ NULL #' not used for prediction / importance / plotting / etc. #' These R attributes are only preserved when using R's serializers. #' -#' In addition to the regular `xgb.Booster` objects producted by [xgb.train()], the -#' function [xgboost()] produces a different subclass `xgboost`, which keeps other -#' additional metadata as R attributes such as class names in classification problems, -#' and which has a dedicated `predict` method that uses different defaults. XGBoost's +#' In addition to the regular `xgb.Booster` objects produced by [xgb.train()], the +#' function [xgboost()] produces objects with a different subclass `xgboost` (which +#' inherits from `xgb.Booster`), which keeps other additional metadata as R attributes +#' such as class names in classification problems, and which has a dedicated `predict` +#' method that uses different defaults and takes different argument names. XGBoost's #' own serializers can work with this `xgboost` class, but as they do not keep R #' attributes, the resulting object, when deserialized, is downcasted to the regular #' `xgb.Booster` class (i.e. it loses the metadata, and the resulting object will use -#' `predict.xgb.Booster` instead of `predict.xgboost`) - for these `xgboost` objects, +#' [predict.xgb.Booster()] instead of [predict.xgboost()]) - for these `xgboost` objects, #' `saveRDS` might thus be a better option if the extra functionalities are needed. #' #' Note that XGBoost models in R starting from version `2.1.0` and onwards, and @@ -466,8 +467,8 @@ NULL #' are incompatible with each other. Hence, models that were saved with R serializers #' like [saveRDS()] or [save()] before version `2.1.0` will not work with latter #' `xgboost` versions and vice versa. Be aware that the structure of R model objects -#' could in theory change again in the future, so XGBoost's serializers -#' should be preferred for long-term storage. +#' could in theory change again in the future, so XGBoost's serializers should be +#' preferred for long-term storage. #' #' Furthermore, note that using the package `qs` for serialization will require #' version 0.26 or higher of said package, and will have the same compatibility diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 808289b63de3..b38cd42bcef3 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -126,6 +126,8 @@ xgb.get.handle <- function(object) { #' of the iterations (rounds) otherwise. #' #' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not. +#' +#' Not applicable to `gblinear` booster. #' @param strict_shape Whether to always return an array with the same dimensions for the given prediction mode #' regardless of the model type - meaning that, for example, both a multi-class and a binary classification #' model would generate output arrays with the same number of dimensions, with the 'class' dimension having @@ -144,7 +146,13 @@ xgb.get.handle <- function(object) { #' #' If passing `TRUE`, then the result will have dimensions in reverse order - for example, rows #' will be the last dimensions instead of the first dimension. -#' @param base_margin Base margin used for boosting from existing model. +#' @param base_margin Base margin used for boosting from existing model (raw score that gets added to +#' all observations independently of the trees in the model). +#' +#' If supplied, should be either a vector with length equal to the number of rows in `newdata` +#' (for objectives which produces a single score per observation), or a matrix with number of +#' rows matching to the number rows in `newdata` and number of columns matching to the number +#' of scores estimated by the model (e.g. number of classes for multi-class classification). #' #' Note that, if `newdata` is an `xgb.DMatrix` object, this argument will #' be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as @@ -206,6 +214,9 @@ xgb.get.handle <- function(object) { #' For multi-class / multi-target, they will be arranged so that columns in the output will have #' the leafs from one group followed by leafs of the other group (e.g. order will be `group1:feat1`, #' `group1:feat2`, ..., `group2:feat1`, `group2:feat2`, ...). +#' +#' If there is more than one parallel tree (e.g. random forests), the parallel trees will be the +#' last grouping in the resulting order, which will still be 2D. #' \item For `predcontrib`: when not multi-class / multi-target, a matrix with dimensions #' `[nrows, nfeats+1]`. The last "+ 1" column corresponds to the baseline value. #' @@ -222,7 +233,7 @@ xgb.get.handle <- function(object) { #' For multi-class and multi-target, will be a 4D array with dimensions `[nrows, ngroups, nfeats+1, nfeats+1]` #' } #' -#' If passing `strict_shape=FALSE`, the result is always an array: +#' If passing `strict_shape=TRUE`, the result is always a matrix (if 2D) or array (if 3D or higher): #' - For normal predictions, the dimension is `[nrows, ngroups]`. #' - For `predcontrib=TRUE`, the dimension is `[nrows, ngroups, nfeats+1]`. #' - For `predinteraction=TRUE`, the dimension is `[nrows, ngroups, nfeats+1, nfeats+1]`. diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 429cf3f0422c..280fcf52ee3e 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -9,12 +9,13 @@ #' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the #' sorted-indices method (`tree_method = "exact"`), nor for the approximate method #' (`tree_method = "approx"`). +#' #' @param data Data from which to create a DMatrix, which can then be used for fitting models or #' for getting predictions out of a fitted model. #' -#' Supported input types are as follows:\itemize{ -#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`. -#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`. +#' Supported input types are as follows: +#' - `matrix` objects, with types `numeric`, `integer`, or `logical`. +#' - `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor` #' #' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1 #' encoding') will be converted inside the function call. Be aware that the encoding used for `factor` @@ -23,33 +24,14 @@ #' was constructed. #' #' Other column types are not supported. -#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`. -#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are **not** supported for -#' 'xgb.QuantileDMatrix'. -#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted -#' as a single row (only when making predictions from a fitted model). -#' \item Text files in a supported format, passed as a `character` variable containing the URI path to -#' the file, with an optional format specifier. -#' -#' These are **not** supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{ -#' \item XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()]. -#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix -#' `?format=libsvm` at the end of the file path. It will be the default format if not -#' otherwise specified. -#' \item CSV files (comma-separated values). This format can be specified by adding suffix -#' `?format=csv` at the end ofthe file path. It will **not** be auto-deduced from file extensions. -#' } +#' - CSR matrices, as class `dgRMatrix` from package `Matrix`. +#' - CSC matrices, as class `dgCMatrix` from package `Matrix`. #' -#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv', -#' it will not look at the extension or file contents to determine that it is a comma-separated value. -#' Instead, the format must be specified following the URI format, so the input to `data` should be passed -#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column -#' corresponds to the labels). +#' These are **not** supported by `xgb.QuantileDMatrix`. +#' - XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()]. +#' - Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted +#' as a single row (only when making predictions from a fitted model). #' -#' For more information about passing text files as input, see the articles -#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and -#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}. -#' } #' @param label Label of the training data. For classification problems, should be passed encoded as #' integers with numeration starting at zero. #' @param weight Weight for each instance. @@ -95,15 +77,9 @@ #' @param label_lower_bound Lower bound for survival training. #' @param label_upper_bound Upper bound for survival training. #' @param feature_weights Set feature weights for column sampling. -#' @param data_split_mode When passing a URI (as R `character`) as input, this signals -#' whether to split by row or column. Allowed values are `"row"` and `"col"`. -#' -#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on -#' how the file was split beforehand. Default to row. -#' -#' This is not used when `data` is not a URI. -#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional -#' subclass 'xgb.QuantileDMatrix'. +#' @param data_split_mode Not used yet. This parameter is for distributed training, which is not yet available for the R package. +#' @return An 'xgb.DMatrix' object. If calling `xgb.QuantileDMatrix`, it will have additional +#' subclass `xgb.QuantileDMatrix`. #' #' @details #' Note that DMatrix objects are not serializable through R functions such as [saveRDS()] or [save()]. @@ -145,6 +121,9 @@ xgb.DMatrix <- function( if (!is.null(group) && !is.null(qid)) { stop("Either one of 'group' or 'qid' should be NULL") } + if (data_split_mode != "row") { + stop("'data_split_mode' is not supported yet.") + } nthread <- as.integer(NVL(nthread, -1L)) if (typeof(data) == "character") { if (length(data) > 1) { diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R index f9d892caa1e5..2c4015c5f2de 100644 --- a/R-package/R/xgb.create.features.R +++ b/R-package/R/xgb.create.features.R @@ -86,7 +86,7 @@ #' @export xgb.create.features <- function(model, data, ...) { check.deprecation(...) - pred_with_leaf <- predict(model, data, predleaf = TRUE) + pred_with_leaf <- predict.xgb.Booster(model, data, predleaf = TRUE) cols <- lapply(as.data.frame(pred_with_leaf), factor) cbind(data, sparse.model.matrix(~ . -1, cols)) # nolint } diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index 443020e1ac7e..4184c6f5ea6a 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -16,7 +16,7 @@ #' @param target_class Only relevant for multiclass models. The default (`NULL`) #' averages the SHAP values over all classes. Pass a (0-based) class index #' to show only SHAP values of that class. -#' @param approxcontrib Passed to `predict()` when `shap_contrib = NULL`. +#' @param approxcontrib Passed to [predict.xgb.Booster()] when `shap_contrib = NULL`. #' @param subsample Fraction of data points randomly picked for plotting. #' The default (`NULL`) will use up to 100k data points. #' @param n_col Number of columns in a grid of plots. @@ -353,7 +353,7 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, } if (is.null(shap_contrib)) { - shap_contrib <- predict( + shap_contrib <- predict.xgb.Booster( model, newdata = data, predcontrib = TRUE, diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 48a81fab34d8..c22752a3f506 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -949,6 +949,243 @@ xgboost <- function( return(model) } +#' @title Compute predictions from XGBoost model on new data +#' @description Predict values on data based on XGBoost model. +#' @param object An XGBoost model object of class `xgboost`, as produced by function [xgboost()]. +#' +#' Note that there is also a lower-level [predict.xgb.Booster()] method for models of class +#' `xgb.Booster` as produced by [xgb.train()], which can also be used for `xgboost` class models as +#' an alternative that performs fewer validations and post-processings. +#' @param newdata Data on which to compute predictions from the model passed in `object`. Supported +#' input classes are: +#' - Data Frames (class `data.frame` from base R and subclasses like `data.table`). +#' - Matrices (class `matrix` from base R). +#' - Sparse matrices from package `Matrix`, either as class `dgRMatrix` (CSR) or `dgCMatrix` (CSC). +#' - Sparse vectors from package `Matrix`, which will be interpreted as containing a single +#' observation. +#' +#' In the case of data frames, if there are any categorical features, they should be of class +#' `factor` and should have the same levels as the `factor` columns of the data from which the model +#' was constructed. +#' +#' If there are named columns and the model was fitted to data with named columns, they will be +#' matched by name by default (see `validate_features`). +#' @param type Type of prediction to make. Supported options are: +#' - `"response"`: will output model predictions on the scale of the response variable (e.g. +#' probabilities of belonging to the last class in the case of binary classification). Result will +#' be either a numeric vector with length matching to rows in `newdata`, or a numeric matrix with +#' shape `[nrows(newdata), nscores]` (for objectives that produce more than one score per +#' observation such as multi-class classification or multi-quantile regression). +#' - `"raw"`: will output the unprocessed boosting scores (e.g. log-odds in the case of objective +#' `binary:logistic`). Same output shape and type as for `"response"`. +#' - `"class"`: will output the class with the highest predicted probability, returned as a `factor` +#' (only applicable to classification objectives) with length matching to rows in `newdata`. +#' - `"leaf"`: will output the terminal node indices of each observation across each tree, as an +#' integer matrix of shape `[nrows(newdata), ntrees]`, or as an integer array with an extra one or +#' two dimensions, up to `[nrows(newdata), ntrees, nscores, n_parallel_trees]` for models that +#' produce more than one score per tree and/or which have more than one parallel tree (e.g. +#' random forests). +#' +#' Only applicable to tree-based boosters (not `gblinear`). +#' - `"contrib"`: will produce per-feature contribution estimates towards the model score for a +#' given observation, based on SHAP values. The contribution values are on the scale of +#' untransformed margin (e.g., for binary classification, the values are log-odds deviations from +#' the baseline). +#' +#' Output will be a numeric matrix with shape `[nrows, nfeatures+1]`, with the intercept being the +#' last feature, or a numeric array with shape `[nrows, nscores, nfeatures+1]` if the model +#' produces more than one score per observation. +#' - `"interaction"`: similar to `"contrib"`, but computing SHAP values of contributions of +#' interaction of each pair of features. Note that this operation might be rather expensive in +#' terms of compute and memory. +#' +#' Since it quadratically depends on the number of features, it is recommended to perform +#' selection of the most important features first. +#' +#' Output will be a numeric array of shape `[nrows, nfeatures+1, nfeatures+1]`, or shape +#' `[nrows, nscores, nfeatures+1, nfeatures+1]` (for objectives that produce more than one score +#' per observation). +#' @param base_margin Base margin used for boosting from existing model (raw score that gets added to +#' all observations independently of the trees in the model). +#' +#' If supplied, should be either a vector with length equal to the number of rows in `newdata` +#' (for objectives which produces a single score per observation), or a matrix with number of +#' rows matching to the number rows in `newdata` and number of columns matching to the number +#' of scores estimated by the model (e.g. number of classes for multi-class classification). +#' @param iteration_range Sequence of rounds/iterations from the model to use for prediction, specified by passing +#' a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e. +#' base-1 indexing, and inclusive of both ends). +#' +#' For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will +#' predict using only the first one. +#' +#' If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all +#' of the iterations (rounds) otherwise. +#' +#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not. +#' +#' Not applicable to `gblinear` booster. +#' @param validate_features Validate that the feature names in the data match to the feature names +#' in the column, and reorder them in the data otherwise. +#' +#' If passing `FALSE`, it is assumed that the feature names and types are the same, +#' and come in the same order as in the training data. +#' +#' Be aware that this only applies to column names and not to factor levels in categorical columns. +#' +#' Note that this check might add some sizable latency to the predictions, so it's +#' recommended to disable it for performance-sensitive applications. +#' @param ... Not used. +#' @return Either a numeric vector (for 1D outputs), numeric matrix (for 2D outputs), numeric array +#' (for 3D and higher), or `factor` (for class predictions). See documentation for parameter `type` +#' for details about what the output type and shape will be. +#' @method predict xgboost +#' @export +#' @examples +#' data("ToothGrowth") +#' y <- ToothGrowth$supp +#' x <- ToothGrowth[, -2L] +#' model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) +#' pred_prob <- predict(model, x[1:5, ], type = "response") +#' pred_raw <- predict(model, x[1:5, ], type = "raw") +#' pred_class <- predict(model, x[1:5, ], type = "class") +#' +#' # Relationships between these +#' manual_probs <- 1 / (1 + exp(-pred_raw)) +#' manual_class <- ifelse(manual_probs < 0.5, levels(y)[1], levels(y)[2]) +#' +#' # They should match up to numerical precision +#' round(pred_prob, 6) == round(manual_probs, 6) +#' pred_class == manual_class +predict.xgboost <- function( + object, + newdata, + type = "response", + base_margin = NULL, + iteration_range = NULL, + validate_features = TRUE, + ... +) { + if (inherits(newdata, "xgb.DMatrix")) { + stop( + "Predictions on 'xgb.DMatrix' objects are not supported with 'xgboost' class.", + " Try 'xgb.train' or 'predict.xgb.Booster'." + ) + } + + outputmargin <- FALSE + predleaf <- FALSE + predcontrib <- FALSE + predinteraction <- FALSE + pred_class <- FALSE + strict_shape <- FALSE + allowed_types <- c( + "response", + "raw", + "class", + "leaf", + "contrib", + "interaction" + ) + type <- head(type, 1L) + if (!is.character(type) || !(type %in% allowed_types)) { + stop("'type' must be one of: ", paste(allowed_types, collapse = ", ")) + } + + if (type != "response") { + switch( + type, + "raw" = { + outputmargin <- TRUE + }, "class" = { + if (is.null(attributes(object)$metadata$y_levels)) { + stop("Prediction type 'class' is only for classification objectives.") + } + pred_class <- TRUE + outputmargin <- TRUE + }, "leaf" = { + predleaf <- TRUE + strict_shape <- TRUE # required for 3D and 4D outputs + }, "contrib" = { + predcontrib <- TRUE + }, "interaction" = { + predinteraction <- TRUE + } + ) + } + out <- predict.xgb.Booster( + object, + newdata, + outputmargin = outputmargin, + predleaf = predleaf, + predcontrib = predcontrib, + predinteraction = predinteraction, + iterationrange = iteration_range, + strict_shape = strict_shape, + validate_features = validate_features, + base_margin = base_margin + ) + + if (strict_shape) { + # Should only end up here for leaf predictions + out_dims <- dim(out) + dims_remove <- integer() + if (out_dims[3L] == 1L) { + dims_remove <- c(dims_remove, -3L) + } + if (length(out_dims) >= 4L && out_dims[4L] == 1L) { + dims_remove <- c(dims_remove, -4L) + } + if (length(dims_remove)) { + new_dimnames <- dimnames(out)[dims_remove] + dim(out) <- out_dims[dims_remove] + dimnames(out) <- new_dimnames + } + } + + if (pred_class) { + + if (is.null(dim(out))) { + out <- as.integer(out >= 0) + 1L + } else { + out <- max.col(out, ties.method = "first") + } + attr_out <- attributes(out) + attr_out$class <- "factor" + attr_out$levels <- attributes(object)$metadata$y_levels + attributes(out) <- attr_out + + } else if (NCOL(out) > 1L || (strict_shape && length(dim(out)) >= 3L)) { + + names_use <- NULL + if (NROW(attributes(object)$metadata$y_levels) > 2L) { + names_use <- attributes(object)$metadata$y_levels + } else if (NROW(attributes(object)$metadata$y_names)) { + names_use <- attributes(object)$metadata$y_names + } else if (NROW(attributes(object)$params$quantile_alpha) > 1L) { + names_use <- paste0("q", attributes(object)$params$quantile_alpha) + if (anyDuplicated(names_use)) { + warning("Cannot add quantile names to output due to clashes in their character conversions") + names_use <- NULL + } + } + if (NROW(names_use)) { + dimnames_out <- dimnames(out) + dim_with_names <- if (type == "leaf") 3L else 2L + dimnames_out[[dim_with_names]] <- names_use + .Call(XGSetArrayDimNamesInplace_R, out, dimnames_out) + } + + } + + return(out) +} + +#' @title Print info from XGBoost model +#' @description Prints basic properties of an XGBoost model object. +#' @param x An XGBoost model object of class `xgboost`, as produced by function [xgboost()]. +#' @param ... Not used. +#' @return Same object `x`, after printing its info. #' @method print xgboost #' @export print.xgboost <- function(x, ...) { diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index af90ddded197..4ce043799436 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -5,7 +5,7 @@ \title{Model Serialization and Compatibility} \description{ When it comes to serializing XGBoost models, it's possible to use R serializers such as -\code{\link[=save]{save()}} or \code{\link[=saveRDS]{saveRDS()}} to serialize an XGBoost R model, but XGBoost also provides +\code{\link[=save]{save()}} or \code{\link[=saveRDS]{saveRDS()}} to serialize an XGBoost model object, but XGBoost also provides its own serializers with better compatibility guarantees, which allow loading said models in other language bindings of XGBoost. @@ -35,14 +35,15 @@ the model was fit, or saving the R call that produced the model, but are otherwi not used for prediction / importance / plotting / etc. These R attributes are only preserved when using R's serializers. -In addition to the regular \code{xgb.Booster} objects producted by \code{\link[=xgb.train]{xgb.train()}}, the -function \code{\link[=xgboost]{xgboost()}} produces a different subclass \code{xgboost}, which keeps other -additional metadata as R attributes such as class names in classification problems, -and which has a dedicated \code{predict} method that uses different defaults. XGBoost's +In addition to the regular \code{xgb.Booster} objects produced by \code{\link[=xgb.train]{xgb.train()}}, the +function \code{\link[=xgboost]{xgboost()}} produces objects with a different subclass \code{xgboost} (which +inherits from \code{xgb.Booster}), which keeps other additional metadata as R attributes +such as class names in classification problems, and which has a dedicated \code{predict} +method that uses different defaults and takes different argument names. XGBoost's own serializers can work with this \code{xgboost} class, but as they do not keep R attributes, the resulting object, when deserialized, is downcasted to the regular \code{xgb.Booster} class (i.e. it loses the metadata, and the resulting object will use -\code{predict.xgb.Booster} instead of \code{predict.xgboost}) - for these \code{xgboost} objects, +\code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} instead of \code{\link[=predict.xgboost]{predict.xgboost()}}) - for these \code{xgboost} objects, \code{saveRDS} might thus be a better option if the extra functionalities are needed. Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and @@ -50,8 +51,8 @@ XGBoost models before version \verb{2.1.0}; have a very different R object struc are incompatible with each other. Hence, models that were saved with R serializers like \code{\link[=saveRDS]{saveRDS()}} or \code{\link[=save]{save()}} before version \verb{2.1.0} will not work with latter \code{xgboost} versions and vice versa. Be aware that the structure of R model objects -could in theory change again in the future, so XGBoost's serializers -should be preferred for long-term storage. +could in theory change again in the future, so XGBoost's serializers should be +preferred for long-term storage. Furthermore, note that using the package \code{qs} for serialization will require version 0.26 or higher of said package, and will have the same compatibility diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd index d97984e7fa48..5cdfed97f504 100644 --- a/R-package/man/predict.xgb.Booster.Rd +++ b/R-package/man/predict.xgb.Booster.Rd @@ -80,7 +80,9 @@ predict using only the first one. If passing \code{NULL}, will either stop at the best iteration if the model used early stopping, or use all of the iterations (rounds) otherwise. -If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.} +If passing "all", will use all of the rounds regardless of whether the model had early stopping or not. + +Not applicable to \code{gblinear} booster.} \item{strict_shape}{Whether to always return an array with the same dimensions for the given prediction mode regardless of the model type - meaning that, for example, both a multi-class and a binary classification @@ -118,7 +120,13 @@ and come in the same order as in the training data. Note that this check might add some sizable latency to the predictions, so it's recommended to disable it for performance-sensitive applications.} -\item{base_margin}{Base margin used for boosting from existing model. +\item{base_margin}{Base margin used for boosting from existing model (raw score that gets added to +all observations independently of the trees in the model). + +If supplied, should be either a vector with length equal to the number of rows in \code{newdata} +(for objectives which produces a single score per observation), or a matrix with number of +rows matching to the number rows in \code{newdata} and number of columns matching to the number +of scores estimated by the model (e.g. number of classes for multi-class classification). Note that, if \code{newdata} is an \code{xgb.DMatrix} object, this argument will be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as @@ -141,6 +149,9 @@ Note that objective variant \code{multi:softmax} defaults towards predicting mos For multi-class / multi-target, they will be arranged so that columns in the output will have the leafs from one group followed by leafs of the other group (e.g. order will be \code{group1:feat1}, \code{group1:feat2}, ..., \code{group2:feat1}, \code{group2:feat2}, ...). + +If there is more than one parallel tree (e.g. random forests), the parallel trees will be the +last grouping in the resulting order, which will still be 2D. \item For \code{predcontrib}: when not multi-class / multi-target, a matrix with dimensions \verb{[nrows, nfeats+1]}. The last "+ 1" column corresponds to the baseline value. @@ -157,7 +168,7 @@ dimension should produce practically the same result as \code{predcontrib = TRUE For multi-class and multi-target, will be a 4D array with dimensions \verb{[nrows, ngroups, nfeats+1, nfeats+1]} } -If passing \code{strict_shape=FALSE}, the result is always an array: +If passing \code{strict_shape=TRUE}, the result is always a matrix (if 2D) or array (if 3D or higher): \itemize{ \item For normal predictions, the dimension is \verb{[nrows, ngroups]}. \item For \code{predcontrib=TRUE}, the dimension is \verb{[nrows, ngroups, nfeats+1]}. diff --git a/R-package/man/predict.xgboost.Rd b/R-package/man/predict.xgboost.Rd new file mode 100644 index 000000000000..15e75965aaa6 --- /dev/null +++ b/R-package/man/predict.xgboost.Rd @@ -0,0 +1,138 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost.R +\name{predict.xgboost} +\alias{predict.xgboost} +\title{Compute predictions from XGBoost model on new data} +\usage{ +\method{predict}{xgboost}( + object, + newdata, + type = "response", + base_margin = NULL, + iteration_range = NULL, + validate_features = TRUE, + ... +) +} +\arguments{ +\item{object}{An XGBoost model object of class \code{xgboost}, as produced by function \code{\link[=xgboost]{xgboost()}}. + +Note that there is also a lower-level \code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} method for models of class +\code{xgb.Booster} as produced by \code{\link[=xgb.train]{xgb.train()}}, which can also be used for \code{xgboost} class models as +an alternative that performs fewer validations and post-processings.} + +\item{newdata}{Data on which to compute predictions from the model passed in \code{object}. Supported +input classes are: +\itemize{ +\item Data Frames (class \code{data.frame} from base R and subclasses like \code{data.table}). +\item Matrices (class \code{matrix} from base R). +\item Sparse matrices from package \code{Matrix}, either as class \code{dgRMatrix} (CSR) or \code{dgCMatrix} (CSC). +\item Sparse vectors from package \code{Matrix}, which will be interpreted as containing a single +observation. +} + +In the case of data frames, if there are any categorical features, they should be of class +\code{factor} and should have the same levels as the \code{factor} columns of the data from which the model +was constructed. + +If there are named columns and the model was fitted to data with named columns, they will be +matched by name by default (see \code{validate_features}).} + +\item{type}{Type of prediction to make. Supported options are: +\itemize{ +\item \code{"response"}: will output model predictions on the scale of the response variable (e.g. +probabilities of belonging to the last class in the case of binary classification). Result will +be either a numeric vector with length matching to rows in \code{newdata}, or a numeric matrix with +shape \verb{[nrows(newdata), nscores]} (for objectives that produce more than one score per +observation such as multi-class classification or multi-quantile regression). +\item \code{"raw"}: will output the unprocessed boosting scores (e.g. log-odds in the case of objective +\code{binary:logistic}). Same output shape and type as for \code{"response"}. +\item \code{"class"}: will output the class with the highest predicted probability, returned as a \code{factor} +(only applicable to classification objectives) with length matching to rows in \code{newdata}. +\item \code{"leaf"}: will output the terminal node indices of each observation across each tree, as an +integer matrix of shape \verb{[nrows(newdata), ntrees]}, or as an integer array with an extra one or +two dimensions, up to \verb{[nrows(newdata), ntrees, nscores, n_parallel_trees]} for models that +produce more than one score per tree and/or which have more than one parallel tree (e.g. +random forests). + +Only applicable to tree-based boosters (not \code{gblinear}). +\item \code{"contrib"}: will produce per-feature contribution estimates towards the model score for a +given observation, based on SHAP values. The contribution values are on the scale of +untransformed margin (e.g., for binary classification, the values are log-odds deviations from +the baseline). + +Output will be a numeric matrix with shape \verb{[nrows, nfeatures+1]}, with the intercept being the +last feature, or a numeric array with shape \verb{[nrows, nscores, nfeatures+1]} if the model +produces more than one score per observation. +\item \code{"interaction"}: similar to \code{"contrib"}, but computing SHAP values of contributions of +interaction of each pair of features. Note that this operation might be rather expensive in +terms of compute and memory. + +Since it quadratically depends on the number of features, it is recommended to perform +selection of the most important features first. + +Output will be a numeric array of shape \verb{[nrows, nfeatures+1, nfeatures+1]}, or shape +\verb{[nrows, nscores, nfeatures+1, nfeatures+1]} (for objectives that produce more than one score +per observation). +}} + +\item{base_margin}{Base margin used for boosting from existing model (raw score that gets added to +all observations independently of the trees in the model). + +If supplied, should be either a vector with length equal to the number of rows in \code{newdata} +(for objectives which produces a single score per observation), or a matrix with number of +rows matching to the number rows in \code{newdata} and number of columns matching to the number +of scores estimated by the model (e.g. number of classes for multi-class classification).} + +\item{iteration_range}{Sequence of rounds/iterations from the model to use for prediction, specified by passing +a two-dimensional vector with the start and end numbers in the sequence (same format as R's \code{seq} - i.e. +base-1 indexing, and inclusive of both ends). + +For example, passing \code{c(1,20)} will predict using the first twenty iterations, while passing \code{c(1,1)} will +predict using only the first one. + +If passing \code{NULL}, will either stop at the best iteration if the model used early stopping, or use all +of the iterations (rounds) otherwise. + +If passing "all", will use all of the rounds regardless of whether the model had early stopping or not. + +Not applicable to \code{gblinear} booster.} + +\item{validate_features}{Validate that the feature names in the data match to the feature names +in the column, and reorder them in the data otherwise. + +If passing \code{FALSE}, it is assumed that the feature names and types are the same, +and come in the same order as in the training data. + +Be aware that this only applies to column names and not to factor levels in categorical columns. + +Note that this check might add some sizable latency to the predictions, so it's +recommended to disable it for performance-sensitive applications.} + +\item{...}{Not used.} +} +\value{ +Either a numeric vector (for 1D outputs), numeric matrix (for 2D outputs), numeric array +(for 3D and higher), or \code{factor} (for class predictions). See documentation for parameter \code{type} +for details about what the output type and shape will be. +} +\description{ +Predict values on data based on XGBoost model. +} +\examples{ +data("ToothGrowth") +y <- ToothGrowth$supp +x <- ToothGrowth[, -2L] +model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) +pred_prob <- predict(model, x[1:5, ], type = "response") +pred_raw <- predict(model, x[1:5, ], type = "raw") +pred_class <- predict(model, x[1:5, ], type = "class") + +# Relationships between these +manual_probs <- 1 / (1 + exp(-pred_raw)) +manual_class <- ifelse(manual_probs < 0.5, levels(y)[1], levels(y)[2]) + +# They should match up to numerical precision +round(pred_prob, 6) == round(manual_probs, 6) +pred_class == manual_class +} diff --git a/R-package/man/print.xgboost.Rd b/R-package/man/print.xgboost.Rd new file mode 100644 index 000000000000..235f3e36bdd0 --- /dev/null +++ b/R-package/man/print.xgboost.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost.R +\name{print.xgboost} +\alias{print.xgboost} +\title{Print info from XGBoost model} +\usage{ +\method{print}{xgboost}(x, ...) +} +\arguments{ +\item{x}{An XGBoost model object of class \code{xgboost}, as produced by function \code{\link[=xgboost]{xgboost()}}.} + +\item{...}{Not used.} +} +\value{ +Same object \code{x}, after printing its info. +} +\description{ +Prints basic properties of an XGBoost model object. +} diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 2cfa2e713038..23a24dec4226 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -45,9 +45,11 @@ xgb.QuantileDMatrix( \item{data}{Data from which to create a DMatrix, which can then be used for fitting models or for getting predictions out of a fitted model. -Supported input types are as follows:\itemize{ +Supported input types are as follows: +\itemize{ \item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}. -\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}. +\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor} +} Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1 encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor} @@ -56,32 +58,16 @@ responsibility to ensure that factor columns have the same levels as the ones fr was constructed. Other column types are not supported. +\itemize{ \item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}. -\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. These are \strong{not} supported for -'xgb.QuantileDMatrix'. -\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted -as a single row (only when making predictions from a fitted model). -\item Text files in a supported format, passed as a \code{character} variable containing the URI path to -the file, with an optional format specifier. - -These are \strong{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{ -\item XGBoost's own binary format for DMatrices, as produced by \code{\link[=xgb.DMatrix.save]{xgb.DMatrix.save()}}. -\item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix -\code{?format=libsvm} at the end of the file path. It will be the default format if not -otherwise specified. -\item CSV files (comma-separated values). This format can be specified by adding suffix -\code{?format=csv} at the end ofthe file path. It will \strong{not} be auto-deduced from file extensions. +\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. } -Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv', -it will not look at the extension or file contents to determine that it is a comma-separated value. -Instead, the format must be specified following the URI format, so the input to \code{data} should be passed -like this: \code{"file.csv?format=csv"} (or \code{"file.csv?format=csv&label_column=0"} if the first column -corresponds to the labels). - -For more information about passing text files as input, see the articles -\href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and -\href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}. +These are \strong{not} supported by \code{xgb.QuantileDMatrix}. +\itemize{ +\item XGBoost's own binary format for DMatrices, as produced by \code{\link[=xgb.DMatrix.save]{xgb.DMatrix.save()}}. +\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted +as a single row (only when making predictions from a fitted model). }} \item{label}{Label of the training data. For classification problems, should be passed encoded as @@ -144,13 +130,7 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h \item{feature_weights}{Set feature weights for column sampling.} -\item{data_split_mode}{When passing a URI (as R \code{character}) as input, this signals -whether to split by row or column. Allowed values are \code{"row"} and \code{"col"}. - -In distributed mode, the file is split accordingly; otherwise this is only an indicator on -how the file was split beforehand. Default to row. - -This is not used when \code{data} is not a URI.} +\item{data_split_mode}{Not used yet. This parameter is for distributed training, which is not yet available for the R package.} \item{ref}{The training dataset that provides quantile information, needed when creating validation/test dataset with \code{\link[=xgb.QuantileDMatrix]{xgb.QuantileDMatrix()}}. Supplying the training DMatrix @@ -163,8 +143,8 @@ applied to the validation/test data} This is only supported when constructing a QuantileDMatrix.} } \value{ -An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional -subclass 'xgb.QuantileDMatrix'. +An 'xgb.DMatrix' object. If calling \code{xgb.QuantileDMatrix}, it will have additional +subclass \code{xgb.QuantileDMatrix}. } \description{ Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd index f4f51059d653..969a7d103c62 100644 --- a/R-package/man/xgb.plot.shap.Rd +++ b/R-package/man/xgb.plot.shap.Rd @@ -54,7 +54,7 @@ Only used when \code{features = NULL}.} averages the SHAP values over all classes. Pass a (0-based) class index to show only SHAP values of that class.} -\item{approxcontrib}{Passed to \code{predict()} when \code{shap_contrib = NULL}.} +\item{approxcontrib}{Passed to \code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} when \code{shap_contrib = NULL}.} \item{subsample}{Fraction of data points randomly picked for plotting. The default (\code{NULL}) will use up to 100k data points.} diff --git a/R-package/man/xgb.plot.shap.summary.Rd b/R-package/man/xgb.plot.shap.summary.Rd index f6df2daca758..b72c560b3769 100644 --- a/R-package/man/xgb.plot.shap.summary.Rd +++ b/R-package/man/xgb.plot.shap.summary.Rd @@ -51,7 +51,7 @@ Only used when \code{features = NULL}.} averages the SHAP values over all classes. Pass a (0-based) class index to show only SHAP values of that class.} -\item{approxcontrib}{Passed to \code{predict()} when \code{shap_contrib = NULL}.} +\item{approxcontrib}{Passed to \code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} when \code{shap_contrib = NULL}.} \item{subsample}{Fraction of data points randomly picked for plotting. The default (\code{NULL}) will use up to 100k data points.} diff --git a/R-package/tests/testthat/test_xgboost.R b/R-package/tests/testthat/test_xgboost.R index a4ac658a11b8..8f0c1e7ba9a7 100644 --- a/R-package/tests/testthat/test_xgboost.R +++ b/R-package/tests/testthat/test_xgboost.R @@ -1,5 +1,8 @@ library(survival) library(data.table) +data("iris") +data("mtcars") +data("ToothGrowth") test_that("Auto determine objective", { y_num <- seq(1, 10) @@ -621,3 +624,324 @@ test_that("Whole function works", { expect_true(any(grepl("Number of iterations: 5", txt, fixed = TRUE))) expect_true(any(grepl("Number of features: 8", txt, fixed = TRUE))) }) + +test_that("Can predict probabilities and raw scores", { + y <- ToothGrowth$supp + x <- ToothGrowth[, -2L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + pred_prob <- predict(model, x, type = "response") + pred_raw <- predict(model, x, type = "raw") + expect_true(is.vector(pred_prob)) + expect_equal(length(pred_prob), nrow(x)) + expect_true(min(pred_prob) >= 0) + expect_true(max(pred_prob) <= 1) + + expect_equal(length(pred_raw), nrow(x)) + expect_true(is.vector(pred_raw)) + expect_true(min(pred_raw) < 0) + expect_true(max(pred_raw) > 0) + + expect_equal( + pred_prob, + 1 / (1 + exp(-pred_raw)), + tolerance = 1e-6 + ) +}) + +test_that("Can predict class", { + y <- iris$Species + x <- iris[, -5L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + pred_class <- predict(model, x, type = "class") + expect_true(is.factor(pred_class)) + expect_equal(levels(pred_class), levels(y)) + + y <- ToothGrowth$supp + x <- ToothGrowth[, -2L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + pred_class <- predict(model, x, type = "class") + expect_true(is.factor(pred_class)) + expect_equal(levels(pred_class), levels(y)) + + probs <- predict(model, x, type = "response") + expect_true(all(pred_class[probs >= 0.5] == levels(y)[[2L]])) + expect_true(all(pred_class[probs < 0.5] == levels(y)[[1L]])) + + # Check that it fails for regression models + y <- mtcars$mpg + x <- mtcars[, -1L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + expect_error({ + predict(model, x, type = "class") + }) +}) + +test_that("Metadata survives serialization", { + y <- iris$Species + x <- iris[, -5L] + model_fresh <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + temp_file <- file.path(tempdir(), "xgb_model.Rds") + saveRDS(model_fresh, temp_file) + model <- readRDS(temp_file) + pred_class <- predict(model, x, type = "class") + expect_true(is.factor(pred_class)) + expect_equal(levels(pred_class), levels(y)) +}) + +test_that("Column names aren't added when not appropriate", { + pred_types <- c( + "response", + "raw", + "leaf" + ) + for (pred_type in pred_types) { + y <- mtcars$mpg + x <- mtcars[, -1L] + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 3L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = 0.5 + ) + pred <- predict(model, x, type = pred_type) + if (pred_type %in% c("raw", "response")) { + expect_true(is.vector(pred)) + } else { + expect_true(length(dim(pred)) >= 2L) + expect_null(colnames(pred)) + } + + y <- ToothGrowth$supp + x <- ToothGrowth[, -2L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + pred <- predict(model, x, type = pred_type) + if (pred_type %in% c("raw", "response")) { + expect_true(is.vector(pred)) + } else { + expect_true(length(dim(pred)) >= 2L) + expect_null(colnames(pred)) + } + } +}) + +test_that("Column names from multiclass are added to non-class predictions", { + y <- iris$Species + x <- iris[, -5L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + + pred_types_with_colnames <- c( + "response", + "raw", + "contrib", + "interaction" + ) + + for (pred_type in pred_types_with_colnames) { + pred <- predict(model, x, type = pred_type) + expect_equal(nrow(pred), nrow(x)) + expect_equal(ncol(pred), 3L) + expect_equal(colnames(pred), levels(y)) + } +}) + +test_that("Column names from multitarget are added to predictions", { + y <- data.frame( + ylog = log(mtcars$mpg), + ysqrt = sqrt(mtcars$mpg) + ) + x <- mtcars[, -1L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L) + + pred_types_with_colnames <- c( + "response", + "raw", + "contrib", + "interaction" + ) + + for (pred_type in pred_types_with_colnames) { + pred <- predict(model, x, type = pred_type) + expect_equal(nrow(pred), nrow(x)) + expect_equal(ncol(pred), 2L) + expect_equal(colnames(pred), colnames(y)) + } +}) + +test_that("Column names from multiquantile are added to predictions", { + y <- mtcars$mpg + x <- mtcars[, -1L] + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 3L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = c(0.25, 0.5, 0.75) + ) + + pred_types_with_colnames <- c( + "response", + "raw", + "contrib", + "interaction" + ) + + for (pred_type in pred_types_with_colnames) { + pred <- predict(model, x, type = pred_type) + expect_equal(nrow(pred), nrow(x)) + expect_equal(ncol(pred), 3L) + expect_equal(colnames(pred), c("q0.25", "q0.5", "q0.75")) + } +}) + +test_that("Leaf predictions have multiple dimensions when needed", { + # single score, multiple trees + y <- mtcars$mpg + x <- mtcars[, -1L] + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 4L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = 0.5 + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 4L)) + expect_equal(row.names(pred), row.names(x)) + expect_null(colnames(pred)) + + # single score, single tree + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 1L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = 0.5 + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L)) + expect_equal(row.names(pred), row.names(x)) + expect_null(colnames(pred)) + + # multiple score, multiple trees + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 4L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = c(0.25, 0.5, 0.75) + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 4L, 3L)) + expect_equal(row.names(pred), row.names(x)) + expect_null(colnames(pred)) + expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75")) + + # multiple score, single tree + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 1L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = c(0.25, 0.5, 0.75) + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L, 3L)) + expect_equal(row.names(pred), row.names(x)) + expect_null(colnames(pred)) + expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75")) + + # parallel trees, single tree, single score + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 1L, + max_depth = 2L, + objective = "count:poisson", + num_parallel_tree = 2L + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L, 2L)) + expect_equal(row.names(pred), row.names(x)) + expect_null(colnames(pred)) + expect_null(dimnames(pred)[[3L]]) + + # num_parallel_tree>1 + multiple scores is not supported at the moment so no test for it. +}) + +test_that("Column names from multiclass are added to leaf predictions", { + y <- iris$Species + x <- iris[, -5L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 4L, max_depth = 2L) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 4L, 3L)) + expect_equal(dimnames(pred)[[3L]], levels(y)) + + # Check also for a single tree + model <- xgboost(x, y, nthreads = 1L, nrounds = 1L, max_depth = 2L) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L, 3L)) + expect_equal(dimnames(pred)[[3L]], levels(y)) +}) + +test_that("Column names from multitarget are added to leaf predictions", { + y <- data.frame( + ylog = log(mtcars$mpg), + ysqrt = sqrt(mtcars$mpg) + ) + x <- mtcars[, -1L] + model <- xgboost(x, y, nthreads = 1L, nrounds = 4L, max_depth = 2L) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 4L, 2L)) + expect_equal(dimnames(pred)[[3L]], colnames(y)) + + # Check also for a single tree + model <- xgboost(x, y, nthreads = 1L, nrounds = 1L, max_depth = 2L) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L, 2L)) + expect_equal(dimnames(pred)[[3L]], colnames(y)) +}) + +test_that("Column names from multiquantile are added to leaf predictions", { + y <- mtcars$mpg + x <- mtcars[, -1L] + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 4L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = c(0.25, 0.5, 0.75) + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 4L, 3L)) + expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75")) + + # Check also for a single tree + model <- xgboost( + x, + y, + nthreads = 1L, + nrounds = 1L, + max_depth = 2L, + objective = "reg:quantileerror", + quantile_alpha = c(0.25, 0.5, 0.75) + ) + pred <- predict(model, x, type = "leaf") + expect_equal(dim(pred), c(nrow(x), 1L, 3L)) + expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75")) +}) diff --git a/demo/dask/dask_learning_to_rank.py b/demo/dask/dask_learning_to_rank.py new file mode 100644 index 000000000000..c08450fec56e --- /dev/null +++ b/demo/dask/dask_learning_to_rank.py @@ -0,0 +1,201 @@ +""" +Learning to rank with the Dask Interface +======================================== + + .. versionadded:: 3.0.0 + +This is a demonstration of using XGBoost for learning to rank tasks using the +MSLR_10k_letor dataset. For more infomation about the dataset, please visit its +`description page `_. + +See :ref:`ltr-dist` for a general description for distributed learning to rank and +:ref:`ltr-dask` for Dask-specific features. + +""" + +from __future__ import annotations + +import argparse +import os +from contextlib import contextmanager +from typing import Generator + +import dask +import numpy as np +from dask import dataframe as dd +from distributed import Client, LocalCluster, wait +from sklearn.datasets import load_svmlight_file + +from xgboost import dask as dxgb + + +def load_mslr_10k( + device: str, data_path: str, cache_path: str +) -> tuple[dd.DataFrame, dd.DataFrame, dd.DataFrame]: + """Load the MSLR10k dataset from data_path and save parquet files in the cache_path.""" + root_path = os.path.expanduser(args.data) + cache_path = os.path.expanduser(args.cache) + + # Use only the Fold1 for demo: + # Train, Valid, Test + # {S1,S2,S3}, S4, S5 + fold = 1 + + if not os.path.exists(cache_path): + os.mkdir(cache_path) + fold_path = os.path.join(root_path, f"Fold{fold}") + train_path = os.path.join(fold_path, "train.txt") + valid_path = os.path.join(fold_path, "vali.txt") + test_path = os.path.join(fold_path, "test.txt") + + X_train, y_train, qid_train = load_svmlight_file( + train_path, query_id=True, dtype=np.float32 + ) + columns = [f"f{i}" for i in range(X_train.shape[1])] + X_train = dd.from_array(X_train.toarray(), columns=columns) + y_train = y_train.astype(np.int32) + qid_train = qid_train.astype(np.int32) + + X_train["y"] = dd.from_array(y_train) + X_train["qid"] = dd.from_array(qid_train) + X_train.to_parquet(os.path.join(cache_path, "train"), engine="pyarrow") + + X_valid, y_valid, qid_valid = load_svmlight_file( + valid_path, query_id=True, dtype=np.float32 + ) + X_valid = dd.from_array(X_valid.toarray(), columns=columns) + y_valid = y_valid.astype(np.int32) + qid_valid = qid_valid.astype(np.int32) + + X_valid["y"] = dd.from_array(y_valid) + X_valid["qid"] = dd.from_array(qid_valid) + X_valid.to_parquet(os.path.join(cache_path, "valid"), engine="pyarrow") + + X_test, y_test, qid_test = load_svmlight_file( + test_path, query_id=True, dtype=np.float32 + ) + + X_test = dd.from_array(X_test.toarray(), columns=columns) + y_test = y_test.astype(np.int32) + qid_test = qid_test.astype(np.int32) + + X_test["y"] = dd.from_array(y_test) + X_test["qid"] = dd.from_array(qid_test) + X_test.to_parquet(os.path.join(cache_path, "test"), engine="pyarrow") + + df_train = dd.read_parquet( + os.path.join(cache_path, "train"), calculate_divisions=True + ) + df_valid = dd.read_parquet( + os.path.join(cache_path, "valid"), calculate_divisions=True + ) + df_test = dd.read_parquet( + os.path.join(cache_path, "test"), calculate_divisions=True + ) + + return df_train, df_valid, df_test + + +def ranking_demo(client: Client, args: argparse.Namespace) -> None: + """Learning to rank with data sorted locally.""" + df_tr, df_va, _ = load_mslr_10k(args.device, args.data, args.cache) + + X_train: dd.DataFrame = df_tr[df_tr.columns.difference(["y", "qid"])] + y_train = df_tr[["y", "qid"]] + Xy_train = dxgb.DaskQuantileDMatrix(client, X_train, y_train.y, qid=y_train.qid) + + X_valid: dd.DataFrame = df_va[df_va.columns.difference(["y", "qid"])] + y_valid = df_va[["y", "qid"]] + Xy_valid = dxgb.DaskQuantileDMatrix( + client, X_valid, y_valid.y, qid=y_valid.qid, ref=Xy_train + ) + # Upon training, you will see a performance warning about sorting data based on + # query groups. + dxgb.train( + client, + {"objective": "rank:ndcg", "device": args.device}, + Xy_train, + evals=[(Xy_train, "Train"), (Xy_valid, "Valid")], + num_boost_round=100, + ) + + +def ranking_wo_split_demo(client: Client, args: argparse.Namespace) -> None: + """Learning to rank with data partitioned according to query groups.""" + df_tr, df_va, df_te = load_mslr_10k(args.device, args.data, args.cache) + + X_tr = df_tr[df_tr.columns.difference(["y", "qid"])] + X_va = df_va[df_va.columns.difference(["y", "qid"])] + + # `allow_group_split=False` makes sure data is partitioned according to the query + # groups. + ltr = dxgb.DaskXGBRanker(allow_group_split=False, device=args.device) + ltr.client = client + ltr = ltr.fit( + X_tr, + df_tr.y, + qid=df_tr.qid, + eval_set=[(X_tr, df_tr.y), (X_va, df_va.y)], + eval_qid=[df_tr.qid, df_va.qid], + verbose=True, + ) + + df_te = df_te.persist() + wait([df_te]) + + X_te = df_te[df_te.columns.difference(["y", "qid"])] + predt = ltr.predict(X_te) + y = client.compute(df_te.y) + wait([predt, y]) + + +@contextmanager +def gen_client(device: str) -> Generator[Client, None, None]: + match device: + case "cuda": + from dask_cuda import LocalCUDACluster + + with LocalCUDACluster() as cluster: + with Client(cluster) as client: + with dask.config.set( + { + "array.backend": "cupy", + "dataframe.backend": "cudf", + } + ): + yield client + case "cpu": + with LocalCluster() as cluster: + with Client(cluster) as client: + yield client + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Demonstration of learning to rank using XGBoost." + ) + parser.add_argument( + "--data", + type=str, + help="Root directory of the MSLR-WEB10K data.", + required=True, + ) + parser.add_argument( + "--cache", + type=str, + help="Directory for caching processed data.", + required=True, + ) + parser.add_argument("--device", choices=["cpu", "cuda"], default="cpu") + parser.add_argument( + "--no-split", + action="store_true", + help="Flag to indicate query groups should not be split.", + ) + args = parser.parse_args() + + with gen_client(args.device) as client: + if args.no_split: + ranking_wo_split_demo(client, args) + else: + ranking_demo(client, args) diff --git a/demo/guide-python/cross_validation.py b/demo/guide-python/cross_validation.py index 4e537108aa1a..a33a16c36f04 100644 --- a/demo/guide-python/cross_validation.py +++ b/demo/guide-python/cross_validation.py @@ -2,6 +2,7 @@ Demo for using cross validation =============================== """ + import os import numpy as np @@ -83,9 +84,12 @@ def logregobj(preds, dtrain): def evalerror(preds, dtrain): labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) return "error", float(sum(labels != (preds > 0.0))) / len(labels) param = {"max_depth": 2, "eta": 1} # train with customized objective -xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror) +xgb.cv( + param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, custom_metric=evalerror +) diff --git a/demo/guide-python/learning_to_rank.py b/demo/guide-python/learning_to_rank.py index b131b31f76f6..fbc1f44baf50 100644 --- a/demo/guide-python/learning_to_rank.py +++ b/demo/guide-python/learning_to_rank.py @@ -12,8 +12,8 @@ train on relevance degree, and the second part simulates click data and enable the position debiasing training. -For an overview of learning to rank in XGBoost, please see -:doc:`Learning to Rank `. +For an overview of learning to rank in XGBoost, please see :doc:`Learning to Rank +`. """ from __future__ import annotations @@ -31,7 +31,7 @@ from xgboost.testing.data import RelDataCV, simulate_clicks, sort_ltr_samples -def load_mlsr_10k(data_path: str, cache_path: str) -> RelDataCV: +def load_mslr_10k(data_path: str, cache_path: str) -> RelDataCV: """Load the MSLR10k dataset from data_path and cache a pickle object in cache_path. Returns @@ -89,7 +89,7 @@ def load_mlsr_10k(data_path: str, cache_path: str) -> RelDataCV: def ranking_demo(args: argparse.Namespace) -> None: """Demonstration for learning to rank with relevance degree.""" - data = load_mlsr_10k(args.data, args.cache) + data = load_mslr_10k(args.data, args.cache) # Sort data according to query index X_train, y_train, qid_train = data.train @@ -123,7 +123,7 @@ def ranking_demo(args: argparse.Namespace) -> None: def click_data_demo(args: argparse.Namespace) -> None: """Demonstration for learning to rank with click data.""" - data = load_mlsr_10k(args.data, args.cache) + data = load_mslr_10k(args.data, args.cache) train, test = simulate_clicks(data) assert test is not None diff --git a/demo/json-model/json_parser.py b/demo/guide-python/model_parser.py similarity index 98% rename from demo/json-model/json_parser.py rename to demo/guide-python/model_parser.py index b744d9569aea..39a459613409 100644 --- a/demo/json-model/json_parser.py +++ b/demo/guide-python/model_parser.py @@ -1,4 +1,9 @@ -"""Demonstration for parsing JSON/UBJSON tree model file generated by XGBoost. +""" +Demonstration for parsing JSON/UBJSON tree model files +====================================================== + +See :doc:`/tutorials/saving_model` for details about the model serialization. + """ import argparse diff --git a/demo/json-model/README.md b/demo/json-model/README.md deleted file mode 100644 index 065d854f476a..000000000000 --- a/demo/json-model/README.md +++ /dev/null @@ -1,3 +0,0 @@ -We introduced initial support for saving XGBoost model in JSON format in 1.0.0. Note that -it's still experimental and under development, output schema is subject to change due to -bug fixes or further refactoring. For an overview, see https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html . \ No newline at end of file diff --git a/dev/prepare_jvm_release.py b/dev/prepare_jvm_release.py index 0b4594e2d2c0..c5a72724f707 100644 --- a/dev/prepare_jvm_release.py +++ b/dev/prepare_jvm_release.py @@ -203,7 +203,7 @@ def main(): ) print( "5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n" - " python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" + " python ops/script/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" " GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests -Dskip.native.build=true" ) print( diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py index 08bb2cbfaff2..bfcc813a0ef1 100644 --- a/dev/release-artifacts.py +++ b/dev/release-artifacts.py @@ -123,7 +123,7 @@ def make_python_sdist( with DirectoryExcursion(ROOT): with open("python-package/pyproject.toml", "r") as f: orig_pyproj_lines = f.read() - with open("tests/buildkite/remove_nccl_dep.patch", "r") as f: + with open("ops/patch/remove_nccl_dep.patch", "r") as f: patch_lines = f.read() subprocess.run( ["patch", "-p0"], input=patch_lines, check=True, text=True, encoding="utf-8" @@ -234,7 +234,7 @@ def check_path() -> None: def make_src_tarball(release: str, outdir: Path) -> Tuple[str, str]: - tarball_name = f"xgboost-{release}.tar.gz" + tarball_name = f"xgboost-src-{release}.tar.gz" tarball_path = outdir / tarball_name if tarball_path.exists(): tarball_path.unlink() @@ -301,7 +301,7 @@ def release_note( * xgboost_r_gpu_linux_{release}.tar.gz: [Download]({r_gpu_linux_url}) **Source tarball** -* xgboost.tar.gz: [Download]({src_tarball})""" +* {tarball_name}: [Download]({src_tarball})""" print(end_note) with open(outdir / "end_note.md", "w") as f: f.write(end_note) diff --git a/doc/changes/v2.1.0.rst b/doc/changes/v2.1.0.rst index 4a657c3a403f..3e2297c8a89d 100644 --- a/doc/changes/v2.1.0.rst +++ b/doc/changes/v2.1.0.rst @@ -1,3 +1,13 @@ +################################# +2.1.3 Patch Release (2024 Nov 26) +################################# + +The 2.1.3 patch release makes the following bug fixes: + +- [pyspark] Support large model size (#10984). +- Fix rng for the column sampler (#10998). +- Handle `cudf.pandas` proxy objects properly (#11014). + ################################# 2.1.2 Patch Release (2024 Oct 23) ################################# diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index af9e6556290c..d6effa0b09d4 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -14,11 +14,9 @@ project. ************** GitHub Actions ************** -The configuration files are located under the directory -`.github/workflows `_. - -Most of the tests listed in the configuration files run automatically for every incoming pull -requests and every update to branches. A few tests however require manual activation: +We make the extensive use of `GitHub Actions `_ to host our +CI pipelines. Most of the tests listed in the configuration files run automatically for every +incoming pull requests and every update to branches. A few tests however require manual activation: * R tests with ``noLD`` option: Run R tests using a custom-built R with compilation flag ``--disable-long-double``. See `this page `_ for more @@ -26,18 +24,29 @@ requests and every update to branches. A few tests however require manual activa To invoke this test suite for a particular pull request, simply add a review comment ``/gha run r-nold-test``. (Ordinary comment won't work. It needs to be a review comment.) -GitHub Actions is also used to build Python wheels targeting MacOS Intel and Apple Silicon. See -`.github/workflows/python_wheels.yml -`_. The -``python_wheels`` pipeline sets up environment variables prefixed ``CIBW_*`` to indicate the target -OS and processor. The pipeline then invokes the script ``build_python_wheels.sh``, which in turns -calls ``cibuildwheel`` to build the wheel. The ``cibuildwheel`` is a library that sets up a -suitable Python environment for each OS and processor target. Since we don't have Apple Silicon -machine in GitHub Actions, cross-compilation is needed; ``cibuildwheel`` takes care of the complex -task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call -``pip wheel``. Since XGBoost has a native library component, we created a customized build -backend that hooks into ``pip``. The customized backend contains the glue code to compile the native -library on the fly.) +******************************* +Self-Hosted Runners with RunsOn +******************************* + +`RunsOn `_ is a SaaS (Software as a Service) app that lets us to easily create +self-hosted runners to use with GitHub Actions pipelines. RunsOn uses +`Amazon Web Services (AWS) `_ under the hood to provision runners with +access to various amount of CPUs, memory, and NVIDIA GPUs. Thanks to this app, we are able to test +GPU-accelerated and distributed algorithms of XGBoost while using the familar interface of +GitHub Actions. + +In GitHub Actions, jobs run on Microsoft-hosted runners by default. +To opt into self-hosted runners (enabled by RunsOn), we use the following special syntax: + +.. code-block:: yaml + + runs-on: + - runs-on + - runner=runner-name + - run-id=${{ github.run_id }} + - tag=[unique tag that uniquely identifies the job in the GH Action workflow] + +where the runner is defined in ``.github/runs-on.yml``. ********************************************************* Reproduce CI testing environments using Docker containers @@ -49,116 +58,298 @@ You can reproduce the same testing environment as the CI pipelines by running Do Prerequisites ============= 1. Install Docker: https://docs.docker.com/engine/install/ubuntu/ -2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian +2. Install NVIDIA Docker runtime: + https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html. The runtime lets you access NVIDIA GPUs inside a Docker container. +.. _build_run_docker_locally: + ============================================== Building and Running Docker containers locally ============================================== -For your convenience, we provide the wrapper script ``tests/ci_build/ci_build.sh``. You can use it as follows: +For your convenience, we provide three wrapper scripts: + +* ``ops/docker_build.py``: Build a Docker container +* ``ops/docker_build.sh``: Wrapper for ``ops/docker_build.py`` with a more concise interface +* ``ops/docker_run.py``: Run a command inside a Docker container + +**To build a Docker container**, invoke ``docker_build.sh`` as follows: + +.. code-block:: bash + + export BRANCH_NAME="master" # Relevant for CI, for local testing, use "master" + bash ops/docker_build.sh CONTAINER_ID + +where ``CONTAINER_ID`` identifies for the container. The wrapper script will look up the YAML file +``ops/docker/ci_container.yml``. For example, when ``CONTAINER_ID`` is set to ``xgb-ci.gpu``, +the script will use the corresponding entry from ``ci_container.yml``: + +.. code-block:: yaml + + xgb-ci.gpu: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: "24.10" + +The ``container_def`` entry indicates where the Dockerfile is located. The container +definition will be fetched from ``ops/docker/dockerfile/Dockerfile.CONTAINER_DEF`` where +``CONTAINER_DEF`` is the value of ``container_def`` entry. In this example, the Dockerfile +is ``ops/docker/dockerfile/Dockerfile.gpu``. + +The ``build_args`` entry lists all the build arguments for the Docker build. In this example, +the build arguments are: + +.. code-block:: + + --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 + +The build arguments provide inputs to the ``ARG`` instructions in the Dockerfile. + +.. note:: Inspect the logs from the CI pipeline to find what's going on under the hood + + When invoked, ``ops/docker_build.sh`` logs the precise commands that it runs under the hood. + Using the example above: + + .. code-block:: bash + + # docker_build.sh calls docker_build.py... + python3 ops/docker_build.py --container-def gpu --container-id xgb-ci.gpu \ + --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 + + ... + + # .. and docker_build.py in turn calls "docker build"... + docker build --build-arg CUDA_VERSION_ARG=12.4.1 \ + --build-arg NCCL_VERSION_ARG=2.23.4-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 \ + --load --progress=plain \ + --ulimit nofile=1024000:1024000 \ + -t xgb-ci.gpu \ + -f ops/docker/dockerfile/Dockerfile.gpu \ + ops/ + + The logs come in handy when debugging the container builds. In addition, you can change + the build arguments to make changes to the container. + +**To run commands within a Docker container**, invoke ``docker_run.py`` as follows: + +.. code-block:: bash + + python3 ops/docker_run.py --container-id "ID of the container" [--use-gpus] \ + -- "command to run inside the container" + +where ``--use-gpus`` should be specified to expose NVIDIA GPUs to the Docker container. + +For example: .. code-block:: bash - tests/ci_build/ci_build.sh --use-gpus --build-arg \ - ... + # Run without GPU + python3 ops/docker_run.py --container-id xgb-ci.cpu \ + -- bash ops/script/build_via_cmake.sh + + # Run with NVIDIA GPU + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- bash ops/pipeline/test-python-wheel-impl.sh gpu + +The ``docker_run.py`` script will convert these commands to the following invocations +of ``docker run``: -where: +.. code-block:: bash + + docker run --rm --pid=host \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + xgb-ci.cpu \ + bash ops/script/build_via_cmake.sh -* ```` is the identifier for the container. The wrapper script will use the - container definition (Dockerfile) located at ``tests/ci_build/Dockerfile.``. - For example, setting the container type to ``gpu`` will cause the script to load the Dockerfile - ``tests/ci_build/Dockerfile.gpu``. -* Specify ``--use-gpus`` to run any GPU code. This flag will grant the container access to all NVIDIA GPUs in the base machine. Omit the flag if the access to GPUs is not necessary. -* ```` is a build argument to be passed to Docker. Must be of form ``VAR=VALUE``. - Example: ``--build-arg CUDA_VERSION_ARG=11.0``. You can pass multiple ``--build-arg``. -* ```` is the command to run inside the Docker container. This can be more than one argument. - Example: ``tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON``. + docker run --rm --pid=host --gpus all \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + xgb-ci.gpu \ + bash ops/pipeline/test-python-wheel-impl.sh gpu -Optionally, you can set the environment variable ``CI_DOCKER_EXTRA_PARAMS_INIT`` to pass extra -arguments to Docker. For example: +Optionally, you can specify ``--run-args`` to pass extra arguments to ``docker run``: .. code-block:: bash # Allocate extra space in /dev/shm to enable NCCL - export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - # Run multi-GPU test suite - tests/ci_build/ci_build.sh gpu --use-gpus --build-arg CUDA_VERSION_ARG=11.0 \ - tests/ci_build/test_python.sh mgpu + # Also run the container with elevated privileges + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--shm-size=4g --privileged' \ + -- bash ops/pipeline/test-python-wheel-impl.sh gpu + +which translates to + +.. code-block:: bash + + docker run --rm --pid=host --gpus all \ + -w /workspace -v /path/to/xgboost:/workspace \ + -e CI_BUILD_UID= -e CI_BUILD_USER= \ + -e CI_BUILD_GID= -e CI_BUILD_GROUP= \ + --shm-size=4g --privileged \ + xgb-ci.gpu \ + bash ops/pipeline/test-python-wheel-impl.sh gpu + +******************************************************************* +The Lay of the Land: how CI pipelines are organized in the codebase +******************************************************************* +The XGBoost project stores the configuration for its CI pipelines as part of the codebase. +The git repository therefore stores not only the change history for its source code but also +the change history for the CI pipelines. + +================= +File Organization +================= + +The CI pipelines are organized into the following directories and files: + +* ``.github/workflows/``: Definition of CI pipelines, using the GitHub Actions syntax +* ``.github/runs-on.yml``: Configuration for the RunsOn service. Specifies the spec for + the self-hosted CI runners. +* ``ops/conda_env/``: Definitions for Conda environments +* ``ops/packer/``: Packer scripts to build VM images for Amazon EC2 +* ``ops/patch/``: Patch files +* ``ops/pipeline/``: Shell scripts defining CI/CD pipelines. Most of these scripts can be run + locally (to assist with development and debugging); a few must run in the CI. +* ``ops/script/``: Various utility scripts useful for testing +* ``ops/docker/dockerfile/``: Dockerfiles to define containers +* ``ops/docker/ci_container.yml``: Defines the mapping between Dockerfiles and containers. + Also specifies the build arguments to be used with each container. See + :ref:`build_run_docker_locally` to learn how this YAML file is used in the context of + a container build. +* ``ops/docker_build.*``: Wrapper scripts to build and test CI containers. See + :ref:`build_run_docker_locally` for the detailed description. + +To inspect a given CI pipeline, inspect files in the following order: + +.. plot:: + :nofigs: + + from graphviz import Source + source = r""" + digraph ci_graph { + graph [fontname = "monospace"]; + node [fontname = "monospace"]; + edge [fontname = "monospace"]; + 0 [label=<.github/workflows/*.yml>, shape=box]; + 1 [label=, shape=box]; + 2 [label=, shape=box]; + 3 [label=, shape=box]; + 0 -> 1 [xlabel="Calls"]; + 1 -> 2 [xlabel="Calls,\nvia docker_run.py"]; + 2 -> 3 [xlabel="Calls"]; + 1 -> 3 [xlabel="Calls"]; + } + """ + Source(source, format='png').render('../_static/ci_graph', view=False) + Source(source, format='svg').render('../_static/ci_graph', view=False) + +.. figure:: ../_static/ci_graph.svg + :align: center + :figwidth: 80 % + +=================================== +Primitives used in the CI pipelines +=================================== + +------------------------ +Build and run containers +------------------------ + +See :ref:`build_run_docker_locally` to learn about the utility scripts for building and +using containers. + +**What's the relationship between the VM image (for Amazon EC2) and the container image?** +In ``ops/packer/`` directory, we define Packer scripts to build VM images for Amazon EC2. +The VM image contains the minimal set of drivers and system software that are needed to +run the containers. + +We update container images much more often than VM images. Whereas VM images are +updated sparingly (once in a few months), container images are updated each time a branch +or a pull request is updated. This way, developers can make changes to containers and +see the results of the changes immediately in the CI run. + +------------------------------------------ +Stash artifacts, to move them between jobs +------------------------------------------ + +This primitive is useful when one pipeline job needs to consume the output +from another job. +We use `Amazon S3 `_ to store the stashed files. + +**To stash a file**: + +.. code-block:: bash + + REMOTE_PREFIX="remote directory to place the artifact(s)" + bash ops/pipeline/stash-artifacts.sh stash "${REMOTE_PREFIX}" path/to/file + +The ``REMOTE_PREFIX`` argument, which is the second command-line argument +for ``stash-artifacts.sh``, specifies the remote directory in which the artifact(s) +should be placed. More precisely, the artifact(s) will be placed in +``s3://{RUNS_ON_S3_BUCKET_CACHE}/cache/{GITHUB_REPOSITORY}/stash/{GITHUB_RUN_ID}/{REMOTE_PREFIX}/`` +where ``RUNS_ON_S3_BUCKET_CACHE``, ``GITHUB_REPOSITORY``, and ``GITHUB_RUN_ID`` are set by +the CI. (RunsOn provisions an S3 bucket to stage cache, and its name is stored in the environment +variable ``RUNS_ON_S3_BUCKET_CACHE``.) + +You can upload multiple files, possibly with wildcard globbing: -To pass multiple extra arguments: +.. code-block:: bash + + REMOTE_PREFIX="build-cuda" + bash ops/pipeline/stash-artifacts.sh stash "${REMOTE_PREFIX}" \ + build/testxgboost python-package/dist/*.whl + +**To unstash a file**: + +.. code-block:: bash + + REMOTE_PREFIX="remote directory to place the artifact(s)" + bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" path/to/file + +You can also use the wildcard globbing. The script will download the matching artifacts +from the remote directory. .. code-block:: bash - export CI_DOCKER_EXTRA_PARAMS_INIT='-e VAR1=VAL1 -e VAR2=VAL2 -e VAR3=VAL3' - -******************************************** -Update pipeline definitions for BuildKite CI -******************************************** - -`BuildKite `_ is a SaaS (Software as a Service) platform that orchestrates -cloud machines to host CI pipelines. The BuildKite platform allows us to define CI pipelines as a -declarative YAML file. - -The pipeline definitions are found in ``tests/buildkite/``: - -* ``tests/buildkite/pipeline-win64.yml``: This pipeline builds and tests XGBoost for the Windows platform. -* ``tests/buildkite/pipeline-mgpu.yml``: This pipeline builds and tests XGBoost with access to multiple - NVIDIA GPUs. -* ``tests/buildkite/pipeline.yml``: This pipeline builds and tests XGBoost with access to a single - NVIDIA GPU. Most tests are located here. - -**************************************** -Managing Elastic CI Stack with BuildKite -**************************************** - -BuildKite allows us to define cloud resources in -a declarative fashion. Every configuration step is now documented explicitly as code. - -**Prerequisite**: You should have some knowledge of `CloudFormation `_. -CloudFormation lets us define a stack of cloud resources (EC2 machines, Lambda functions, S3 etc) using -a single YAML file. - -**Prerequisite**: Gain access to the XGBoost project's AWS account (``admin@xgboost-ci.net``), and then -set up a credential pair in order to provision resources on AWS. See -`Creating an IAM user in your AWS account `_. - -* Option 1. Give full admin privileges to your IAM user. This is the simplest option. -* Option 2. Give limited set of permissions to your IAM user, to reduce the possibility of messing up other resources. - For this, use the script ``tests/buildkite/infrastructure/service-user/create_service_user.py``. - -===================== -Worker Image Pipeline -===================== -Building images for worker machines used to be a chore: you'd provision an EC2 machine, SSH into it, and -manually install the necessary packages. This process is not only laborious but also error-prone. You may -forget to install a package or change a system configuration. - -No more. Now we have an automated pipeline for building images for worker machines. - -* Run ``tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py`` in order to provision - CloudFormation stacks named ``buildkite-linux-amd64-gpu-worker`` and ``buildkite-windows-gpu-worker``. They are - pipelines that create AMIs (Amazon Machine Images) for Linux and Windows workers, respectively. -* Navigate to the CloudFormation web console to verify that the image builder pipelines have been provisioned. It may - take some time. -* Once they pipelines have been fully provisioned, run the script - ``tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py`` to execute the pipelines. New AMIs will be - uploaded to the EC2 service. You can locate them in the EC2 console. -* Make sure to modify ``tests/buildkite/infrastructure/aws-stack-creator/metadata.py`` to use the correct AMI IDs. - (For ``linux-amd64-cpu`` and ``linux-arm64-cpu``, use the AMIs provided by BuildKite. Consult the ``AWSRegion2AMI`` - section of https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml.) - -====================== -EC2 Autoscaling Groups -====================== -In EC2, you can create auto-scaling groups, where you can dynamically adjust the number of worker instances according to -workload. When a pull request is submitted, the following steps take place: - -1. GitHub sends a signal to the registered webhook, which connects to the BuildKite server. -2. BuildKite sends a signal to a `Lambda `_ function named ``Autoscaling``. -3. The Lambda function sends a signal to the auto-scaling group. The group scales up and adds additional worker instances. -4. New worker instances run the test jobs. Test results are reported back to BuildKite. -5. When the test jobs complete, BuildKite sends a signal to ``Autoscaling``, which in turn requests the autoscaling group - to scale down. Idle worker instances are shut down. - -To set up the auto-scaling group, run the script ``tests/buildkite/infrastructure/aws-stack-creator/create_stack.py``. -Check the CloudFormation web console to verify successful provision of auto-scaling groups. + REMOTE_PREFIX="build-cuda" + # Download all files whose path matches the wildcard pattern python-package/dist/*.whl + bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" \ + python-package/dist/*.whl + +----------------------------------------- +Custom actions in ``dmlc/xgboost-devops`` +----------------------------------------- + +XGBoost implements a few custom +`composite actions `_ +to reduce duplicated code within workflow YAML files. The custom actions are hosted in a separate repository, +`dmlc/xgboost-devops `_, to make it easy to test changes to the custom actions in +a pull request or a fork. + +In a workflow file, we'd refer to ``dmlc/xgboost-devops/{custom-action}@main``. For example: + +.. code-block:: yaml + + - uses: dmlc/xgboost-devops/miniforge-setup@main + with: + environment-name: cpp_test + environment-file: ops/conda_env/cpp_test.yml + +Each custom action consists of two components: + +* Main script (``dmlc/xgboost-devops/{custom-action}/action.yml``): dispatches to a specific version + of the implementation script (see the next item). The main script clones ``xgboost-devops`` from + a specified fork at a particular ref, allowing us to easily test changes to the custom action. +* Implementation script (``dmlc/xgboost-devops/impls/{custom-action}/action.yml``): Implements the + custom script. + +This design was inspired by Mike Sarahan's work in +`rapidsai/shared-actions `_. diff --git a/doc/contrib/coding_guide.rst b/doc/contrib/coding_guide.rst index bf18ad08cf53..60b3c4a13bd2 100644 --- a/doc/contrib/coding_guide.rst +++ b/doc/contrib/coding_guide.rst @@ -107,7 +107,7 @@ C++ interface of the R package, please make corresponding changes in ``src/init. Generating the Package and Running Tests ======================================== -The source layout of XGBoost is a bit unusual to normal R packages as XGBoost is primarily written in C++ with multiple language bindings in mind. As a result, some special cares need to be taken to generate a standard R tarball. Most of the tests are being run on CI, and as a result, the best way to see how things work is by looking at the CI configuration files (GitHub action, at the time of writing). There are helper scripts in ``tests/ci_build`` and ``R-package/tests/helper_scripts`` for running various checks including linter and making the standard tarball. +The source layout of XGBoost is a bit unusual to normal R packages as XGBoost is primarily written in C++ with multiple language bindings in mind. As a result, some special cares need to be taken to generate a standard R tarball. Most of the tests are being run on CI, and as a result, the best way to see how things work is by looking at the CI configuration files (GitHub action, at the time of writing). There are helper scripts in ``ops/script`` and ``R-package/tests/helper_scripts`` for running various checks including linter and making the standard tarball. ********************************* Running Formatting Checks Locally @@ -127,7 +127,7 @@ To run checks for Python locally, install the checkers mentioned previously and .. code-block:: bash cd /path/to/xgboost/ - python ./tests/ci_build/lint_python.py --fix + python ./ops/script/lint_python.py --fix To run checks for R: @@ -135,21 +135,21 @@ To run checks for R: cd /path/to/xgboost/ R CMD INSTALL R-package/ - Rscript tests/ci_build/lint_r.R $(pwd) + Rscript ops/script/lint_r.R $(pwd) To run checks for cpplint locally: .. code-block:: bash cd /path/to/xgboost/ - python ./tests/ci_build/lint_cpp.py + python ./ops/script/lint_cpp.py See next section for clang-tidy. For CMake scripts: .. code-block:: bash - bash ./tests/ci_build/lint_cmake.sh + bash ./ops/script/lint_cmake.sh Lastly, the linter for jvm-packages is integrated into the maven build process. @@ -163,21 +163,21 @@ To run this check locally, run the following command from the top level source t .. code-block:: bash cd /path/to/xgboost/ - python3 tests/ci_build/tidy.py + python3 ops/script/run_clang_tidy.py Also, the script accepts two optional integer arguments, namely ``--cpp`` and ``--cuda``. By default they are both set to 1, meaning that both C++ and CUDA code will be checked. If the CUDA toolkit is not installed on your machine, you'll encounter an error. To exclude CUDA source from linting, use: .. code-block:: bash cd /path/to/xgboost/ - python3 tests/ci_build/tidy.py --cuda=0 + python3 ops/script/run_clang_tidy.py --cuda=0 Similarly, if you want to exclude C++ source from linting: .. code-block:: bash cd /path/to/xgboost/ - python3 tests/ci_build/tidy.py --cpp=0 + python3 ops/script/run_clang_tidy.py --cpp=0 ********************************** Guide for handling user input data diff --git a/doc/contrib/donate.rst b/doc/contrib/donate.rst index b6171c412c74..ba7c75a942f9 100644 --- a/doc/contrib/donate.rst +++ b/doc/contrib/donate.rst @@ -13,9 +13,9 @@ DMLC/XGBoost has grown from a research project incubated in academia to one of t A robust and efficient **continuous integration (CI)** infrastructure is one of the most critical solutions to address the above challenge. A CI service will monitor an open-source repository and run a suite of integration tests for every incoming contribution. This way, the CI ensures that every proposed change in the codebase is compatible with existing functionalities. Furthermore, XGBoost can enable more thorough tests with a powerful CI infrastructure to cover cases which are closer to the production environment. -There are several CI services available free to open source projects, such as Travis CI and AppVeyor. The XGBoost project already utilizes GitHub Actions. However, the XGBoost project has needs that these free services do not adequately address. In particular, the limited usage quota of resources such as CPU and memory leaves XGBoost developers unable to bring "too-intensive" tests. In addition, they do not offer test machines with GPUs for testing XGBoost-GPU code base which has been attracting more and more interest across many organizations. Consequently, the XGBoost project uses a cloud-hosted test farm. We use `BuildKite `_ to organize CI pipelines. +There are several CI services available free to open source projects, such as Travis CI and AppVeyor. The XGBoost project already utilizes GitHub Actions. However, the XGBoost project has needs that these free services do not adequately address. In particular, the limited usage quota of resources such as CPU and memory leaves XGBoost developers unable to bring "too-intensive" tests. In addition, they do not offer test machines with GPUs for testing XGBoost-GPU code base which has been attracting more and more interest across many organizations. Consequently, the XGBoost project uses a cloud-hosted test farm. We host `Amazon Web Services (AWS) `_ to host the test machines, along with `GitHub Actions `_ and `RunsOn `_ (SaaS app) to organize the CI pipelines. -The cloud-hosted test farm has recurring operating expenses. It utilizes a leading cloud provider (AWS) to accommodate variable workload. BuildKite launches worker machines on AWS on demand, to run the test suite on incoming contributions. To save cost, the worker machines are terminated when they are no longer needed. +The cloud-hosted test farm has recurring operating expenses. RunsOn launches worker machines on AWS on demand to run the test suite on incoming contributions. To save cost, the worker machines are terminated when they are no longer needed. To help defray the hosting cost, the XGBoost project seeks donations from third parties. @@ -29,9 +29,9 @@ The Project Management Committee (PMC) of the XGBoost project appointed `Open So All expenses incurred for hosting CI will be submitted to the fiscal host with receipts. Only the expenses in the following categories will be approved for reimbursement: -* Cloud expenses for the cloud test farm (https://buildkite.com/xgboost) +* Cloud expenses for the cloud test farm * Cost of domain https://xgboost-ci.net -* Monthly cost of using BuildKite +* Annual subscription for RunsOn * Hosting cost of the User Forum (https://discuss.xgboost.ai) Administration of cloud CI infrastructure diff --git a/doc/contrib/release.rst b/doc/contrib/release.rst index c0370b14ed42..4548b1ffa9a2 100644 --- a/doc/contrib/release.rst +++ b/doc/contrib/release.rst @@ -17,7 +17,7 @@ Making a Release ----------------- 1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue. -2. Create a release branch if this is a major release. Bump release version. There's a helper script ``tests/ci_build/change_version.py``. +2. Create a release branch if this is a major release. Bump release version. There's a helper script ``ops/script/change_version.py``. 3. Commit the change, create a PR on GitHub on release branch. Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``. 4. Create a tag on release branch, either on GitHub or locally. 5. Make a release on GitHub tag page, which might be done with previous step if the tag is created on GitHub. diff --git a/doc/contrib/unit_tests.rst b/doc/contrib/unit_tests.rst index aa58cd337020..857d7a067307 100644 --- a/doc/contrib/unit_tests.rst +++ b/doc/contrib/unit_tests.rst @@ -63,7 +63,7 @@ Run .. code-block:: bash - python ./tests/ci_build/test_r_package.py --task=check + python ./ops/script/test_r_package.py --task=check at the root of the project directory. The command builds and checks the XGBoost r-package. Alternatively, if you want to just run the tests, you can use the following diff --git a/doc/jvm/api.rst b/doc/jvm/api.rst index b9e7821aa6fa..3d56cb2c9aa4 100644 --- a/doc/jvm/api.rst +++ b/doc/jvm/api.rst @@ -5,4 +5,5 @@ API Docs for the JVM packages * `XGBoost4J Java API <../jvm_docs/javadocs/index.html>`_ * `XGBoost4J Scala API <../jvm_docs/scaladocs/xgboost4j/index.html>`_ * `XGBoost4J-Spark Scala API <../jvm_docs/scaladocs/xgboost4j-spark/index.html>`_ +* `XGBoost4J-Spark-GPU Scala API <../jvm_docs/scaladocs/xgboost4j-spark-gpu/index.html>`_ * `XGBoost4J-Flink Scala API <../jvm_docs/scaladocs/xgboost4j-flink/index.html>`_ diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst index a8999e119ab4..5398fb5d091f 100644 --- a/doc/python/python_api.rst +++ b/doc/python/python_api.rst @@ -37,6 +37,7 @@ Core Data Structure .. autoclass:: xgboost.Booster :members: :show-inheritance: + :special-members: __getitem__ .. autoclass:: xgboost.DataIter :members: diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst index 6e68d83a0083..036b1e725d47 100644 --- a/doc/tutorials/dask.rst +++ b/doc/tutorials/dask.rst @@ -355,15 +355,18 @@ Working with asyncio .. versionadded:: 1.2.0 -XGBoost's dask interface supports the new ``asyncio`` in Python and can be integrated into -asynchronous workflows. For using dask with asynchronous operations, please refer to -`this dask example `_ and document in -`distributed `_. To use XGBoost's -dask interface asynchronously, the ``client`` which is passed as an argument for training and -prediction must be operating in asynchronous mode by specifying ``asynchronous=True`` when the -``client`` is created (example below). All functions (including ``DaskDMatrix``) provided -by the functional interface will then return coroutines which can then be awaited to retrieve -their result. +XGBoost's dask interface supports the new :py:mod:`asyncio` in Python and can be +integrated into asynchronous workflows. For using dask with asynchronous operations, +please refer to `this dask example +`_ and document in `distributed +`_. To use XGBoost's Dask +interface asynchronously, the ``client`` which is passed as an argument for training and +prediction must be operating in asynchronous mode by specifying ``asynchronous=True`` when +the ``client`` is created (example below). All functions (including ``DaskDMatrix``) +provided by the functional interface will then return coroutines which can then be awaited +to retrieve their result. Please note that XGBoost is a compute-bounded application, where +parallelism is more important than concurrency. The support for `asyncio` is more about +compatibility instead of performance gain. Functional interface: @@ -526,6 +529,47 @@ See https://github.com/coiled/dask-xgboost-nyctaxi for a set of examples of usin with dask and optuna. +.. _ltr-dask: + +**************** +Learning to Rank +**************** + + .. versionadded:: 3.0.0 + + .. note:: + + Position debiasing is not yet supported. + +There are two operation modes in the Dask learning to rank for performance reasons. The +difference is whether a distributed global sort is needed. Please see :ref:`ltr-dist` for +how ranking works with distributed training in general. Below we will discuss some of the +Dask-specific features. + +First, if you use the :py:class:`~xgboost.dask.DaskQuantileDMatrix` interface or the +:py:class:`~xgboost.dask.DaskXGBRanker` with ``allow_group_split`` set to ``True``, +XGBoost will try to sort and group the samples for each worker based on the query ID. This +mode tries to skip the global sort and sort only worker-local data, and hence no +inter-worker data shuffle. Please note that even worker-local sort is costly, particularly +in terms of memory usage as there's no spilling when +:py:meth:`~pandas.DataFrame.sort_values` is used, and we need to concatenate the +data. XGBoost first checks whether the QID is already sorted before actually performing +the sorting operation. One can choose this if the query groups are relatively consecutive, +meaning most of the samples within a query group are close to each other and are likely to +be resided to the same worker. Don't use this if you have performed a random shuffle on +your data. + +If the input data is random, then there's no way we can guarantee most of data within the +same group being in the same worker. For large query groups, this might not be an +issue. But for small query groups, it's possible that each worker gets only one or two +samples from their group for all groups, which can lead to disastrous performance. In that +case, we can partition the data according to query group, which is the default behavior of +the :py:class:`~xgboost.dask.DaskXGBRanker` unless the ``allow_group_split`` is set to +``True``. This mode performs a sort and a groupby on the entire dataset in addition to an +encoding operation for the query group IDs. Along with partition fragmentation, this +option can lead to slow performance. See +:ref:`sphx_glr_python_dask-examples_dask_learning_to_rank.py` for a worked example. + .. _tracker-ip: *************** diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst index 4d2cbad4aa47..8743a672d219 100644 --- a/doc/tutorials/learning_to_rank.rst +++ b/doc/tutorials/learning_to_rank.rst @@ -165,10 +165,26 @@ On the other hand, if you have comparatively small amount of training data: For any method chosen, you can modify ``lambdarank_num_pair_per_sample`` to control the amount of pairs generated. +.. _ltr-dist: + ******************** Distributed Training ******************** -XGBoost implements distributed learning-to-rank with integration of multiple frameworks including Dask, Spark, and PySpark. The interface is similar to the single-node counterpart. Please refer to document of the respective XGBoost interface for details. Scattering a query group onto multiple workers is theoretically sound but can affect the model accuracy. For most of the use cases, the small discrepancy is not an issue, as the amount of training data is usually large when distributed training is used. As a result, users don't need to partition the data based on query groups. As long as each data partition is correctly sorted by query IDs, XGBoost can aggregate sample gradients accordingly. + +XGBoost implements distributed learning-to-rank with integration of multiple frameworks +including :doc:`Dask `, :doc:`Spark `, and +:doc:`PySpark `. The interface is similar to the single-node +counterpart. Please refer to document of the respective XGBoost interface for details. + +.. warning:: + + Position-debiasing is not yet supported for existing distributed interfaces. + +XGBoost works with collective operations, which means data is scattered to multiple workers. We can divide the data partitions by query group and ensure no query group is split among workers. However, this requires a costly sort and groupby operation and might only be necessary for selected use cases. Splitting and scattering a query group to multiple workers is theoretically sound but can affect the model's accuracy. If there are only a small number of groups sitting at the boundaries of workers, the small discrepancy is not an issue, as the amount of training data is usually large when distributed training is used. + +For a longer explanation, assuming the pairwise ranking method is used, we calculate the gradient based on relevance degree by constructing pairs within a query group. If a single query group is split among workers and we use worker-local data for gradient calculation, then we are simply sampling pairs from a smaller group for each worker to calculate the gradient and the evaluation metric. The comparison between each pair doesn't change because a group is split into sub-groups, what changes is the number of total and effective pairs and normalizers like `IDCG`. One can generate more pairs from a large group than it's from two smaller subgroups. As a result, the obtained gradient is still valid from a theoretical standpoint but might not be optimal. As long as each data partitions within a worker are correctly sorted by query IDs, XGBoost can aggregate sample gradients accordingly. And both the (Py)Spark interface and the Dask interface can sort the data according to query ID, please see respected tutorials for more information. + +However, it's possible that a distributed framework shuffles the data during map reduce and splits every query group into multiple workers. In that case, the performance would be disastrous. As a result, it depends on the data and the framework for whether a sorted groupby is needed. ******************* Reproducible Result diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 7e8ed2f29568..6ae1dea8d3ce 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -876,31 +876,48 @@ XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *conf * @defgroup Booster Booster * * @brief The `Booster` class is the gradient-boosted model for XGBoost. + * + * During training, the booster object has many caches for improved performance. In + * addition to gradient and prediction, it also includes runtime buffers like leaf + * partitions. These buffers persist with the Booster object until either XGBoosterReset() + * is called or the booster is deleted by the XGBoosterFree(). + * * @{ */ -/*! - * \brief create xgboost learner - * \param dmats matrices that are set to be cached - * \param len length of dmats - * \param out handle to the result booster - * \return 0 when success, -1 when failure happens +/** + * @brief Create a XGBoost learner (booster) + * + * @param dmats matrices that are set to be cached by the booster. + * @param len length of dmats + * @param out handle to the result booster + * + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[], bst_ulong len, BoosterHandle *out); /** * @example c-api-demo.c */ -/*! - * \brief free obj in handle - * \param handle handle to be freed - * \return 0 when success, -1 when failure happens +/** + * @brief Delete the booster. + * + * @param handle The handle to be freed. + * + * @return 0 when success, -1 when failure happens */ XGB_DLL int XGBoosterFree(BoosterHandle handle); /** * @example c-api-demo.c inference.c external_memory.c */ +/** + * @brief Reset the booster object to release data caches used for training. + * + * @since 3.0.0 + */ +XGB_DLL int XGBoosterReset(BoosterHandle handle); + /*! * \brief Slice a model using boosting index. The slice m:n indicates taking all trees * that were fit during the boosting rounds m, (m+1), (m+2), ..., (n-1). diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 939324e4a6c4..1499804c8592 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -249,6 +249,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable { std::string format) = 0; virtual XGBAPIThreadLocalEntry& GetThreadLocal() const = 0; + /** + * @brief Reset the booster object to release data caches used for training. + */ + virtual void Reset() = 0; /*! * \brief Create a new instance of learner. * \param cache_data The matrix to cache the prediction. diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py index 6be7b451ce14..fbd9b4ce5672 100755 --- a/jvm-packages/create_jni.py +++ b/jvm-packages/create_jni.py @@ -32,7 +32,7 @@ def cd(path): path = normpath(path) cwd = os.getcwd() os.chdir(path) - print("cd " + path) + print("cd " + path, flush=True) try: yield path finally: @@ -41,7 +41,7 @@ def cd(path): def maybe_makedirs(path): path = normpath(path) - print("mkdir -p " + path) + print("mkdir -p " + path, flush=True) try: os.makedirs(path) except OSError as e: @@ -50,14 +50,14 @@ def maybe_makedirs(path): def run(command, **kwargs): - print(command) + print(command, flush=True) subprocess.run(command, shell=True, check=True, env=os.environ, **kwargs) def cp(source, target): source = normpath(source) target = normpath(target) - print("cp {0} {1}".format(source, target)) + print("cp {0} {1}".format(source, target), flush=True) shutil.copy(source, target) @@ -78,7 +78,7 @@ def native_build(args): subprocess.check_output("/usr/libexec/java_home").strip().decode() ) - print("building Java wrapper") + print("building Java wrapper", flush=True) with cd(".."): build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build" maybe_makedirs(build_dir) @@ -123,7 +123,7 @@ def native_build(args): run("cmake .. " + " ".join(args + [generator])) break except subprocess.CalledProcessError as e: - print(f"Failed to build with generator: {generator}", e) + print(f"Failed to build with generator: {generator}", e, flush=True) with cd(os.path.pardir): shutil.rmtree(build_dir) maybe_makedirs(build_dir) @@ -132,7 +132,7 @@ def native_build(args): run("cmake --build . --config Release" + maybe_parallel_build) - print("copying native library") + print("copying native library", flush=True) library_name, os_folder = { "Windows": ("xgboost4j.dll", "windows"), "Darwin": ("libxgboost4j.dylib", "macos"), @@ -153,7 +153,7 @@ def native_build(args): maybe_makedirs(output_folder) cp("../lib/" + library_name, output_folder) - print("copying train/test files") + print("copying train/test files", flush=True) # for xgboost4j maybe_makedirs("xgboost4j/src/test/resources") diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index be46dc261285..b8a7d3337f35 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -116,6 +116,22 @@ + + docs + + ON + true + true + true + + + xgboost4j + xgboost4j-spark + xgboost4j-spark-gpu + xgboost4j-flink + + + release diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala index 6559d90c7887..a5ff2ba0f589 100644 --- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala +++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala @@ -542,6 +542,55 @@ class GpuXGBoostPluginSuite extends GpuTestSuite { } } + test("Same group must be in the same partition") { + val num_workers = 3 + withGpuSparkSession() { spark => + import spark.implicits._ + val df = spark.createDataFrame(spark.sparkContext.parallelize(Seq( + (0.1, 1, 0), + (0.1, 1, 0), + (0.1, 1, 0), + (0.1, 1, 1), + (0.1, 1, 1), + (0.1, 1, 1), + (0.1, 1, 2), + (0.1, 1, 2), + (0.1, 1, 2)), 1)).toDF("label", "f1", "group") + + // The original pattern will repartition df in a RoundRobin manner + val oriRows = df.repartition(num_workers) + .sortWithinPartitions(df.col("group")) + .select("group") + .mapPartitions { case iter => + val tmp: ArrayBuffer[Int] = ArrayBuffer.empty + while (iter.hasNext) { + val r = iter.next() + tmp.append(r.getInt(0)) + } + Iterator.single(tmp.mkString(",")) + }.collect() + assert(oriRows.length == 3) + assert(oriRows.contains("0,1,2")) + + // The fix has replaced repartition with repartitionByRange which will put the + // instances with same group into the same partition + val ranker = new XGBoostRanker().setGroupCol("group").setNumWorkers(num_workers) + val processedDf = ranker.getPlugin.get.asInstanceOf[GpuXGBoostPlugin].preprocess(ranker, df) + val rows = processedDf + .select("group") + .mapPartitions { case iter => + val tmp: ArrayBuffer[Int] = ArrayBuffer.empty + while (iter.hasNext) { + val r = iter.next() + tmp.append(r.getInt(0)) + } + Iterator.single(tmp.mkString(",")) + }.collect() + + rows.forall(Seq("0,0,0", "1,1,1", "2,2,2").contains) + } + } + test("Ranker: XGBoost-Spark should match xgboost4j") { withGpuSparkSession() { spark => import spark.implicits._ diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala index 14d13e34ff61..0265eac55979 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala @@ -22,6 +22,7 @@ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReadable, MLReader} import org.apache.spark.ml.xgboost.SparkUtils import org.apache.spark.sql.Dataset +import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.{DataType, DoubleType, StructType} import ml.dmlc.xgboost4j.scala.Booster @@ -62,6 +63,22 @@ class XGBoostRanker(override val uid: String, } } + /** + * Repartition the dataset to the numWorkers if needed. + * + * @param dataset to be repartition + * @return the repartitioned dataset + */ + override private[spark] def repartitionIfNeeded(dataset: Dataset[_]) = { + val numPartitions = dataset.rdd.getNumPartitions + if (getForceRepartition || getNumWorkers != numPartitions) { + // Please note that the output of repartitionByRange is not deterministic + dataset.repartitionByRange(getNumWorkers, col(getGroupCol)) + } else { + dataset + } + } + /** * Sort partition for Ranker issue. * diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala index 81a770bfe327..063836538931 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala @@ -151,6 +151,54 @@ class XGBoostRankerSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite }} } + test("Same group must be in the same partition") { + val spark = ss + import spark.implicits._ + val num_workers = 3 + val df = ss.createDataFrame(sc.parallelize(Seq( + (0.1, Vectors.dense(1.0, 2.0, 3.0), 0), + (0.1, Vectors.dense(0.0, 0.0, 0.0), 0), + (0.1, Vectors.dense(0.0, 3.0, 0.0), 0), + (0.1, Vectors.dense(2.0, 0.0, 4.0), 1), + (0.1, Vectors.dense(0.2, 1.2, 2.0), 1), + (0.1, Vectors.dense(0.5, 2.2, 1.7), 1), + (0.1, Vectors.dense(0.5, 2.2, 1.7), 2), + (0.1, Vectors.dense(0.5, 2.2, 1.7), 2), + (0.1, Vectors.dense(0.5, 2.2, 1.7), 2)), 1)).toDF("label", "features", "group") + + // The original pattern will repartition df in a RoundRobin manner + val oriRows = df.repartition(num_workers) + .sortWithinPartitions(df.col("group")) + .select("group") + .mapPartitions { case iter => + val tmp: ArrayBuffer[Int] = ArrayBuffer.empty + while (iter.hasNext) { + val r = iter.next() + tmp.append(r.getInt(0)) + } + Iterator.single(tmp.mkString(",")) + }.collect() + assert(oriRows.length == 3) + assert(oriRows.contains("0,1,2")) + + // The fix has replaced repartition with repartitionByRange which will put the + // instances with same group into the same partition + val ranker = new XGBoostRanker().setGroupCol("group").setNumWorkers(num_workers) + val (processedDf, _) = ranker.preprocess(df) + val rows = processedDf + .select("group") + .mapPartitions { case iter => + val tmp: ArrayBuffer[Int] = ArrayBuffer.empty + while (iter.hasNext) { + val r = iter.next() + tmp.append(r.getInt(0)) + } + Iterator.single(tmp.mkString(",")) + }.collect() + + rows.forall(Seq("0,0,0", "1,1,1", "2,2,2").contains) + } + private def runLengthEncode(input: Seq[Int]): Seq[Int] = { if (input.isEmpty) return Seq(0) diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu index 524e5984803d..a9798465686f 100644 --- a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu +++ b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu @@ -97,7 +97,7 @@ void CopyInterface(std::vector> &interface_arr, Json{Boolean{false}}}; out["data"] = Array(std::move(j_data)); - out["shape"] = Array(std::vector{Json(Integer(interface.Shape(0)))}); + out["shape"] = Array(std::vector{Json(Integer(interface.Shape<0>()))}); if (interface.valid.Data()) { CopyColumnMask(interface, columns, kind, c, &mask, &out, stream); @@ -113,7 +113,7 @@ void CopyMetaInfo(Json *p_interface, dh::device_vector *out, cudaStream_t str CHECK_EQ(get(j_interface).size(), 1); auto object = get(get(j_interface)[0]); ArrayInterface<1> interface(object); - out->resize(interface.Shape(0)); + out->resize(interface.Shape<0>()); size_t element_size = interface.ElementSize(); size_t size = element_size * interface.n; dh::safe_cuda(cudaMemcpyAsync(RawPtr(*out), interface.data, size, diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp index 3e5087a78f7d..01706beb6b45 100644 --- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp +++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp @@ -1520,20 +1520,20 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetQuanti ArrayInterface<1> indptr{StringView{str_indptr}}; ArrayInterface<1> data{StringView{str_data}}; - CHECK_GE(indptr.Shape(0), 2); + CHECK_GE(indptr.Shape<0>(), 2); // Cut ptr - auto j_indptr_array = jenv->NewLongArray(indptr.Shape(0)); + auto j_indptr_array = jenv->NewLongArray(indptr.Shape<0>()); CHECK_EQ(indptr.type, ArrayInterfaceHandler::Type::kU8); - CHECK_LT(indptr(indptr.Shape(0) - 1), + CHECK_LT(indptr(indptr.Shape<0>() - 1), static_cast(std::numeric_limits::max())); static_assert(sizeof(jlong) == sizeof(std::uint64_t)); - jenv->SetLongArrayRegion(j_indptr_array, 0, indptr.Shape(0), + jenv->SetLongArrayRegion(j_indptr_array, 0, indptr.Shape<0>(), static_cast(indptr.data)); jenv->SetObjectArrayElement(j_indptr, 0, j_indptr_array); // Cut values - auto n_cuts = indptr(indptr.Shape(0) - 1); + auto n_cuts = indptr(indptr.Shape<0>() - 1); jfloatArray jcuts_array = jenv->NewFloatArray(n_cuts); CHECK_EQ(data.type, ArrayInterfaceHandler::Type::kF4); jenv->SetFloatArrayRegion(jcuts_array, 0, n_cuts, static_cast(data.data)); diff --git a/tests/ci_build/conda_env/aarch64_test.yml b/ops/conda_env/aarch64_test.yml similarity index 100% rename from tests/ci_build/conda_env/aarch64_test.yml rename to ops/conda_env/aarch64_test.yml diff --git a/tests/ci_build/conda_env/cpp_test.yml b/ops/conda_env/cpp_test.yml similarity index 100% rename from tests/ci_build/conda_env/cpp_test.yml rename to ops/conda_env/cpp_test.yml diff --git a/tests/ci_build/conda_env/linux_cpu_test.yml b/ops/conda_env/linux_cpu_test.yml similarity index 100% rename from tests/ci_build/conda_env/linux_cpu_test.yml rename to ops/conda_env/linux_cpu_test.yml diff --git a/tests/ci_build/conda_env/linux_sycl_test.yml b/ops/conda_env/linux_sycl_test.yml similarity index 93% rename from tests/ci_build/conda_env/linux_sycl_test.yml rename to ops/conda_env/linux_sycl_test.yml index 5b3a15f7e3b1..1761787662ee 100644 --- a/tests/ci_build/conda_env/linux_sycl_test.yml +++ b/ops/conda_env/linux_sycl_test.yml @@ -17,7 +17,8 @@ dependencies: - pytest - pytest-timeout - pytest-cov -- dask +- dask=2024.11 +- ninja - dpcpp_linux-64 - onedpl-devel - intel-openmp diff --git a/tests/ci_build/conda_env/macos_cpu_test.yml b/ops/conda_env/macos_cpu_test.yml similarity index 100% rename from tests/ci_build/conda_env/macos_cpu_test.yml rename to ops/conda_env/macos_cpu_test.yml diff --git a/tests/ci_build/conda_env/jvm_tests.yml b/ops/conda_env/minimal.yml similarity index 79% rename from tests/ci_build/conda_env/jvm_tests.yml rename to ops/conda_env/minimal.yml index 56e11dff27bb..efe972bd44d9 100644 --- a/tests/ci_build/conda_env/jvm_tests.yml +++ b/ops/conda_env/minimal.yml @@ -1,4 +1,4 @@ -name: jvm_tests +name: minimal channels: - conda-forge dependencies: diff --git a/tests/ci_build/conda_env/python_lint.yml b/ops/conda_env/python_lint.yml similarity index 100% rename from tests/ci_build/conda_env/python_lint.yml rename to ops/conda_env/python_lint.yml diff --git a/tests/ci_build/conda_env/sdist_test.yml b/ops/conda_env/sdist_test.yml similarity index 85% rename from tests/ci_build/conda_env/sdist_test.yml rename to ops/conda_env/sdist_test.yml index 3597b42c6132..c21cd2b701e1 100644 --- a/tests/ci_build/conda_env/sdist_test.yml +++ b/ops/conda_env/sdist_test.yml @@ -9,5 +9,3 @@ dependencies: - cmake - ninja - python-build -- c-compiler -- cxx-compiler diff --git a/tests/ci_build/conda_env/win64_test.yml b/ops/conda_env/win64_test.yml similarity index 100% rename from tests/ci_build/conda_env/win64_test.yml rename to ops/conda_env/win64_test.yml diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml new file mode 100644 index 000000000000..348bf90f8a1f --- /dev/null +++ b/ops/docker/ci_container.yml @@ -0,0 +1,72 @@ +## List of CI containers with definitions and build arguments + +# Each container will be built using the definition from +# ops/docker/dockerfile/Dockerfile.CONTAINER_DEF + +rapids_versions: + stable: &rapids_version "24.10" + dev: &dev_rapids_version "24.12" + +xgb-ci.gpu_build_rockylinux8: + container_def: gpu_build_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *rapids_version + +xgb-ci.gpu_build_rockylinux8_dev_ver: + container_def: gpu_build_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *dev_rapids_version + +xgb-ci.gpu_build_r_rockylinux8: + container_def: gpu_build_r_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.4.1" + R_VERSION_ARG: "4.3.2" + +xgb-ci.gpu: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *rapids_version + +xgb-ci.gpu_dev_ver: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" + RAPIDS_VERSION_ARG: *dev_rapids_version + RAPIDSAI_CONDA_CHANNEL_ARG: "rapidsai-nightly" + +xgb-ci.clang_tidy: + container_def: clang_tidy + build_args: + CUDA_VERSION_ARG: "12.4.1" + +xgb-ci.cpu: + container_def: cpu + +xgb-ci.aarch64: + container_def: aarch64 + +xgb-ci.manylinux_2_28_x86_64: + container_def: manylinux_2_28_x86_64 + +xgb-ci.manylinux2014_x86_64: + container_def: manylinux2014_x86_64 + +xgb-ci.manylinux2014_aarch64: + container_def: manylinux2014_aarch64 + +xgb-ci.jvm: + container_def: jvm + +xgb-ci.jvm_gpu_build: + container_def: jvm_gpu_build + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" diff --git a/ops/docker/docker_cache_ecr.yml b/ops/docker/docker_cache_ecr.yml new file mode 100644 index 000000000000..e20f35fc8020 --- /dev/null +++ b/ops/docker/docker_cache_ecr.yml @@ -0,0 +1,4 @@ +## Constants for AWS ECR (Elastic Container Registry), used for the Docker cache + +DOCKER_CACHE_ECR_ID: "492475357299" +DOCKER_CACHE_ECR_REGION: "us-west-2" diff --git a/tests/ci_build/Dockerfile.aarch64 b/ops/docker/dockerfile/Dockerfile.aarch64 similarity index 97% rename from tests/ci_build/Dockerfile.aarch64 rename to ops/docker/dockerfile/Dockerfile.aarch64 index 8d6cfaca39fa..9dff2a05230b 100644 --- a/tests/ci_build/Dockerfile.aarch64 +++ b/ops/docker/dockerfile/Dockerfile.aarch64 @@ -32,7 +32,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.clang_tidy b/ops/docker/dockerfile/Dockerfile.clang_tidy similarity index 96% rename from tests/ci_build/Dockerfile.clang_tidy rename to ops/docker/dockerfile/Dockerfile.clang_tidy index 2e7751a20185..de7d9bd3f254 100644 --- a/tests/ci_build/Dockerfile.clang_tidy +++ b/ops/docker/dockerfile/Dockerfile.clang_tidy @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu22.04 ARG CUDA_VERSION_ARG @@ -44,7 +44,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.cpu b/ops/docker/dockerfile/Dockerfile.cpu similarity index 92% rename from tests/ci_build/Dockerfile.cpu rename to ops/docker/dockerfile/Dockerfile.cpu index 22db93572207..a426ce5da30c 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/ops/docker/dockerfile/Dockerfile.cpu @@ -41,8 +41,7 @@ RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ COPY conda_env/linux_cpu_test.yml /scripts/ RUN mamba create -n linux_cpu_test && \ mamba env update -n linux_cpu_test --file=/scripts/linux_cpu_test.yml && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n linux_cpu_test pip install buildkite-test-collector + mamba clean --all --yes # Install lightweight sudo (not bound to TTY) RUN set -ex; \ @@ -52,7 +51,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu similarity index 76% rename from tests/ci_build/Dockerfile.gpu rename to ops/docker/dockerfile/Dockerfile.gpu index 501726e9ffba..96a532fc2ff1 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/ops/docker/dockerfile/Dockerfile.gpu @@ -1,8 +1,10 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 ARG CUDA_VERSION_ARG ARG RAPIDS_VERSION_ARG + # Should be first 4 digits (e.g. 24.06) ARG NCCL_VERSION_ARG +ARG RAPIDSAI_CONDA_CHANNEL_ARG="rapidsai" # Environment ENV DEBIAN_FRONTEND=noninteractive @@ -24,16 +26,16 @@ ENV PATH=/opt/miniforge/bin:$PATH RUN \ export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ - mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \ - python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cuda-version=$CUDA_SHORT_VER \ + mamba create -y -n gpu_test -c ${RAPIDSAI_CONDA_CHANNEL_ARG} -c conda-forge -c nvidia \ + python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ "nccl>=${NCCL_SHORT_VER}" \ - dask \ - dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ + "dask<=2024.10.0" \ + "distributed<=2024.10.0" \ + "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ python-kubernetes urllib3 graphviz hypothesis loky \ "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n gpu_test pip install buildkite-test-collector + mamba clean --all --yes ENV GOSU_VERSION=1.10 ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ @@ -46,7 +48,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 similarity index 97% rename from tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 rename to ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 index 159e5d776c16..2d18b1eeb315 100644 --- a/tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 +++ b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG R_VERSION_ARG @@ -52,7 +52,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.gpu_build_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 similarity index 94% rename from tests/ci_build/Dockerfile.gpu_build_rockylinux8 rename to ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 index 8869fb468e12..b686bfbb2b0d 100644 --- a/tests/ci_build/Dockerfile.gpu_build_rockylinux8 +++ b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG NCCL_VERSION_ARG @@ -53,7 +53,7 @@ RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ # Install RMM # Patch out -Werror # Patch CCCL 2.5.0 to apply https://github.com/NVIDIA/cccl/pull/1957 -RUN git clone -b v${RAPIDS_VERSION_ARG}.00 https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ +RUN git clone -b branch-${RAPIDS_VERSION_ARG} https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \ pushd rmm && \ find . -name CMakeLists.txt -print0 | xargs -0 sed -i 's/-Werror//g' && \ mkdir build && \ @@ -76,7 +76,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.i386 b/ops/docker/dockerfile/Dockerfile.i386 similarity index 100% rename from tests/ci_build/Dockerfile.i386 rename to ops/docker/dockerfile/Dockerfile.i386 diff --git a/tests/ci_build/Dockerfile.jvm b/ops/docker/dockerfile/Dockerfile.jvm similarity index 97% rename from tests/ci_build/Dockerfile.jvm rename to ops/docker/dockerfile/Dockerfile.jvm index c4584747f5db..9fd62e52de93 100644 --- a/tests/ci_build/Dockerfile.jvm +++ b/ops/docker/dockerfile/Dockerfile.jvm @@ -37,7 +37,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build similarity index 97% rename from tests/ci_build/Dockerfile.jvm_gpu_build rename to ops/docker/dockerfile/Dockerfile.jvm_gpu_build index edb5918b8bbc..4983493a6878 100644 --- a/tests/ci_build/Dockerfile.jvm_gpu_build +++ b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG NCCL_VERSION_ARG @@ -48,7 +48,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.manylinux2014_aarch64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 similarity index 82% rename from tests/ci_build/Dockerfile.manylinux2014_aarch64 rename to ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 index 9627e15c64a0..7800033f552d 100644 --- a/tests/ci_build/Dockerfile.manylinux2014_aarch64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 @@ -1,5 +1,7 @@ FROM quay.io/pypa/manylinux2014_aarch64 +RUN yum update -y && yum install -y java-1.8.0-openjdk-devel + # Install lightweight sudo (not bound to TTY) ENV GOSU_VERSION=1.10 RUN set -ex; \ @@ -9,7 +11,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.manylinux2014_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 similarity index 82% rename from tests/ci_build/Dockerfile.manylinux2014_x86_64 rename to ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 index 11beb116ee43..8214b598d8d4 100644 --- a/tests/ci_build/Dockerfile.manylinux2014_x86_64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 @@ -1,5 +1,7 @@ FROM quay.io/pypa/manylinux2014_x86_64 +RUN yum update -y && yum install -y java-1.8.0-openjdk-devel + # Install lightweight sudo (not bound to TTY) ENV GOSU_VERSION=1.10 RUN set -ex; \ @@ -9,7 +11,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 similarity index 92% rename from tests/ci_build/Dockerfile.manylinux_2_28_x86_64 rename to ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 index 5e264e2f16e6..f5dac54b9b8f 100644 --- a/tests/ci_build/Dockerfile.manylinux_2_28_x86_64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 @@ -9,7 +9,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/entrypoint.sh b/ops/docker/entrypoint.sh similarity index 70% rename from tests/ci_build/entrypoint.sh rename to ops/docker/entrypoint.sh index a0c5f56bb52d..40135c197c73 100755 --- a/tests/ci_build/entrypoint.sh +++ b/ops/docker/entrypoint.sh @@ -1,12 +1,10 @@ #!/usr/bin/env bash -# This script is a wrapper creating the same user inside container as the one -# running the ci_build.sh outside the container. It also set the home directory -# for the user inside container to match the same absolute path as the workspace -# outside of container. Do not run this manually. It does not make sense. It is -# intended to be called by ci_build.sh only. +# This wrapper script propagates the user information from the host +# to the container. This way, any files generated by processes running +# in the container will be accessible in the host. -set -e +set -euo pipefail COMMAND=("$@") @@ -19,7 +17,11 @@ else rm /this_is_writable_file_system fi -if [[ -n $CI_BUILD_UID ]] && [[ -n $CI_BUILD_GID ]]; then +## Assumption: the host passes correct user information via environment variables +## CI_BUILD_UID, CI_BUILD_GID, CI_BUILD_USER, CI_BUILD_GROUP + +if [[ -n ${CI_BUILD_UID:-} ]] && [[ -n ${CI_BUILD_GID:-} ]] +then groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \ "${CI_BUILD_USER}" || true diff --git a/ops/docker/extract_build_args.jq b/ops/docker/extract_build_args.jq new file mode 100644 index 000000000000..b35240edb626 --- /dev/null +++ b/ops/docker/extract_build_args.jq @@ -0,0 +1,12 @@ +## Example input: +## xgb-ci.gpu_build_r_rockylinux8 +## Example output: +## --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg R_VERSION_ARG=4.3.2 +def compute_build_args($input; $container_id): + $input | + .[$container_id] | + select(.build_args != null) | + .build_args | + to_entries | + map("--build-arg " + .key + "=" + .value) | + join(" "); diff --git a/ops/docker/extract_build_args.sh b/ops/docker/extract_build_args.sh new file mode 100755 index 000000000000..42a83047742c --- /dev/null +++ b/ops/docker/extract_build_args.sh @@ -0,0 +1,26 @@ +#!/bin/bash +## Extract container definition and build args from ops/docker/ci_container.yml, +## given the container ID. +## +## Example input: +## xgb-ci.clang_tidy +## Example output: +## CONTAINER_DEF='clang_tidy' BUILD_ARGS='--build-arg CUDA_VERSION_ARG=12.4.1' + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 [container_id]" + exit 1 +fi + +CONTAINER_ID="$1" +CONTAINER_DEF=$( + yq -o json ops/docker/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" '.[$container_id].container_def' +) +BUILD_ARGS=$( + yq -o json ops/docker/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" \ + 'include "ops/docker/extract_build_args"; + compute_build_args(.; $container_id)' +) +echo "CONTAINER_DEF='${CONTAINER_DEF}' BUILD_ARGS='${BUILD_ARGS}'" diff --git a/ops/docker_build.py b/ops/docker_build.py new file mode 100644 index 000000000000..1fed975ce223 --- /dev/null +++ b/ops/docker_build.py @@ -0,0 +1,137 @@ +""" +Wrapper script to build a Docker container with layer caching +""" + +import argparse +import itertools +import pathlib +import subprocess +import sys +from typing import Optional + +from docker_run import OPS_DIR, fancy_print_cli_args + + +def parse_build_args(raw_build_args: list[str]) -> dict[str, str]: + parsed_build_args = dict() + for arg in raw_build_args: + try: + key, value = arg.split("=", maxsplit=1) + except ValueError as e: + raise ValueError( + f"Build argument must be of form KEY=VALUE. Got: {arg}" + ) from e + parsed_build_args[key] = value + return parsed_build_args + + +def docker_build( + container_id: str, + *, + build_args: dict[str, str], + dockerfile_path: pathlib.Path, + docker_context_path: pathlib.Path, + cache_from: Optional[str], + cache_to: Optional[str], +) -> None: + ## Set up command-line arguments to be passed to `docker build` + # Build args + docker_build_cli_args = list( + itertools.chain.from_iterable( + [["--build-arg", f"{k}={v}"] for k, v in build_args.items()] + ) + ) + # When building an image using a non-default driver, we need to specify + # `--load` to load it to the image store. + # See https://docs.docker.com/build/builders/drivers/ + docker_build_cli_args.append("--load") + # Layer caching + if cache_from: + docker_build_cli_args.extend(["--cache-from", cache_from]) + if cache_to: + docker_build_cli_args.extend(["--cache-to", cache_to]) + # Remaining CLI args + docker_build_cli_args.extend( + [ + "--progress=plain", + "--ulimit", + "nofile=1024000:1024000", + "-t", + container_id, + "-f", + str(dockerfile_path), + str(docker_context_path), + ] + ) + cli_args = ["docker", "build"] + docker_build_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(args: argparse.Namespace) -> None: + # Dockerfile to be used in docker build + dockerfile_path = ( + OPS_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" + ) + docker_context_path = OPS_DIR + + build_args = parse_build_args(args.build_arg) + + docker_build( + args.container_id, + build_args=build_args, + dockerfile_path=dockerfile_path, + docker_context_path=docker_context_path, + cache_from=args.cache_from, + cache_to=args.cache_to, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Build a Docker container") + parser.add_argument( + "--container-def", + type=str, + required=True, + help=( + "String uniquely identifying the container definition. The container " + "definition will be fetched from " + "docker/dockerfile/Dockerfile.CONTAINER_DEF." + ), + ) + parser.add_argument( + "--container-id", + type=str, + required=True, + help="String ID to assign to the newly built container", + ) + parser.add_argument( + "--build-arg", + type=str, + default=[], + action="append", + help=( + "Build-time variable(s) to be passed to `docker build`. Each variable " + "should be specified as a key-value pair in the form KEY=VALUE. " + "The variables should match the ARG instructions in the Dockerfile. " + "When passing multiple variables, specify --build-arg multiple times. " + "Example: --build-arg CUDA_VERSION_ARG=12.5 --build-arg RAPIDS_VERSION_ARG=24.10'" + ), + ) + parser.add_argument( + "--cache-from", + type=str, + help="Use an external cache source for the Docker build", + ) + parser.add_argument( + "--cache-to", + type=str, + help="Export layers from the container to an external cache destination", + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/docker_build.sh b/ops/docker_build.sh new file mode 100755 index 000000000000..7d83daec9574 --- /dev/null +++ b/ops/docker_build.sh @@ -0,0 +1,149 @@ +#!/bin/bash +## Build a CI container and cache the layers in AWS ECR (Elastic Container Registry). +## This script provides a convenient wrapper for ops/docker_build.py. +## Build-time variables (--build-arg) and container defintion are fetched from +## ops/docker/ci_container.yml. +## +## Note. This script takes in some inputs via environment variables. + +USAGE_DOC=$( +cat <<-EOF +Usage: ops/docker_build.sh [container_id] + +In addition, the following environment variables should be set. + - BRANCH_NAME: Name of the current git branch or pull request (Required) + - USE_DOCKER_CACHE: If set to 1, enable caching +EOF +) + +ECR_LIFECYCLE_RULE=$( +cat <<-EOF +{ + "rules": [ + { + "rulePriority": 1, + "selection": { + "tagStatus": "any", + "countType": "sinceImagePushed", + "countUnit": "days", + "countNumber": 30 + }, + "action": { + "type": "expire" + } + } + ] +} +EOF +) + +set -euo pipefail + +for arg in "BRANCH_NAME" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n\n${USAGE_DOC}" + exit 1 + fi +done + +if [[ "$#" -lt 1 ]] +then + echo "${USAGE_DOC}" + exit 2 +fi +CONTAINER_ID="$1" + +# Fetch CONTAINER_DEF and BUILD_ARGS +source <(ops/docker/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 + +if [[ "${USE_DOCKER_CACHE:-}" != "1" ]] # Any value other than 1 is considered false +then + USE_DOCKER_CACHE=0 +fi + +if [[ ${USE_DOCKER_CACHE} -eq 0 ]] +then + echo "USE_DOCKER_CACHE not set; caching disabled" +else + DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/docker/docker_cache_ecr.yml) + DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/docker/docker_cache_ecr.yml) + DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" + echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" + # Login for Docker registry + echo "aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} |" \ + "docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO}" + aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} \ + | docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO} +fi + +# Pull pre-built container from the cache +# First try locating one for the particular branch or pull request +CACHE_FROM_CMD="" +IS_CACHED=0 +if [[ ${USE_DOCKER_CACHE} -eq 1 ]] +then + DOCKER_TAG="${BRANCH_NAME//\//-}" # Slashes are not allowed in Docker tag + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" + echo "docker pull --quiet ${DOCKER_URL}" + if time docker pull --quiet "${DOCKER_URL}" + then + echo "Found a cached container for the branch ${BRANCH_NAME}: ${DOCKER_URL}" + IS_CACHED=1 + else + # If there's no pre-built container from the cache, + # use the pre-built container from the master branch. + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:master" + echo "Could not find a cached container for the branch ${BRANCH_NAME}." \ + "Using a cached container from the master branch: ${DOCKER_URL}" + echo "docker pull --quiet ${DOCKER_URL}" + if time docker pull --quiet "${DOCKER_URL}" + then + IS_CACHED=1 + else + echo "Could not find a cached container for the master branch either." + IS_CACHED=0 + fi + fi + if [[ $IS_CACHED -eq 1 ]] + then + CACHE_FROM_CMD="--cache-from type=registry,ref=${DOCKER_URL}" + fi +fi + +# Run Docker build +set -x +python3 ops/docker_build.py \ + --container-def ${CONTAINER_DEF} \ + --container-id ${CONTAINER_ID} \ + ${BUILD_ARGS} \ + --cache-to type=inline \ + ${CACHE_FROM_CMD} +set +x + +# Now cache the new container +if [[ ${USE_DOCKER_CACHE} -eq 1 ]] +then + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" + echo "docker tag ${CONTAINER_ID} ${DOCKER_URL}" + docker tag "${CONTAINER_ID}" "${DOCKER_URL}" + + # Attempt to create Docker repository; it will fail if the repository already exists + echo "aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION}" + if aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION} + then + # Repository was created. Now set expiration policy + echo "aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID}" \ + "--region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin" + echo "${ECR_LIFECYCLE_RULE}" | aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID} \ + --region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin + fi + + echo "docker push --quiet ${DOCKER_URL}" + if ! time docker push --quiet "${DOCKER_URL}" + then + echo "ERROR: could not update Docker cache ${DOCKER_URL}" + exit 1 + fi +fi diff --git a/ops/docker_run.py b/ops/docker_run.py new file mode 100644 index 000000000000..7e61c5a14f39 --- /dev/null +++ b/ops/docker_run.py @@ -0,0 +1,168 @@ +""" +Wrapper script to run a command inside a Docker container +""" + +import argparse +import grp +import itertools +import os +import pathlib +import pwd +import subprocess +import sys +import textwrap + +OPS_DIR = pathlib.Path(__file__).expanduser().resolve().parent +PROJECT_ROOT_DIR = OPS_DIR.parent +LINEWIDTH = 88 +TEXT_WRAPPER = textwrap.TextWrapper( + width=LINEWIDTH, + initial_indent="", + subsequent_indent=" ", + break_long_words=False, + break_on_hyphens=False, +) + + +def parse_run_args(raw_run_args: str) -> list[str]: + return [x for x in raw_run_args.split() if x] + + +def get_user_ids() -> dict[str, str]: + uid = os.getuid() + gid = os.getgid() + return { + "CI_BUILD_UID": str(uid), + "CI_BUILD_USER": pwd.getpwuid(uid).pw_name, + "CI_BUILD_GID": str(gid), + "CI_BUILD_GROUP": grp.getgrgid(gid).gr_name, + } + + +def fancy_print_cli_args(cli_args: list[str]) -> None: + print( + "=" * LINEWIDTH + + "\n" + + " \\\n".join(TEXT_WRAPPER.wrap(" ".join(cli_args))) + + "\n" + + "=" * LINEWIDTH + + "\n", + flush=True, + ) + + +def docker_run( + container_id: str, + command_args: list[str], + *, + use_gpus: bool, + workdir: pathlib.Path, + user_ids: dict[str, str], + extra_args: list[str], +) -> None: + # Command-line arguments to be passed to `docker run` + docker_run_cli_args = ["--rm", "--pid=host"] + + if use_gpus: + docker_run_cli_args.extend(["--gpus", "all"]) + + docker_run_cli_args.extend(["-v", f"{workdir}:/workspace", "-w", "/workspace"]) + docker_run_cli_args.extend( + itertools.chain.from_iterable([["-e", f"{k}={v}"] for k, v in user_ids.items()]) + ) + docker_run_cli_args.extend(extra_args) + docker_run_cli_args.append(container_id) + docker_run_cli_args.extend(command_args) + + cli_args = ["docker", "run"] + docker_run_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(args: argparse.Namespace) -> None: + run_args = parse_run_args(args.run_args) + user_ids = get_user_ids() + + if args.use_gpus: + print("Using NVIDIA GPUs for `docker run`") + if args.interactive: + print("Using interactive mode for `docker run`") + run_args.append("-it") + + docker_run( + args.container_id, + args.command_args, + use_gpus=args.use_gpus, + workdir=args.workdir, + user_ids=user_ids, + extra_args=run_args, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + usage=( + f"{sys.argv[0]} --container-id CONTAINER_ID [--use-gpus] [--interactive] " + "[--workdir WORKDIR] [--run-args RUN_ARGS] -- COMMAND_ARG " + "[COMMAND_ARG ...]" + ), + description="Run tasks inside a Docker container", + ) + parser.add_argument( + "--container-id", + type=str, + required=True, + help="String ID of the container to run.", + ) + parser.add_argument( + "--use-gpus", + action="store_true", + help=( + "Grant the container access to NVIDIA GPUs; requires the NVIDIA " + "Container Toolkit." + ), + ) + parser.add_argument( + "--interactive", + action="store_true", + help=( + "Run the container in the interactive mode; requires an interactive shell " + "(TTY). With this flag, you can use Ctrl-C to interrupt an long-running " + "command." + ), + ) + parser.add_argument( + "--workdir", + type=lambda p: pathlib.Path(p).expanduser().resolve(), + default=PROJECT_ROOT_DIR, + help="Path to working directory; if unset, use the project's root", + ) + parser.add_argument( + "--run-args", + type=str, + default="", + help=( + "Argument(s) to be passed to `docker run`. When passing multiple " + "arguments, use single quotes to wrap them. Example: " + "--run-args '--cap-add SYS_PTRACE --shm-size=4g'" + ), + ) + parser.add_argument( + "command_args", + metavar="COMMAND_ARG", + type=str, + nargs="+", + help=( + "Argument(s) for the command to execute. NOTE. Make sure to specify " + "double-dash (--) to clearly distinguish between the command and the " + "preceding parameters. Example: --run-args '--cap-add SYS_PTRACE " + "--shm-size=4g' -- ./myprog" + ), + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/packer/linux/bootstrap.sh b/ops/packer/linux/bootstrap.sh new file mode 100644 index 000000000000..57be6e14b507 --- /dev/null +++ b/ops/packer/linux/bootstrap.sh @@ -0,0 +1,52 @@ +#!/bin/bash +set -euo pipefail + +## Install Docker +# Add Docker's official GPG key: +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc +# Add the repository to Apt sources: +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update +sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +# Allow users to use Docker without sudo +sudo usermod -aG docker ubuntu + +# Start Docker daemon +sudo systemctl is-active --quiet docker.service || sudo systemctl start docker.service +sudo systemctl is-enabled --quiet docker.service || sudo systemctl enable docker.service +sleep 10 # Docker daemon takes time to come up after installing +sudo docker info + +## Install NVIDIA Container Toolkit +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker + +sleep 10 +sudo docker run --rm --gpus all ubuntu nvidia-smi +sudo systemctl stop docker + +## Install AWS CLI v2 +wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscliv2.zip +unzip -q awscliv2.zip +sudo ./aws/install +rm -rf ./aws/ ./awscliv2.zip + +## Install jq and yq +sudo apt update && sudo apt install jq +mkdir yq/ +pushd yq/ +wget -nv https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64.tar.gz -O - | \ + tar xz && sudo mv ./yq_linux_amd64 /usr/bin/yq +popd +rm -rf yq/ diff --git a/ops/packer/linux/install_drivers.sh b/ops/packer/linux/install_drivers.sh new file mode 100644 index 000000000000..07309be836a8 --- /dev/null +++ b/ops/packer/linux/install_drivers.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -euo pipefail + +## Install basic tools +echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections +sudo apt-get update +sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip + +## Install CUDA Toolkit 12.6 (Driver will be installed later) +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install cuda-toolkit-12-6 cuda-drivers-565 +rm cuda-keyring_1.1-1_all.deb diff --git a/ops/packer/linux/linux.pkr.hcl b/ops/packer/linux/linux.pkr.hcl new file mode 100644 index 000000000000..c6990894764a --- /dev/null +++ b/ops/packer/linux/linux.pkr.hcl @@ -0,0 +1,79 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = "~> 1" + } + } +} + +locals { + ami_name_prefix = "xgboost-ci" + image_name = "RunsOn worker with Ubuntu 24.04 + CUDA driver" + region = "us-west-2" + timestamp = regex_replace(timestamp(), "[- TZ:]", "") + volume_size = 40 +} + +data "amazon-ami" "aws-ubuntu-x64" { + filters = { + name = "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["amazon"] +} + +source "amazon-ebs" "runs-on-linux" { + source_ami = "${data.amazon-ami.aws-ubuntu-x64.id}" + ami_name = "${local.ami_name_prefix}-runs-on-linux-${local.timestamp}" + ami_description = "${local.image_name}" + ami_regions = ["${local.region}"] + ami_virtualization_type = "hvm" + associate_public_ip_address = true + communicator = "ssh" + instance_type = "g4dn.xlarge" + region = "${local.region}" + ssh_timeout = "10m" + ssh_username = "ubuntu" + ssh_file_transfer_method = "sftp" + user_data_file = "setup_ssh.sh" + launch_block_device_mappings { + device_name = "/dev/sda1" + volume_size = "${local.volume_size}" + volume_type = "gp3" + delete_on_termination = true + } + aws_polling { # Wait up to 1 hour until the AMI is ready + delay_seconds = 15 + max_attempts = 240 + } + snapshot_tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } + tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } +} + +build { + sources = ["source.amazon-ebs.runs-on-linux"] + + provisioner "shell" { + script = "install_drivers.sh" + pause_after = "30s" + } + + provisioner "shell" { + expect_disconnect = true + inline = ["echo 'Reboot VM'", "sudo reboot"] + } + + provisioner "shell" { + pause_before = "1m0s" + script = "bootstrap.sh" + } +} diff --git a/ops/packer/linux/setup_ssh.sh b/ops/packer/linux/setup_ssh.sh new file mode 100644 index 000000000000..501b4da455f5 --- /dev/null +++ b/ops/packer/linux/setup_ssh.sh @@ -0,0 +1,2 @@ +#!/bin/bash +systemctl start ssh diff --git a/ops/packer/windows/bootstrap.ps1 b/ops/packer/windows/bootstrap.ps1 new file mode 100644 index 000000000000..c67f3b73fb9a --- /dev/null +++ b/ops/packer/windows/bootstrap.ps1 @@ -0,0 +1,73 @@ +## Install packages from Chocolatey + +# jq & yq +Write-Output "Installing jq and yq..." +choco install jq --version=1.7.1 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install yq --version=4.40.2 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# AWS CLI +Write-Output "Installing AWS CLI..." +choco install awscli --version=2.18.11 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Git +Write-Host '>>> Installing Git...' +choco install git --version=2.47.0 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# CMake +Write-Host '>>> Installing CMake 3.30.5...' +choco install cmake --version 3.30.5 --installargs "ADD_CMAKE_TO_PATH=System" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Notepad++ +Write-Host '>>> Installing Notepad++...' +choco install notepadplusplus +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Miniforge3 +Write-Host '>>> Installing Miniforge3...' +choco install miniforge3 --params="'/InstallationType:AllUsers /RegisterPython:1 /D:C:\tools\miniforge3'" +C:\tools\miniforge3\Scripts\conda.exe init --user --system +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +. "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +conda config --set auto_activate_base false + +# Java 11 +Write-Host '>>> Installing Java 11...' +choco install openjdk11 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Maven +Write-Host '>>> Installing Maven...' +choco install maven +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# GraphViz +Write-Host '>>> Installing GraphViz...' +choco install graphviz +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Visual Studio 2022 Community +Write-Host '>>> Installing Visual Studio 2022 Community...' +choco install visualstudio2022community ` + --params "--wait --passive --norestart" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install visualstudio2022-workload-nativedesktop --params ` + "--wait --passive --norestart --includeOptional" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# CUDA 12.5 +Write-Host '>>> Installing CUDA 12.5...' +choco install cuda --version=12.5.1.555 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# R 4.3 +Write-Host '>>> Installing R...' +choco install r.project --version=4.3.2 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install rtools --version=4.3.5550 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/packer/windows/install_choco.ps1 b/ops/packer/windows/install_choco.ps1 new file mode 100644 index 000000000000..131e8129feaa --- /dev/null +++ b/ops/packer/windows/install_choco.ps1 @@ -0,0 +1,14 @@ +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/InstallChoco.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +$ErrorActionPreference = "Stop" + +# Install Chocolatey +# See https://chocolatey.org/install#individual +Set-ExecutionPolicy Bypass -Scope Process -Force +[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072 +Invoke-Expression ((New-Object System.Net.WebClient).DownloadString("https://community.chocolatey.org/install.ps1")) + +# Globally Auto confirm every action +# See: https://docs.chocolatey.org/en-us/faqs#why-do-i-have-to-confirm-packages-now-is-there-a-way-to-remove-this +choco feature enable -n allowGlobalConfirmation diff --git a/ops/packer/windows/setup_ssh.ps1 b/ops/packer/windows/setup_ssh.ps1 new file mode 100644 index 000000000000..a7bdee898002 --- /dev/null +++ b/ops/packer/windows/setup_ssh.ps1 @@ -0,0 +1,58 @@ + +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/SetupSsh.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +# Don't display progress bars +# See: https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_preference_variables?view=powershell-7.3#progresspreference +$ProgressPreference = "SilentlyContinue" +$ErrorActionPreference = "Stop" + +# Install OpenSSH using Add-WindowsCapability +# See: https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_install_firstuse?tabs=powershell#install-openssh-for-windows + +Write-Host "Installing and starting ssh-agent" +Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0 +Set-Service -Name ssh-agent -StartupType Automatic +Start-Service ssh-agent + +Write-Host "Installing and starting sshd" +Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0 +Set-Service -Name sshd -StartupType Automatic +Start-Service sshd + +# Confirm the Firewall rule is configured. It should be created automatically by setup. Run the following to verify +if (!(Get-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -ErrorAction SilentlyContinue | Select-Object Name, Enabled)) { + Write-Output "Firewall Rule 'OpenSSH-Server-In-TCP' does not exist, creating it..." + New-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -DisplayName "OpenSSH Server (sshd)" -Enabled True -Direction Inbound -Protocol TCP -Action Allow -LocalPort 22 +} else { + Write-Output "Firewall rule 'OpenSSH-Server-In-TCP' has been created and exists." +} + +# Set default shell to Powershell +New-ItemProperty -Path "HKLM:\SOFTWARE\OpenSSH" -Name DefaultShell -Value "C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe" -PropertyType String -Force + +$keyDownloadScript = Join-Path $env:ProgramData "ssh\download-key.ps1" + +@' +# Download private key to $env:ProgramData\ssh\administrators_authorized_keys +$openSSHAuthorizedKeys = Join-Path $env:ProgramData "ssh\administrators_authorized_keys" + +$keyUrl = "http://169.254.169.254/latest/meta-data/public-keys/0/openssh-key" +Invoke-WebRequest $keyUrl -OutFile $openSSHAuthorizedKeys + +# Ensure ACL for administrators_authorized_keys is correct +# See https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_server_configuration#authorizedkeysfile +icacls.exe $openSSHAuthorizedKeys /inheritance:r /grant "Administrators:F" /grant "SYSTEM:F" +'@ | Out-File $keyDownloadScript + +# Create Task +$taskName = "DownloadKey" +$principal = New-ScheduledTaskPrincipal -UserID "NT AUTHORITY\SYSTEM" -LogonType ServiceAccount -RunLevel Highest +$action = New-ScheduledTaskAction -Execute "Powershell.exe" -Argument "-NoProfile -File ""$keyDownloadScript""" +$trigger = New-ScheduledTaskTrigger -AtStartup +Register-ScheduledTask -Action $action -Trigger $trigger -Principal $principal -TaskName $taskName -Description $taskName + +# Fetch key via $keyDownloadScript +& Powershell.exe -ExecutionPolicy Bypass -File $keyDownloadScript + + diff --git a/ops/packer/windows/sysprep.ps1 b/ops/packer/windows/sysprep.ps1 new file mode 100644 index 000000000000..a0470309f9da --- /dev/null +++ b/ops/packer/windows/sysprep.ps1 @@ -0,0 +1,14 @@ +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/PrepareImage.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +$ErrorActionPreference = "Stop" + +Write-Output "Cleaning up keys" +$openSSHAuthorizedKeys = Join-Path $env:ProgramData "ssh\administrators_authorized_keys" +Remove-Item -Recurse -Force -Path $openSSHAuthorizedKeys + +# Make sure task is enabled +Enable-ScheduledTask "DownloadKey" + +Write-Output "Running Sysprep" +& "$Env:Programfiles\Amazon\EC2Launch\ec2launch.exe" sysprep diff --git a/ops/packer/windows/windows.pkr.hcl b/ops/packer/windows/windows.pkr.hcl new file mode 100644 index 000000000000..4c14b7b75806 --- /dev/null +++ b/ops/packer/windows/windows.pkr.hcl @@ -0,0 +1,90 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = "~> 1" + } + windows-update = { + version = "0.15.0" + source = "github.com/rgl/windows-update" + } + } +} + +locals { + ami_name_prefix = "xgboost-ci" + image_name = "RunsOn worker with Windows Server 2022 + ssh + CUDA driver" + region = "us-west-2" + timestamp = regex_replace(timestamp(), "[- TZ:]", "") + volume_size = 120 +} + +data "amazon-ami" "aws-windows-x64" { + filters = { + name = "Windows_Server-2022-English-Full-Base-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["amazon"] +} + +source "amazon-ebs" "runs-on-windows" { + source_ami = "${data.amazon-ami.aws-windows-x64.id}" + ami_name = "${local.ami_name_prefix}-runs-on-windows-${local.timestamp}" + ami_description = "${local.image_name}" + ami_regions = ["${local.region}"] + ami_virtualization_type = "hvm" + associate_public_ip_address = true + communicator = "ssh" + instance_type = "g4dn.xlarge" + region = "${local.region}" + ssh_timeout = "10m" + ssh_username = "Administrator" + ssh_file_transfer_method = "sftp" + user_data_file = "setup_ssh.ps1" + launch_block_device_mappings { + device_name = "/dev/sda1" + volume_size = "${local.volume_size}" + volume_type = "gp3" + delete_on_termination = true + } + aws_polling { # Wait up to 2.5 hours until the AMI is ready + delay_seconds = 15 + max_attempts = 600 + } + fast_launch { + enable_fast_launch = true + target_resource_count = 10 + } + snapshot_tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } + tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } +} + +build { + sources = ["source.amazon-ebs.runs-on-windows"] + + provisioner "windows-update" {} + + provisioner "powershell" { + script = "install_choco.ps1" + } + + provisioner "windows-restart" { + max_retries = 3 + } + + provisioner "powershell" { + script = "bootstrap.ps1" + } + + provisioner "powershell" { # Sysprep should run the last + script = "sysprep.ps1" + } +} diff --git a/tests/buildkite/cpu_only_pypkg.patch b/ops/patch/cpu_only_pypkg.patch similarity index 100% rename from tests/buildkite/cpu_only_pypkg.patch rename to ops/patch/cpu_only_pypkg.patch diff --git a/tests/buildkite/manylinux2014_warning.patch b/ops/patch/manylinux2014_warning.patch similarity index 100% rename from tests/buildkite/manylinux2014_warning.patch rename to ops/patch/manylinux2014_warning.patch diff --git a/tests/buildkite/remove_nccl_dep.patch b/ops/patch/remove_nccl_dep.patch similarity index 100% rename from tests/buildkite/remove_nccl_dep.patch rename to ops/patch/remove_nccl_dep.patch diff --git a/ops/pipeline/build-cpu-arm64.sh b/ops/pipeline/build-cpu-arm64.sh new file mode 100755 index 000000000000..ff948ca0c77a --- /dev/null +++ b/ops/pipeline/build-cpu-arm64.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +set -euox pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +WHEEL_TAG=manylinux_2_28_aarch64 + +echo "--- Build CPU code targeting ARM64" + +echo "--- Build libxgboost from the source" +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- ops/script/build_via_cmake.sh \ + --conda-env=aarch64_test \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOL=ON + +echo "--- Run Google Test" +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c "cd build && ctest --extra-verbose" + +echo "--- Build binary wheel" +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c \ + "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" +python3 ops/script/rename_whl.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ + +# Make sure that libgomp.so is vendored in the wheel +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c \ + "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" diff --git a/ops/pipeline/build-cpu.sh b/ops/pipeline/build-cpu.sh new file mode 100755 index 000000000000..dc0572f0ca4d --- /dev/null +++ b/ops/pipeline/build-cpu.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +set -euox pipefail + +echo "--- Build CPU code" + +# This step is not necessary, but here we include it, to ensure that +# DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use +# the configured header build/dmlc/build_config.h instead of +# include/dmlc/build_config_default.h. +rm -fv dmlc-core/include/dmlc/build_config_default.h + +# Sanitizer tests +echo "--- Run Google Test with sanitizer enabled" +# Work around https://github.com/google/sanitizers/issues/1614 +sudo sysctl vm.mmap_rnd_bits=28 +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- ops/script/build_via_cmake.sh \ + -DUSE_SANITIZER=ON \ + -DENABLED_SANITIZERS="address;leak;undefined" \ + -DCMAKE_BUILD_TYPE=Debug \ + -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + --run-args '-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer + -e ASAN_OPTIONS=symbolize=1 + -e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log + --cap-add SYS_PTRACE' \ + -- bash -c \ + "cd build && ./testxgboost --gtest_filter=-*DeathTest*" + +echo "--- Run Google Test" +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- ops/script/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH=/opt/grpc \ + -DPLUGIN_FEDERATED=ON +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- bash -c "cd build && ctest --extra-verbose" diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh new file mode 100755 index 000000000000..479c9a1b1a28 --- /dev/null +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -0,0 +1,74 @@ +#!/bin/bash +## Build XGBoost with CUDA + RMM support + +set -euo pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 [container_id]" + exit 1 +fi +container_id="$1" + +source ops/pipeline/classify-git-branch.sh + +set -x + +WHEEL_TAG=manylinux_2_28_x86_64 + +echo "--- Build with CUDA with RMM" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +echo "--- Build libxgboost from the source" +python3 ops/docker_run.py \ + --container-id "${container_id}" \ + -- ops/script/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DPLUGIN_RMM=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} + +echo "--- Build binary wheel" +python3 ops/docker_run.py \ + --container-id "${container_id}" \ + -- bash -c \ + "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" +python3 ops/script/rename_whl.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" +python3 ops/docker_run.py \ + --container-id xgb-ci.${WHEEL_TAG} \ + -- auditwheel repair \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ +# Make sure that libgomp.so is vendored in the wheel +python3 ops/docker_run.py \ + --container-id xgb-ci.${WHEEL_TAG} \ + -- bash -c \ + "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh new file mode 100755 index 000000000000..49475c01c69e --- /dev/null +++ b/ops/pipeline/build-cuda.sh @@ -0,0 +1,85 @@ +#!/bin/bash +## Build XGBoost with CUDA + +set -euox pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +WHEEL_TAG=manylinux_2_28_x86_64 + +source ops/pipeline/classify-git-branch.sh + +echo "--- Build with CUDA" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +echo "--- Build libxgboost from the source" +set -x +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- ops/script/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} + +echo "--- Build binary wheel" +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- bash -c \ + "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" +python3 ops/script/rename_whl.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" +python3 ops/docker_run.py \ + --container-id xgb-ci.manylinux_2_28_x86_64 \ + -- auditwheel repair \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ +# Make sure that libgomp.so is vendored in the wheel +python3 ops/docker_run.py \ + --container-id xgb-ci.manylinux_2_28_x86_64 \ + -- bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" + +# Generate the meta info which includes xgboost version and the commit info +python3 ops/docker_run.py \ +--container-id xgb-ci.gpu_build_rockylinux8 \ +-- python ops/script/format_wheel_meta.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} \ + --meta-path python-package/dist/ + +echo "--- Upload Python wheel" +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ + --acl public-read --no-progress + aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ + --acl public-read --no-progress +fi diff --git a/tests/ci_build/build_r_pkg_with_cuda.sh b/ops/pipeline/build-gpu-rpkg-impl.sh similarity index 73% rename from tests/ci_build/build_r_pkg_with_cuda.sh rename to ops/pipeline/build-gpu-rpkg-impl.sh index 78a2afc1cdf7..2815b8f448f1 100755 --- a/tests/ci_build/build_r_pkg_with_cuda.sh +++ b/ops/pipeline/build-gpu-rpkg-impl.sh @@ -1,8 +1,12 @@ #!/bin/bash -set -e -set -x -if [ "$#" -ne 1 ] +## Build XGBoost R package with GPU support and package it in a tarball. +## Users will be able to install it without having CTK installed +## (only a compatible NVIDIA driver is needed). + +set -euo pipefail + +if [[ "$#" -ne 1 ]] then echo "Build the R package tarball with CUDA code. Usage: $0 [commit hash]" exit 1 @@ -10,7 +14,7 @@ fi commit_hash="$1" -python tests/ci_build/test_r_package.py --task=pack +python3 ops/script/test_r_package.py --task=pack mv xgboost/ xgboost_rpack/ mkdir build diff --git a/ops/pipeline/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh new file mode 100755 index 000000000000..d1384ef766a6 --- /dev/null +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -euox pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +echo "--- Build XGBoost R package with CUDA" +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_r_rockylinux8 \ + -- ops/pipeline/build-gpu-rpkg-impl.sh \ + ${GITHUB_SHA} diff --git a/ops/pipeline/build-jvm-doc-impl.sh b/ops/pipeline/build-jvm-doc-impl.sh new file mode 100755 index 000000000000..4e95f284e25c --- /dev/null +++ b/ops/pipeline/build-jvm-doc-impl.sh @@ -0,0 +1,43 @@ +#!/bin/bash +## Build docs for the JVM packages and package it in a tarball +## Note: Note: this script assumes that the user has already built libxgboost4j.so +## and place it in the lib/ directory. + +if [[ $# -ne 1 ]] +then + echo "Usage: $0 [branch name]" + exit 1 +fi + +set -euo pipefail + +branch_name=$1 + +# Copy in libxgboost4j.so +mkdir -p jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +cp -v lib/libxgboost4j.so jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ + +cd jvm-packages/ +# Install JVM packages in local Maven repository +mvn --no-transfer-progress install -Pdocs +# Build Scaladocs +mvn --no-transfer-progress scala:doc -Pdocs +# Build Javadocs +mvn --no-transfer-progress javadoc:javadoc -Pdocs + +# Package JVM docs in a tarball +mkdir -p tmp/scaladocs +cp -rv xgboost4j/target/reports/apidocs/ ./tmp/javadocs/ +cp -rv xgboost4j/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j/ +cp -rv xgboost4j-spark/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark/ +cp -rv xgboost4j-spark-gpu/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark-gpu/ +cp -rv xgboost4j-flink/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-flink/ + +cd tmp +tar cvjf ${branch_name}.tar.bz2 javadocs/ scaladocs/ +mv ${branch_name}.tar.bz2 .. +cd .. +rm -rfv tmp/ + +set +x +set +e diff --git a/ops/pipeline/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh new file mode 100755 index 000000000000..00fdac7a1353 --- /dev/null +++ b/ops/pipeline/build-jvm-doc.sh @@ -0,0 +1,24 @@ +#!/bin/bash +## Build docs for the JVM packages and package it in a tarball +## Note: this script assumes that the user has already built libxgboost4j.so +## and place it in the lib/ directory. + +set -euox pipefail + +echo "--- Build JVM packages doc" + +if [[ -z ${BRANCH_NAME:-} ]] +then + echo "Make sure to define environment variable BRANCH_NAME." + exit 1 +fi + +if [[ ! -f lib/libxgboost4j.so ]] +then + echo "Must place libxgboost4j.so in lib/ first" + exit 2 +fi + +python3 ops/docker_run.py \ + --container-id xgb-ci.jvm_gpu_build \ + -- ops/pipeline/build-jvm-doc-impl.sh ${BRANCH_NAME} diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh new file mode 100755 index 000000000000..7656a3d2f188 --- /dev/null +++ b/ops/pipeline/build-jvm-gpu.sh @@ -0,0 +1,33 @@ +#!/bin/bash +## Build libxgboost4j.so with CUDA + +set -euo pipefail + +source ops/pipeline/classify-git-branch.sh + +echo "--- Build libxgboost4j.so with CUDA" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +COMMAND=$( +cat <<-EOF +cd build-gpu/ && \ +cmake .. -DCMAKE_PREFIX_PATH=/workspace/cccl -GNinja -DUSE_CUDA=ON -DUSE_NCCL=ON \ + -DJVM_BINDINGS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ${arch_flag} && \ + ninja +EOF +) + +set -x +mkdir -p build-gpu/ +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet --depth 1 +python3 ops/docker_run.py \ + --container-id xgb-ci.jvm_gpu_build \ + -- bash -c "${COMMAND}" diff --git a/tests/buildkite/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-apple-silicon.sh old mode 100644 new mode 100755 similarity index 50% rename from tests/buildkite/build-jvm-macos-m1.sh rename to ops/pipeline/build-jvm-macos-apple-silicon.sh index 1d2e5e8703bc..cfba35d0f96a --- a/tests/buildkite/build-jvm-macos-m1.sh +++ b/ops/pipeline/build-jvm-macos-apple-silicon.sh @@ -1,8 +1,7 @@ #!/bin/bash +## Build libxgboost4j.dylib targeting MacOS (Apple Silicon) -set -euo pipefail - -source tests/buildkite/conftest.sh +set -euox pipefail # Display system info echo "--- Display system information" @@ -12,6 +11,8 @@ sysctl -n machdep.cpu.brand_string uname -m set +x +brew install ninja libomp + # Build XGBoost4J binary echo "--- Build libxgboost4j.dylib" set -x @@ -24,18 +25,3 @@ popd rm -rf build otool -L lib/libxgboost.dylib set +x - -echo "--- Upload libxgboost4j.dylib" -set -x -pushd lib -libname=libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib -mv -v libxgboost4j.dylib ${libname} -buildkite-agent artifact upload ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd -set +x diff --git a/ops/pipeline/build-jvm-macos-intel.sh b/ops/pipeline/build-jvm-macos-intel.sh new file mode 100755 index 000000000000..5e73b03b7f6e --- /dev/null +++ b/ops/pipeline/build-jvm-macos-intel.sh @@ -0,0 +1,26 @@ +#!/bin/bash +## Build libxgboost4j.dylib targeting MacOS (Intel) + +set -euox pipefail + +# Display system info +echo "--- Display system information" +set -x +system_profiler SPSoftwareDataType +sysctl -n machdep.cpu.brand_string +uname -m +set +x + +brew install ninja libomp + +# Build XGBoost4J binary +echo "--- Build libxgboost4j.dylib" +set -x +mkdir build +pushd build +export JAVA_HOME=$(/usr/libexec/java_home) +cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=ON -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15 +ninja -v +popd +rm -rf build +otool -L lib/libxgboost.dylib diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh new file mode 100755 index 000000000000..e69dd3682b90 --- /dev/null +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -0,0 +1,25 @@ +#!/bin/bash +## Build libxgboost4j.so targeting glibc 2.17 systems + +set -euox pipefail + +if [[ $# -ne 1 ]] +then + echo "Usage: $0 {x86_64,aarch64}" + exit 1 +fi + +arch=$1 + +image="xgb-ci.manylinux2014_${arch}" + +# Build XGBoost4J binary +echo "--- Build libxgboost4j.so (targeting glibc 2.17)" +set -x +mkdir build +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ + "cd build && cmake .. -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && make -j$(nproc)" +ldd lib/libxgboost4j.so +objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh new file mode 100755 index 000000000000..a8f5af8bc3cd --- /dev/null +++ b/ops/pipeline/build-manylinux2014.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +set -euox pipefail + +if [[ -z "${GITHUB_SHA:-}" ]] +then + echo "Make sure to set environment variable GITHUB_SHA" + exit 1 +fi + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {x86_64,aarch64}" + exit 1 +fi + +arch="$1" + +WHEEL_TAG="manylinux2014_${arch}" +image="xgb-ci.${WHEEL_TAG}" + +python_bin="/opt/python/cp310-cp310/bin/python" + +echo "--- Build binary wheel for ${WHEEL_TAG}" +# Patch to add warning about manylinux2014 variant +patch -p0 < ops/patch/remove_nccl_dep.patch +patch -p0 < ops/patch/manylinux2014_warning.patch +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ + "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" +git checkout python-package/pyproject.toml python-package/xgboost/core.py + # discard the patch + +python3 ops/docker_run.py \ + --container-id ${image} \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +rm -rf python-package/dist/ +mkdir python-package/dist/ +mv -v wheelhouse/*.whl python-package/dist/ + +echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)" +# Patch to rename pkg to xgboost-cpu +patch -p0 < ops/patch/remove_nccl_dep.patch +patch -p0 < ops/patch/cpu_only_pypkg.patch +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ + "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" +git checkout python-package/pyproject.toml # discard the patch + +python3 ops/docker_run.py \ + --container-id ${image} \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl +python3 ops/script/rename_whl.py \ + --wheel-path wheelhouse/xgboost_cpu-*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +rm -v python-package/dist/xgboost_cpu-*.whl +mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/ diff --git a/tests/ci_build/build_python_wheels.sh b/ops/pipeline/build-python-wheels-macos.sh old mode 100644 new mode 100755 similarity index 94% rename from tests/ci_build/build_python_wheels.sh rename to ops/pipeline/build-python-wheels-macos.sh index d9927905cf83..697514c0c3ad --- a/tests/ci_build/build_python_wheels.sh +++ b/ops/pipeline/build-python-wheels-macos.sh @@ -1,7 +1,6 @@ #!/bin/bash -set -e -set -x +set -euox pipefail if [[ $# -ne 2 ]]; then echo "Usage: $0 [platform_id] [commit ID]" @@ -31,7 +30,6 @@ if [[ "$platform_id" == macosx_* ]]; then # Set up environment variables to configure cibuildwheel export CIBW_BUILD=cp${cpython_ver}-${platform_id} export CIBW_ARCHS=${cibw_archs} - export CIBW_ENVIRONMENT=${setup_env_var} export CIBW_TEST_SKIP='*-macosx_arm64' export CIBW_BUILD_VERBOSITY=3 else @@ -44,7 +42,7 @@ export CIBW_REPAIR_WHEEL_COMMAND_MACOS="delocate-wheel --require-archs {delocate python -m pip install cibuildwheel python -m cibuildwheel python-package --output-dir wheelhouse -python tests/ci_build/rename_whl.py \ +python ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${commit_id} \ --platform-tag ${wheel_tag} diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh new file mode 100755 index 000000000000..ed95ba3368ab --- /dev/null +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -0,0 +1,79 @@ +#!/bin/bash +## Build and test JVM packages. +## Companion script for build-test-jvm-packages.sh. +## +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) + - USE_CUDA: Set to 1 to enable CUDA + - SKIP_NATIVE_BUILD: Set to 1 to have the JVM packages use an externally provided + libxgboost4j.so. (Usually Maven will invoke create_jni.py to + build it from scratch.) When using this option, make sure to + place libxgboost4j.so in lib/ directory. +EOF +) + +set -euo pipefail + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x + +# Set Scala version +if [[ "${SCALA_VERSION}" == "2.12" || "${SCALA_VERSION}" == "2.13" ]] +then + python ops/script/change_scala_version.py --scala-version ${SCALA_VERSION} --purge-artifacts +else + echo "Error: SCALA_VERSION must be either 2.12 or 2.13" + exit 2 +fi + +# If SKIP_NATIVE_BUILD is set, copy in libxgboost4j.so from lib/ +# Also copy in other files needed for testing. (Usually create_jni.py would perform this +# step, but we need to do it manually here.) +if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] +then + bash ops/script/inject_jvm_lib.sh +fi + +cd jvm-packages/ + +# Ensure that XGBoost4J-Spark is compatible with multiple versions of Spark +if [[ "${USE_CUDA:-}" != "1" && "${SCALA_VERSION}" == "2.12" ]] +then + for spark_version in 3.1.3 3.2.4 3.3.4 3.4.3 + do + mvn --no-transfer-progress clean package -Dspark.version=${spark_version} \ + -pl xgboost4j,xgboost4j-spark + done +fi + +set +x +mvn_options="" +if [[ "${USE_CUDA:-}" == "1" ]] +then + mvn_options="${mvn_options} -Pgpu" +fi +if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] +then + mvn_options="${mvn_options} -Dskip.native.build=true" +fi +set -x + +mvn --no-transfer-progress clean install ${mvn_options} + +# Integration tests +if [[ "${USE_CUDA:-}" != "1" ]] +then + mvn --no-transfer-progress test -pl xgboost4j-example +fi diff --git a/ops/pipeline/build-test-jvm-packages.sh b/ops/pipeline/build-test-jvm-packages.sh new file mode 100755 index 000000000000..d04cc3510de5 --- /dev/null +++ b/ops/pipeline/build-test-jvm-packages.sh @@ -0,0 +1,28 @@ +#!/bin/bash +## Build and test JVM packages. +## +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) +EOF +) + +set -euo pipefail + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x + +python3 ops/docker_run.py --container-id xgb-ci.jvm \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION}" \ + -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/build-test-sycl.sh b/ops/pipeline/build-test-sycl.sh new file mode 100755 index 000000000000..f3b651b18cf9 --- /dev/null +++ b/ops/pipeline/build-test-sycl.sh @@ -0,0 +1,33 @@ +#!/bin/bash +## Build and test oneAPI + +set -euox pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gtest,pytest}" + exit 1 +fi + +suite="$1" + +mkdir build +pushd build +cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ + -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja +ninja +popd + +case "$suite" in + gtest) + ./build/testxgboost + ;; + pytest) + cd python-package + python --version + pip install -v . + cd .. + pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ + ;; +esac diff --git a/ops/pipeline/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 new file mode 100644 index 000000000000..76cc955059b8 --- /dev/null +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -0,0 +1,46 @@ +$ErrorActionPreference = "Stop" + +. ops/pipeline/enforce-ci.ps1 + +Write-Host "--- Build libxgboost on Windows with CUDA" + +nvcc --version +if ( $is_release_branch -eq 0 ) { + $arch_flag = "-DGPU_COMPUTE_VER=75" +} else { + $arch_flag = "" +} + +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet +mkdir build +cd build +cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON ` + -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ` + -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" ${arch_flag} +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +cmake --build . --config Release -- /m /nodeReuse:false ` + "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Build binary wheel" +cd ../python-package +conda activate +pip install --user -v "pip>=23" +pip --version +pip wheel --no-deps -v . --wheel-dir dist/ +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +python ../ops/script/rename_whl.py ` + --wheel-path (Get-ChildItem dist/*.whl | Select-Object -Expand FullName) ` + --commit-hash $Env:GITHUB_SHA ` + --platform-tag win_amd64 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Upload Python wheel" +cd .. +if ( $is_release_branch -eq 1 ) { + aws s3 cp (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) ` + s3://xgboost-nightly-builds/$Env:BRANCH_NAME/ --acl public-read --no-progress + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +} diff --git a/ops/pipeline/classify-git-branch.sh b/ops/pipeline/classify-git-branch.sh new file mode 100755 index 000000000000..3d9a2348f23e --- /dev/null +++ b/ops/pipeline/classify-git-branch.sh @@ -0,0 +1,25 @@ +#!/bin/bash +## Detect whether the current git branch is a pull request or a release branch + +set -euo pipefail + +if [[ -n ${GITHUB_BASE_REF:-} ]] +then + is_pull_request=1 +else + is_pull_request=0 +fi + +if [[ ${BRANCH_NAME:-} == "master" || ${BRANCH_NAME:-} == "release_"* || ${BRANCH_NAME:-} == "federated-secure" ]] +then + is_release_branch=1 + enforce_daily_budget=0 +else + is_release_branch=0 + enforce_daily_budget=1 +fi + +if [[ -n ${DISABLE_RELEASE:-} ]] +then + is_release_branch=0 +fi diff --git a/ops/pipeline/deploy-jvm-packages-impl.sh b/ops/pipeline/deploy-jvm-packages-impl.sh new file mode 100755 index 000000000000..e9c09112a4bd --- /dev/null +++ b/ops/pipeline/deploy-jvm-packages-impl.sh @@ -0,0 +1,39 @@ +#!/bin/bash +## Deploy JVM packages to S3 bucket +## Companion script for ops/pipeline/deploy-jvm-packages.sh + +set -euox pipefail + +if [[ "$#" -lt 2 ]] +then + echo "Usage: $0 {cpu,gpu} [scala_version]" + exit 1 +fi + +variant="$1" +scala_version="$2" +maven_options="-DskipTests -Dmaven.test.skip=true -Dskip.native.build=true" + +case "$variant" in + cpu) + # CPU variant + python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + bash ops/script/inject_jvm_lib.sh + pushd jvm-packages + mvn --no-transfer-progress deploy -Pdefault,release-to-s3 ${maven_options} + popd + ;; + gpu) + # GPU variant + python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + bash ops/script/inject_jvm_lib.sh + pushd jvm-packages + mvn --no-transfer-progress install -Pgpu ${maven_options} + mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu ${maven_options} + popd + ;; + *) + echo "Unrecognized argument: $variant" + exit 2 + ;; +esac diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh new file mode 100755 index 000000000000..e821f334b9d2 --- /dev/null +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -0,0 +1,23 @@ +#!/bin/bash +## Deploy JVM packages to S3 bucket + +set -euox pipefail + +source ops/pipeline/enforce-ci.sh + +if [[ "$#" -lt 3 ]] +then + echo "Usage: $0 {cpu,gpu} [container_id] [scala_version]" + exit 1 +fi + +variant="$1" +container_id="$2" +scala_version="$3" + +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" + python3 ops/docker_run.py --container-id "${container_id}" \ + -- ops/pipeline/deploy-jvm-packages-impl.sh "${variant}" "${scala_version}" +fi diff --git a/ops/pipeline/enforce-ci.ps1 b/ops/pipeline/enforce-ci.ps1 new file mode 100644 index 000000000000..0528472be6cb --- /dev/null +++ b/ops/pipeline/enforce-ci.ps1 @@ -0,0 +1,28 @@ +## Ensure that a script is running inside the CI. +## Usage: . ops/pipeline/enforce-ci.ps1 + +if ( -Not $Env:GITHUB_ACTION ) { + $script_name = (Split-Path -Path $PSCommandPath -Leaf) + Write-Host "$script_name is not meant to run locally; it should run inside GitHub Actions." + Write-Host "Please inspect the content of $script_name and locate the desired command manually." + exit 1 +} + +if ( -Not $Env:BRANCH_NAME ) { + Write-Host "Make sure to define environment variable BRANCH_NAME." + exit 2 +} + +if ( $Env:GITHUB_BASE_REF ) { + $is_pull_request = 1 +} else { + $is_pull_request = 0 +} + +if ( ($Env:BRANCH_NAME -eq "master") -or ($Env:BRANCH_NAME -match "release_.+") ) { + $is_release_branch = 1 + $enforce_daily_budget = 0 +} else { + $is_release_branch = 0 + $enforce_daily_budget = 1 +} diff --git a/ops/pipeline/enforce-ci.sh b/ops/pipeline/enforce-ci.sh new file mode 100755 index 000000000000..1e853a5ea266 --- /dev/null +++ b/ops/pipeline/enforce-ci.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +## Ensure that a script is running inside the CI. +## Usage: source ops/pipeline/enforce-ci.sh + +set -euo pipefail + +if [[ -z ${GITHUB_ACTION:-} ]] +then + echo "$0 is not meant to run locally; it should run inside GitHub Actions." + echo "Please inspect the content of $0 and locate the desired command manually." + exit 1 +fi + +if [[ -z ${BRANCH_NAME:-} ]] +then + echo "Make sure to define environment variable BRANCH_NAME." + exit 2 +fi + +source ops/pipeline/classify-git-branch.sh diff --git a/ops/pipeline/publish-artifact.sh b/ops/pipeline/publish-artifact.sh new file mode 100755 index 000000000000..adcb3c521d2a --- /dev/null +++ b/ops/pipeline/publish-artifact.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +## Publish artifacts in an S3 bucket +## Meant to be used inside GitHub Actions + +set -euo pipefail + +source ops/pipeline/enforce-ci.sh + +if [[ $# -ne 2 ]] +then + echo "Usage: $0 [artifact] [s3_url]" + exit 1 +fi + +artifact="$1" +s3_url="$2" + +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + echo "aws s3 cp ${artifact} ${s3_url} --acl public-read --no-progress" + aws s3 cp "${artifact}" "${s3_url}" --acl public-read --no-progress +fi diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh new file mode 100755 index 000000000000..676f302009ce --- /dev/null +++ b/ops/pipeline/run-clang-tidy.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -euox pipefail + +echo "--- Run clang-tidy" + +python3 ops/docker_run.py \ + --container-id xgb-ci.clang_tidy \ + -- python3 ops/script/run_clang_tidy.py --cuda-archs 75 diff --git a/ops/pipeline/stash-artifacts.ps1 b/ops/pipeline/stash-artifacts.ps1 new file mode 100644 index 000000000000..9b9989bf376d --- /dev/null +++ b/ops/pipeline/stash-artifacts.ps1 @@ -0,0 +1,49 @@ +[CmdletBinding()] +Param( + [Parameter( + Mandatory=$true, + Position=0 + )][string]$command, + [Parameter( + Mandatory=$true, + Position=1 + )][string]$remote_prefix, + [Parameter( + Mandatory=$true, + Position=2, + ValueFromRemainingArguments=$true + )][string[]]$artifacts +) + +## Convenience wrapper for ops/pipeline/stash-artifacts.py +## Meant to be used inside GitHub Actions + +$ErrorActionPreference = "Stop" + +. ops/pipeline/enforce-ci.ps1 + +foreach ($env in "GITHUB_REPOSITORY", "GITHUB_RUN_ID", "RUNS_ON_S3_BUCKET_CACHE") { + $val = [Environment]::GetEnvironmentVariable($env) + if ($val -eq $null) { + Write-Host "Error: $env must be set." + exit 1 + } +} + +$artifact_stash_prefix = "cache/${Env:GITHUB_REPOSITORY}/stash/${Env:GITHUB_RUN_ID}" + +conda activate + +Write-Host @" +python ops/pipeline/stash-artifacts.py ` + --command "${command}" ` + --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` + --prefix "${artifact_stash_prefix}/${remote_prefix}" ` + -- $artifacts +"@ +python ops/pipeline/stash-artifacts.py ` + --command "${command}" ` + --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` + --prefix "${artifact_stash_prefix}/${remote_prefix}" ` + -- $artifacts +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/pipeline/stash-artifacts.py b/ops/pipeline/stash-artifacts.py new file mode 100644 index 000000000000..151e187513da --- /dev/null +++ b/ops/pipeline/stash-artifacts.py @@ -0,0 +1,144 @@ +""" +Stash an artifact in an S3 bucket for later use + +Note. This script takes in all inputs via environment variables + except the path to the artifact(s). +""" + +import argparse +import os +import subprocess +from pathlib import Path +from urllib.parse import SplitResult, urlsplit, urlunsplit + + +def resolve(x: Path) -> Path: + return x.expanduser().resolve() + + +def path_equals(a: Path, b: Path) -> bool: + return resolve(a) == resolve(b) + + +def compute_s3_url(s3_bucket: str, prefix: str, artifact: Path) -> str: + filename = artifact.name + relative_path = resolve(artifact).relative_to(Path.cwd()) + if resolve(artifact.parent) == resolve(Path.cwd()): + full_prefix = prefix + else: + full_prefix = f"{prefix}/{str(relative_path.parent)}" + return f"s3://{s3_bucket}/{full_prefix}/{filename}" + + +def aws_s3_upload(src: Path, dest: str) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", str(src), dest] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download(src: str, dest: Path) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", src, str(dest)] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download_with_wildcard(src: str, dest: Path) -> None: + parsed_src = urlsplit(src) + src_dir = urlunsplit( + SplitResult( + scheme="s3", + netloc=parsed_src.netloc, + path=os.path.dirname(parsed_src.path), + query="", + fragment="", + ) + ) + dest_dir = dest.parent + src_glob = os.path.basename(parsed_src.path) + cli_args = [ + "aws", + "s3", + "cp", + "--recursive", + "--no-progress", + "--exclude", + "'*'", + "--include", + src_glob, + src_dir, + str(dest_dir), + ] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def upload(args: argparse.Namespace) -> None: + print(f"Stashing artifacts to prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) + aws_s3_upload(artifact_path, s3_url) + + +def download(args: argparse.Namespace) -> None: + print(f"Unstashing artifacts from prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + print(f"mkdir -p {str(artifact_path.parent)}") + artifact_path.parent.mkdir(parents=True, exist_ok=True) + s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) + if "*" in artifact: + aws_s3_download_with_wildcard(s3_url, artifact_path) + else: + aws_s3_download(s3_url, artifact_path) + + +if __name__ == "__main__": + # Ensure that the current working directory is the project root + if not (Path.cwd() / "ops").is_dir() or not path_equals( + Path(__file__).parent.parent, Path.cwd() / "ops" + ): + x = Path(__file__).name + raise RuntimeError(f"Script {x} must be run at the project's root directory") + + parser = argparse.ArgumentParser() + parser.add_argument( + "--command", + type=str, + choices=["stash", "unstash"], + required=True, + help="Whether to stash or unstash the artifact", + ) + parser.add_argument( + "--s3-bucket", + type=str, + required=True, + help="Name of the S3 bucket to store the artifact", + ) + parser.add_argument( + "--prefix", + type=str, + required=True, + help=( + "Where the artifact would be stored. The artifact will be stored in " + "s3://[s3-bucket]/[prefix]." + ), + ) + parser.add_argument("artifacts", type=str, nargs="+", metavar="artifact") + parsed_args = parser.parse_args() + if parsed_args.command == "stash": + upload(parsed_args) + elif parsed_args.command == "unstash": + download(parsed_args) diff --git a/ops/pipeline/stash-artifacts.sh b/ops/pipeline/stash-artifacts.sh new file mode 100755 index 000000000000..98c9695c4227 --- /dev/null +++ b/ops/pipeline/stash-artifacts.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +## Convenience wrapper for ops/pipeline/stash-artifacts.py +## Meant to be used inside GitHub Actions + +set -euo pipefail + +source ops/pipeline/enforce-ci.sh + +if [[ "$#" -lt 3 ]] +then + echo "Usage: $0 {stash,unstash} [remote_prefix] [artifact] [artifact ...]" + exit 1 +fi + +command="$1" +remote_prefix="$2" +shift 2 + +for arg in "GITHUB_REPOSITORY" "GITHUB_RUN_ID" "RUNS_ON_S3_BUCKET_CACHE" +do + if [[ -z "${!arg:-}" ]] + then + echo "Error: $arg must be set." + exit 2 + fi +done + +artifact_stash_prefix="cache/${GITHUB_REPOSITORY}/stash/${GITHUB_RUN_ID}" + +set -x +python3 ops/pipeline/stash-artifacts.py \ + --command "${command}" \ + --s3-bucket "${RUNS_ON_S3_BUCKET_CACHE}" \ + --prefix "${artifact_stash_prefix}/${remote_prefix}" \ + -- "$@" diff --git a/ops/pipeline/test-c-api-demo.sh b/ops/pipeline/test-c-api-demo.sh new file mode 100755 index 000000000000..9a44c8c46fd9 --- /dev/null +++ b/ops/pipeline/test-c-api-demo.sh @@ -0,0 +1,39 @@ +#!/bin/bash +## Test C API demos + +set -euox pipefail + +# Build and install XGBoost static library (libxgboost.a) +mkdir build +pushd build +cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja +ninja -v install +popd + +# Build and run C API demo with static library +pushd demo/c-api/ +mkdir build-c-api-demo +pushd build-c-api-demo +cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX +ninja -v +ctest +popd +rm -rf ./build-c-api-demo +popd + +# Build and install XGBoost shared library (libxgboost.so) +pushd build +cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja \ + -DPLUGIN_FEDERATED=ON +ninja -v install +popd + +# Build and run C API demo with shared library +mkdir demo/c-api/build-c-api-demo +pushd demo/c-api/build-c-api-demo +cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX +ninja -v +ctest +popd +./ops/script/verify_link.sh ./demo/c-api/build-c-api-demo/basic/api-demo +./ops/script/verify_link.sh ./demo/c-api/build-c-api-demo/external-memory/external-memory-demo diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh new file mode 100755 index 000000000000..9a0cd4743c18 --- /dev/null +++ b/ops/pipeline/test-cpp-gpu.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -euox pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gpu,gpu-rmm,mgpu}" + exit 1 +fi +arg=$1 + +case "${arg}" in + gpu) + echo "--- Run Google Tests, using a single GPU" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- build/testxgboost + ;; + + gpu-rmm) + echo "--- Run Google Tests, using a single GPU, RMM enabled" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- build/testxgboost --use-rmm-pool + ;; + + mgpu) + echo "--- Run Google Tests, using multiple GPUs" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--shm-size=4g' \ + -- build/testxgboost --gtest_filter=*MGPU* + ;; + + *) + echo "Unrecognized arg: ${arg}" + exit 2 + ;; +esac diff --git a/ops/pipeline/test-freebsd.sh b/ops/pipeline/test-freebsd.sh new file mode 100755 index 000000000000..f9ed61e9e2b8 --- /dev/null +++ b/ops/pipeline/test-freebsd.sh @@ -0,0 +1,10 @@ +#!/bin/bash +## Run tests on FreeBSD + +set -euox pipefail + +mkdir build +cd build +cmake .. -GNinja -DGOOGLE_TEST=ON +ninja -v +./testxgboost diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh new file mode 100755 index 000000000000..380db97c787c --- /dev/null +++ b/ops/pipeline/test-jvm-gpu.sh @@ -0,0 +1,32 @@ +#!/bin/bash +## Test JVM packages with CUDA. Note: this script assumes that +## the user has already built libxgboost4j.so with CUDA support +## and place it in the lib/ directory. + +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) +EOF +) + +set -euo pipefail + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x + +python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ + -- nvidia-smi +python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1 --shm-size=4g --privileged" \ + -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/test-python-macos.sh b/ops/pipeline/test-python-macos.sh new file mode 100755 index 000000000000..63b5690d1312 --- /dev/null +++ b/ops/pipeline/test-python-macos.sh @@ -0,0 +1,23 @@ +#!/bin/bash +## Test XGBoost Python wheel on MacOS + +set -euox pipefail + +brew install ninja + +mkdir build +pushd build +# Set prefix, to use OpenMP library from Conda env +# See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 +# to learn why we don't use libomp from Homebrew. +cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON +ninja +popd + +cd python-package +python --version +pip install -v . + +cd .. +pytest -s -v -rxXs --durations=0 ./tests/python +pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask diff --git a/ops/pipeline/test-python-sdist.sh b/ops/pipeline/test-python-sdist.sh new file mode 100755 index 000000000000..d6b71597380e --- /dev/null +++ b/ops/pipeline/test-python-sdist.sh @@ -0,0 +1,11 @@ +#!/bin/bash +## Test installing Python XGBoost from source distribution + +set -euox pipefail + +cd python-package +python --version +python -m build --sdist +pip install -v ./dist/xgboost-*.tar.gz +cd .. +python -c 'import xgboost' diff --git a/ops/pipeline/test-python-wheel-impl.sh b/ops/pipeline/test-python-wheel-impl.sh new file mode 100755 index 000000000000..75bfa5fbaffb --- /dev/null +++ b/ops/pipeline/test-python-wheel-impl.sh @@ -0,0 +1,74 @@ +#!/bin/bash +## Companion script for ops/pipeline/test-python-wheel.sh + +set -eo pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64}" + exit 1 +fi + +suite="$1" + +# Cannot set -u before Conda env activation +case "$suite" in + gpu|mgpu) + source activate gpu_test + ;; + cpu) + source activate linux_cpu_test + ;; + cpu-arm64) + source activate aarch64_test + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac + +set -xu + +export PYSPARK_DRIVER_PYTHON=$(which python) +export PYSPARK_PYTHON=$(which python) +export SPARK_TESTING=1 + +pip install -v ./python-package/dist/*.whl + +case "$suite" in + gpu) + echo "-- Run Python tests, using a single GPU" + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + ;; + mgpu) + echo "-- Run Python tests, using multiple GPUs" + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_federated + ;; + cpu) + echo "-- Run Python tests (CPU)" + export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 + pytest -v -s -rxXs --fulltrace --durations=0 tests/python + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated + ;; + cpu-arm64) + echo "-- Run Python tests (CPU, ARM64)" + pytest -v -s -rxXs --fulltrace --durations=0 \ + tests/python/test_basic.py tests/python/test_basic_models.py \ + tests/python/test_model_compatibility.py + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh new file mode 100755 index 000000000000..b4dd59b7cb0e --- /dev/null +++ b/ops/pipeline/test-python-wheel.sh @@ -0,0 +1,25 @@ +#!/bin/bash +## Test XGBoost Python wheel on the Linux platform + +set -euo pipefail + +if [[ "$#" -lt 2 ]] +then + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [container_id]" + exit 1 +fi + +suite="$1" +container_id="$2" + +if [[ "$suite" == "gpu" || "$suite" == "mgpu" ]] +then + gpu_option="--use-gpus" +else + gpu_option="" +fi + +set -x +python3 ops/docker_run.py --container-id "${container_id}" ${gpu_option} \ + --run-args='--shm-size=4g --privileged' \ + -- bash ops/pipeline/test-python-wheel-impl.sh "${suite}" diff --git a/ops/pipeline/test-python-with-sysprefix.sh b/ops/pipeline/test-python-with-sysprefix.sh new file mode 100755 index 000000000000..9ee918b112f4 --- /dev/null +++ b/ops/pipeline/test-python-with-sysprefix.sh @@ -0,0 +1,23 @@ +#!/bin/bash +## Test if Python XGBoost can be configured to use libxgboost.so from the system prefix + +set -euox pipefail + +sudo apt-get update && sudo apt-get install -y ninja-build + +mkdir build +pushd build +cmake .. -GNinja +ninja +popd + +# Copy libxgboost.so to system prefix +cp -v lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib" + +# Now configure Python XGBoost to use libxgboost.so from the system prefix +cd python-package +pip install virtualenv +virtualenv venv +source venv/bin/activate && \ + pip install -v . --config-settings use_system_libxgboost=True && \ + python -c 'import xgboost' diff --git a/ops/pipeline/test-win64-gpu.ps1 b/ops/pipeline/test-win64-gpu.ps1 new file mode 100644 index 000000000000..4af3bee2cffc --- /dev/null +++ b/ops/pipeline/test-win64-gpu.ps1 @@ -0,0 +1,26 @@ +$ErrorActionPreference = "Stop" + +Write-Host "--- Test XGBoost on Windows with CUDA" + +nvcc --version + +Write-Host "--- Run Google Tests" +build/testxgboost.exe +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Set up Python env" +conda activate +$env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) +mamba env create -n ${env_name} --file=ops/conda_env/win64_test.yml +conda activate ${env_name} +python -m pip install ` + (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Run Python tests" +python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +Write-Host "--- Run Python tests with GPU" +python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"` + tests/python-gpu +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/script/build_via_cmake.sh b/ops/script/build_via_cmake.sh new file mode 100755 index 000000000000..00a571584ea4 --- /dev/null +++ b/ops/script/build_via_cmake.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -euo pipefail + +if [[ "$#" -lt 1 ]] +then + conda_env="" +else + conda_env="$1" +fi + +if [[ "${conda_env}" == --conda-env=* ]] +then + conda_env=$(echo "${conda_env}" | sed 's/^--conda-env=//g' -) + echo "Activating Conda environment ${conda_env}" + shift 1 + cmake_args="$@" + + # Workaround for file permission error + if [[ -n ${CI_BUILD_UID:-} ]] + then + gosu root chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" /opt/miniforge/envs + fi + + # Don't activate Conda env if it's already activated + if [[ -z ${CONDA_PREFIX:-} ]] + then + source activate ${conda_env} + fi + cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX" +else + cmake_args="$@" + cmake_prefix_flag='' +fi + +rm -rf build +mkdir build +cd build +# Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until +# https://github.com/dmlc/xgboost/issues/10400 is fixed +set -x +cmake .. ${cmake_args} \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -GNinja \ + ${cmake_prefix_flag} \ + -DHIDE_CXX_SYMBOLS=ON \ + -DBUILD_DEPRECATED_CLI=ON +ninja clean +time ninja -v +cd .. +set +x diff --git a/dev/change_scala_version.py b/ops/script/change_scala_version.py similarity index 93% rename from dev/change_scala_version.py rename to ops/script/change_scala_version.py index c8a9b54ccf91..ed475a1f9582 100644 --- a/dev/change_scala_version.py +++ b/ops/script/change_scala_version.py @@ -4,7 +4,7 @@ import shutil -def main(args): +def main(args: argparse.Namespace) -> None: if args.scala_version == "2.12": scala_ver = "2.12" scala_patchver = "2.12.18" @@ -20,6 +20,9 @@ def main(args): if target.is_dir(): print(f"Removing {target}...") shutil.rmtree(target) + for target in pathlib.Path("jvm-packages/").glob("**/*.so"): + print(f"Removing {target}...") + target.unlink() # Update pom.xml for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"): diff --git a/tests/ci_build/change_version.py b/ops/script/change_version.py similarity index 100% rename from tests/ci_build/change_version.py rename to ops/script/change_version.py diff --git a/tests/ci_build/format_wheel_meta.py b/ops/script/format_wheel_meta.py similarity index 92% rename from tests/ci_build/format_wheel_meta.py rename to ops/script/format_wheel_meta.py index 9e7bad907687..a7def879905e 100644 --- a/tests/ci_build/format_wheel_meta.py +++ b/ops/script/format_wheel_meta.py @@ -2,18 +2,19 @@ Script to generate meta.json to store metadata for a nightly build of XGBoost Python package. """ + +import argparse import json import pathlib -from argparse import ArgumentParser -def main(args): +def main(args: argparse.Namespace) -> None: wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() if not wheel_path.exists(): raise ValueError(f"Wheel cannot be found at path {wheel_path}") if not wheel_path.is_file(): raise ValueError(f"Path {wheel_path} is not a valid file") - wheel_dir, wheel_name = wheel_path.parent, wheel_path.name + wheel_name = wheel_path.name meta_path = pathlib.Path(args.meta_path) if not meta_path.exists(): @@ -36,7 +37,7 @@ def main(args): if __name__ == "__main__": - parser = ArgumentParser( + parser = argparse.ArgumentParser( description="Format meta.json encoding the latest nightly version of the Python wheel" ) parser.add_argument( diff --git a/ops/script/inject_jvm_lib.sh b/ops/script/inject_jvm_lib.sh new file mode 100755 index 000000000000..82584aeaca92 --- /dev/null +++ b/ops/script/inject_jvm_lib.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Inject lib/libxgboost4j.so into JVM packages. +# This script is useful when the user opts to set skip.native.build=true +# option in the JVM package build. When this option is set, the JVM package +# build will not build libxgboost4j.so; instead it will expect to find the +# library in jvm-packages/xgboost4j/src/main/resources/lib/{os}/{arch}/. +# This script will ensure that libxgboost4j.so is copied to the correct +# location. + +set -euox pipefail + +echo "Using externally provided libxgboost4j.so. Locating one from lib/..." +mkdir -p jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +cp -v lib/libxgboost4j.so jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +mkdir -p jvm-packages/xgboost4j/src/test/resources +mkdir -p jvm-packages/xgboost4j-spark/src/test/resources +mkdir -p jvm-packages/xgboost4j-spark-gpu/src/test/resources + +# Generate machine.txt.* files from the CLI regression demo +# TODO(hcho3): Remove once CLI is removed +pushd demo/CLI/regression +python3 mapfeat.py +python3 mknfold.py machine.txt 1 +popd + +cp -v demo/data/agaricus.* \ + jvm-packages/xgboost4j/src/test/resources +cp -v demo/CLI/regression/machine.txt.t* demo/data/agaricus.* \ + jvm-packages/xgboost4j-spark/src/test/resources +cp -v demo/data/veterans_lung_cancer.csv \ + jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv \ + jvm-packages/xgboost4j-spark-gpu/src/test/resources diff --git a/tests/ci_build/lint_cmake.sh b/ops/script/lint_cmake.sh old mode 100644 new mode 100755 similarity index 94% rename from tests/ci_build/lint_cmake.sh rename to ops/script/lint_cmake.sh index d67ecd0844ed..55aeb20e8fb2 --- a/tests/ci_build/lint_cmake.sh +++ b/ops/script/lint_cmake.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -e +set -euo pipefail cmake_files=$( find . -name CMakeLists.txt -o -path "./cmake/*.cmake" \ diff --git a/tests/ci_build/lint_cpp.py b/ops/script/lint_cpp.py similarity index 86% rename from tests/ci_build/lint_cpp.py rename to ops/script/lint_cpp.py index d4775d6b6b3e..2d00b219ceab 100644 --- a/tests/ci_build/lint_cpp.py +++ b/ops/script/lint_cpp.py @@ -2,6 +2,7 @@ import os import re import sys +from typing import TextIO import cpplint from cpplint import _cpplint_state @@ -9,7 +10,7 @@ CXX_SUFFIX = set(["cc", "c", "cpp", "h", "cu", "hpp"]) -def filepath_enumerate(paths): +def filepath_enumerate(paths: list[str]) -> list[str]: """Enumerate the file paths of all subfiles of the list of paths""" out = [] for path in paths: @@ -22,7 +23,7 @@ def filepath_enumerate(paths): return out -def get_header_guard_dmlc(filename): +def get_header_guard_dmlc(filename: str) -> str: """Get Header Guard Convention for DMLC Projects. For headers in include, directly use the path @@ -54,11 +55,10 @@ def get_header_guard_dmlc(filename): class Lint: - def __init__(self): + def __init__(self) -> None: self.project_name = "xgboost" - self.cpp_header_map = {} - self.cpp_src_map = {} - self.python_map = {} + self.cpp_header_map: dict[str, dict[str, int]] = {} + self.cpp_src_map: dict[str, dict[str, int]] = {} self.pylint_cats = set(["error", "warning", "convention", "refactor"]) # setup cpp lint @@ -78,7 +78,7 @@ def __init__(self): cpplint._SetCountingStyle("toplevel") cpplint._line_length = 100 - def process_cpp(self, path, suffix): + def process_cpp(self, path: str, suffix: str) -> None: """Process a cpp file.""" _cpplint_state.ResetErrorCounts() cpplint.ProcessFile(str(path), _cpplint_state.verbose_level) @@ -91,7 +91,9 @@ def process_cpp(self, path, suffix): self.cpp_src_map[str(path)] = errors @staticmethod - def _print_summary_map(strm, result_map, ftype): + def _print_summary_map( + strm: TextIO, result_map: dict[str, dict[str, int]], ftype: str + ) -> int: """Print summary of certain result map.""" if len(result_map) == 0: return 0 @@ -105,7 +107,7 @@ def _print_summary_map(strm, result_map, ftype): ) return len(result_map) - npass - def print_summary(self, strm): + def print_summary(self, strm: TextIO) -> int: """Print summary of lint.""" nerr = 0 nerr += Lint._print_summary_map(strm, self.cpp_header_map, "cpp-header") @@ -122,7 +124,7 @@ def print_summary(self, strm): cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc -def process(fname, allow_type): +def process(fname: str, allow_type: list[str]) -> None: """Process a file.""" fname = str(fname) arr = fname.rsplit(".", 1) @@ -132,13 +134,19 @@ def process(fname, allow_type): _HELPER.process_cpp(fname, arr[-1]) -def main(): +def main() -> None: parser = argparse.ArgumentParser(description="run cpp lint") parser.add_argument( "path", nargs="*", help="Path to traverse", - default=["src", "include", os.path.join("R-package", "src"), "python-package", "plugin/sycl"], + default=[ + "src", + "include", + os.path.join("R-package", "src"), + "python-package", + "plugin/sycl", + ], ) parser.add_argument( "--exclude_path", @@ -149,7 +157,7 @@ def main(): args = parser.parse_args() excluded_paths = filepath_enumerate(args.exclude_path) - allow_type = [] + allow_type: list[str] = [] allow_type += CXX_SUFFIX for path in args.path: diff --git a/tests/ci_build/lint_python.py b/ops/script/lint_python.py similarity index 93% rename from tests/ci_build/lint_python.py rename to ops/script/lint_python.py index 76860d9d1e35..f418fbf1075f 100644 --- a/tests/ci_build/lint_python.py +++ b/ops/script/lint_python.py @@ -16,8 +16,6 @@ class LintersPaths: BLACK = ( # core "python-package/", - # CI - "tests/ci_build/tidy.py", # tests "tests/python/test_config.py", "tests/python/test_callback.py", @@ -27,6 +25,7 @@ class LintersPaths: "tests/python/test_dt.py", "tests/python/test_demos.py", "tests/python/test_eval_metrics.py", + "tests/python/test_early_stopping.py", "tests/python/test_multi_target.py", "tests/python/test_objectives.py", "tests/python/test_predict.py", @@ -49,13 +48,14 @@ class LintersPaths: # demo "demo/dask/", "demo/rmm_plugin", - "demo/json-model/json_parser.py", "demo/guide-python/continuation.py", "demo/guide-python/cat_in_the_dat.py", "demo/guide-python/callbacks.py", "demo/guide-python/categorical.py", "demo/guide-python/cat_pipeline.py", + "demo/guide-python/cross_validation.py", "demo/guide-python/feature_weights.py", + "demo/guide-python/model_parser.py", "demo/guide-python/sklearn_parallel.py", "demo/guide-python/sklearn_examples.py", "demo/guide-python/sklearn_evals_result.py", @@ -70,10 +70,7 @@ class LintersPaths: "demo/guide-python/update_process.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "tests/ci_build/lint_python.py", - "tests/ci_build/test_r_package.py", - "tests/ci_build/test_utils.py", - "tests/ci_build/change_version.py", + "ops/", ) ISORT = ( @@ -83,12 +80,13 @@ class LintersPaths: "tests/test_distributed/", "tests/python/", "tests/python-gpu/", - "tests/ci_build/", # demo "demo/", # misc "dev/", "doc/", + # CI + "ops/", ) MYPY = ( @@ -111,11 +109,9 @@ class LintersPaths: "tests/test_distributed/test_with_dask/test_external_memory.py", "tests/test_distributed/test_with_spark/test_data.py", "tests/test_distributed/test_gpu_with_spark/test_data.py", - "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py", - "tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py", + "tests/test_distributed/test_gpu_with_dask/", # demo "demo/dask/", - "demo/json-model/json_parser.py", "demo/guide-python/external_memory.py", "demo/guide-python/distributed_extmem_basic.py", "demo/guide-python/sklearn_examples.py", @@ -125,17 +121,14 @@ class LintersPaths: "demo/guide-python/categorical.py", "demo/guide-python/cat_pipeline.py", "demo/guide-python/feature_weights.py", + "demo/guide-python/model_parser.py", "demo/guide-python/individual_trees.py", "demo/guide-python/quantile_regression.py", "demo/guide-python/multioutput_regression.py", "demo/guide-python/learning_to_rank.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "tests/ci_build/tidy.py", - "tests/ci_build/lint_python.py", - "tests/ci_build/test_r_package.py", - "tests/ci_build/test_utils.py", - "tests/ci_build/change_version.py", + "ops/", ) diff --git a/tests/ci_build/lint_r.R b/ops/script/lint_r.R similarity index 100% rename from tests/ci_build/lint_r.R rename to ops/script/lint_r.R diff --git a/tests/ci_build/rename_whl.py b/ops/script/rename_whl.py similarity index 95% rename from tests/ci_build/rename_whl.py rename to ops/script/rename_whl.py index 500196190b3d..d4467720c738 100644 --- a/tests/ci_build/rename_whl.py +++ b/ops/script/rename_whl.py @@ -1,8 +1,8 @@ +import argparse import pathlib -from argparse import ArgumentParser -def main(args): +def main(args: argparse.Namespace) -> None: wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() if not wheel_path.exists(): raise ValueError(f"Wheel cannot be found at path {wheel_path}") @@ -43,7 +43,7 @@ def main(args): if __name__ == "__main__": - parser = ArgumentParser( + parser = argparse.ArgumentParser( description="Format a Python wheel's name using the git commit hash and platform tag" ) parser.add_argument( diff --git a/tests/ci_build/tidy.py b/ops/script/run_clang_tidy.py similarity index 97% rename from tests/ci_build/tidy.py rename to ops/script/run_clang_tidy.py index 13bbedc0b4b5..dca5d1069598 100755 --- a/tests/ci_build/tidy.py +++ b/ops/script/run_clang_tidy.py @@ -19,7 +19,9 @@ def call(args: list[str]) -> tuple[int, int, str, list[str]]: # `workspace` is a name used in the CI container. Normally we should keep the dir # as `xgboost`. matched = re.search( - "(workspace|xgboost)/.*(src|tests|include)/.*warning:", error_msg, re.MULTILINE + "(workspace|xgboost)/.*(ops|src|tests|include)/.*warning:", + error_msg, + re.MULTILINE, ) if matched is None: @@ -265,7 +267,7 @@ def test_tidy(args: argparse.Namespace) -> None: """ root_path = os.path.abspath(os.path.curdir) tidy_file = os.path.join(root_path, ".clang-tidy") - test_file_path = os.path.join(root_path, "tests", "ci_build", "test_tidy.cc") + test_file_path = os.path.join(root_path, "ops", "script", "test_tidy.cc") tidy_config = "--config-file=" + tidy_file if not args.tidy_version: @@ -274,8 +276,8 @@ def test_tidy(args: argparse.Namespace) -> None: tidy = "clang-tidy-" + str(args.tidy_version) cmd = [tidy, tidy_config, test_file_path] (proc_code, tidy_status, error_msg, _) = call(cmd) - assert proc_code == 0 - assert tidy_status == 1 + if proc_code != 0 or tidy_status != 1: + raise RuntimeError(error_msg) print("clang-tidy is working.") diff --git a/tests/ci_build/test_r_package.py b/ops/script/test_r_package.py similarity index 99% rename from tests/ci_build/test_r_package.py rename to ops/script/test_r_package.py index 5ca7fa69b21a..3ce886c1bc41 100644 --- a/tests/ci_build/test_r_package.py +++ b/ops/script/test_r_package.py @@ -42,7 +42,7 @@ def pkgroot(path: str) -> None: else: would_remove = output.stdout.decode("utf-8").strip().split("\n") - if would_remove and not all(f.find("tests/ci_build") != -1 for f in would_remove): + if would_remove and not all(f.find("ops") != -1 for f in would_remove): raise ValueError( "\n".join(would_remove) + "\nPlease cleanup the working git repository." ) diff --git a/tests/ci_build/test_tidy.cc b/ops/script/test_tidy.cc similarity index 100% rename from tests/ci_build/test_tidy.cc rename to ops/script/test_tidy.cc diff --git a/tests/ci_build/test_utils.py b/ops/script/test_utils.py similarity index 100% rename from tests/ci_build/test_utils.py rename to ops/script/test_utils.py diff --git a/tests/buildkite/update-rapids.sh b/ops/script/update_rapids.sh similarity index 50% rename from tests/buildkite/update-rapids.sh rename to ops/script/update_rapids.sh index f6a2675bdfa9..d7958ce70d86 100755 --- a/tests/buildkite/update-rapids.sh +++ b/ops/script/update_rapids.sh @@ -7,7 +7,10 @@ echo "LATEST_RAPIDS_VERSION = $LATEST_RAPIDS_VERSION" DEV_RAPIDS_VERSION=$(date +%Y-%m-%d -d "20${LATEST_RAPIDS_VERSION//./-}-01 + 2 month" | cut -c3-7 | tr - .) echo "DEV_RAPIDS_VERSION = $DEV_RAPIDS_VERSION" -PARENT_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +OPS_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")/.." ; pwd -P ) +CONTAINER_YAML="$OPS_PATH/docker/ci_container.yml" -sed -i "s/^RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/RAPIDS_VERSION=${LATEST_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh -sed -i "s/^DEV_RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/DEV_RAPIDS_VERSION=${DEV_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh +sed -i "s/\&rapids_version \"[[:digit:]]\+\.[[:digit:]]\+\"/\&rapids_version \"${LATEST_RAPIDS_VERSION}\"/" \ + "$CONTAINER_YAML" +sed -i "s/\&dev_rapids_version \"[[:digit:]]\+\.[[:digit:]]\+\"/\&dev_rapids_version \"${DEV_RAPIDS_VERSION}\"/" \ + "$CONTAINER_YAML" diff --git a/tests/ci_build/verify_link.sh b/ops/script/verify_link.sh similarity index 100% rename from tests/ci_build/verify_link.sh rename to ops/script/verify_link.sh diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 0420b2672e1e..565b61eb0669 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -27,6 +27,7 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ] dependencies = [ "numpy", @@ -63,6 +64,8 @@ disable = [ "import-error", "attribute-defined-outside-init", "import-outside-toplevel", + "too-few-public-methods", + "too-many-ancestors", "too-many-nested-blocks", "unsubscriptable-object", "useless-object-inheritance" diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index 26399f0da2f8..e09ccb6f4d29 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -1,5 +1,6 @@ # pylint: disable=invalid-name,unused-import """For compatibility and optional dependencies.""" +import functools import importlib.util import logging import sys @@ -43,36 +44,48 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool: # sklearn try: + from sklearn import __version__ as _sklearn_version from sklearn.base import BaseEstimator as XGBModelBase from sklearn.base import ClassifierMixin as XGBClassifierBase from sklearn.base import RegressorMixin as XGBRegressorBase - from sklearn.preprocessing import LabelEncoder try: - from sklearn.model_selection import KFold as XGBKFold from sklearn.model_selection import StratifiedKFold as XGBStratifiedKFold except ImportError: - from sklearn.cross_validation import KFold as XGBKFold from sklearn.cross_validation import StratifiedKFold as XGBStratifiedKFold + # sklearn.utils Tags types can be imported unconditionally once + # xgboost's minimum scikit-learn version is 1.6 or higher + try: + from sklearn.utils import Tags as _sklearn_Tags + except ImportError: + _sklearn_Tags = object + SKLEARN_INSTALLED = True except ImportError: SKLEARN_INSTALLED = False # used for compatibility without sklearn - XGBModelBase = object - XGBClassifierBase = object - XGBRegressorBase = object - LabelEncoder = object + class XGBModelBase: # type: ignore[no-redef] + """Dummy class for sklearn.base.BaseEstimator.""" + + class XGBClassifierBase: # type: ignore[no-redef] + """Dummy class for sklearn.base.ClassifierMixin.""" + + class XGBRegressorBase: # type: ignore[no-redef] + """Dummy class for sklearn.base.RegressorMixin.""" - XGBKFold = None XGBStratifiedKFold = None + _sklearn_Tags = object + _sklearn_version = object + _logger = logging.getLogger(__name__) +@functools.cache def is_cudf_available() -> bool: """Check cuDF package available or not""" if importlib.util.find_spec("cudf") is None: @@ -86,6 +99,7 @@ def is_cudf_available() -> bool: return False +@functools.cache def is_cupy_available() -> bool: """Check cupy package available or not""" if importlib.util.find_spec("cupy") is None: @@ -98,6 +112,7 @@ def is_cupy_available() -> bool: return False +@functools.cache def import_cupy() -> types.ModuleType: """Import cupy.""" if not is_cupy_available(): @@ -150,4 +165,4 @@ def concat(value: Sequence[_T]) -> _T: # pylint: disable=too-many-return-statem d_v = arr.device.id assert d_v == d, "Concatenating arrays on different devices." return cupy.concatenate(value, axis=0) - raise TypeError("Unknown type.") + raise TypeError(f"Unknown type: {type(value[0])}") diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index c2034652322d..07924623955d 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -430,7 +430,7 @@ def c_array( def from_array_interface(interface: dict) -> NumpyOrCupy: """Convert array interface to numpy or cupy array""" - class Array: # pylint: disable=too-few-public-methods + class Array: """Wrapper type for communicating with numpy and cupy.""" _interface: Optional[dict] = None @@ -2008,7 +2008,8 @@ def __setstate__(self, state: Dict) -> None: self.__dict__.update(state) def __getitem__(self, val: Union[Integer, tuple, slice, EllipsisType]) -> "Booster": - """Get a slice of the tree-based model. + """Get a slice of the tree-based model. Attributes like `best_iteration` and + `best_score` are removed in the resulting booster. .. versionadded:: 1.3.0 @@ -2107,6 +2108,15 @@ def copy(self) -> "Booster": """ return copy.copy(self) + def reset(self) -> "Booster": + """Reset the booster object to release data caches used for training. + + .. versionadded:: 3.0.0 + + """ + _check_call(_LIB.XGBoosterReset(self.handle)) + return self + def attr(self, key: str) -> Optional[str]: """Get attribute string from the Booster. @@ -3205,11 +3215,7 @@ def trees_to_dataframe(self, fmap: PathLike = "") -> DataFrame: } ) - if callable(getattr(df, "sort_values", None)): - # pylint: disable=no-member - return df.sort_values(["Tree", "Node"]).reset_index(drop=True) - # pylint: disable=no-member - return df.sort(["Tree", "Node"]).reset_index(drop=True) + return df.sort_values(["Tree", "Node"]).reset_index(drop=True) def _assign_dmatrix_features(self, data: DMatrix) -> None: if data.num_row() == 0: diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py index 76fcc1a6ad92..b2fc191f1c02 100644 --- a/python-package/xgboost/dask/__init__.py +++ b/python-package/xgboost/dask/__init__.py @@ -1,7 +1,6 @@ # pylint: disable=too-many-arguments, too-many-locals # pylint: disable=missing-class-docstring, invalid-name # pylint: disable=too-many-lines -# pylint: disable=too-few-public-methods """ Dask extensions for distributed training ---------------------------------------- @@ -73,6 +72,7 @@ Tuple, TypeAlias, TypedDict, + TypeGuard, TypeVar, Union, ) @@ -118,7 +118,7 @@ ) from ..tracker import RabitTracker from ..training import train as worker_train -from .data import _create_dmatrix, _create_quantile_dmatrix +from .data import _create_dmatrix, _create_quantile_dmatrix, no_group_split from .utils import get_address_from_user, get_n_threads _DaskCollection: TypeAlias = Union[da.Array, dd.DataFrame, dd.Series] @@ -766,7 +766,6 @@ async def _train_async( num_boost_round: int, evals: Optional[Sequence[Tuple[DaskDMatrix, str]]], obj: Optional[Objective], - feval: Optional[Metric], early_stopping_rounds: Optional[int], verbose_eval: Union[int, bool], xgb_model: Optional[Booster], @@ -816,7 +815,6 @@ def do_train( # pylint: disable=too-many-positional-arguments evals_result=local_history, evals=evals if len(evals) != 0 else None, obj=obj, - feval=feval, custom_metric=custom_metric, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval, @@ -870,7 +868,6 @@ def train( # pylint: disable=unused-argument *, evals: Optional[Sequence[Tuple[DaskDMatrix, str]]] = None, obj: Optional[Objective] = None, - feval: Optional[Metric] = None, early_stopping_rounds: Optional[int] = None, xgb_model: Optional[Booster] = None, verbose_eval: Union[int, bool] = True, @@ -1675,7 +1672,6 @@ async def _fit_async( num_boost_round=self.get_num_boosting_rounds(), evals=evals, obj=obj, - feval=None, custom_metric=metric, verbose_eval=verbose, early_stopping_rounds=self.early_stopping_rounds, @@ -1784,7 +1780,6 @@ async def _fit_async( num_boost_round=self.get_num_boosting_rounds(), evals=evals, obj=obj, - feval=None, custom_metric=metric, verbose_eval=verbose, early_stopping_rounds=self.early_stopping_rounds, @@ -1899,10 +1894,21 @@ def _argmax(x: Any) -> Any: """, ["estimators", "model"], + extra_parameters=""" + allow_group_split : + + .. versionadded:: 3.0.0 + + Whether a query group can be split among multiple workers. When set to `False`, + inputs must be Dask dataframes or series. If you have many small query groups, + this can significantly increase the fragmentation of the data, and the internal + DMatrix construction can take longer. + +""", end_note=""" .. note:: - For dask implementation, group is not supported, use qid instead. + For the dask implementation, group is not supported, use qid instead. """, ) class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn): @@ -1911,36 +1917,36 @@ def __init__( self, *, objective: str = "rank:pairwise", + allow_group_split: bool = False, coll_cfg: Optional[CollConfig] = None, **kwargs: Any, ) -> None: if callable(objective): raise ValueError("Custom objective function not supported by XGBRanker.") + self.allow_group_split = allow_group_split super().__init__(objective=objective, coll_cfg=coll_cfg, **kwargs) + def _wrapper_params(self) -> Set[str]: + params = super()._wrapper_params() + params.add("allow_group_split") + return params + async def _fit_async( self, X: _DataT, y: _DaskCollection, *, - group: Optional[_DaskCollection], qid: Optional[_DaskCollection], sample_weight: Optional[_DaskCollection], base_margin: Optional[_DaskCollection], eval_set: Optional[Sequence[Tuple[_DaskCollection, _DaskCollection]]], sample_weight_eval_set: Optional[Sequence[_DaskCollection]], base_margin_eval_set: Optional[Sequence[_DaskCollection]], - eval_group: Optional[Sequence[_DaskCollection]], eval_qid: Optional[Sequence[_DaskCollection]], verbose: Union[int, bool], xgb_model: Optional[Union[XGBModel, Booster]], feature_weights: Optional[_DaskCollection], ) -> "DaskXGBRanker": - msg = "Use the `qid` instead of the `group` with the dask interface." - if not (group is None and eval_group is None): - raise ValueError(msg) - if qid is None: - raise ValueError("`qid` is required for ranking.") params = self.get_xgb_params() dtrain, evals = await _async_wrap_evaluation_matrices( self.client, @@ -1975,7 +1981,6 @@ async def _fit_async( num_boost_round=self.get_num_boosting_rounds(), evals=evals, obj=None, - feval=None, custom_metric=metric, verbose_eval=verbose, early_stopping_rounds=self.early_stopping_rounds, @@ -2007,8 +2012,108 @@ def fit( base_margin_eval_set: Optional[Sequence[_DaskCollection]] = None, feature_weights: Optional[_DaskCollection] = None, ) -> "DaskXGBRanker": - args = {k: v for k, v in locals().items() if k not in ("self", "__class__")} - return self._client_sync(self._fit_async, **args) + msg = "Use the `qid` instead of the `group` with the dask interface." + if not (group is None and eval_group is None): + raise ValueError(msg) + if qid is None: + raise ValueError("`qid` is required for ranking.") + + def check_df(X: _DaskCollection) -> TypeGuard[dd.DataFrame]: + if not isinstance(X, dd.DataFrame): + raise TypeError( + "When `allow_group_split` is set to False, X is required to be" + " a dataframe." + ) + return True + + def check_ser( + qid: Optional[_DaskCollection], name: str + ) -> TypeGuard[Optional[dd.Series]]: + if not isinstance(qid, dd.Series) and qid is not None: + raise TypeError( + f"When `allow_group_split` is set to False, {name} is required to be" + " a series." + ) + return True + + if not self.allow_group_split: + assert ( + check_df(X) + and check_ser(qid, "qid") + and check_ser(y, "y") + and check_ser(sample_weight, "sample_weight") + and check_ser(base_margin, "base_margin") + ) + assert qid is not None and y is not None + X_id = id(X) + X, qid, y, sample_weight, base_margin = no_group_split( + self.device, + X, + qid, + y=y, + sample_weight=sample_weight, + base_margin=base_margin, + ) + + if eval_set is not None: + new_eval_set = [] + new_eval_qid = [] + new_sample_weight_eval_set = [] + new_base_margin_eval_set = [] + assert eval_qid + for i, (Xe, ye) in enumerate(eval_set): + we = sample_weight_eval_set[i] if sample_weight_eval_set else None + be = base_margin_eval_set[i] if base_margin_eval_set else None + assert check_df(Xe) + assert eval_qid + qe = eval_qid[i] + assert ( + eval_qid + and check_ser(qe, "qid") + and check_ser(ye, "y") + and check_ser(we, "sample_weight") + and check_ser(be, "base_margin") + ) + assert qe is not None and ye is not None + if id(Xe) != X_id: + Xe, qe, ye, we, be = no_group_split( + self.device, Xe, qe, ye, we, be + ) + else: + Xe, qe, ye, we, be = X, qid, y, sample_weight, base_margin + + new_eval_set.append((Xe, ye)) + new_eval_qid.append(qe) + + if we is not None: + new_sample_weight_eval_set.append(we) + if be is not None: + new_base_margin_eval_set.append(be) + + eval_set = new_eval_set + eval_qid = new_eval_qid + sample_weight_eval_set = ( + new_sample_weight_eval_set if new_sample_weight_eval_set else None + ) + base_margin_eval_set = ( + new_base_margin_eval_set if new_base_margin_eval_set else None + ) + + return self._client_sync( + self._fit_async, + X=X, + y=y, + qid=qid, + sample_weight=sample_weight, + base_margin=base_margin, + eval_set=eval_set, + eval_qid=eval_qid, + verbose=verbose, + xgb_model=xgb_model, + sample_weight_eval_set=sample_weight_eval_set, + base_margin_eval_set=base_margin_eval_set, + feature_weights=feature_weights, + ) # FIXME(trivialfis): arguments differ due to additional parameters like group and # qid. diff --git a/python-package/xgboost/dask/data.py b/python-package/xgboost/dask/data.py index c4f0f138b298..f92f1666499f 100644 --- a/python-package/xgboost/dask/data.py +++ b/python-package/xgboost/dask/data.py @@ -3,15 +3,30 @@ import logging from collections.abc import Sequence -from typing import Any, Callable, Dict, List, Optional, TypeVar, Union - +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Tuple, + TypeVar, + Union, + cast, + overload, +) + +import dask import distributed import numpy as np +import pandas as pd from dask import dataframe as dd +from .. import collective as coll from .._typing import _T, FeatureNames -from ..compat import concat +from ..compat import concat, import_cupy from ..core import DataIter, DMatrix, QuantileDMatrix +from ..data import is_on_cuda LOGGER = logging.getLogger("[xgboost.dask]") @@ -96,6 +111,153 @@ def next(self, input_data: Callable) -> bool: return True +@overload +def _add_column(df: dd.DataFrame, col: dd.Series) -> Tuple[dd.DataFrame, str]: ... + + +@overload +def _add_column(df: dd.DataFrame, col: None) -> Tuple[dd.DataFrame, None]: ... + + +def _add_column( + df: dd.DataFrame, col: Optional[dd.Series] +) -> Tuple[dd.DataFrame, Optional[str]]: + if col is None: + return df, col + + trails = 0 + uid = f"{col.name}_{trails}" + while uid in df.columns: + trails += 1 + uid = f"{col.name}_{trails}" + + df = df.assign(**{uid: col}) + return df, uid + + +def no_group_split( # pylint: disable=too-many-positional-arguments + device: str | None, + df: dd.DataFrame, + qid: dd.Series, + y: dd.Series, + sample_weight: Optional[dd.Series], + base_margin: Optional[dd.Series], +) -> Tuple[ + dd.DataFrame, dd.Series, dd.Series, Optional[dd.Series], Optional[dd.Series] +]: + """A function to prevent query group from being scattered to different + workers. Please see the tutorial in the document for the implication for not having + partition boundary based on query groups. + + """ + + df, qid_uid = _add_column(df, qid) + df, y_uid = _add_column(df, y) + df, w_uid = _add_column(df, sample_weight) + df, bm_uid = _add_column(df, base_margin) + + # `tasks` shuffle is required as of rapids 24.12 + shuffle = "p2p" if device is None or device == "cpu" else "tasks" + with dask.config.set({"dataframe.shuffle.method": shuffle}): + df = df.persist() + # Encode the QID to make it dense. + df[qid_uid] = df[qid_uid].astype("category").cat.as_known().cat.codes + # The shuffle here is costly. + df = df.sort_values(by=qid_uid) + cnt = df.groupby(qid_uid)[qid_uid].count() + div = cnt.index.compute().values.tolist() + div = sorted(div) + div = tuple(div + [div[-1] + 1]) + + df = df.set_index( + qid_uid, + drop=False, + divisions=div, + ).persist() + + qid = df[qid_uid] + y = df[y_uid] + sample_weight, base_margin = ( + cast(dd.Series, df[uid]) if uid is not None else None for uid in (w_uid, bm_uid) + ) + + uids = [uid for uid in [qid_uid, y_uid, w_uid, bm_uid] if uid is not None] + df = df.drop(uids, axis=1).persist() + return df, qid, y, sample_weight, base_margin + + +def sort_data_by_qid(**kwargs: List[Any]) -> Dict[str, List[Any]]: + """Sort worker-local data by query ID for learning to rank tasks.""" + data_parts = kwargs.get("data") + assert data_parts is not None + n_parts = len(data_parts) + + if is_on_cuda(data_parts[0]): + from cudf import DataFrame + else: + from pandas import DataFrame + + def get_dict(i: int) -> Dict[str, list]: + """Return a dictionary containing all the meta info and all partitions.""" + + def _get(attr: Optional[List[Any]]) -> Optional[list]: + if attr is not None: + return attr[i] + return None + + data_opt = {name: _get(kwargs.get(name, None)) for name in meta} + # Filter out None values. + data = {k: v for k, v in data_opt.items() if v is not None} + return data + + def map_fn(i: int) -> pd.DataFrame: + data = get_dict(i) + return DataFrame(data) + + meta_parts = [map_fn(i) for i in range(n_parts)] + dfq = concat(meta_parts) + if dfq.qid.is_monotonic_increasing: + return kwargs + + LOGGER.warning( + "[r%d]: Sorting data with %d partitions for ranking. " + "This is a costly operation and will increase the memory usage significantly. " + "To avoid this warning, sort the data based on qid before passing it into " + "XGBoost. Alternatively, you can use set the `allow_group_split` to False.", + coll.get_rank(), + n_parts, + ) + # I tried to construct a new dask DF to perform the sort, but it's quite difficult + # to get the partition alignment right. Along with the still maturing shuffle + # implementation and GPU compatibility, a simple concat is used. + # + # In case it might become useful one day, I managed to get a CPU version working, + # albeit qutie slow (much slower than concatenated sort). The implementation merges + # everything into a single Dask DF and runs `DF.sort_values`, then retrieve the + # individual X,y,qid, ... from calculated partition values `client.compute([p for p + # in df.partitions])`. It was to avoid creating mismatched partitions. + dfx = concat(data_parts) + + if is_on_cuda(dfq): + cp = import_cupy() + sorted_idx = cp.argsort(dfq.qid) + else: + sorted_idx = np.argsort(dfq.qid) + dfq = dfq.iloc[sorted_idx, :] + + if hasattr(dfx, "iloc"): + dfx = dfx.iloc[sorted_idx, :] + else: + dfx = dfx[sorted_idx, :] + + kwargs.update({"data": [dfx]}) + for i, c in enumerate(dfq.columns): + assert c in kwargs + kwargs.update({c: [dfq[c]]}) + + return kwargs + + def _get_worker_parts(list_of_parts: _DataParts) -> Dict[str, List[Any]]: assert isinstance(list_of_parts, list) result: Dict[str, List[Any]] = {} @@ -115,6 +277,9 @@ def append(i: int, name: str) -> None: for k in meta: append(i, k) + qid = result.get("qid", None) + if qid is not None: + result = sort_data_by_qid(**result) return result diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 29647f88a893..1085f28f8ff5 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -2,6 +2,7 @@ # pylint: disable=too-many-return-statements """Data dispatching for DMatrix.""" import ctypes +import functools import json import os import warnings @@ -21,7 +22,9 @@ TransformedData, c_bst_ulong, ) -from .compat import DataFrame, lazy_isinstance +from .compat import DataFrame +from .compat import Series as PdSeries +from .compat import lazy_isinstance from .core import ( _LIB, DataIter, @@ -377,23 +380,39 @@ def pandas_feature_info( else: feature_names = list(data.columns.map(str)) - # handle feature types + # handle feature types and dtype validation + new_feature_types = [] + need_sparse_extension_warn = True + for dtype in data.dtypes: + if is_pd_sparse_dtype(dtype): + new_feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) + if need_sparse_extension_warn: + warnings.warn("Sparse arrays from pandas are converted into dense.") + need_sparse_extension_warn = False + elif ( + is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype) + ) and enable_categorical: + new_feature_types.append(CAT_T) + else: + try: + new_feature_types.append(_pandas_dtype_mapper[dtype.name]) + except KeyError: + _invalid_dataframe_dtype(data) + if feature_types is None and meta is None: - feature_types = [] - for dtype in data.dtypes: - if is_pd_sparse_dtype(dtype): - feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) - elif ( - is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype) - ) and enable_categorical: - feature_types.append(CAT_T) - else: - feature_types.append(_pandas_dtype_mapper[dtype.name]) + feature_types = new_feature_types + return feature_names, feature_types def is_nullable_dtype(dtype: PandasDType) -> bool: """Whether dtype is a pandas nullable type.""" + + from pandas.api.extensions import ExtensionDtype + + if not isinstance(dtype, ExtensionDtype): + return False + from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper @@ -415,8 +434,8 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool: ) -def is_pd_cat_dtype(dtype: PandasDType) -> bool: - """Wrapper for testing pandas category type.""" +@functools.cache +def _lazy_load_pd_is_cat() -> Callable[[PandasDType], bool]: import pandas as pd if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"): @@ -424,15 +443,23 @@ def is_pd_cat_dtype(dtype: PandasDType) -> bool: if Version(pd.__version__) >= Version("2.1.0"): from pandas import CategoricalDtype - return isinstance(dtype, CategoricalDtype) + def pd_is_cat_210(dtype: PandasDType) -> bool: + return isinstance(dtype, CategoricalDtype) + return pd_is_cat_210 from pandas.api.types import is_categorical_dtype # type: ignore - return is_categorical_dtype(dtype) + return is_categorical_dtype -def is_pd_sparse_dtype(dtype: PandasDType) -> bool: - """Wrapper for testing pandas sparse type.""" +def is_pd_cat_dtype(dtype: PandasDType) -> bool: + """Wrapper for testing pandas category type.""" + is_cat = _lazy_load_pd_is_cat() + return is_cat(dtype) + + +@functools.cache +def _lazy_load_pd_is_sparse() -> Callable[[PandasDType], bool]: import pandas as pd if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"): @@ -440,10 +467,20 @@ def is_pd_sparse_dtype(dtype: PandasDType) -> bool: if Version(pd.__version__) >= Version("2.1.0"): from pandas import SparseDtype - return isinstance(dtype, SparseDtype) + def pd_is_sparse_210(dtype: PandasDType) -> bool: + return isinstance(dtype, SparseDtype) + + return pd_is_sparse_210 from pandas.api.types import is_sparse # type: ignore + return is_sparse + + +def is_pd_sparse_dtype(dtype: PandasDType) -> bool: + """Wrapper for testing pandas sparse type.""" + is_sparse = _lazy_load_pd_is_sparse() + return is_sparse(dtype) @@ -474,33 +511,34 @@ def pandas_pa_type(ser: Any) -> np.ndarray: return arr +@functools.cache +def _lazy_has_npdtypes() -> bool: + return np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0") + + +@functools.cache +def _lazy_load_pd_floats() -> tuple: + from pandas import Float32Dtype, Float64Dtype + + return Float32Dtype, Float64Dtype + + def pandas_transform_data(data: DataFrame) -> List[np.ndarray]: """Handle categorical dtype and extension types from pandas.""" - import pandas as pd - from pandas import Float32Dtype, Float64Dtype + Float32Dtype, Float64Dtype = _lazy_load_pd_floats() result: List[np.ndarray] = [] + np_dtypes = _lazy_has_npdtypes() - def cat_codes(ser: pd.Series) -> np.ndarray: - if is_pd_cat_dtype(ser.dtype): - return _ensure_np_dtype( - ser.cat.codes.astype(np.float32) - .replace(-1.0, np.nan) - .to_numpy(na_value=np.nan), - np.float32, - )[0] - # Not yet supported, the index is not ordered for some reason. Alternately: - # `combine_chunks().to_pandas().cat.codes`. The result is the same. - assert is_pa_ext_categorical_dtype(ser.dtype) - return ( - ser.array.__arrow_array__() - .combine_chunks() - .dictionary_encode() - .indices.astype(np.float32) + def cat_codes(ser: PdSeries) -> np.ndarray: + return _ensure_np_dtype( + ser.cat.codes.astype(np.float32) .replace(-1.0, np.nan) - ) + .to_numpy(na_value=np.nan), + np.float32, + )[0] - def nu_type(ser: pd.Series) -> np.ndarray: + def nu_type(ser: PdSeries) -> np.ndarray: # Avoid conversion when possible if isinstance(dtype, Float32Dtype): res_dtype: NumpyDType = np.float32 @@ -512,10 +550,9 @@ def nu_type(ser: pd.Series) -> np.ndarray: ser.to_numpy(dtype=res_dtype, na_value=np.nan), res_dtype )[0] - def oth_type(ser: pd.Series) -> np.ndarray: + def oth_type(ser: PdSeries) -> np.ndarray: # The dtypes module is added in 1.25. - npdtypes = np.lib.NumpyVersion(np.__version__) > np.lib.NumpyVersion("1.25.0") - npdtypes = npdtypes and isinstance( + npdtypes = np_dtypes and isinstance( ser.dtype, ( # pylint: disable=no-member @@ -545,7 +582,7 @@ def oth_type(ser: pd.Series) -> np.ndarray: elif is_nullable_dtype(dtype): result.append(nu_type(data[col])) elif is_pd_sparse_dtype(dtype): - arr = cast(pd.arrays.SparseArray, data[col].values) + arr = data[col].values arr = arr.to_dense() if _is_np_array_like(arr): arr, _ = _ensure_np_dtype(arr, arr.dtype) @@ -559,26 +596,6 @@ def oth_type(ser: pd.Series) -> np.ndarray: return result -def pandas_check_dtypes(data: DataFrame, enable_categorical: bool) -> None: - """Validate the input types, returns True if the dataframe is backed by arrow.""" - sparse_extension = False - - for dtype in data.dtypes: - if not ( - (dtype.name in _pandas_dtype_mapper) - or is_pd_sparse_dtype(dtype) - or (is_pd_cat_dtype(dtype) and enable_categorical) - or is_pa_ext_dtype(dtype) - ): - _invalid_dataframe_dtype(data) - - if is_pd_sparse_dtype(dtype): - sparse_extension = True - - if sparse_extension: - warnings.warn("Sparse arrays from pandas are converted into dense.") - - class PandasTransformed: """A storage class for transformed pandas DataFrame.""" @@ -604,7 +621,6 @@ def _transform_pandas_df( feature_types: Optional[FeatureTypes] = None, meta: Optional[str] = None, ) -> Tuple[PandasTransformed, Optional[FeatureNames], Optional[FeatureTypes]]: - pandas_check_dtypes(data, enable_categorical) if meta and len(data.columns) > 1 and meta not in _matrix_meta: raise ValueError(f"DataFrame for {meta} cannot have multiple columns") @@ -856,7 +872,8 @@ def _is_cudf_pandas(data: DataType) -> bool: ) -def _get_cudf_cat_predicate() -> Callable[[Any], bool]: +@functools.cache +def _lazy_load_cudf_is_cat() -> Callable[[Any], bool]: try: from cudf import CategoricalDtype @@ -879,7 +896,7 @@ def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes: array interface is finished. """ - is_categorical_dtype = _get_cudf_cat_predicate() + is_categorical_dtype = _lazy_load_cudf_is_cat() interfaces = [] def append(interface: dict) -> None: @@ -917,7 +934,7 @@ def _transform_cudf_df( except ImportError: from pandas.api.types import is_bool_dtype - is_categorical_dtype = _get_cudf_cat_predicate() + is_categorical_dtype = _lazy_load_cudf_is_cat() # Work around https://github.com/dmlc/xgboost/issues/10181 if _is_cudf_ser(data): if is_bool_dtype(data.dtype): diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 25448657c8ad..b197539bfc1f 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -35,6 +35,8 @@ XGBClassifierBase, XGBModelBase, XGBRegressorBase, + _sklearn_Tags, + _sklearn_version, import_cupy, ) from .config import config_context @@ -54,7 +56,7 @@ from .training import train -class XGBRankerMixIn: # pylint: disable=too-few-public-methods +class XGBRankerMixIn: """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn base classes. @@ -79,7 +81,7 @@ def _can_use_qdm(tree_method: Optional[str], device: Optional[str]) -> bool: return tree_method in ("hist", "gpu_hist", None, "auto") and not_sycl -class _SklObjWProto(Protocol): # pylint: disable=too-few-public-methods +class _SklObjWProto(Protocol): def __call__( self, y_true: ArrayLike, @@ -424,7 +426,7 @@ def task(i: int) -> float: Metric used for monitoring the training result and early stopping. It can be a string or list of strings as names of predefined metric in XGBoost (See - doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any + :doc:`/parameter`), one of the metrics in :py:mod:`sklearn.metrics`, or any other user defined metric that looks like `sklearn.metrics`. If custom objective is also provided, then custom metric should implement the @@ -805,6 +807,41 @@ def _more_tags(self) -> Dict[str, bool]: tags["non_deterministic"] = True return tags + @staticmethod + def _update_sklearn_tags_from_dict( + *, + tags: _sklearn_Tags, + tags_dict: Dict[str, bool], + ) -> _sklearn_Tags: + """Update ``sklearn.utils.Tags`` inherited from ``scikit-learn`` base classes. + + ``scikit-learn`` 1.6 introduced a dataclass-based interface for estimator tags. + ref: https://github.com/scikit-learn/scikit-learn/pull/29677 + + This method handles updating that instance based on the values in ``self._more_tags()``. + """ + tags.non_deterministic = tags_dict.get("non_deterministic", False) + tags.no_validation = tags_dict["no_validation"] + tags.input_tags.allow_nan = tags_dict["allow_nan"] + return tags + + def __sklearn_tags__(self) -> _sklearn_Tags: + # XGBModelBase.__sklearn_tags__() cannot be called unconditionally, + # because that method isn't defined for scikit-learn<1.6 + if not hasattr(XGBModelBase, "__sklearn_tags__"): + err_msg = ( + "__sklearn_tags__() should not be called when using scikit-learn<1.6. " + f"Detected version: {_sklearn_version}" + ) + raise AttributeError(err_msg) + + # take whatever tags are provided by BaseEstimator, then modify + # them with XGBoost-specific values + return self._update_sklearn_tags_from_dict( + tags=super().__sklearn_tags__(), # pylint: disable=no-member + tags_dict=self._more_tags(), + ) + def __sklearn_is_fitted__(self) -> bool: return hasattr(self, "_Booster") @@ -898,13 +935,27 @@ def get_params(self, deep: bool = True) -> Dict[str, Any]: """Get parameters.""" # Based on: https://stackoverflow.com/questions/59248211 # The basic flow in `get_params` is: - # 0. Return parameters in subclass first, by using inspect. - # 1. Return parameters in `XGBModel` (the base class). + # 0. Return parameters in subclass (self.__class__) first, by using inspect. + # 1. Return parameters in all parent classes (especially `XGBModel`). # 2. Return whatever in `**kwargs`. # 3. Merge them. + # + # This needs to accommodate being called recursively in the following + # inheritance graphs (and similar for classification and ranking): + # + # XGBRFRegressor -> XGBRegressor -> XGBModel -> BaseEstimator + # XGBRegressor -> XGBModel -> BaseEstimator + # XGBModel -> BaseEstimator + # params = super().get_params(deep) cp = copy.copy(self) - cp.__class__ = cp.__class__.__bases__[0] + # If the immediate parent defines get_params(), use that. + if callable(getattr(cp.__class__.__bases__[0], "get_params", None)): + cp.__class__ = cp.__class__.__bases__[0] + # Otherwise, skip it and assume the next class will have it. + # This is here primarily for cases where the first class in MRO is a scikit-learn mixin. + else: + cp.__class__ = cp.__class__.__bases__[1] params.update(cp.__class__.get_params(cp, deep)) # if kwargs is a dict, update params accordingly if hasattr(self, "kwargs") and isinstance(self.kwargs, dict): @@ -1481,7 +1532,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) -> Number of boosting rounds. """, ) -class XGBClassifier(XGBModel, XGBClassifierBase): +class XGBClassifier(XGBClassifierBase, XGBModel): # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes @_deprecate_positional_args def __init__( @@ -1497,6 +1548,12 @@ def _more_tags(self) -> Dict[str, bool]: tags["multilabel"] = True return tags + def __sklearn_tags__(self) -> _sklearn_Tags: + tags = super().__sklearn_tags__() + tags_dict = self._more_tags() + tags.classifier_tags.multi_label = tags_dict["multilabel"] + return tags + @_deprecate_positional_args def fit( self, @@ -1769,7 +1826,7 @@ def fit( "Implementation of the scikit-learn API for XGBoost regression.", ["estimators", "model", "objective"], ) -class XGBRegressor(XGBModel, XGBRegressorBase): +class XGBRegressor(XGBRegressorBase, XGBModel): # pylint: disable=missing-docstring @_deprecate_positional_args def __init__( @@ -1783,6 +1840,13 @@ def _more_tags(self) -> Dict[str, bool]: tags["multioutput_only"] = False return tags + def __sklearn_tags__(self) -> _sklearn_Tags: + tags = super().__sklearn_tags__() + tags_dict = self._more_tags() + tags.target_tags.multi_output = tags_dict["multioutput"] + tags.target_tags.single_output = not tags_dict["multioutput_only"] + return tags + @xgboost_model_doc( "scikit-learn API for XGBoost random forest regression.", @@ -1910,7 +1974,7 @@ def _get_qid( `qid` can be a special column of input `X` instead of a separated parameter, see :py:meth:`fit` for more info.""", ) -class XGBRanker(XGBModel, XGBRankerMixIn): +class XGBRanker(XGBRankerMixIn, XGBModel): # pylint: disable=missing-docstring,too-many-arguments,invalid-name @_deprecate_positional_args def __init__(self, *, objective: str = "rank:ndcg", **kwargs: Any): diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index 166acbe1764b..689e747e8a5c 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -2,8 +2,8 @@ import base64 -# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name -# pylint: disable=too-few-public-methods, too-many-lines, too-many-branches +# pylint: disable=fixme, protected-access, no-member, invalid-name +# pylint: disable=too-many-lines, too-many-branches import json import logging import os @@ -475,10 +475,7 @@ def _validate_params(self) -> None: ) if self.getOrDefault("early_stopping_rounds") is not None: - if not ( - self.isDefined(self.validationIndicatorCol) - and self.getOrDefault(self.validationIndicatorCol) != "" - ): + if not self._col_is_defined_not_empty(self.validationIndicatorCol): raise ValueError( "If 'early_stopping_rounds' param is set, you need to set " "'validation_indicator_col' param as well." @@ -517,6 +514,9 @@ def _run_on_gpu(self) -> bool: or self.getOrDefault(self.getParam("tree_method")) == "gpu_hist" ) + def _col_is_defined_not_empty(self, param: "Param[str]") -> bool: + return self.isDefined(param) and self.getOrDefault(param) != "" + def _validate_and_convert_feature_col_as_float_col_list( dataset: DataFrame, features_col_names: List[str] @@ -805,16 +805,13 @@ def _prepare_input_columns_and_feature_prop( ) select_cols.append(features_array_col) - if self.isDefined(self.weightCol) and self.getOrDefault(self.weightCol) != "": + if self._col_is_defined_not_empty(self.weightCol): select_cols.append( col(self.getOrDefault(self.weightCol)).alias(alias.weight) ) has_validation_col = False - if ( - self.isDefined(self.validationIndicatorCol) - and self.getOrDefault(self.validationIndicatorCol) != "" - ): + if self._col_is_defined_not_empty(self.validationIndicatorCol): select_cols.append( col(self.getOrDefault(self.validationIndicatorCol)).alias(alias.valid) ) @@ -823,15 +820,12 @@ def _prepare_input_columns_and_feature_prop( # which will cause exception or hanging issue when creating DMatrix. has_validation_col = True - if ( - self.isDefined(self.base_margin_col) - and self.getOrDefault(self.base_margin_col) != "" - ): + if self._col_is_defined_not_empty(self.base_margin_col): select_cols.append( col(self.getOrDefault(self.base_margin_col)).alias(alias.margin) ) - if self.isDefined(self.qid_col) and self.getOrDefault(self.qid_col) != "": + if self._col_is_defined_not_empty(self.qid_col): select_cols.append(col(self.getOrDefault(self.qid_col)).alias(alias.qid)) feature_prop = FeatureProp( @@ -862,17 +856,22 @@ def _prepare_input(self, dataset: DataFrame) -> Tuple[DataFrame, FeatureProp]: ) if self._repartition_needed(dataset): - # If validationIndicatorCol defined, and if user unionise train and validation - # dataset, users must set force_repartition to true to force repartition. - # Or else some partitions might contain only train or validation dataset. - if self.getOrDefault(self.repartition_random_shuffle): - # In some cases, spark round-robin repartition might cause data skew - # use random shuffle can address it. - dataset = dataset.repartition(num_workers, rand(1)) + if self._col_is_defined_not_empty(self.qid_col): + # For ranking problem, we need to try best the put the instances with + # same group into the same partition + dataset = dataset.repartitionByRange(num_workers, alias.qid) else: - dataset = dataset.repartition(num_workers) + # If validationIndicatorCol defined, and if user unionise train and validation + # dataset, users must set force_repartition to true to force repartition. + # Or else some partitions might contain only train or validation dataset. + if self.getOrDefault(self.repartition_random_shuffle): + # In some cases, spark round-robin repartition might cause data skew + # use random shuffle can address it. + dataset = dataset.repartition(num_workers, rand(1)) + else: + dataset = dataset.repartition(num_workers) - if self.isDefined(self.qid_col) and self.getOrDefault(self.qid_col) != "": + if self._col_is_defined_not_empty(self.qid_col): # XGBoost requires qid to be sorted for each partition dataset = dataset.sortWithinPartitions(alias.qid, ascending=True) @@ -1306,10 +1305,7 @@ def _get_feature_col( def _get_pred_contrib_col_name(self) -> Optional[str]: """Return the pred_contrib_col col name""" pred_contrib_col_name = None - if ( - self.isDefined(self.pred_contrib_col) - and self.getOrDefault(self.pred_contrib_col) != "" - ): + if self._col_is_defined_not_empty(self.pred_contrib_col): pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col) return pred_contrib_col_name @@ -1413,10 +1409,7 @@ def _transform(self, dataset: DataFrame) -> DataFrame: xgb_sklearn_model = self._xgb_sklearn_model base_margin_col = None - if ( - self.isDefined(self.base_margin_col) - and self.getOrDefault(self.base_margin_col) != "" - ): + if self._col_is_defined_not_empty(self.base_margin_col): base_margin_col = col(self.getOrDefault(self.base_margin_col)).alias( alias.margin ) diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py index f53ef72eb99e..011f7ea0b715 100644 --- a/python-package/xgboost/spark/estimator.py +++ b/python-package/xgboost/spark/estimator.py @@ -1,7 +1,6 @@ """Xgboost pyspark integration submodule for estimator API.""" -# pylint: disable=too-many-ancestors -# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name +# pylint: disable=fixme, protected-access, no-member, invalid-name # pylint: disable=unused-argument, too-many-locals import warnings diff --git a/python-package/xgboost/spark/params.py b/python-package/xgboost/spark/params.py index a177c73fe413..f173d3301286 100644 --- a/python-package/xgboost/spark/params.py +++ b/python-package/xgboost/spark/params.py @@ -2,7 +2,6 @@ from typing import Dict -# pylint: disable=too-few-public-methods from pyspark.ml.param import TypeConverters from pyspark.ml.param.shared import Param, Params diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py index c96ec284abe3..e0d3e094a805 100644 --- a/python-package/xgboost/spark/utils.py +++ b/python-package/xgboost/spark/utils.py @@ -47,7 +47,7 @@ def _get_default_params_from_func( return filtered_params_dict -class CommunicatorContext(CCtx): # pylint: disable=too-few-public-methods +class CommunicatorContext(CCtx): """Context with PySpark specific task ID.""" def __init__(self, context: BarrierTaskContext, **args: CollArgsVals) -> None: diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index 5fbafd6ec58f..0821aee913c3 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -457,7 +457,11 @@ def make_categorical( def make_ltr( - n_samples: int, n_features: int, n_query_groups: int, max_rel: int + n_samples: int, + n_features: int, + n_query_groups: int, + max_rel: int, + sort_qid: bool = True, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """Make a dataset for testing LTR.""" rng = np.random.default_rng(1994) @@ -470,7 +474,8 @@ def make_ltr( w = rng.normal(0, 1.0, size=n_query_groups) w -= np.min(w) w /= np.max(w) - qid = np.sort(qid) + if sort_qid: + qid = np.sort(qid) return X, y, qid, w @@ -637,6 +642,10 @@ def non_increasing(L: Sequence[float], tolerance: float = 1e-4) -> bool: return all((y - x) < tolerance for x, y in zip(L, L[1:])) +def non_decreasing(L: Sequence[float], tolerance: float = 1e-4) -> bool: + return all((y - x) >= -tolerance for x, y in zip(L, L[1:])) + + def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool: """Assert whether two DMatrices contain the same predictors.""" lcsr = lhs.get_data() @@ -653,9 +662,29 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool: M = TypeVar("M", xgb.Booster, xgb.XGBModel) -def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]: - """Evaluation metric for xgb.train""" +def logregobj(preds: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]: + """Binary regression custom objective.""" + labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) + grad = preds - labels + hess = preds * (1.0 - preds) + return grad, hess + + +def eval_error_metric( + predt: np.ndarray, dtrain: xgb.DMatrix, rev_link: bool +) -> Tuple[str, np.float64]: + """Evaluation metric for xgb.train. + + Parameters + ---------- + rev_link : Whether the metric needs to apply the reverse link function (activation). + + """ label = dtrain.get_label() + if rev_link: + predt = 1.0 / (1.0 + np.exp(-predt)) + assert (0.0 <= predt).all() and (predt <= 1.0).all() r = np.zeros(predt.shape) gt = predt > 0.5 if predt.size == 0: @@ -666,8 +695,15 @@ def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.f return "CustomErr", np.sum(r) -def eval_error_metric_skl(y_true: np.ndarray, y_score: np.ndarray) -> np.float64: +def eval_error_metric_skl( + y_true: np.ndarray, y_score: np.ndarray, rev_link: bool = False +) -> np.float64: """Evaluation metric that looks like metrics provided by sklearn.""" + + if rev_link: + y_score = 1.0 / (1.0 + np.exp(-y_score)) + assert (0.0 <= y_score).all() and (y_score <= 1.0).all() + r = np.zeros(y_score.shape) gt = y_score > 0.5 r[gt] = 1 - y_true[gt] diff --git a/python-package/xgboost/testing/dask.py b/python-package/xgboost/testing/dask.py index 541009a73c85..af0fc8bf0397 100644 --- a/python-package/xgboost/testing/dask.py +++ b/python-package/xgboost/testing/dask.py @@ -1,6 +1,6 @@ """Tests for dask shared by different test modules.""" -from typing import Any, List, Literal, cast +from typing import Any, List, Literal, Tuple, cast import numpy as np import pandas as pd @@ -175,7 +175,82 @@ def get_rabit_args(client: Client, n_workers: int) -> Any: return client.sync(_get_rabit_args, client, n_workers) -def get_client_workers(client: Any) -> List[str]: +def get_client_workers(client: Client) -> List[str]: "Get workers from a dask client." workers = client.scheduler_info()["workers"] return list(workers.keys()) + + +def make_ltr( # pylint: disable=too-many-locals,too-many-arguments + client: Client, + n_samples: int, + n_features: int, + *, + n_query_groups: int, + max_rel: int, + device: str, +) -> Tuple[dd.DataFrame, dd.Series, dd.Series]: + """Synthetic dataset for learning to rank.""" + workers = get_client_workers(client) + n_samples_per_worker = n_samples // len(workers) + + if device == "cpu": + from pandas import DataFrame as DF + else: + from cudf import DataFrame as DF + + def make(n: int, seed: int) -> pd.DataFrame: + rng = np.random.default_rng(seed) + X, y = make_classification( + n, n_features, n_informative=n_features, n_redundant=0, n_classes=max_rel + ) + qid = rng.integers(size=(n,), low=0, high=n_query_groups) + df = DF(X, columns=[f"f{i}" for i in range(n_features)]) + df["qid"] = qid + df["y"] = y + return df + + futures = [] + i = 0 + for k in range(0, n_samples, n_samples_per_worker): + fut = client.submit( + make, n=n_samples_per_worker, seed=k, workers=[workers[i % len(workers)]] + ) + futures.append(fut) + i += 1 + + last = n_samples - (n_samples_per_worker * len(workers)) + if last != 0: + fut = client.submit(make, n=last, seed=n_samples_per_worker * len(workers)) + futures.append(fut) + + meta = make(1, 0) + df = dd.from_delayed(futures, meta=meta) + assert isinstance(df, dd.DataFrame) + return df.drop(["qid", "y"], axis=1), df.y, df.qid + + +def check_no_group_split(client: Client, device: str) -> None: + """Test for the allow_group_split parameter.""" + X_tr, q_tr, y_tr = make_ltr( + client, 4096, 128, n_query_groups=4, max_rel=5, device=device + ) + X_va, q_va, y_va = make_ltr( + client, 1024, 128, n_query_groups=4, max_rel=5, device=device + ) + + ltr = dxgb.DaskXGBRanker(allow_group_split=False, n_estimators=32, device=device) + ltr.fit( + X_tr, + y_tr, + qid=q_tr, + eval_set=[(X_tr, y_tr), (X_va, y_va)], + eval_qid=[q_tr, q_va], + verbose=True, + ) + + assert ltr.n_features_in_ == 128 + assert X_tr.shape[1] == ltr.n_features_in_ # no change + ndcg = ltr.evals_result()["validation_0"]["ndcg@32"] + assert tm.non_decreasing(ndcg[:16], tolerance=1e-2), ndcg + np.testing.assert_allclose(ndcg[-1], 1.0, rtol=1e-2) diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py index d9a4c85af326..34f55c077a85 100644 --- a/python-package/xgboost/testing/data.py +++ b/python-package/xgboost/testing/data.py @@ -566,7 +566,7 @@ def is_binary(self) -> bool: return self.max_rel == 1 -class PBM: # pylint: disable=too-few-public-methods +class PBM: """Simulate click data with position bias model. There are other models available in `ULTRA `_ like the cascading model. diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index bb4ebe44e1ed..29a516e81e24 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -3,7 +3,6 @@ """Training Library containing training routines.""" import copy import os -import warnings from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union, cast import numpy as np @@ -28,26 +27,6 @@ _CVFolds = Sequence["CVPack"] -def _configure_custom_metric( - feval: Optional[Metric], custom_metric: Optional[Metric] -) -> Optional[Metric]: - if feval is not None: - link = ( - "https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html" - ) - warnings.warn( - "`feval` is deprecated, use `custom_metric` instead. They have " - "different behavior when custom objective is also used." - f"See {link} for details on the `custom_metric`." - ) - if feval is not None and custom_metric is not None: - raise ValueError( - "Both `feval` and `custom_metric` are supplied. Use `custom_metric` instead." - ) - eval_metric = custom_metric if custom_metric is not None else feval - return eval_metric - - @_deprecate_positional_args def train( params: Dict[str, Any], @@ -56,7 +35,6 @@ def train( *, evals: Optional[Sequence[Tuple[DMatrix, str]]] = None, obj: Optional[Objective] = None, - feval: Optional[Metric] = None, maximize: Optional[bool] = None, early_stopping_rounds: Optional[int] = None, evals_result: Optional[TrainingCallback.EvalsLog] = None, @@ -81,23 +59,27 @@ def train( obj Custom objective function. See :doc:`Custom Objective ` for details. - feval : - .. deprecated:: 1.6.0 - Use `custom_metric` instead. maximize : - Whether to maximize feval. + Whether to maximize custom_metric. + early_stopping_rounds : + Activates early stopping. Validation metric needs to improve at least once in every **early_stopping_rounds** round(s) to continue training. + Requires at least one item in **evals**. + The method returns the model from the last iteration (not the best one). Use - custom callback or model slicing if the best model is desired. - If there's more than one item in **evals**, the last entry will be used for early - stopping. + custom callback :py:class:`~xgboost.callback.EarlyStopping` or :py:meth:`model + slicing ` if the best model is desired. If there's + more than one item in **evals**, the last entry will be used for early stopping. + If there's more than one metric in the **eval_metric** parameter given in **params**, the last metric will be used for early stopping. + If early stopping occurs, the model will have two additional fields: ``bst.best_score``, ``bst.best_iteration``. + evals_result : This dictionary stores the evaluation results of all the items in watchlist. @@ -113,15 +95,22 @@ def train( verbose_eval : Requires at least one item in **evals**. + If **verbose_eval** is True then the evaluation metric on the validation set is printed at each boosting stage. - If **verbose_eval** is an integer then the evaluation metric on the validation set - is printed at every given **verbose_eval** boosting stage. The last boosting stage - / the boosting stage found by using **early_stopping_rounds** is also printed. - Example: with ``verbose_eval=4`` and at least one item in **evals**, an evaluation metric - is printed every 4 boosting stages, instead of every boosting stage. + + If **verbose_eval** is an integer then the evaluation metric on the validation + set is printed at every given **verbose_eval** boosting stage. The last boosting + stage / the boosting stage found by using **early_stopping_rounds** is also + printed. + + Example: with ``verbose_eval=4`` and at least one item in **evals**, an + evaluation metric is printed every 4 boosting stages, instead of every boosting + stage. + xgb_model : Xgb model to be loaded before training (allows training continuation). + callbacks : List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using @@ -145,15 +134,17 @@ def train( .. versionadded 1.6.0 Custom metric function. See :doc:`Custom Metric ` - for details. + for details. The metric receives transformed prediction (after applying the + reverse link function) when using a builtin objective, and raw output when using + a custom objective. Returns ------- Booster : a trained booster model + """ callbacks = [] if callbacks is None else copy.copy(list(callbacks)) - metric_fn = _configure_custom_metric(feval, custom_metric) evals = list(evals) if evals else [] bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model) @@ -165,12 +156,7 @@ def train( if early_stopping_rounds: callbacks.append(EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)) cb_container = CallbackContainer( - callbacks, - metric=metric_fn, - # For old `feval` parameter, the behavior is unchanged. For the new - # `custom_metric`, it will receive proper prediction result when custom objective - # is not used. - output_margin=callable(obj) or metric_fn is feval, + callbacks, metric=custom_metric, output_margin=callable(obj) ) bst = cb_container.before_training(bst) @@ -187,9 +173,7 @@ def train( if evals_result is not None: evals_result.update(cb_container.history) - # Copy to serialise and unserialise booster to reset state and free - # training memory - return bst.copy() + return bst.reset() class CVPack: @@ -425,7 +409,6 @@ def cv( folds: XGBStratifiedKFold = None, metrics: Sequence[str] = (), obj: Optional[Objective] = None, - feval: Optional[Metric] = None, maximize: Optional[bool] = None, early_stopping_rounds: Optional[int] = None, fpreproc: Optional[FPreProcCallable] = None, @@ -466,11 +449,9 @@ def cv( Custom objective function. See :doc:`Custom Objective ` for details. - feval : function - .. deprecated:: 1.6.0 - Use `custom_metric` instead. maximize : bool - Whether to maximize feval. + Whether to maximize the evaluataion metric (score or error). + early_stopping_rounds: int Activates early stopping. Cross-Validation metric (average of validation metric computed over CV folds) needs to improve at least once in @@ -561,8 +542,6 @@ def cv( shuffle=shuffle, ) - metric_fn = _configure_custom_metric(feval, custom_metric) - # setup callbacks callbacks = [] if callbacks is None else copy.copy(list(callbacks)) @@ -572,10 +551,7 @@ def cv( if early_stopping_rounds: callbacks.append(EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)) callbacks_container = CallbackContainer( - callbacks, - metric=metric_fn, - is_cv=True, - output_margin=callable(obj) or metric_fn is feval, + callbacks, metric=custom_metric, is_cv=True, output_margin=callable(obj) ) booster = _PackedBooster(cvfolds) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 90407fcf58ac..d3e11d2f894c 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -980,6 +980,13 @@ XGB_DLL int XGBoosterFree(BoosterHandle handle) { API_END(); } +XGB_DLL int XGBoosterReset(BoosterHandle handle) { + API_BEGIN(); + CHECK_HANDLE(); + static_cast(handle)->Reset(); + API_END(); +} + XGB_DLL int XGBoosterSetParam(BoosterHandle handle, const char *name, const char *value) { @@ -1091,18 +1098,18 @@ XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, in ArrayInterface<2, false> i_grad{StringView{grad}}; ArrayInterface<2, false> i_hess{StringView{hess}}; StringView msg{"Mismatched shape between the gradient and hessian."}; - CHECK_EQ(i_grad.Shape(0), i_hess.Shape(0)) << msg; - CHECK_EQ(i_grad.Shape(1), i_hess.Shape(1)) << msg; + CHECK_EQ(i_grad.Shape<0>(), i_hess.Shape<0>()) << msg; + CHECK_EQ(i_grad.Shape<1>(), i_hess.Shape<1>()) << msg; linalg::Matrix gpair; auto grad_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_grad.data); auto hess_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_hess.data); - CHECK_EQ(i_grad.Shape(0), p_fmat->Info().num_row_) + CHECK_EQ(i_grad.Shape<0>(), p_fmat->Info().num_row_) << "Mismatched size between the gradient and training data."; CHECK_EQ(grad_is_cuda, hess_is_cuda) << "gradient and hessian should be on the same device."; auto *learner = static_cast(handle); auto ctx = learner->Ctx(); if (!grad_is_cuda) { - gpair.Reshape(i_grad.Shape(0), i_grad.Shape(1)); + gpair.Reshape(i_grad.Shape<0>(), i_grad.Shape<1>()); auto h_gpair = gpair.HostView(); DispatchDType(i_grad, DeviceOrd::CPU(), [&](auto &&t_grad) { DispatchDType(i_hess, DeviceOrd::CPU(), [&](auto &&t_hess) { diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu index 47868f466473..c9ff16dea120 100644 --- a/src/c_api/c_api.cu +++ b/src/c_api/c_api.cu @@ -1,5 +1,5 @@ /** - * Copyright 2019-2023, XGBoost Contributors + * Copyright 2019-2024, XGBoost Contributors */ #include // for transform @@ -78,7 +78,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device."; auto &gpair = *out_gpair; gpair.SetDevice(DeviceOrd::CUDA(grad_dev)); - gpair.Reshape(grad.Shape(0), grad.Shape(1)); + gpair.Reshape(grad.Shape<0>(), grad.Shape<1>()); auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev)); auto cuctx = ctx->CUDACtx(); diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu index 01e81b16ee0b..608a535cd8cb 100644 --- a/src/common/device_helpers.cu +++ b/src/common/device_helpers.cu @@ -7,11 +7,6 @@ namespace dh { PinnedMemory::PinnedMemory() { - // Use the `GrowOnlyPinnedMemoryImpl` as the only option for now. - // See https://github.com/dmlc/xgboost/issues/10933 - this->impl_.emplace(); - return; - #if defined(xgboost_IS_WIN) this->impl_.emplace(); #else diff --git a/src/data/adapter.h b/src/data/adapter.h index 0ad1e9e3864c..1467d3376886 100644 --- a/src/data/adapter.h +++ b/src/data/adapter.h @@ -1,22 +1,22 @@ /** - * Copyright 2019-2023, XGBoost Contributors + * Copyright 2019-2024, XGBoost Contributors * \file adapter.h */ #ifndef XGBOOST_DATA_ADAPTER_H_ #define XGBOOST_DATA_ADAPTER_H_ #include -#include -#include // for size_t -#include -#include -#include -#include -#include -#include // std::move -#include - -#include "../common/error_msg.h" // for MaxFeatureSize +#include // for transform, all_of +#include // for isfinite +#include // for size_t +#include // for uint8_t +#include // for back_inserter +#include // for numeric_limits +#include // for unique_ptr, make_unique +#include // for string +#include // for move +#include // for vector + #include "../common/math.h" #include "array_interface.h" #include "xgboost/base.h" @@ -256,7 +256,7 @@ class ArrayAdapterBatch : public detail::NoMetaInfo { Line(ArrayInterface<2> array_interface, size_t ridx) : array_interface_{std::move(array_interface)}, ridx_{ridx} {} - size_t Size() const { return array_interface_.Shape(1); } + size_t Size() const { return array_interface_.Shape<1>(); } COOTuple GetElement(size_t idx) const { return {ridx_, idx, array_interface_(ridx_, idx)}; @@ -269,8 +269,8 @@ class ArrayAdapterBatch : public detail::NoMetaInfo { return Line{array_interface_, idx}; } - [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); } - [[nodiscard]] std::size_t NumCols() const { return array_interface_.Shape(1); } + [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape<0>(); } + [[nodiscard]] std::size_t NumCols() const { return array_interface_.Shape<1>(); } [[nodiscard]] std::size_t Size() const { return this->NumRows(); } explicit ArrayAdapterBatch(ArrayInterface<2> array_interface) @@ -290,8 +290,8 @@ class ArrayAdapter : public detail::SingleBatchDataIter { batch_ = ArrayAdapterBatch{array_interface_}; } [[nodiscard]] ArrayAdapterBatch const& Value() const override { return batch_; } - [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); } - [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); } + [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape<0>(); } + [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape<1>(); } private: ArrayAdapterBatch batch_; @@ -321,7 +321,7 @@ class CSRArrayAdapterBatch : public detail::NoMetaInfo { } [[nodiscard]] std::size_t Size() const { - return values_.Shape(0); + return values_.Shape<0>(); } }; @@ -339,7 +339,7 @@ class CSRArrayAdapterBatch : public detail::NoMetaInfo { } size_t NumRows() const { - size_t size = indptr_.Shape(0); + size_t size = indptr_.Shape<0>(); size = size == 0 ? 0 : size - 1; return size; } @@ -381,9 +381,9 @@ class CSRArrayAdapter : public detail::SingleBatchDataIter return batch_; } size_t NumRows() const { - size_t size = indptr_.Shape(0); + size_t size = indptr_.Shape<0>(); size = size == 0 ? 0 : size - 1; - return size; + return size; } size_t NumColumns() const { return num_cols_; } @@ -479,7 +479,7 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo { values_{std::move(values)}, offset_{offset} {} - std::size_t Size() const { return values_.Shape(0); } + std::size_t Size() const { return values_.Shape<0>(); } COOTuple GetElement(std::size_t idx) const { return {TypedIndex{row_idx_}(offset_ + idx), column_idx_, values_(offset_ + idx)}; @@ -684,7 +684,7 @@ class ColumnarAdapterBatch : public detail::NoMetaInfo { : columns_{columns} {} [[nodiscard]] Line GetLine(std::size_t ridx) const { return Line{columns_, ridx}; } [[nodiscard]] std::size_t Size() const { - return columns_.empty() ? 0 : columns_.front().Shape(0); + return columns_.empty() ? 0 : columns_.front().Shape<0>(); } [[nodiscard]] std::size_t NumCols() const { return columns_.empty() ? 0 : columns_.size(); } [[nodiscard]] std::size_t NumRows() const { return this->Size(); } @@ -707,7 +707,7 @@ class ColumnarAdapter : public detail::SingleBatchDataIter bool consistent = columns_.empty() || std::all_of(columns_.cbegin(), columns_.cend(), [&](ArrayInterface<1, false> const& array) { - return array.Shape(0) == columns_[0].Shape(0); + return array.Shape<0>() == columns_[0].Shape<0>(); }); CHECK(consistent) << "Size of columns should be the same."; batch_ = ColumnarAdapterBatch{columns_}; diff --git a/src/data/array_interface.h b/src/data/array_interface.h index 93fb55dd5626..35056b74f3aa 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -501,8 +501,16 @@ class ArrayInterface { } } - [[nodiscard]] XGBOOST_DEVICE std::size_t Shape(size_t i) const { return shape[i]; } - [[nodiscard]] XGBOOST_DEVICE std::size_t Stride(size_t i) const { return strides[i]; } + template + [[nodiscard]] XGBOOST_DEVICE std::size_t Shape() const { + static_assert(i < D); + return shape[i]; + } + template + [[nodiscard]] XGBOOST_DEVICE std::size_t Stride() const { + static_assert(i < D); + return strides[i]; + } template XGBOOST_HOST_DEV_INLINE decltype(auto) DispatchCall(Fn func) const { diff --git a/src/data/data.cc b/src/data/data.cc index 47836bb5134b..713ad4a1a514 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -539,7 +539,9 @@ void MetaInfo::SetInfoFromHost(Context const* ctx, StringView key, Json arr) { } else if (key == "label") { CopyTensorInfoImpl(ctx, arr, &this->labels); if (this->num_row_ != 0 && this->labels.Shape(0) != this->num_row_) { - CHECK_EQ(this->labels.Size() % this->num_row_, 0) << "Incorrect size for labels."; + CHECK_EQ(this->labels.Size() % this->num_row_, 0) + << "Incorrect size for labels: (" << this->labels.Shape(0) << "," << this->labels.Shape(1) + << ") v.s. " << this->num_row_; size_t n_targets = this->labels.Size() / this->num_row_; this->labels.Reshape(this->num_row_, n_targets); } diff --git a/src/data/data.cu b/src/data/data.cu index 73717aa79700..17fc54a562a4 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -69,12 +69,12 @@ void CopyGroupInfoImpl(ArrayInterface<1> column, std::vector* out) auto ptr_device = SetDeviceToPtr(column.data); CHECK_EQ(ptr_device, dh::CurrentDevice()); - dh::TemporaryArray temp(column.Shape(0)); + dh::TemporaryArray temp(column.Shape<0>()); auto d_tmp = temp.data().get(); - dh::LaunchN(column.Shape(0), + dh::LaunchN(column.Shape<0>(), [=] __device__(size_t idx) { d_tmp[idx] = TypedIndex{column}(idx); }); - auto length = column.Shape(0); + auto length = column.Shape<0>(); out->resize(length + 1); out->at(0) = 0; thrust::copy(temp.data(), temp.data() + length, out->begin() + 1); @@ -93,7 +93,7 @@ void CopyQidImpl(Context const* ctx, ArrayInterface<1> array_interface, auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data)); auto cuctx = ctx->CUDACtx(); dh::LaunchN(1, cuctx->Stream(), [=] __device__(size_t) { d_flag[0] = true; }); - dh::LaunchN(array_interface.Shape(0) - 1, cuctx->Stream(), [=] __device__(size_t i) { + dh::LaunchN(array_interface.Shape<0>() - 1, cuctx->Stream(), [=] __device__(size_t i) { auto typed = TypedIndex{array_interface}; if (typed(i) > typed(i + 1)) { d_flag[0] = false; @@ -104,15 +104,15 @@ void CopyQidImpl(Context const* ctx, ArrayInterface<1> array_interface, cudaMemcpyDeviceToHost)); CHECK(non_dec) << "`qid` must be sorted in increasing order along with data."; size_t bytes = 0; - dh::caching_device_vector out(array_interface.Shape(0)); - dh::caching_device_vector cnt(array_interface.Shape(0)); + dh::caching_device_vector out(array_interface.Shape<0>()); + dh::caching_device_vector cnt(array_interface.Shape<0>()); HostDeviceVector d_num_runs_out(1, 0, d); cub::DeviceRunLengthEncode::Encode(nullptr, bytes, it, out.begin(), cnt.begin(), - d_num_runs_out.DevicePointer(), array_interface.Shape(0), + d_num_runs_out.DevicePointer(), array_interface.Shape<0>(), cuctx->Stream()); dh::CachingDeviceUVector tmp(bytes); cub::DeviceRunLengthEncode::Encode(tmp.data(), bytes, it, out.begin(), cnt.begin(), - d_num_runs_out.DevicePointer(), array_interface.Shape(0), + d_num_runs_out.DevicePointer(), array_interface.Shape<0>(), cuctx->Stream()); auto h_num_runs_out = d_num_runs_out.HostSpan()[0]; diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh index 9089c361ea23..cad3cffbc58a 100644 --- a/src/data/device_adapter.cuh +++ b/src/data/device_adapter.cuh @@ -16,9 +16,7 @@ #include "adapter.h" #include "array_interface.h" -namespace xgboost { -namespace data { - +namespace xgboost::data { class CudfAdapterBatch : public detail::NoMetaInfo { friend class CudfAdapter; @@ -114,7 +112,7 @@ class CudfAdapter : public detail::SingleBatchDataIter { CHECK_EQ(typestr.size(), 3) << ArrayInterfaceErrors::TypestrFormat(); std::vector> columns; auto first_column = ArrayInterface<1>(get(json_columns[0])); - num_rows_ = first_column.Shape(0); + num_rows_ = first_column.Shape<0>(); if (num_rows_ == 0) { return; } @@ -124,12 +122,12 @@ class CudfAdapter : public detail::SingleBatchDataIter { dh::safe_cuda(cudaSetDevice(device_.ordinal)); for (auto& json_col : json_columns) { auto column = ArrayInterface<1>(get(json_col)); - n_bytes_ += column.ElementSize() * column.Shape(0); + n_bytes_ += column.ElementSize() * column.Shape<0>(); columns.push_back(column); - num_rows_ = std::max(num_rows_, column.Shape(0)); + num_rows_ = std::max(num_rows_, column.Shape<0>()); CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data)) << "All columns should use the same device."; - CHECK_EQ(num_rows_, column.Shape(0)) + CHECK_EQ(num_rows_, column.Shape<0>()) << "All columns should have same number of rows."; } columns_ = columns; @@ -161,12 +159,13 @@ class CupyAdapterBatch : public detail::NoMetaInfo { CupyAdapterBatch() = default; explicit CupyAdapterBatch(ArrayInterface<2> array_interface) : array_interface_(std::move(array_interface)) {} + // The total number of elements. [[nodiscard]] std::size_t Size() const { - return array_interface_.Shape(0) * array_interface_.Shape(1); + return array_interface_.Shape<0>() * array_interface_.Shape<1>(); } [[nodiscard]]__device__ COOTuple GetElement(size_t idx) const { - size_t column_idx = idx % array_interface_.Shape(1); - size_t row_idx = idx / array_interface_.Shape(1); + size_t column_idx = idx % array_interface_.Shape<1>(); + size_t row_idx = idx / array_interface_.Shape<1>(); float value = array_interface_(row_idx, column_idx); return {row_idx, column_idx, value}; } @@ -175,8 +174,8 @@ class CupyAdapterBatch : public detail::NoMetaInfo { return value; } - [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return array_interface_.Shape(0); } - [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols() const { return array_interface_.Shape(1); } + [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return array_interface_.Shape<0>(); } + [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols() const { return array_interface_.Shape<1>(); } private: ArrayInterface<2> array_interface_; @@ -188,20 +187,20 @@ class CupyAdapter : public detail::SingleBatchDataIter { Json json_array_interface = Json::Load(cuda_interface_str); array_interface_ = ArrayInterface<2>(get(json_array_interface)); batch_ = CupyAdapterBatch(array_interface_); - if (array_interface_.Shape(0) == 0) { + if (array_interface_.Shape<0>() == 0) { return; } device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data)); this->n_bytes_ = - array_interface_.Shape(0) * array_interface_.Shape(1) * array_interface_.ElementSize(); + array_interface_.Shape<0>() * array_interface_.Shape<1>() * array_interface_.ElementSize(); CHECK(device_.IsCUDA()); } explicit CupyAdapter(std::string cuda_interface_str) : CupyAdapter{StringView{cuda_interface_str}} {} [[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; } - [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); } - [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); } + [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape<0>(); } + [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape<1>(); } [[nodiscard]] DeviceOrd Device() const { return device_; } [[nodiscard]] bst_idx_t SizeBytes() const { return this->n_bytes_; } @@ -279,6 +278,5 @@ bool NoInfInData(Context const* ctx, AdapterBatchT const& batch, IsValidFunctor thrust::logical_and<>{}); return valid; } -}; // namespace data -} // namespace xgboost +} // namespace xgboost::data #endif // XGBOOST_DATA_DEVICE_ADAPTER_H_ diff --git a/src/learner.cc b/src/learner.cc index e6642b0874ac..1dcd0fcfc7eb 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -860,6 +860,7 @@ class LearnerIO : public LearnerConfiguration { // Will be removed once JSON takes over. Right now we still loads some RDS files from R. std::string const serialisation_header_ { u8"CONFIG-offset:" }; + protected: void ClearCaches() { this->prediction_container_ = PredictionContainer{}; } public: @@ -1264,6 +1265,28 @@ class LearnerImpl : public LearnerIO { return out_impl; } + void Reset() override { + this->Configure(); + this->CheckModelInitialized(); + // Global data + auto local_map = LearnerAPIThreadLocalStore::Get(); + if (local_map->find(this) != local_map->cend()) { + local_map->erase(this); + } + + // Model + std::string buf; + common::MemoryBufferStream fo(&buf); + this->Save(&fo); + + common::MemoryFixSizeBuffer fs(buf.data(), buf.size()); + this->Load(&fs); + + // Learner self cache. Prediction is cleared in the load method + CHECK(this->prediction_container_.Container().empty()); + this->gpair_ = decltype(this->gpair_){}; + } + void UpdateOneIter(int iter, std::shared_ptr train) override { monitor_.Start("UpdateOneIter"); TrainingObserver::Instance().Update(iter); diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc index f1c7548877c9..4a47de9bd46f 100644 --- a/src/objective/lambdarank_obj.cc +++ b/src/objective/lambdarank_obj.cc @@ -1,5 +1,5 @@ /** - * Copyright (c) 2023, XGBoost contributors + * Copyright 2023-2024, XGBoost contributors */ #include "lambdarank_obj.h" @@ -23,7 +23,6 @@ #include "../common/optional_weight.h" // for MakeOptionalWeights, OptionalWeights #include "../common/ranking_utils.h" // for RankingCache, LambdaRankParam, MAPCache, NDCGC... #include "../common/threading_utils.h" // for ParallelFor, Sched -#include "../common/transform_iterator.h" // for IndexTransformIter #include "init_estimation.h" // for FitIntercept #include "xgboost/base.h" // for bst_group_t, GradientPair, kRtEps, GradientPai... #include "xgboost/context.h" // for Context diff --git a/src/objective/lambdarank_obj.cuh b/src/objective/lambdarank_obj.cuh index 2e5724f7f1fd..e1a78f905434 100644 --- a/src/objective/lambdarank_obj.cuh +++ b/src/objective/lambdarank_obj.cuh @@ -1,5 +1,5 @@ /** - * Copyright 2023 XGBoost contributors + * Copyright 2023-2024, XGBoost contributors */ #ifndef XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_CUH_ #define XGBOOST_OBJECTIVE_LAMBDARANK_OBJ_CUH_ @@ -71,13 +71,13 @@ struct KernelInputs { std::int32_t iter; }; /** - * \brief Functor for generating pairs + * @brief Functor for generating pairs */ template struct MakePairsOp { KernelInputs args; /** - * \brief Make pair for the topk pair method. + * @brief Make pair for the topk pair method. */ [[nodiscard]] XGBOOST_DEVICE std::tuple WithTruncation( std::size_t idx, bst_group_t g) const { @@ -86,9 +86,6 @@ struct MakePairsOp { auto data_group_begin = static_cast(args.d_group_ptr[g]); std::size_t n_data = args.d_group_ptr[g + 1] - data_group_begin; - // obtain group segment data. - auto g_label = args.labels.Slice(linalg::Range(data_group_begin, data_group_begin + n_data), 0); - auto g_sorted_idx = args.d_sorted_idx.subspan(data_group_begin, n_data); std::size_t i = 0, j = 0; common::UnravelTrapeziodIdx(idx_in_thread_group, n_data, &i, &j); @@ -97,7 +94,7 @@ struct MakePairsOp { return std::make_tuple(rank_high, rank_low); } /** - * \brief Make pair for the mean pair method + * @brief Make pair for the mean pair method */ XGBOOST_DEVICE std::tuple WithSampling(std::size_t idx, bst_group_t g) const { diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc index 040022c373a4..0639233510f7 100644 --- a/src/tree/tree_model.cc +++ b/src/tree/tree_model.cc @@ -303,9 +303,8 @@ class TextGenerator : public TreeGenerator { return result; } - std::string SplitNodeImpl( - RegTree const& tree, int32_t nid, std::string const& template_str, - std::string cond, uint32_t depth) const { + std::string SplitNodeImpl(RegTree const& tree, bst_node_t nid, std::string const& template_str, + std::string cond, uint32_t depth) const { auto split_index = tree[nid].SplitIndex(); std::string const result = SuperT::Match( template_str, @@ -345,18 +344,16 @@ class TextGenerator : public TreeGenerator { return SplitNodeImpl(tree, nid, kNodeTemplate, ToStr(cond), depth); } - std::string Categorical(RegTree const &tree, int32_t nid, - uint32_t depth) const override { + std::string Categorical(RegTree const& tree, bst_node_t nid, uint32_t depth) const override { auto cats = GetSplitCategories(tree, nid); std::string cats_str = PrintCatsAsSet(cats); static std::string const kNodeTemplate = "{tabs}{nid}:[{fname}:{cond}] yes={right},no={left},missing={missing}"; - std::string const result = - SplitNodeImpl(tree, nid, kNodeTemplate, cats_str, depth); + std::string const result = SplitNodeImpl(tree, nid, kNodeTemplate, cats_str, depth); return result; } - std::string NodeStat(RegTree const& tree, int32_t nid) const override { + std::string NodeStat(RegTree const& tree, bst_node_t nid) const override { static std::string const kStatTemplate = ",gain={loss_chg},cover={sum_hess}"; std::string const result = SuperT::Match( kStatTemplate, @@ -679,15 +676,12 @@ class GraphvizGenerator : public TreeGenerator { std::string result; if (this->with_stats_) { CHECK(!tree.IsMultiTarget()) << MTNotImplemented(); - result = SuperT::Match( - kNodeTemplate, {{"{nid}", std::to_string(nidx)}, - {"{fname}", GetFeatureName(fmap_, split_index)}, - {"{<}", has_less ? "<" : ""}, - {"{cond}", has_less ? ToStr(cond) : ""}, - {"{stat}", Match("\ncover={cover}\ngain={gain}", - {{"{cover}", std::to_string(tree.Stat(nidx).sum_hess)}, - {"{gain}", std::to_string(tree.Stat(nidx).loss_chg)}})}, - {"{params}", param_.condition_node_params}}); + result = SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nidx)}, + {"{fname}", GetFeatureName(fmap_, split_index)}, + {"{<}", has_less ? "<" : ""}, + {"{cond}", has_less ? ToStr(cond) : ""}, + {"{stat}", this->NodeStat(tree, nidx)}, + {"{params}", param_.condition_node_params}}); } else { result = SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nidx)}, {"{fname}", GetFeatureName(fmap_, split_index)}, @@ -703,9 +697,15 @@ class GraphvizGenerator : public TreeGenerator { return result; }; - std::string Categorical(RegTree const& tree, bst_node_t nidx, uint32_t) const override { + std::string NodeStat(RegTree const& tree, bst_node_t nidx) const override { + return Match("\ngain={gain}\ncover={cover}", + {{"{cover}", std::to_string(tree.Stat(nidx).sum_hess)}, + {"{gain}", std::to_string(tree.Stat(nidx).loss_chg)}}); + } + + std::string Categorical(RegTree const& tree, bst_node_t nidx, uint32_t /*depth*/) const override { static std::string const kLabelTemplate = - " {nid} [ label=\"{fname}:{cond}\" {params}]\n"; + " {nid} [ label=\"{fname}:{cond}{stat}\" {params}]\n"; auto cats = GetSplitCategories(tree, nidx); auto cats_str = PrintCatsAsSet(cats); auto split_index = tree.SplitIndex(nidx); @@ -714,6 +714,7 @@ class GraphvizGenerator : public TreeGenerator { SuperT::Match(kLabelTemplate, {{"{nid}", std::to_string(nidx)}, {"{fname}", GetFeatureName(fmap_, split_index)}, {"{cond}", cats_str}, + {"{stat}", this->NodeStat(tree, nidx)}, {"{params}", param_.condition_node_params}}); result += BuildEdge(tree, nidx, tree.LeftChild(nidx), true); diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh deleted file mode 100755 index aa8f572483a3..000000000000 --- a/tests/buildkite/build-containers.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -set -euo pipefail -set -x - -if [ "$#" -lt 1 ] -then - echo "Usage: $0 [container to build]" - exit 1 -fi -container=$1 - -source tests/buildkite/conftest.sh - -echo "--- Build container ${container}" - -BUILD_ARGS="" - -case "${container}" in - cpu) - ;; - - gpu|gpu_build_rockylinux8) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - ;; - - gpu_dev_ver) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$DEV_RAPIDS_VERSION" - ;; - - jvm_gpu_build) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - ;; - - *) - echo "Unrecognized container ID: ${container}" - exit 2 - ;; -esac - -# Run a no-op command. This will simply build the container and push it to the private registry -tests/ci_build/ci_build.sh ${container} ${BUILD_ARGS} bash diff --git a/tests/buildkite/build-cpu-arm64.sh b/tests/buildkite/build-cpu-arm64.sh deleted file mode 100755 index 8b3847ed58b9..000000000000 --- a/tests/buildkite/build-cpu-arm64.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -WHEEL_TAG=manylinux_2_28_aarch64 - -echo "--- Build CPU code targeting ARM64" - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh aarch64" - -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=aarch64_test \ - -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOL=ON -echo "--- Run Google Test" -$command_wrapper bash -c "cd build && ctest --extra-verbose" - -echo "--- Build binary wheel" -$command_wrapper bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} - -echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -$command_wrapper bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" - -echo "--- Upload Python wheel" -buildkite-agent artifact upload "python-package/dist/*.whl" -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress -fi - -echo "--- Stash XGBoost CLI executable" -buildkite-agent artifact upload ./xgboost diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh deleted file mode 100755 index 11679d644de1..000000000000 --- a/tests/buildkite/build-cpu.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Build CPU code" - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh cpu" - -$command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h - # This step is not necessary, but here we include it, to ensure that - # DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use - # the configured header build/dmlc/build_config.h instead of - # include/dmlc/build_config_default.h. -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \ - -DPLUGIN_FEDERATED=ON -echo "--- Run Google Test" -$command_wrapper bash -c "cd build && ctest --extra-verbose" -echo "--- Stash XGBoost CLI executable" -buildkite-agent artifact upload ./xgboost - -# Sanitizer test -echo "--- Run Google Test with sanitizer enabled" -$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON \ - -DENABLED_SANITIZERS="address;leak;undefined" -DCMAKE_BUILD_TYPE=Debug \ - -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ -CI_DOCKER_EXTRA_PARAMS_INIT="-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer "` - `"-e ASAN_OPTIONS=symbolize=1 "` - `"-e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log "` - `"--cap-add SYS_PTRACE" \ - $command_wrapper bash -c "cd build && ctest --exclude-regex AllTestsInDMLCUnitTests "` - `"--extra-verbose" diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh deleted file mode 100755 index 189c67cba449..000000000000 --- a/tests/buildkite/build-cuda-with-rmm.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -if [ "$#" -lt 1 ] -then - mode=stable - exit 1 -else - mode=$1 -fi - -WHEEL_TAG=manylinux_2_28_x86_64 - -source tests/buildkite/conftest.sh - - -case "${mode}" in - stable) - container_tag='gpu_build_rockylinux8' - rapids_version=$RAPIDS_VERSION - ;; - - dev) - container_tag='gpu_dev_ver' - rapids_version=$DEV_RAPIDS_VERSION - ;; - - *) - echo "Unrecognized mode ID: ${mode}" - exit 2 - ;; -esac - -echo "--- Build with CUDA ${CUDA_VERSION} with RMM" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -command_wrapper="tests/ci_build/ci_build.sh $container_tag --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$rapids_version" - -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DPLUGIN_RMM=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} -echo "--- Build binary wheel" -$command_wrapper bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} - -echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 auditwheel repair \ - --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" - -echo "--- Upload Python wheel" -buildkite-agent artifact upload python-package/dist/*.whl -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \ - --acl public-read --no-progress -fi - -echo "-- Stash C++ test executable (testxgboost)" -buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh deleted file mode 100755 index 03d2cc8a6a24..000000000000 --- a/tests/buildkite/build-cuda.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -WHEEL_TAG=manylinux_2_28_x86_64 - -source tests/buildkite/conftest.sh - -echo "--- Build with CUDA ${CUDA_VERSION}" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -command_wrapper="tests/ci_build/ci_build.sh gpu_build_rockylinux8 --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} -echo "--- Build binary wheel" -$command_wrapper bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} - -echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 auditwheel repair \ - --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" - -echo "--- Upload Python wheel" -buildkite-agent artifact upload python-package/dist/*.whl -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress - - # Generate the meta info which includes xgboost version and the commit info - $command_wrapper python tests/ci_build/format_wheel_meta.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} \ - --meta-path python-package/dist/ - aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress -fi -echo "-- Stash C++ test executable (testxgboost)" -buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-gpu-rpkg.sh b/tests/buildkite/build-gpu-rpkg.sh deleted file mode 100755 index 83bcd9eb9c7b..000000000000 --- a/tests/buildkite/build-gpu-rpkg.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build XGBoost R package with CUDA" - -tests/ci_build/ci_build.sh gpu_build_r_rockylinux8 \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg R_VERSION_ARG=${R_VERSION} \ - tests/ci_build/build_r_pkg_with_cuda.sh \ - ${BUILDKITE_COMMIT} - -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "--- Upload R tarball" - aws s3 cp xgboost_r_gpu_linux_*.tar.gz s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress -fi diff --git a/tests/buildkite/build-jvm-doc.sh b/tests/buildkite/build-jvm-doc.sh deleted file mode 100755 index d168eb8cc58d..000000000000 --- a/tests/buildkite/build-jvm-doc.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build JVM packages doc" -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "--- Upload JVM packages doc" - aws s3 cp jvm-packages/${BRANCH_NAME}.tar.bz2 \ - s3://xgboost-docs/${BRANCH_NAME}.tar.bz2 --acl public-read --no-progress -fi diff --git a/tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh b/tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh deleted file mode 100644 index e7fec780b956..000000000000 --- a/tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh jvm_manylinux2014_aarch64" - -# Build XGBoost4J binary -echo "--- Build libxgboost4j.so (targeting glibc 2.17)" -set -x -mkdir build -$command_wrapper bash -c \ - "cd build && cmake .. -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && make -j$(nproc)" -ldd lib/libxgboost4j.so -objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu - -echo "--- Upload libxgboost4j.so" -pushd lib -libname=libxgboost4j_linux_arm64_${BUILDKITE_COMMIT}.so -mv -v libxgboost4j.so ${libname} -buildkite-agent artifact upload ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd diff --git a/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh b/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh deleted file mode 100644 index 46a819a016d3..000000000000 --- a/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh jvm_manylinux2014_x86_64" - -# Build XGBoost4J binary -echo "--- Build libxgboost4j.so (targeting glibc 2.17)" -set -x -mkdir build -$command_wrapper bash -c \ - "cd build && cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && ninja -v" -ldd lib/libxgboost4j.so -objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu - -echo "--- Upload libxgboost4j.so" -pushd lib -libname=libxgboost4j_linux_x86_64_${BUILDKITE_COMMIT}.so -mv -v libxgboost4j.so ${libname} -buildkite-agent artifact upload ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd diff --git a/tests/buildkite/build-jvm-packages-gpu.sh b/tests/buildkite/build-jvm-packages-gpu.sh deleted file mode 100755 index 76ffafbcfdd7..000000000000 --- a/tests/buildkite/build-jvm-packages-gpu.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build and test XGBoost JVM packages with CUDA" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -tests/ci_build/ci_build.sh jvm_gpu_build --use-gpus \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \ - tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} -Duse.cuda=ON ${arch_flag} diff --git a/tests/buildkite/build-jvm-packages.sh b/tests/buildkite/build-jvm-packages.sh deleted file mode 100755 index da4d1e9d8c8a..000000000000 --- a/tests/buildkite/build-jvm-packages.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build and test XGBoost JVM packages with Scala 2.12" -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} - -echo "--- Build and test XGBoost JVM packages with Scala 2.13" - -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} "" "" "true" diff --git a/tests/buildkite/build-manylinux2014.sh b/tests/buildkite/build-manylinux2014.sh deleted file mode 100755 index 426d32b5c361..000000000000 --- a/tests/buildkite/build-manylinux2014.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -if [ $# -ne 1 ]; then - echo "Usage: $0 {x86_64,aarch64}" - exit 1 -fi - -arch=$1 - -source tests/buildkite/conftest.sh - -WHEEL_TAG="manylinux2014_${arch}" -command_wrapper="tests/ci_build/ci_build.sh ${WHEEL_TAG}" -python_bin="/opt/python/cp310-cp310/bin/python" - -echo "--- Build binary wheel for ${WHEEL_TAG}" -# Patch to add warning about manylinux2014 variant -patch -p0 < tests/buildkite/remove_nccl_dep.patch -patch -p0 < tests/buildkite/manylinux2014_warning.patch -$command_wrapper bash -c \ - "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" -git checkout python-package/pyproject.toml python-package/xgboost/core.py # discard the patch - -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -rm -rf python-package/dist/ -mkdir python-package/dist/ -mv -v wheelhouse/*.whl python-package/dist/ - -echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)" -# Patch to rename pkg to xgboost-cpu -patch -p0 < tests/buildkite/remove_nccl_dep.patch -patch -p0 < tests/buildkite/cpu_only_pypkg.patch -$command_wrapper bash -c \ - "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" -git checkout python-package/pyproject.toml # discard the patch - -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl -$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/xgboost_cpu-*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -rm -v python-package/dist/xgboost_cpu-*.whl -mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/ - -echo "--- Upload Python wheel" -for wheel in python-package/dist/*.whl -do - buildkite-agent artifact upload "${wheel}" -done -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - for wheel in python-package/dist/*.whl - do - aws s3 cp "${wheel}" s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ - --acl public-read --no-progress - done -fi diff --git a/tests/buildkite/build-win64-gpu.ps1 b/tests/buildkite/build-win64-gpu.ps1 deleted file mode 100644 index 9114d3237751..000000000000 --- a/tests/buildkite/build-win64-gpu.ps1 +++ /dev/null @@ -1,55 +0,0 @@ -$ErrorActionPreference = "Stop" - -. tests/buildkite/conftest.ps1 - -Write-Host "--- Build libxgboost on Windows with CUDA" - -nvcc --version -if ( $is_release_branch -eq 0 ) { - $arch_flag = "-DGPU_COMPUTE_VER=75" -} else { - $arch_flag = "" -} -mkdir build -cd build -cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON ` - -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ${arch_flag} -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -cmake --build . --config Release -- /m /nodeReuse:false ` - "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -Write-Host "--- Build binary wheel" -cd ../python-package -conda activate -& pip install --user -v "pip>=23" -& pip --version -& pip wheel --no-deps -v . --wheel-dir dist/ -Get-ChildItem . -Filter dist/*.whl | -Foreach-Object { - & python ../tests/ci_build/rename_whl.py ` - --wheel-path $_.FullName ` - --commit-hash $Env:BUILDKITE_COMMIT ` - --platform-tag win_amd64 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} - -Write-Host "--- Upload Python wheel" -cd .. -Get-ChildItem . -Filter python-package/dist/*.whl | -Foreach-Object { - & buildkite-agent artifact upload python-package/dist/$_ - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} -if ( $is_release_branch -eq 1 ) { - Get-ChildItem . -Filter python-package/dist/*.whl | - Foreach-Object { - & aws s3 cp python-package/dist/$_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ ` - --acl public-read --no-progress - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - } -} - -Write-Host "--- Stash C++ test executables" -& buildkite-agent artifact upload build/testxgboost.exe -& buildkite-agent artifact upload xgboost.exe diff --git a/tests/buildkite/conftest.ps1 b/tests/buildkite/conftest.ps1 deleted file mode 100644 index bd623caf0c03..000000000000 --- a/tests/buildkite/conftest.ps1 +++ /dev/null @@ -1,13 +0,0 @@ -if ( $Env:BUILDKITE_PULL_REQUEST -and ($Env:BUILDKITE_PULL_REQUEST -ne "false") ) { - $is_pull_request = 1 -} else { - $is_pull_request = 0 -} - -if ( ($Env:BUILDKITE_BRANCH -eq "master") -or ($Env:BUILDKITE_BRANCH -match "release_.+") ) { - $is_release_branch = 1 - $enforce_daily_budget = 0 -} else { - $is_release_branch = 0 - $enforce_daily_budget = 1 -} diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh deleted file mode 100755 index 12f4c07ac6c9..000000000000 --- a/tests/buildkite/conftest.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -function get_aws_secret { - if [[ $# -ne 1 ]] - then - echo "Usage: get_aws_secret [Name of secret]" - return 1 - fi - aws secretsmanager get-secret-value --secret-id $1 --output text --region us-west-2 --query SecretString -} - -function set_buildkite_env_vars_in_container { - # Pass all Buildkite-specific env vars to Docker containers. - # This is to be used with tests/ci_build/ci_build.sh - export CI_DOCKER_EXTRA_PARAMS_INIT="${CI_DOCKER_EXTRA_PARAMS_INIT:-} "` - `"--env BUILDKITE_ANALYTICS_TOKEN --env BUILDKITE_BUILD_ID --env BUILDKITE_BUILD_NUMBER "` - `"--env BUILDKITE_JOB_ID --env BUILDKITE_BRANCH --env BUILDKITE_COMMIT "` - `"--env BUILDKITE_MESSAGE --env BUILDKITE_BUILD_URL" -} - -set -x - -CUDA_VERSION=12.4.1 -NCCL_VERSION=2.23.4-1 -RAPIDS_VERSION=24.10 -DEV_RAPIDS_VERSION=24.12 -SPARK_VERSION=3.5.1 -JDK_VERSION=8 -R_VERSION=4.3.2 - -if [[ -z ${BUILDKITE:-} ]] -then - echo "$0 is not meant to run locally; it should run inside BuildKite." - echo "Please inspect the content of $0 and locate the desired command manually." - exit 1 -fi - -if [[ -n $BUILDKITE_PULL_REQUEST && $BUILDKITE_PULL_REQUEST != "false" ]] -then - is_pull_request=1 - BRANCH_NAME=PR-$BUILDKITE_PULL_REQUEST -else - is_pull_request=0 - BRANCH_NAME=$BUILDKITE_BRANCH -fi -export BRANCH_NAME=${BRANCH_NAME//\//-} - -if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* || $BRANCH_NAME == "federated-secure" ]] -then - is_release_branch=1 - enforce_daily_budget=0 -else - is_release_branch=0 - enforce_daily_budget=1 -fi - -if [[ -n ${DISABLE_RELEASE:-} ]] -then - is_release_branch=0 -fi - -set +x diff --git a/tests/buildkite/deploy-jvm-packages.sh b/tests/buildkite/deploy-jvm-packages.sh deleted file mode 100755 index 812a6c5cafec..000000000000 --- a/tests/buildkite/deploy-jvm-packages.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" - tests/ci_build/ci_build.sh jvm_gpu_build \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \ - tests/ci_build/deploy_jvm_packages.sh ${SPARK_VERSION} -fi diff --git a/tests/buildkite/enforce_daily_budget.py b/tests/buildkite/enforce_daily_budget.py deleted file mode 100644 index af1b1ce484b8..000000000000 --- a/tests/buildkite/enforce_daily_budget.py +++ /dev/null @@ -1,14 +0,0 @@ -import json -import argparse - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--response", type=str, required=True) - args = parser.parse_args() - with open(args.response, "r") as f: - payload = f.read() - response = json.loads(payload) - if response["approved"]: - print(f"Testing approved. Reason: {response['reason']}") - else: - raise RuntimeError(f"Testing rejected. Reason: {response['reason']}") diff --git a/tests/buildkite/enforce_daily_budget.sh b/tests/buildkite/enforce_daily_budget.sh deleted file mode 100755 index 8212f07c1b24..000000000000 --- a/tests/buildkite/enforce_daily_budget.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Enforce daily budget" - -source tests/buildkite/conftest.sh - -if [[ $enforce_daily_budget == 0 ]] -then - echo "Automatically approving all test jobs for trunk branches" -else - aws lambda invoke --function-name XGBoostCICostWatcher --invocation-type RequestResponse --region us-west-2 response.json - python3 tests/buildkite/enforce_daily_budget.py --response response.json -fi diff --git a/tests/buildkite/infrastructure/README.md b/tests/buildkite/infrastructure/README.md deleted file mode 100644 index cc3e552e70ff..000000000000 --- a/tests/buildkite/infrastructure/README.md +++ /dev/null @@ -1,106 +0,0 @@ -BuildKite CI Infrastructure -=========================== - -# Worker image builder (`worker-image-pipeline/`) - -Use EC2 Image Builder to build machine images in a deterministic fashion. -The machine images are used to initialize workers in the CI/CD pipelines. - -## Editing bootstrap scripts - -Currently, we create two pipelines for machine images: one for Linux workers and another -for Windows workers. -You can edit the bootstrap scripts to change how the worker machines are initialized. - -* `linux-amd64-gpu-bootstrap.yml`: Bootstrap script for Linux worker machines -* `windows-gpu-bootstrap.yml`: Bootstrap script for Windows worker machines - -## Creating and running Image Builder pipelines - -Run the following commands to create and run pipelines in EC2 Image Builder service: -```bash -python worker-image-pipeline/create_worker_image_pipelines.py --aws-region us-west-2 -python worker-image-pipeline/run_pipelines.py --aws-region us-west-2 -``` -Go to the AWS CloudFormation console and verify the existence of two CloudFormation stacks: -* `buildkite-windows-gpu-worker` -* `buildkite-linux-amd64-gpu-worker` - -Then go to the EC2 Image Builder console to check the status of the image builds. You may -want to inspect the log output should a build fails. -Once the new machine images are done building, see the next section to deploy the new -images to the worker machines. - -# Elastic CI Stack for AWS (`aws-stack-creator/`) - -Use EC2 Autoscaling groups to launch worker machines in EC2. BuildKite periodically sends -messages to the Autoscaling groups to increase or decrease the number of workers according -to the number of outstanding testing jobs. - -## Deploy an updated CI stack with new machine images - -First, edit `aws-stack-creator/metadata.py` to update the `AMI_ID` fields: -```python -AMI_ID = { - # Managed by XGBoost team - "linux-amd64-gpu": { - "us-west-2": "...", - }, - "linux-amd64-mgpu": { - "us-west-2": "...", - }, - "windows-gpu": { - "us-west-2": "...", - }, - "windows-cpu": { - "us-west-2": "...", - }, - # Managed by BuildKite - # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml - "linux-amd64-cpu": { - "us-west-2": "...", - }, - "pipeline-loader": { - "us-west-2": "...", - }, - "linux-arm64-cpu": { - "us-west-2": "...", - }, -} -``` -AMI IDs uniquely identify the machine images in the EC2 service. -Go to the EC2 Image Builder console to find the AMI IDs for the new machine images -(see the previous section), and update the following fields: - -* `AMI_ID["linux-amd64-gpu"]["us-west-2"]`: - Use the latest output from the `buildkite-linux-amd64-gpu-worker` pipeline -* `AMI_ID["linux-amd64-mgpu"]["us-west-2"]`: - Should be identical to `AMI_ID["linux-amd64-gpu"]["us-west-2"]` -* `AMI_ID["windows-gpu"]["us-west-2"]`: - Use the latest output from the `buildkite-windows-gpu-worker` pipeline -* `AMI_ID["windows-cpu"]["us-west-2"]`: - Should be identical to `AMI_ID["windows-gpu"]["us-west-2"]` - -Next, visit https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml -to look up the AMI IDs for the following fields: - -* `AMI_ID["linux-amd64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field - `Mappings/AWSRegion2AMI/us-west-2/linuxamd64` -* `AMI_ID["pipeline-loader"]["us-west-2"]`: - Should be identical to `AMI_ID["linux-amd64-cpu"]["us-west-2"]` -* `AMI_ID["linux-arm64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field - `Mappings/AWSRegion2AMI/us-west-2/linuxarm64` - -Finally, run the following commands to deploy the new machine images: -``` -python aws-stack-creator/create_stack.py --aws-region us-west-2 --agent-token AGENT_TOKEN -``` -Go to the AWS CloudFormation console and verify the existence of the following -CloudFormation stacks: -* `buildkite-pipeline-loader-autoscaling-group` -* `buildkite-linux-amd64-cpu-autoscaling-group` -* `buildkite-linux-amd64-gpu-autoscaling-group` -* `buildkite-linux-amd64-mgpu-autoscaling-group` -* `buildkite-linux-arm64-cpu-autoscaling-group` -* `buildkite-windows-cpu-autoscaling-group` -* `buildkite-windows-gpu-autoscaling-group` diff --git a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml b/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml deleted file mode 100644 index 7f15b1fbcd4f..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml +++ /dev/null @@ -1,32 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "Buildkite agent's IAM policy" - -Resources: - BuildkiteAgentManagedPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:*", - "s3-object-lambda:*" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": "lambda:InvokeFunction", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": "secretsmanager:GetSecretValue", - "Resource": "*" - } - ] - } diff --git a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py deleted file mode 100644 index 8f8db348a073..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py +++ /dev/null @@ -1,127 +0,0 @@ -import argparse -import copy -import os -import re -import sys - -import boto3 -import botocore -from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS - -current_dir = os.path.dirname(__file__) -sys.path.append(os.path.join(current_dir, "..")) - -from common_blocks.utils import create_or_update_stack, wait - -TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" - - -def get_availability_zones(*, aws_region): - client = boto3.client("ec2", region_name=aws_region) - r = client.describe_availability_zones( - Filters=[ - {"Name": "region-name", "Values": [aws_region]}, - {"Name": "zone-type", "Values": ["availability-zone"]}, - ] - ) - return sorted([x["ZoneName"] for x in r["AvailabilityZones"]]) - - -def get_default_vpc(*, aws_region): - ec2 = boto3.resource("ec2", region_name=aws_region) - default_vpc_id = None - for x in ec2.vpcs.filter(Filters=[{"Name": "is-default", "Values": ["true"]}]): - return x - - # Create default VPC if not exist - client = boto3.client("ec2", region_name=aws_region) - r = client.create_default_vpc() - default_vpc_id = r["Vpc"]["VpcId"] - - return ec2.Vpc(default_vpc_id) - - -def format_params(args, *, stack_id, agent_iam_policy): - default_vpc = get_default_vpc(aws_region=args.aws_region) - azs = get_availability_zones(aws_region=args.aws_region) - # For each of the first two availability zones (AZs), choose the default subnet - subnets = [ - x.id - for x in default_vpc.subnets.filter( - Filters=[ - {"Name": "default-for-az", "Values": ["true"]}, - {"Name": "availability-zone", "Values": azs[:2]}, - ] - ) - ] - assert len(subnets) == 2 - - params = copy.deepcopy(STACK_PARAMS[stack_id]) - params["ImageId"] = AMI_ID[stack_id][args.aws_region] - params["BuildkiteQueue"] = stack_id - params["CostAllocationTagValue"] = f"buildkite-{stack_id}" - params["BuildkiteAgentToken"] = args.agent_token - params["VpcId"] = default_vpc.id - params["Subnets"] = ",".join(subnets) - params["ManagedPolicyARNs"] = agent_iam_policy - params.update(COMMON_STACK_PARAMS) - return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] - - -def get_full_stack_id(stack_id): - return f"buildkite-{stack_id}-autoscaling-group" - - -def create_agent_iam_policy(args, *, client): - policy_stack_name = "buildkite-agent-iam-policy" - print(f"Creating stack {policy_stack_name} for agent IAM policy...") - with open( - os.path.join(current_dir, "agent-iam-policy-template.yml"), - encoding="utf-8", - ) as f: - policy_template = f.read() - promise = create_or_update_stack( - args, client=client, stack_name=policy_stack_name, template_body=policy_template - ) - wait(promise, client=client) - - cf = boto3.resource("cloudformation", region_name=args.aws_region) - policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy") - return policy.physical_resource_id - - -def main(args): - client = boto3.client("cloudformation", region_name=args.aws_region) - - agent_iam_policy = create_agent_iam_policy(args, client=client) - - promises = [] - - for stack_id in AMI_ID: - stack_id_full = get_full_stack_id(stack_id) - print(f"Creating elastic CI stack {stack_id_full}...") - - params = format_params( - args, stack_id=stack_id, agent_iam_policy=agent_iam_policy - ) - - promise = create_or_update_stack( - args, - client=client, - stack_name=stack_id_full, - template_url=TEMPLATE_URL, - params=params, - ) - promises.append(promise) - print(f"CI stack {stack_id_full} is in progress in the background") - - for promise in promises: - wait(promise, client=client) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - parser.add_argument("--agent-token", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py deleted file mode 100644 index 5012aa738854..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py +++ /dev/null @@ -1,114 +0,0 @@ -AMI_ID = { - # Managed by XGBoost team - "linux-amd64-gpu": { - "us-west-2": "ami-0b4079c15bbbd0faf", - }, - "linux-amd64-mgpu": { - "us-west-2": "ami-0b4079c15bbbd0faf", - }, - "windows-gpu": { - "us-west-2": "ami-0123456bcf4cdfb82", - }, - "windows-cpu": { - "us-west-2": "ami-0123456bcf4cdfb82", - }, - # Managed by BuildKite - # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml - "linux-amd64-cpu": { - "us-west-2": "ami-0083e0ae73c175ec6", - }, - "pipeline-loader": { - "us-west-2": "ami-0083e0ae73c175ec6", - }, - "linux-arm64-cpu": { - "us-west-2": "ami-0dbf1f9da54222f21", - }, -} - -STACK_PARAMS = { - "linux-amd64-gpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "g4dn.xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "8", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-amd64-mgpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "g4dn.12xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "1", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "windows-gpu": { - "InstanceOperatingSystem": "windows", - "InstanceTypes": "g4dn.2xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "windows-cpu": { - "InstanceOperatingSystem": "windows", - "InstanceTypes": "c5a.2xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-amd64-cpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "c5a.4xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "16", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "pipeline-loader": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "t3a.micro", - "AgentsPerInstance": "1", - "MinSize": "2", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-arm64-cpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "c6g.4xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "8", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, -} - -COMMON_STACK_PARAMS = { - "BuildkiteAgentTimestampLines": "false", - "BuildkiteWindowsAdministrator": "true", - "AssociatePublicIpAddress": "true", - "ScaleOutForWaitingJobs": "false", - "EnableCostAllocationTags": "true", - "CostAllocationTagName": "CreatedBy", - "ECRAccessPolicy": "full", - "EnableSecretsPlugin": "false", - "EnableECRPlugin": "false", - "EnableDockerLoginPlugin": "false", - "EnableDockerUserNamespaceRemap": "false", - "BuildkiteAgentExperiments": "normalised-upload-paths,resolve-commit-after-checkout", -} diff --git a/tests/buildkite/infrastructure/common_blocks/utils.py b/tests/buildkite/infrastructure/common_blocks/utils.py deleted file mode 100644 index 27a0835e8dc0..000000000000 --- a/tests/buildkite/infrastructure/common_blocks/utils.py +++ /dev/null @@ -1,97 +0,0 @@ -import re - -import boto3 -import botocore - - -def stack_exists(args, *, stack_name): - client = boto3.client("cloudformation", region_name=args.aws_region) - waiter = client.get_waiter("stack_exists") - try: - waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1}) - return True - except botocore.exceptions.WaiterError as e: - return False - - -def create_or_update_stack( - args, *, client, stack_name, template_url=None, template_body=None, params=None -): - kwargs = { - "StackName": stack_name, - "Capabilities": [ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - "CAPABILITY_AUTO_EXPAND", - ], - } - if template_url: - kwargs["TemplateURL"] = template_url - if template_body: - kwargs["TemplateBody"] = template_body - if params: - kwargs["Parameters"] = params - - if stack_exists(args, stack_name=stack_name): - print(f"Stack {stack_name} already exists. Updating...") - try: - response = client.update_stack(**kwargs) - return {"StackName": stack_name, "Action": "update"} - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "ValidationError" and re.search( - "No updates are to be performed", e.response["Error"]["Message"] - ): - print(f"No update was made to {stack_name}") - return {"StackName": stack_name, "Action": "noop"} - else: - raise e - else: - kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False}) - response = client.create_stack(**kwargs) - return {"StackName": stack_name, "Action": "create"} - - -def replace_stack( - args, *, client, stack_name, template_url=None, template_body=None, params=None -): - """Delete an existing stack and create a new stack with identical name""" - - if not stack_exists(args, stack_name=stack_name): - raise ValueError(f"Stack {stack_name} does not exist") - r = client.delete_stack(StackName=stack_name) - delete_waiter = client.get_waiter("stack_delete_complete") - delete_waiter.wait(StackName=stack_name) - - kwargs = { - "StackName": stack_name, - "Capabilities": [ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - "CAPABILITY_AUTO_EXPAND", - ], - "OnFailure": "ROLLBACK", - "EnableTerminationProtection": False, - } - if template_url: - kwargs["TemplateURL"] = template_url - if template_body: - kwargs["TemplateBody"] = template_body - if params: - kwargs["Parameters"] = params - response = client.create_stack(**kwargs) - return {"StackName": stack_name, "Action": "create"} - - -def wait(promise, *, client): - stack_name = promise["StackName"] - print(f"Waiting for {stack_name}...") - if promise["Action"] == "create": - waiter = client.get_waiter("stack_create_complete") - waiter.wait(StackName=stack_name) - print(f"Finished creating stack {stack_name}") - elif promise["Action"] == "update": - waiter = client.get_waiter("stack_update_complete") - waiter.wait(StackName=stack_name) - print(f"Finished updating stack {stack_name}") - elif promise["Action"] != "noop": - raise ValueError(f"Invalid promise {promise}") diff --git a/tests/buildkite/infrastructure/requirements.txt b/tests/buildkite/infrastructure/requirements.txt deleted file mode 100644 index 3ce271ebbdd6..000000000000 --- a/tests/buildkite/infrastructure/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -boto3 -cfn_tools diff --git a/tests/buildkite/infrastructure/service-user/create_service_user.py b/tests/buildkite/infrastructure/service-user/create_service_user.py deleted file mode 100644 index ba08779bd159..000000000000 --- a/tests/buildkite/infrastructure/service-user/create_service_user.py +++ /dev/null @@ -1,44 +0,0 @@ -import argparse -import os - -import boto3 - -current_dir = os.path.dirname(__file__) - - -def main(args): - with open( - os.path.join(current_dir, "service-user-template.yml"), encoding="utf-8" - ) as f: - service_user_template = f.read() - - stack_id = "buildkite-elastic-ci-stack-service-user" - - print("Create a new IAM user with suitable permissions...") - client = boto3.client("cloudformation", region_name=args.aws_region) - response = client.create_stack( - StackName=stack_id, - TemplateBody=service_user_template, - Capabilities=[ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - ], - Parameters=[{"ParameterKey": "UserName", "ParameterValue": args.user_name}], - ) - waiter = client.get_waiter("stack_create_complete") - waiter.wait(StackName=stack_id) - user = boto3.resource("iam", region_name=args.aws_region).User(args.user_name) - key_pair = user.create_access_key_pair() - print("Finished creating an IAM users with suitable permissions.") - print(f"Access Key ID: {key_pair.access_key_id}") - print(f"Access Secret Access Key: {key_pair.secret_access_key}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - parser.add_argument( - "--user-name", type=str, default="buildkite-elastic-ci-stack-user" - ) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/service-user/service-user-template.yml b/tests/buildkite/infrastructure/service-user/service-user-template.yml deleted file mode 100644 index 2077cfe7b148..000000000000 --- a/tests/buildkite/infrastructure/service-user/service-user-template.yml +++ /dev/null @@ -1,349 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "Buildkite Elastic CI Stack CloudFormation service user" - -Parameters: - UserName: - Type: String - Default: buildkite-elastic-ci-stack-user - Description: Name of user to create - -Outputs: - UserNameOutput: - Value: !Ref CloudFormationServiceUser - UserArnOutput: - Value: !GetAtt CloudFormationServiceUser.Arn - -Resources: - CloudFormationServiceUser: - Type: AWS::IAM::User - Properties: - ManagedPolicyArns: - - !Ref SubstackCrudPolicy - - !Ref CrudPolicy - - !Ref ImageBuilderPolicy - UserName: !Ref UserName - - SubstackCrudPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": "cloudformation:*", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "serverlessrepo:GetApplication", - "serverlessrepo:GetCloudFormationTemplate", - "serverlessrepo:CreateCloudFormationTemplate" - ], - "Resource": "*" - } - ] - } - - CrudPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "ec2:DescribeAccountAttributes", - "ec2:DescribeAvailabilityZones", - "ec2:DescribeInstances", - "ec2:DescribeInternetGateways", - "ec2:DescribeLaunchTemplateVersions", - "ec2:DescribeLaunchTemplates", - "ec2:DescribeNetworkInterfaces", - "ec2:DescribeRouteTables", - "ec2:DescribeSecurityGroups", - "ec2:DescribeSubnets", - "ec2:DescribeVpcs", - "ec2:CreateTags" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateInternetGateway", - "ec2:AttachInternetGateway", - "ec2:DetachInternetGateway", - "ec2:DeleteInternetGateway" - ], - "Resource": "arn:aws:ec2:*:*:internet-gateway/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateLaunchTemplate", - "ec2:CreateLaunchTemplateVersion", - "ec2:DeleteLaunchTemplate" - ], - "Resource": "arn:aws:ec2:*:*:launch-template/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:AssociateRouteTable", - "ec2:DisassociateRouteTable", - "ec2:CreateRoute", - "ec2:CreateRouteTable", - "ec2:DeleteRoute", - "ec2:DeleteRouteTable" - ], - "Resource": "arn:aws:ec2:*:*:route-table/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:AuthorizeSecurityGroupIngress", - "ec2:RevokeSecurityGroupIngress", - "ec2:CreateSecurityGroup", - "ec2:DeleteSecurityGroup" - ], - "Resource": "arn:aws:ec2:*:*:security-group/*" - }, - { - "Effect": "Allow", - "Action": "ec2:RunInstances", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateSubnet", - "ec2:DeleteSubnet", - "ec2:AssociateRouteTable", - "ec2:DisassociateRouteTable" - ], - "Resource": "arn:aws:ec2:*:*:subnet/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateVpc", - "ec2:CreateSecurityGroup", - "ec2:ModifyVpcAttribute", - "ec2:AttachInternetGateway", - "ec2:DetachInternetGateway", - "ec2:CreateSubnet", - "ec2:CreateRouteTable", - "ec2:DeleteVpc" - ], - "Resource": "arn:aws:ec2:*:*:vpc/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateDefaultVpc", - "ec2:CreateDefaultSubnet" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "iam:CreateInstanceProfile", - "iam:GetInstanceProfile", - "iam:AddRoleToInstanceProfile", - "iam:RemoveRoleFromInstanceProfile", - "iam:DeleteInstanceProfile" - ], - "Resource": "arn:aws:iam::*:instance-profile/*" - }, - { - "Effect": "Allow", - "Action": [ - "kms:DescribeKey", - "kms:CreateGrant", - "kms:Decrypt", - "kms:Encrypt" - ], - "Resource": "arn:aws:kms:*:*:key/*" - }, - { - "Effect": "Allow", - "Action": [ - "lambda:CreateFunction", - "lambda:GetFunction", - "lambda:GetFunctionCodeSigningConfig", - "lambda:AddPermission", - "lambda:RemovePermission", - "lambda:DeleteFunction", - "lambda:InvokeFunction", - "lambda:TagResource" - ], - "Resource": "arn:aws:lambda:*:*:function:*" - }, - { - "Effect": "Allow", - "Action": [ - "logs:CreateLogGroup", - "logs:PutRetentionPolicy", - "logs:DeleteLogGroup" - ], - "Resource": "arn:aws:logs:*:*:log-group:*" - }, - { - "Effect": "Allow", - "Action": [ - "s3:GetObject", - "s3:CreateBucket", - "s3:PutBucketAcl", - "s3:PutBucketLogging", - "s3:PutBucketTagging", - "s3:PutBucketVersioning" - ], - "Resource": "arn:aws:s3:::*" - }, - { - "Effect": "Allow", - "Action": [ - "ssm:GetParameter", - "ssm:PutParameter", - "ssm:DeleteParameter" - ], - "Resource": "arn:aws:ssm:*:*:parameter/*" - }, - { - "Effect": "Allow", - "Action": [ - "iam:ListPolicies", - "iam:ListInstanceProfiles", - "iam:ListRoles", - "iam:ListPolicyVersions", - "iam:ListRolePolicies", - "iam:ListAttachedRolePolicies", - "iam:ListInstanceProfileTags", - "iam:ListRoleTags", - "iam:ListInstanceProfilesForRole", - "iam:GetPolicyVersion", - "iam:GetPolicy", - "iam:GetInstanceProfile", - "iam:GetRole", - "iam:GetRolePolicy", - "iam:TagPolicy", - "iam:UntagPolicy", - "iam:TagInstanceProfile", - "iam:UntagInstanceProfile", - "iam:TagRole", - "iam:UntagRole", - "iam:CreateRole", - "iam:PassRole", - "iam:DeleteRole", - "iam:UpdateRoleDescription", - "iam:UpdateRole", - "iam:AddRoleToInstanceProfile", - "iam:RemoveRoleFromInstanceProfile", - "iam:CreateInstanceProfile", - "iam:DeleteInstanceProfile", - "iam:DetachRolePolicy", - "iam:SetDefaultPolicyVersion", - "iam:AttachRolePolicy", - "iam:UpdateAssumeRolePolicy", - "iam:PutRolePermissionsBoundary", - "iam:DeleteRolePermissionsBoundary", - "iam:CreatePolicy", - "iam:DeletePolicyVersion", - "iam:DeletePolicy", - "iam:PutRolePolicy", - "iam:DeleteRolePolicy" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "autoscaling:DescribeLifecycleHookTypes", - "autoscaling:DescribeTerminationPolicyTypes", - "autoscaling:DescribePolicies", - "autoscaling:DescribeWarmPool", - "autoscaling:DescribeScalingActivities", - "autoscaling:DescribeScalingProcessTypes", - "autoscaling:DescribeScheduledActions", - "autoscaling:DescribeAutoScalingGroups", - "autoscaling:DescribeAutoScalingInstances", - "autoscaling:DescribeLifecycleHooks", - "autoscaling:SetDesiredCapacity", - "autoscaling:PutLifecycleHook", - "autoscaling:DeleteLifecycleHook", - "autoscaling:SetInstanceProtection", - "autoscaling:CreateAutoScalingGroup", - "autoscaling:EnableMetricsCollection", - "autoscaling:UpdateAutoScalingGroup", - "autoscaling:DeleteAutoScalingGroup", - "autoscaling:PutScalingPolicy", - "autoscaling:DeletePolicy", - "autoscaling:BatchPutScheduledUpdateGroupAction", - "autoscaling:PutScheduledUpdateGroupAction", - "autoscaling:DeleteScheduledAction", - "autoscaling:PutWarmPool", - "autoscaling:DeleteWarmPool", - "autoscaling:TerminateInstanceInAutoScalingGroup", - "autoscaling:AttachInstances" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "events:DescribeRule", - "events:PutRule", - "events:PutTargets", - "events:RemoveTargets", - "events:DeleteRule" - ], - "Resource": "arn:aws:events:*:*:rule/*" - } - ] - } - - ImageBuilderPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "imagebuilder:CreateComponent", - "imagebuilder:GetComponent", - "imagebuilder:DeleteComponent", - "imagebuilder:CreateImageRecipe", - "imagebuilder:GetImageRecipe", - "imagebuilder:DeleteImageRecipe", - "imagebuilder:CreateImagePipeline", - "imagebuilder:GetImagePipeline", - "imagebuilder:DeleteImagePipeline", - "imagebuilder:CreateInfrastructureConfiguration", - "imagebuilder:GetInfrastructureConfiguration", - "imagebuilder:DeleteInfrastructureConfiguration", - "imagebuilder:CreateDistributionConfiguration", - "imagebuilder:GetDistributionConfiguration", - "imagebuilder:DeleteDistributionConfiguration", - "imagebuilder:TagResource", - "imagebuilder:StartImagePipelineExecution", - "ec2:DescribeImages", - "ec2:DescribeSnapshots", - "ec2:DescribeRegions", - "ec2:DescribeVolumes", - "ec2:DescribeKeyPairs", - "ec2:DescribeInstanceTypeOfferings" - ], - "Resource": "*" - } - ] - } diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py deleted file mode 100644 index 8051b991da51..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py +++ /dev/null @@ -1,85 +0,0 @@ -import argparse -import copy -import json -import os -import sys -from urllib.request import urlopen - -import boto3 -import cfn_flip -from metadata import IMAGE_PARAMS - -current_dir = os.path.dirname(__file__) -sys.path.append(os.path.join(current_dir, "..")) - -from common_blocks.utils import replace_stack, wait - -BUILDKITE_CF_TEMPLATE_URL = ( - "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" -) - - -def format_params(*, stack_id, aws_region, ami_mapping): - params = copy.deepcopy(IMAGE_PARAMS[stack_id]) - with open( - os.path.join(current_dir, params["BootstrapScript"]), - encoding="utf-8", - ) as f: - bootstrap_script = f.read() - params["BaseImageId"] = ami_mapping[aws_region][params["BaseImageId"]] - params["BootstrapScript"] = bootstrap_script - return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] - - -def get_ami_mapping(): - with urlopen(BUILDKITE_CF_TEMPLATE_URL) as response: - buildkite_cf_template = response.read().decode("utf-8") - cfn_obj = json.loads(cfn_flip.to_json(buildkite_cf_template)) - return cfn_obj["Mappings"]["AWSRegion2AMI"] - - -def get_full_stack_id(stack_id): - return f"buildkite-{stack_id}-worker" - - -def main(args): - with open( - os.path.join(current_dir, "ec2-image-builder-pipeline-template.yml"), - encoding="utf-8", - ) as f: - ec2_image_pipeline_template = f.read() - - ami_mapping = get_ami_mapping() - - client = boto3.client("cloudformation", region_name=args.aws_region) - promises = [] - - for stack_id in IMAGE_PARAMS: - stack_id_full = get_full_stack_id(stack_id) - print(f"Creating EC2 image builder stack {stack_id_full}...") - - params = format_params( - stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping - ) - - promise = replace_stack( - args, - client=client, - stack_name=stack_id_full, - template_body=ec2_image_pipeline_template, - params=params, - ) - promises.append(promise) - print( - f"EC2 image builder stack {stack_id_full} is in progress in the background" - ) - - for promise in promises: - wait(promise, client=client) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml b/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml deleted file mode 100644 index 8d3bafa72f08..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml +++ /dev/null @@ -1,108 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "EC2 Image Builder pipelines to build workers" - -Parameters: - BaseImageId: - Type: String - Description: Base AMI to build a new image on top of. - - BootstrapScript: - Type: String - Description: Content of AMI customization script - - InstanceType: - Type: String - Description: Instance type for the Image Builder instances. - - InstanceOperatingSystem: - Type: String - Description: The operating system to run on the instance - AllowedValues: - - Linux - - Windows - Default: "Linux" - - VolumeSize: - Type: Number - Description: Size of EBS volume, in GiBs - -Conditions: - IsInstanceWindows: - !Equals [ !Ref InstanceOperatingSystem, "Windows" ] - -Resources: - # IAM role for the image builder instance - InstanceRole: - Type: AWS::IAM::Role - Properties: - AssumeRolePolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: "Allow" - Principal: - Service: "ec2.amazonaws.com" - Action: "sts:AssumeRole" - ManagedPolicyArns: - - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore - - arn:aws:iam::aws:policy/EC2InstanceProfileForImageBuilder - - arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess - - InstanceProfile: - Type: AWS::IAM::InstanceProfile - Properties: - Roles: - - !Ref InstanceRole - - # Component that runs the bootstrap script - BootstrapComponent: - Type: AWS::ImageBuilder::Component - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "bootstrap-component", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Platform: !Ref InstanceOperatingSystem - Version: "1.0.0" - Description: Execute a bootstrap script. - Data: !Ref BootstrapScript - - Recipe: - Type: AWS::ImageBuilder::ImageRecipe - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Components: - - ComponentArn: !Ref BootstrapComponent - ParentImage: !Ref BaseImageId - BlockDeviceMappings: - - DeviceName: !If [IsInstanceWindows, "/dev/sda1", "/dev/xvda"] - Ebs: - DeleteOnTermination: true - Encrypted: false - VolumeSize: !Ref VolumeSize - VolumeType: gp2 - Version: "1.0.0" - - Infrastructure: - Type: AWS::ImageBuilder::InfrastructureConfiguration - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-infrastructure", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - InstanceProfileName: !Ref InstanceProfile - InstanceTypes: - - !Ref InstanceType - TerminateInstanceOnFailure: true - - # Copy to this region only - Distribution: - Type: AWS::ImageBuilder::DistributionConfiguration - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-distribution-config", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Distributions: - - Region: !Ref AWS::Region - AmiDistributionConfiguration: {} - - # Composition of the above elements - Pipeline: - Type: AWS::ImageBuilder::ImagePipeline - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - DistributionConfigurationArn: !Ref Distribution - ImageRecipeArn: !Ref Recipe - InfrastructureConfigurationArn: !Ref Infrastructure diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml deleted file mode 100644 index 88403911cbc6..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: BuildKiteLinuxAMD64GPUBootstrap -description: Set up worker image for linux-amd64-gpu pipeline -schemaVersion: 1.0 - -phases: - - name: build - steps: - - name: SetupStep - action: ExecuteBash - inputs: - commands: - - | - yum groupinstall -y "Development tools" - yum install -y kernel-devel-$(uname -r) - dnf install -y kernel-modules-extra - aws s3 cp --recursive s3://ec2-linux-nvidia-drivers/latest/ . - chmod +x NVIDIA-Linux-x86_64*.run - ./NVIDIA-Linux-x86_64*.run --silent - - curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo - yum install -y nvidia-container-toolkit - yum clean expire-cache - nvidia-ctk runtime configure --runtime=docker - systemctl restart docker diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py b/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py deleted file mode 100644 index 37100209fe2e..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py +++ /dev/null @@ -1,18 +0,0 @@ -IMAGE_PARAMS = { - "linux-amd64-gpu": { - "BaseImageId": "linuxamd64", - # AMI ID is looked up from Buildkite's CloudFormation template - "BootstrapScript": "linux-amd64-gpu-bootstrap.yml", - "InstanceType": "g4dn.xlarge", - "InstanceOperatingSystem": "Linux", - "VolumeSize": "40", # in GiBs - }, - "windows-gpu": { - "BaseImageId": "windows", - # AMI ID is looked up from Buildkite's CloudFormation template - "BootstrapScript": "windows-gpu-bootstrap.yml", - "InstanceType": "g4dn.2xlarge", - "InstanceOperatingSystem": "Windows", - "VolumeSize": "120", # in GiBs - }, -} diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py deleted file mode 100644 index 9edb8b1a7c24..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py +++ /dev/null @@ -1,22 +0,0 @@ -import argparse - -import boto3 -from create_worker_image_pipelines import get_full_stack_id -from metadata import IMAGE_PARAMS - - -def main(args): - cf = boto3.resource("cloudformation", region_name=args.aws_region) - builder_client = boto3.client("imagebuilder", region_name=args.aws_region) - for stack_id in IMAGE_PARAMS: - stack_id_full = get_full_stack_id(stack_id) - pipeline_arn = cf.Stack(stack_id_full).Resource("Pipeline").physical_resource_id - print(f"Running pipeline {pipeline_arn} to generate a new AMI...") - r = builder_client.start_image_pipeline_execution(imagePipelineArn=pipeline_arn) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml deleted file mode 100644 index 0348e28c8709..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: BuildKiteWindowsGPUBootstrap -description: Set up worker image for windows-gpu pipeline -schemaVersion: 1.0 - -phases: - - name: build - steps: - - name: SetupStep - action: ExecutePowerShell - inputs: - commands: - - | - $ErrorActionPreference = "Stop" - - choco --version - choco feature enable -n=allowGlobalConfirmation - - # CMake 3.29.2 - Write-Host '>>> Installing CMake 3.29.2...' - choco install cmake --version 3.29.2 --installargs "ADD_CMAKE_TO_PATH=System" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Notepad++ - Write-Host '>>> Installing Notepad++...' - choco install notepadplusplus - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Mambaforge - Write-Host '>>> Installing Mambaforge...' - choco install mambaforge /RegisterPython:1 /D:C:\tools\mambaforge - C:\tools\mambaforge\Scripts\conda.exe init --user --system - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - . "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - conda config --set auto_activate_base false - - # Install Java 11 - Write-Host '>>> Installing Java 11...' - choco install openjdk11 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install Maven - Write-Host '>>> Installing Maven...' - choco install maven - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install GraphViz - Write-Host '>>> Installing GraphViz...' - choco install graphviz - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install Visual Studio 2022 Community - Write-Host '>>> Installing Visual Studio 2022 Community...' - choco install visualstudio2022community ` - --params "--wait --passive --norestart" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - choco install visualstudio2022-workload-nativedesktop --params ` - "--wait --passive --norestart --includeOptional" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install CUDA 12.4 - Write-Host '>>> Installing CUDA 12.4...' - choco install cuda --version=12.4.1.551 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install R - Write-Host '>>> Installing R...' - choco install r.project --version=4.3.2 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - choco install rtools --version=4.3.5550 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/tests/buildkite/pipeline-mac-m1.yml b/tests/buildkite/pipeline-mac-m1.yml deleted file mode 100644 index 57b1b1d12010..000000000000 --- a/tests/buildkite/pipeline-mac-m1.yml +++ /dev/null @@ -1,13 +0,0 @@ -steps: - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - - label: ":macos: Build libxgboost4j.dylib for MacOS M1" - command: "tests/buildkite/build-jvm-macos-m1.sh" - key: mac-m1-jvm - agents: - queue: mac-mini-m1 - - label: ":macos: Build and Test XGBoost for MacOS M1 with Clang 11" - command: "tests/buildkite/test-macos-m1-clang11.sh" - key: mac-m1-appleclang11 - agents: - queue: mac-mini-m1 diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml deleted file mode 100644 index cbb573c3682c..000000000000 --- a/tests/buildkite/pipeline-mgpu.yml +++ /dev/null @@ -1,48 +0,0 @@ -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" - DISABLE_RELEASE: "1" - # Skip uploading artifacts to S3 bucket - # Also, don't build all CUDA archs; just build sm_75 -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh gpu" - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - - "tests/buildkite/build-containers.sh jvm_gpu_build" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - #### -------- BUILD -------- - - label: ":console: Build CUDA" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - label: ":console: Build and test JVM packages with CUDA" - command: "tests/buildkite/build-jvm-packages-gpu.sh" - key: build-jvm-packages-gpu - agents: - queue: linux-amd64-mgpu - - wait - #### -------- TEST -------- - - label: ":console: Run Google Tests" - command: "tests/buildkite/test-cpp-mgpu.sh" - key: test-cpp-mgpu - agents: - queue: linux-amd64-mgpu - - label: ":console: Test Python package, 4 GPUs" - command: "tests/buildkite/test-python-gpu.sh mgpu" - key: test-python-mgpu - agents: - queue: linux-amd64-mgpu diff --git a/tests/buildkite/pipeline-nightly.yml b/tests/buildkite/pipeline-nightly.yml deleted file mode 100644 index 4d84f93a54d4..000000000000 --- a/tests/buildkite/pipeline-nightly.yml +++ /dev/null @@ -1,43 +0,0 @@ -# Nightly CI pipeline, to test against dev versions of dependencies - -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" - DISABLE_RELEASE: "1" - # Skip uploading artifacts to S3 bucket - # Also, don't build all CUDA archs; just build sm_75 - USE_DEPS_DEV_VER: "1" - # Use dev versions of RAPIDS and other dependencies -steps: - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - - "tests/buildkite/build-containers.sh gpu_dev_ver" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - - - label: ":console: Build CUDA" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - wait - - label: ":console: Build CUDA + RMM Nightly" - command: "tests/buildkite/build-cuda-with-rmm.sh dev" - key: build-cuda-rmm-nightly - agents: - queue: linux-amd64-cpu - - wait - - label: ":console: Test Python package, single GPU" - command: "tests/buildkite/test-python-gpu.sh gpu" - key: test-python-gpu - agents: - queue: linux-amd64-gpu - - label: ":console: Test Python package, 4 GPUs" - command: "tests/buildkite/test-python-gpu.sh mgpu" - key: test-python-mgpu - agents: - queue: linux-amd64-mgpu diff --git a/tests/buildkite/pipeline-win64.yml b/tests/buildkite/pipeline-win64.yml deleted file mode 100644 index 83a61981e716..000000000000 --- a/tests/buildkite/pipeline-win64.yml +++ /dev/null @@ -1,24 +0,0 @@ -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- BUILD -------- - - label: ":windows: Build XGBoost for Windows with CUDA" - command: "tests/buildkite/build-win64-gpu.ps1" - key: build-win64-gpu - agents: - queue: windows-cpu - - - wait - - #### -------- TEST -------- - - label: ":windows: Test XGBoost on Windows" - command: "tests/buildkite/test-win64-gpu.ps1" - key: test-win64-gpu - agents: - queue: windows-gpu diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml deleted file mode 100644 index 6c1df33b84dd..000000000000 --- a/tests/buildkite/pipeline.yml +++ /dev/null @@ -1,113 +0,0 @@ -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh cpu" - - "tests/buildkite/build-containers.sh gpu" - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - #### -------- BUILD -------- - - label: ":console: Run clang-tidy" - command: "tests/buildkite/run-clang-tidy.sh" - key: run-clang-tidy - agents: - queue: linux-amd64-cpu - - label: ":console: Build CPU" - command: "tests/buildkite/build-cpu.sh" - key: build-cpu - agents: - queue: linux-amd64-cpu - - label: ":console: Build CPU ARM64 + manylinux_2_28_aarch64 wheel" - command: "tests/buildkite/build-cpu-arm64.sh" - key: build-cpu-arm64 - agents: - queue: linux-arm64-cpu - - label: ":console: Build CUDA + manylinux_2_28_x86_64 wheel" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - label: ":console: Build CUDA with RMM" - command: "tests/buildkite/build-cuda-with-rmm.sh stable" - key: build-cuda-with-rmm - agents: - queue: linux-amd64-cpu - - label: ":console: Build R package with CUDA" - command: "tests/buildkite/build-gpu-rpkg.sh" - key: build-gpu-rpkg - agents: - queue: linux-amd64-cpu - - label: ":console: Build JVM packages" - timeout_in_minutes: 30 - command: "tests/buildkite/build-jvm-packages.sh" - key: build-jvm-packages - agents: - queue: linux-amd64-cpu - - label: ":console: Build libxgboost4j.so for Linux ARM64 (targeting glibc 2.17)" - command: "tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh" - key: build-jvm-linux-arm64-manylinux2014 - agents: - queue: linux-arm64-cpu - - label: ":console: Build libxgboost4j.so for Linux x86_64 (targeting glibc 2.17)" - command: "tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh" - key: build-jvm-linux-x86_64-manylinux2014 - agents: - queue: linux-amd64-cpu - - label: ":console: Build JVM package doc" - command: "tests/buildkite/build-jvm-doc.sh" - key: build-jvm-doc - agents: - queue: linux-amd64-cpu - - label: ":console: Build manylinux2014_x86_64 wheel" - command: "tests/buildkite/build-manylinux2014.sh x86_64" - key: build-manylinux2014-x86_64 - agents: - queue: linux-amd64-cpu - - label: ":console: Build manylinux2014_aarch64 wheel" - command: "tests/buildkite/build-manylinux2014.sh aarch64" - key: build-manylinux2014-aarch64 - agents: - queue: linux-arm64-cpu - - wait - #### -------- TEST -------- - - label: ":console: Test Python package, CPU" - command: "tests/buildkite/test-python-cpu.sh" - key: test-python-cpu - agents: - queue: linux-amd64-cpu - - label: ":console: Test Python package, CPU ARM64" - command: "tests/buildkite/test-python-cpu-arm64.sh" - key: test-python-cpu-arm64 - agents: - queue: linux-arm64-cpu - - label: ":console: Test Python package, single GPU" - command: "tests/buildkite/test-python-gpu.sh gpu" - key: test-python-gpu - agents: - queue: linux-amd64-gpu - - label: ":console: Run Google Tests" - command: "tests/buildkite/test-cpp-gpu.sh" - key: test-cpp-gpu - agents: - queue: linux-amd64-gpu - - wait - #### -------- DEPLOY JVM -------- - - label: ":console: Deploy JVM packages" - command: "tests/buildkite/deploy-jvm-packages.sh" - key: deploy-jvm-packages - agents: - queue: linux-amd64-cpu diff --git a/tests/buildkite/run-clang-tidy.sh b/tests/buildkite/run-clang-tidy.sh deleted file mode 100755 index 95ff010c20f1..000000000000 --- a/tests/buildkite/run-clang-tidy.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Run clang-tidy" - -source tests/buildkite/conftest.sh - -tests/ci_build/ci_build.sh clang_tidy \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - python3 tests/ci_build/tidy.py --cuda-archs 75 diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh deleted file mode 100755 index d7197db2efce..000000000000 --- a/tests/buildkite/test-cpp-gpu.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Run Google Tests with CUDA, using a GPU" -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost - -echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" -rm -rfv build/ -buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost --use-rmm-pool diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh deleted file mode 100755 index 65614b191d04..000000000000 --- a/tests/buildkite/test-cpp-mgpu.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -# Allocate extra space in /dev/shm to enable NCCL -export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - -echo "--- Run Google Tests with CUDA, using multiple GPUs" -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost --gtest_filter=*MGPU* diff --git a/tests/buildkite/test-macos-m1-clang11.sh b/tests/buildkite/test-macos-m1-clang11.sh deleted file mode 100755 index 6824cb7b14b4..000000000000 --- a/tests/buildkite/test-macos-m1-clang11.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -# Display system info -echo "--- Display system information" -set -x -system_profiler SPSoftwareDataType -sysctl -n machdep.cpu.brand_string -uname -m -set +x - -# Ensure that XGBoost can be built with Clang 11 -echo "--- Build and Test XGBoost with MacOS M1, Clang 11" -set -x -LLVM11_PATH=$(brew --prefix llvm\@11) -mkdir build -pushd build -cmake .. -GNinja -DCMAKE_C_COMPILER=${LLVM11_PATH}/bin/clang \ - -DCMAKE_CXX_COMPILER=${LLVM11_PATH}/bin/clang++ -DGOOGLE_TEST=ON \ - -DUSE_DMLC_GTEST=ON -ninja -v -./testxgboost diff --git a/tests/buildkite/test-python-cpu-arm64.sh b/tests/buildkite/test-python-cpu-arm64.sh deleted file mode 100755 index 68a428034073..000000000000 --- a/tests/buildkite/test-python-cpu-arm64.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Test Python CPU ARM64" -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cpu-arm64 -buildkite-agent artifact download "xgboost" . --step build-cpu-arm64 -chmod +x ./xgboost -tests/ci_build/ci_build.sh aarch64 tests/ci_build/test_python.sh cpu-arm64 diff --git a/tests/buildkite/test-python-cpu.sh b/tests/buildkite/test-python-cpu.sh deleted file mode 100755 index 6c53dc2821bc..000000000000 --- a/tests/buildkite/test-python-cpu.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Test CPU code in Python env" - -source tests/buildkite/conftest.sh - -mkdir -pv python-package/dist -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda -buildkite-agent artifact download "xgboost" . --step build-cpu -chmod +x ./xgboost - -export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/cpu) -set_buildkite_env_vars_in_container -tests/ci_build/ci_build.sh cpu tests/ci_build/test_python.sh cpu diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh deleted file mode 100755 index d7bd729a2e01..000000000000 --- a/tests/buildkite/test-python-gpu.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -if [ "$#" -lt 1 ] -then - suite='' - args='' -else - suite=$1 - shift 1 - args="$@" -fi - -source tests/buildkite/conftest.sh - -echo "--- Fetch build artifacts" -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost - -# Allocate extra space in /dev/shm to enable NCCL -export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - -if [[ -z "${USE_DEPS_DEV_VER-}" ]] -then - container_tag='gpu' - rapids_version=${RAPIDS_VERSION} -else - container_tag='gpu_dev_ver' - rapids_version=${DEV_RAPIDS_VERSION} -fi - -command_wrapper="tests/ci_build/ci_build.sh ${container_tag} --use-gpus --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=${rapids_version} --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION" - -# Run specified test suite -case "$suite" in - gpu) - export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/gpu) - set_buildkite_env_vars_in_container - echo "--- Test XGBoost Python package, single GPU" - $command_wrapper tests/ci_build/test_python.sh $suite - ;; - - mgpu) - export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/mgpu) - set_buildkite_env_vars_in_container - echo "--- Test XGBoost Python package, 4 GPUs" - $command_wrapper tests/ci_build/test_python.sh $suite - ;; - - *) - echo "Usage: $0 {gpu|mgpu} [extra args to pass to pytest]" - exit 1 - ;; -esac diff --git a/tests/buildkite/test-win64-gpu.ps1 b/tests/buildkite/test-win64-gpu.ps1 deleted file mode 100644 index 95a51b50228d..000000000000 --- a/tests/buildkite/test-win64-gpu.ps1 +++ /dev/null @@ -1,39 +0,0 @@ -$ErrorActionPreference = "Stop" - -. tests/buildkite/conftest.ps1 - -Write-Host "--- Test XGBoost on Windows with CUDA" - -New-Item python-package/dist -ItemType Directory -ea 0 -New-Item build -ItemType Directory -ea 0 -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -buildkite-agent artifact download "build/testxgboost.exe" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -buildkite-agent artifact download "xgboost.exe" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -nvcc --version - -Write-Host "--- Run Google Tests" -& build/testxgboost.exe -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -Write-Host "--- Set up Python env" -conda activate -$env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) -mamba env create -n ${env_name} --file=tests/ci_build/conda_env/win64_test.yml -conda activate ${env_name} -Get-ChildItem . -Filter python-package/dist/*.whl | -Foreach-Object { - & python -m pip install python-package/dist/$_ - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} - -Write-Host "--- Run Python tests" -python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -Write-Host "--- Run Python tests with GPU" -python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"` - tests/python-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver deleted file mode 100644 index d23c5e83c2c7..000000000000 --- a/tests/ci_build/Dockerfile.gpu_dev_ver +++ /dev/null @@ -1,54 +0,0 @@ -# Container to test XGBoost against dev versions of dependencies - -ARG CUDA_VERSION_ARG -FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 -ARG CUDA_VERSION_ARG -ARG RAPIDS_VERSION_ARG - # Should be first 4 digits of the dev version (e.g. 24.06) -ARG NCCL_VERSION_ARG - -# Environment -ENV DEBIAN_FRONTEND=noninteractive -SHELL ["/bin/bash", "-c"] # Use Bash as shell - -# Install all basic requirements -RUN \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ - apt-get update && \ - apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge - -ENV PATH=/opt/miniforge/bin:$PATH - -# Create new Conda environment with dev versions of cuDF, Dask, and cuPy -RUN \ - export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ - export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ - mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \ - python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ - "nccl>=${NCCL_SHORT_VER}" \ - dask \ - "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ - numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ - python-kubernetes urllib3 graphviz hypothesis loky \ - "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n gpu_test pip install buildkite-test-collector - -ENV GOSU_VERSION=1.10 -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.jvm_manylinux2014_aarch64 b/tests/ci_build/Dockerfile.jvm_manylinux2014_aarch64 deleted file mode 100644 index 52baff43bb6f..000000000000 --- a/tests/ci_build/Dockerfile.jvm_manylinux2014_aarch64 +++ /dev/null @@ -1,17 +0,0 @@ -FROM quay.io/pypa/manylinux2014_aarch64 - -RUN yum update -y && yum install -y java-1.8.0-openjdk-devel - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/Dockerfile.jvm_manylinux2014_x86_64 b/tests/ci_build/Dockerfile.jvm_manylinux2014_x86_64 deleted file mode 100644 index 578b85618776..000000000000 --- a/tests/ci_build/Dockerfile.jvm_manylinux2014_x86_64 +++ /dev/null @@ -1,17 +0,0 @@ -FROM quay.io/pypa/manylinux2014_x86_64 - -RUN yum update -y && yum install -y java-1.8.0-openjdk-devel ninja-build - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/build_jvm_doc.sh b/tests/ci_build/build_jvm_doc.sh deleted file mode 100755 index 01a91dd629b5..000000000000 --- a/tests/ci_build/build_jvm_doc.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -if [ $# -ne 1 ]; then - echo "Usage: $0 [branch name]" - exit 1 -fi - -set -e -set -x - -rm -rf build/ -cd jvm-packages - -branch_name=$1 - -# Install JVM packages in local Maven repository -mvn --no-transfer-progress install -DskipTests -# Build Scaladocs -mvn --no-transfer-progress scala:doc -DskipTests -# Build Javadocs -mvn --no-transfer-progress javadoc:javadoc -DskipTests - -# Package JVM docs in a tarball -mkdir -p tmp/scaladocs -cp -rv xgboost4j/target/reports/apidocs/ ./tmp/javadocs/ -cp -rv xgboost4j/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j/ -cp -rv xgboost4j-spark/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark/ -cp -rv xgboost4j-flink/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-flink/ - -cd tmp -tar cvjf ${branch_name}.tar.bz2 javadocs/ scaladocs/ -mv ${branch_name}.tar.bz2 .. -cd .. -rm -rfv tmp/ - -set +x -set +e diff --git a/tests/ci_build/build_jvm_packages.sh b/tests/ci_build/build_jvm_packages.sh deleted file mode 100755 index 99681f5ca43c..000000000000 --- a/tests/ci_build/build_jvm_packages.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash - -set -e -set -x - -spark_version=$1 -use_cuda=$2 -gpu_arch=$3 -use_scala213=$4 - -gpu_options="" -if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then - gpu_options="$use_cuda -Pgpu" -fi - -rm -rf build/ -cd jvm-packages - -if [ "x$gpu_arch" != "x" ]; then - export GPU_ARCH_FLAG=$gpu_arch -fi - -# Purge artifacts and set correct Scala version -pushd .. -if [ "x$use_scala213" != "x" ]; then - python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts -else - python dev/change_scala_version.py --scala-version 2.12 --purge-artifacts -fi -popd - -# Build and test XGBoost4j-spark against different spark versions only for CPU and scala=2.12 -if [ "x$gpu_options" == "x" ] && [ "x$use_scala213" == "x" ]; then - mvn --no-transfer-progress clean package -Dspark.version=3.1.3 -pl xgboost4j,xgboost4j-spark - mvn --no-transfer-progress clean package -Dspark.version=3.2.4 -pl xgboost4j,xgboost4j-spark - mvn --no-transfer-progress clean package -Dspark.version=3.3.4 -pl xgboost4j,xgboost4j-spark - mvn --no-transfer-progress clean package -Dspark.version=3.4.3 -pl xgboost4j,xgboost4j-spark -fi - -mvn --no-transfer-progress clean install -Dspark.version=${spark_version} $gpu_options - -# Integration tests -if [ "x$use_cuda" == "x" ]; then - mvn --no-transfer-progress test -pl xgboost4j-example -fi - -set +x -set +e diff --git a/tests/ci_build/build_via_cmake.sh b/tests/ci_build/build_via_cmake.sh deleted file mode 100755 index 3238c41e1bcb..000000000000 --- a/tests/ci_build/build_via_cmake.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash -set -e - -if [[ "$1" == --conda-env=* ]] -then - conda_env=$(echo "$1" | sed 's/^--conda-env=//g' -) - echo "Activating Conda environment ${conda_env}" - shift 1 - cmake_args="$@" - - # Workaround for file permission error - if [[ -n $CI_BUILD_UID ]] - then - gosu root chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" /opt/miniforge/envs - fi - - source activate ${conda_env} - cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX" -else - cmake_args="$@" - cmake_prefix_flag='' -fi - -rm -rf build -mkdir build -cd build -# Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until -# https://github.com/dmlc/xgboost/issues/10400 is fixed -cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF -GNinja ${cmake_prefix_flag} -DHIDE_CXX_SYMBOLS=ON -DBUILD_DEPRECATED_CLI=ON -ninja clean -time ninja -v -cd .. diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh deleted file mode 100755 index a2f2d6063160..000000000000 --- a/tests/ci_build/ci_build.sh +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env bash -# -# Execute command within a docker container -# -# Usage: ci_build.sh [--use-gpus] -# [--dockerfile ] [-it] -# [--build-arg ] -# -# CONTAINER_TYPE: Type of the docker container used the run the build: e.g., -# (cpu | gpu) -# -# --use-gpus: Whether to grant the container access to NVIDIA GPUs. -# -# DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build. If -# this optional value is not supplied (via the --dockerfile -# flag), will use Dockerfile.CONTAINER_TYPE in default -# -# BUILD_ARG: (Optional) an argument to be passed to docker build -# -# COMMAND: Command to be executed in the docker container -# -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Get the command line arguments. -CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' ) -shift 1 - -# Dockerfile to be used in docker build -DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" -DOCKER_CONTEXT_PATH="${SCRIPT_DIR}" - -GPU_FLAG='' -if [[ "$1" == "--use-gpus" ]]; then - echo "Using NVIDIA GPUs" - GPU_FLAG='--gpus all' - shift 1 -fi - -if [[ "$1" == "--dockerfile" ]]; then - DOCKERFILE_PATH="$2" - DOCKER_CONTEXT_PATH=$(dirname "${DOCKERFILE_PATH}") - echo "Using custom Dockerfile path: ${DOCKERFILE_PATH}" - echo "Using custom docker build context path: ${DOCKER_CONTEXT_PATH}" - shift 2 -fi - -if [[ -n "${CI_DOCKER_EXTRA_PARAMS_INIT}" ]] -then - IFS=' ' read -r -a CI_DOCKER_EXTRA_PARAMS <<< "${CI_DOCKER_EXTRA_PARAMS_INIT}" -fi - -if [[ "$1" == "-it" ]]; then - CI_DOCKER_EXTRA_PARAMS+=('-it') - shift 1 -fi - -while [[ "$1" == "--build-arg" ]]; do - CI_DOCKER_BUILD_ARG+=" $1" - CI_DOCKER_BUILD_ARG+=" $2" - shift 2 -done - -if [[ ! -f "${DOCKERFILE_PATH}" ]]; then - echo "Invalid Dockerfile path: \"${DOCKERFILE_PATH}\"" - exit 1 -fi - -COMMAND=("$@") - -# Validate command line arguments. -if [ "$#" -lt 1 ] || [ ! -e "${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" ]; then - supported_container_types=$( ls -1 ${SCRIPT_DIR}/Dockerfile.* | \ - sed -n 's/.*Dockerfile\.\([^\/]*\)/\1/p' | tr '\n' ' ' ) - echo "Usage: $(basename $0) CONTAINER_TYPE COMMAND" - echo " CONTAINER_TYPE can be one of [${supported_container_types}]" - echo " COMMAND is a command (with arguments) to run inside" - echo " the container." - exit 1 -fi - -# Helper function to traverse directories up until given file is found. -function upsearch () { - test / == "$PWD" && return || \ - test -e "$1" && echo "$PWD" && return || \ - cd .. && upsearch "$1" -} - -# Set up WORKSPACE. Jenkins will set them for you or we pick -# reasonable defaults if you run it outside of Jenkins. -WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../../}" - -# Determine the docker image name -DOCKER_IMG_NAME="xgb-ci.${CONTAINER_TYPE}" - -# Append cuda version if available -CUDA_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'CUDA_VERSION_ARG=[0-9]+\.[0-9]+' | grep -o -E '[0-9]+\.[0-9]+') -# Append jdk version if available -JDK_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'JDK_VERSION=[0-9]+' | grep -o -E '[0-9]+') -# Append cmake version if available -CMAKE_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'CMAKE_VERSION=[0-9]+\.[0-9]+' | grep -o -E '[0-9]+\.[0-9]+') -# Append R version if available -USE_R35=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'USE_R35=[0-9]+' | grep -o -E '[0-9]+$') -if [[ ${USE_R35} == "1" ]]; then - USE_R35="_r35" -elif [[ ${USE_R35} == "0" ]]; then - USE_R35="_no_r35" -fi -DOCKER_IMG_NAME=$DOCKER_IMG_NAME$CUDA_VERSION$JDK_VERSION$CMAKE_VERSION$USE_R35 - -# Under Jenkins matrix build, the build tag may contain characters such as -# commas (,) and equal signs (=), which are not valid inside docker image names. -DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | sed -e 's/=/_/g' -e 's/,/-/g') - -# Convert to all lower-case, as per requirement of Docker image names -DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]') - -# Bash on Ubuntu on Windows -UBUNTU_ON_WINDOWS=$([ -e /proc/version ] && grep -l Microsoft /proc/version || echo "") -# MSYS, Git Bash, etc. -MSYS=$([ -e /proc/version ] && grep -l MINGW /proc/version || echo "") - -if [[ -z "$UBUNTU_ON_WINDOWS" ]] && [[ -z "$MSYS" ]] && [[ ! "$OSTYPE" == "darwin"* ]]; then - USER_IDS="-e CI_BUILD_UID=$( id -u ) -e CI_BUILD_GID=$( id -g ) -e CI_BUILD_USER=$( id -un ) -e CI_BUILD_GROUP=$( id -gn ) -e CI_BUILD_HOME=${WORKSPACE}" -fi - -# Print arguments. -cat <=1.4.1 -- pandas -- matplotlib -- dask -- distributed -- python-graphviz -- pytest -- jsonschema -- hypothesis -- python-graphviz -- pip -- py-ubjson -- loky -- pyarrow diff --git a/tests/ci_build/deploy_jvm_packages.sh b/tests/ci_build/deploy_jvm_packages.sh deleted file mode 100755 index 2cb108c8bc6f..000000000000 --- a/tests/ci_build/deploy_jvm_packages.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -set -e -set -x - -if [ $# -ne 1 ]; then - echo "Usage: $0 [spark version]" - exit 1 -fi - -spark_version=$1 - -cd jvm-packages -rm -rf $(find . -name target) -rm -rf ../build/ - -## Deploy JVM packages to xgboost-maven-repo - -# Scala 2.12, CPU variant -mvn --no-transfer-progress deploy -Pdefault,release-to-s3 -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true -mvn clean -mvn clean -Pdefault,release-to-s3 - -# Scala 2.12, GPU variant -mvn --no-transfer-progress install -Pgpu -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true -mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true - -# Scala 2.13, CPU variant -pushd .. -python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts -popd -mvn --no-transfer-progress deploy -Pdefault,release-to-s3 -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true -mvn clean -mvn clean -Pdefault,release-to-s3 - -# Scala 2.13, GPU variant -mvn --no-transfer-progress install -Pgpu -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true -mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu -Dspark.version=${spark_version} -DskipTests -Dmaven.test.skip=true - -set +x -set +e diff --git a/tests/ci_build/jenkins_tools.Groovy b/tests/ci_build/jenkins_tools.Groovy deleted file mode 100644 index 1bc2574c6ac0..000000000000 --- a/tests/ci_build/jenkins_tools.Groovy +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/groovy -// -*- mode: groovy -*- - -/* Utility functions for Jenkins */ - -// Command to run command inside a docker container -dockerRun = 'tests/ci_build/ci_build.sh' - - -/** - * Creates cmake and make builds - */ -def buildFactory(buildName, conf, restricted, build_func) { - def os = conf["os"] - def device = conf["withGpu"] ? (conf["multiGpu"] ? "mgpu" : "gpu") : "cpu" - def restricted_flag = restricted ? "restricted" : "unrestricted" - def nodeReq = "${os} && ${device} && ${restricted_flag}" - def dockerTarget = conf["withGpu"] ? "gpu" : "cpu" - [ ("${buildName}") : { build_func("${buildName}", conf, nodeReq, dockerTarget) } - ] -} - -def cmakeOptions(conf) { - return ([ - conf["withGpu"] ? '-DUSE_CUDA=ON' : '-DUSE_CUDA=OFF', - conf["withNccl"] ? '-DUSE_NCCL=ON' : '-DUSE_NCCL=OFF', - conf["withOmp"] ? '-DOPEN_MP:BOOL=ON' : ''] - ).join(" ") -} - -def getBuildName(conf) { - def gpuLabel = conf['withGpu'] ? ( (conf['multiGpu'] ? "_mgpu" : "") + "_cuda" + conf['cudaVersion'] + (conf['withNccl'] ? "_nccl" : "_nonccl")) : "_cpu" - def ompLabel = conf['withOmp'] ? "_omp" : "" - def pyLabel = "_py${conf['pythonVersion']}" - return "${conf['os']}${gpuLabel}${ompLabel}${pyLabel}" -} - -return this diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh deleted file mode 100755 index a1a023046e5b..000000000000 --- a/tests/ci_build/test_python.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash -set -e - -if [ "$#" -lt 1 ] -then - suite='' - args='' -else - suite=$1 - shift 1 - args="$@" -fi - -# Install XGBoost Python package -function install_xgboost { - wheel_found=0 - pip install --upgrade pip --user - for file in python-package/dist/*.whl - do - if [ -e "${file}" ] - then - pip install --user "${file}" - wheel_found=1 - break # need just one - fi - done - if [ "$wheel_found" -eq 0 ] - then - pushd . - cd python-package - pip install --user -v . - popd - fi -} - -function setup_pyspark_envs { - export PYSPARK_DRIVER_PYTHON=`which python` - export PYSPARK_PYTHON=`which python` - export SPARK_TESTING=1 -} - -function unset_pyspark_envs { - unset PYSPARK_DRIVER_PYTHON - unset PYSPARK_PYTHON - unset SPARK_TESTING -} - -function uninstall_xgboost { - pip uninstall -y xgboost -} - -# Run specified test suite -case "$suite" in - gpu) - source activate gpu_test - set -x - install_xgboost - setup_pyspark_envs - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - mgpu) - source activate gpu_test - set -x - install_xgboost - setup_pyspark_envs - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_federated - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - cpu) - source activate linux_cpu_test - set -x - install_xgboost - export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 - setup_pyspark_envs - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_federated - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - cpu-arm64) - source activate aarch64_test - set -x - install_xgboost - setup_pyspark_envs - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - *) - echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [extra args to pass to pytest]" - exit 1 - ;; -esac diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index 97ee39b31a1e..6f4c34edfa9f 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -32,9 +32,6 @@ class TestVirtualMem : public ::testing::TestWithParam { public: void Run() { auto type = this->GetParam(); - if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { - GTEST_SKIP_("Host numa might require special system capabilities, skipping for now."); - } detail::GrowOnlyVirtualMemVec vec{type}; auto prop = xgboost::cudr::MakeAllocProp(type); auto gran = xgboost::cudr::GetAllocGranularity(&prop); @@ -114,7 +111,15 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; +#if defined(xgboost_IS_WIN) ASSERT_FALSE(pinned.IsVm()); +#else // defined(xgboost_IS_WIN) + if (major >= 12 && minor >= 5) { + ASSERT_TRUE(pinned.IsVm()); + } else { + ASSERT_FALSE(pinned.IsVm()); + } +#endif // defined(xgboost_IS_WIN) } TEST(AtomitFetch, Max) { diff --git a/tests/cpp/data/test_array_interface.cc b/tests/cpp/data/test_array_interface.cc index b692a2aa5378..f87932e77749 100644 --- a/tests/cpp/data/test_array_interface.cc +++ b/tests/cpp/data/test_array_interface.cc @@ -14,8 +14,8 @@ TEST(ArrayInterface, Initialize) { HostDeviceVector storage; auto array = RandomDataGenerator{kRows, kCols, 0}.GenerateArrayInterface(&storage); auto arr_interface = ArrayInterface<2>(StringView{array}); - ASSERT_EQ(arr_interface.Shape(0), kRows); - ASSERT_EQ(arr_interface.Shape(1), kCols); + ASSERT_EQ(arr_interface.Shape<0>(), kRows); + ASSERT_EQ(arr_interface.Shape<1>(), kCols); ASSERT_EQ(arr_interface.data, storage.ConstHostPointer()); ASSERT_EQ(arr_interface.ElementSize(), 4); ASSERT_EQ(arr_interface.type, ArrayInterfaceHandler::kF4); @@ -106,7 +106,7 @@ TEST(ArrayInterface, TrivialDim) { { ArrayInterface<1> arr_i{interface_str}; ASSERT_EQ(arr_i.n, kRows); - ASSERT_EQ(arr_i.Shape(0), kRows); + ASSERT_EQ(arr_i.Shape<0>(), kRows); } std::swap(kRows, kCols); @@ -114,7 +114,7 @@ TEST(ArrayInterface, TrivialDim) { { ArrayInterface<1> arr_i{interface_str}; ASSERT_EQ(arr_i.n, kCols); - ASSERT_EQ(arr_i.Shape(0), kCols); + ASSERT_EQ(arr_i.Shape<0>(), kCols); } } diff --git a/tests/cpp/test_learner.cu b/tests/cpp/test_learner.cu new file mode 100644 index 000000000000..2fde49ca0fdb --- /dev/null +++ b/tests/cpp/test_learner.cu @@ -0,0 +1,39 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#include +#include // for DeviceSym +#include // for GlobalConfigThreadLocalStore +#include + +#include // for int32_t +#include // for unique_ptr + +#include "../../src/common/device_vector.cuh" // for GlobalMemoryLogger +#include "helpers.h" // for RandomDataGenerator + +namespace xgboost { +TEST(Learner, Reset) { + dh::GlobalMemoryLogger().Clear(); + + auto verbosity = GlobalConfigThreadLocalStore::Get()->verbosity; + ConsoleLogger::Configure({{"verbosity", "3"}}); + auto p_fmat = RandomDataGenerator{1024, 32, 0.0}.GenerateDMatrix(true); + std::unique_ptr learner{Learner::Create({p_fmat})}; + learner->SetParam("device", DeviceSym::CUDA()); + learner->Configure(); + for (std::int32_t i = 0; i < 2; ++i) { + learner->UpdateOneIter(i, p_fmat); + } + + auto cur = dh::GlobalMemoryLogger().CurrentlyAllocatedBytes(); + p_fmat.reset(); + auto after_p_fmat_reset = dh::GlobalMemoryLogger().CurrentlyAllocatedBytes(); + ASSERT_LT(after_p_fmat_reset, cur); + learner->Reset(); + auto after_learner_reset = dh::GlobalMemoryLogger().CurrentlyAllocatedBytes(); + ASSERT_LT(after_learner_reset, after_p_fmat_reset); + ASSERT_LE(after_learner_reset, 64); + ConsoleLogger::Configure({{"verbosity", std::to_string(verbosity)}}); +} +} // namespace xgboost diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc index 941c425bd9b0..2491f3973f9a 100644 --- a/tests/cpp/tree/test_tree_model.cc +++ b/tests/cpp/tree/test_tree_model.cc @@ -340,6 +340,7 @@ void TestCategoricalTreeDump(std::string format, std::string sep) { ASSERT_NE(pos, std::string::npos); pos = str.find(cond_str, pos + 1); ASSERT_NE(pos, std::string::npos); + ASSERT_NE(str.find("gain"), std::string::npos); if (format == "json") { // Make sure it's valid JSON diff --git a/tests/python-sycl/test_sycl_simple_dask.py b/tests/python-sycl/test_sycl_simple_dask.py index 19eebebee3e5..2d302573ecd1 100644 --- a/tests/python-sycl/test_sycl_simple_dask.py +++ b/tests/python-sycl/test_sycl_simple_dask.py @@ -1,8 +1,6 @@ from xgboost import dask as dxgb from xgboost import testing as tm -from hypothesis import given, strategies, assume, settings, note - import dask.array as da import dask.distributed @@ -32,10 +30,12 @@ def test_simple(self): param["objective"] = "reg:squarederror" # X and y must be Dask dataframes or arrays - num_obs = 1e4 + num_obs = int(1e4) num_features = 20 - X = da.random.random(size=(num_obs, num_features), chunks=(1000, num_features)) - y = da.random.random(size=(num_obs, 1), chunks=(1000, 1)) + + rng = da.random.RandomState(1994) + X = rng.random_sample((num_obs, num_features), chunks=(1000, -1)) + y = X.sum(axis=1) dtrain = dxgb.DaskDMatrix(client, X, y) result = train_result(client, param, dtrain, 10) diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index 3e945546e13b..b24152e5dc9a 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -1,6 +1,7 @@ import json import os import tempfile +from typing import Optional import numpy as np import pytest @@ -17,38 +18,49 @@ class TestModels: def test_glm(self): - param = {'objective': 'binary:logistic', - 'booster': 'gblinear', 'alpha': 0.0001, 'lambda': 1, - 'nthread': 1} + param = { + "objective": "binary:logistic", + "booster": "gblinear", + "alpha": 0.0001, + "lambda": 1, + "nthread": 1, + } dtrain, dtest = tm.load_agaricus(__file__) - watchlist = [(dtest, 'eval'), (dtrain, 'train')] + watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 4 bst = xgb.train(param, dtrain, num_round, watchlist) assert isinstance(bst, xgb.core.Booster) preds = bst.predict(dtest) labels = dtest.get_label() - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) assert err < 0.2 def test_dart(self): dtrain, dtest = tm.load_agaricus(__file__) - param = {'max_depth': 5, 'objective': 'binary:logistic', - 'eval_metric': 'logloss', 'booster': 'dart', 'verbosity': 1} + param = { + "max_depth": 5, + "objective": "binary:logistic", + "eval_metric": "logloss", + "booster": "dart", + "verbosity": 1, + } # specify validations set to watch performance - watchlist = [(dtest, 'eval'), (dtrain, 'train')] + watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 2 bst = xgb.train(param, dtrain, num_round, watchlist) # this is prediction preds = bst.predict(dtest, iteration_range=(0, num_round)) labels = dtest.get_label() - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) # error must be smaller than 10% assert err < 0.1 with tempfile.TemporaryDirectory() as tmpdir: - dtest_path = os.path.join(tmpdir, 'dtest.dmatrix') + dtest_path = os.path.join(tmpdir, "dtest.dmatrix") model_path = os.path.join(tmpdir, "xgboost.model.dart.ubj") # save dmatrix into binary buffer dtest.save_binary(dtest_path) @@ -66,28 +78,30 @@ def test_dart(self): def my_logloss(preds, dtrain): labels = dtrain.get_label() - return 'logloss', np.sum( - np.log(np.where(labels, preds, 1 - preds))) + return "logloss", np.sum(np.log(np.where(labels, preds, 1 - preds))) # check whether custom evaluation metrics work - bst = xgb.train(param, dtrain, num_round, watchlist, - feval=my_logloss) + bst = xgb.train( + param, dtrain, num_round, evals=watchlist, custom_metric=my_logloss + ) preds3 = bst.predict(dtest, iteration_range=(0, num_round)) assert all(preds3 == preds) # check whether sample_type and normalize_type work num_round = 50 - param['learning_rate'] = 0.1 - param['rate_drop'] = 0.1 + param["learning_rate"] = 0.1 + param["rate_drop"] = 0.1 preds_list = [] - for p in [[p0, p1] for p0 in ['uniform', 'weighted'] - for p1 in ['tree', 'forest']]: - param['sample_type'] = p[0] - param['normalize_type'] = p[1] - bst = xgb.train(param, dtrain, num_round, watchlist) + for p in [ + [p0, p1] for p0 in ["uniform", "weighted"] for p1 in ["tree", "forest"] + ]: + param["sample_type"] = p[0] + param["normalize_type"] = p[1] + bst = xgb.train(param, dtrain, num_round, evals=watchlist) preds = bst.predict(dtest, iteration_range=(0, num_round)) - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) assert err < 0.1 preds_list.append(preds) @@ -143,53 +157,67 @@ def test_boost_from_existing_model(self) -> None: ) assert booster.num_boosted_rounds() == 8 - def run_custom_objective(self, tree_method=None): + def run_custom_objective(self, tree_method: Optional[str] = None): param = { - 'max_depth': 2, - 'eta': 1, - 'objective': 'reg:logistic', - "tree_method": tree_method + "max_depth": 2, + "eta": 1, + "objective": "reg:logistic", + "tree_method": tree_method, } dtrain, dtest = tm.load_agaricus(__file__) - watchlist = [(dtest, 'eval'), (dtrain, 'train')] + watchlist = [(dtest, "eval"), (dtrain, "train")] num_round = 10 - def logregobj(preds, dtrain): - labels = dtrain.get_label() - preds = 1.0 / (1.0 + np.exp(-preds)) - grad = preds - labels - hess = preds * (1.0 - preds) - return grad, hess - - def evalerror(preds, dtrain): - labels = dtrain.get_label() - preds = 1.0 / (1.0 + np.exp(-preds)) - return 'error', float(sum(labels != (preds > 0.5))) / len(labels) + def evalerror(preds: np.ndarray, dtrain: xgb.DMatrix): + return tm.eval_error_metric(preds, dtrain, rev_link=True) # test custom_objective in training - bst = xgb.train(param, dtrain, num_round, watchlist, obj=logregobj, - feval=evalerror) - assert isinstance(bst, xgb.core.Booster) + bst = xgb.train( + param, + dtrain, + num_round, + watchlist, + obj=tm.logregobj, + custom_metric=evalerror, + ) + assert isinstance(bst, xgb.Booster) preds = bst.predict(dtest) labels = dtest.get_label() - err = sum(1 for i in range(len(preds)) - if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + err = sum( + 1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i] + ) / float(len(preds)) assert err < 0.1 # test custom_objective in cross-validation - xgb.cv(param, dtrain, num_round, nfold=5, seed=0, - obj=logregobj, feval=evalerror) + xgb.cv( + param, + dtrain, + num_round, + nfold=5, + seed=0, + obj=tm.logregobj, + custom_metric=evalerror, + ) # test maximize parameter def neg_evalerror(preds, dtrain): labels = dtrain.get_label() - return 'error', float(sum(labels == (preds > 0.0))) / len(labels) - - bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj, - neg_evalerror, maximize=True) + preds = 1.0 / (1.0 + np.exp(-preds)) + return "error", float(sum(labels == (preds > 0.0))) / len(labels) + + bst2 = xgb.train( + param, + dtrain, + num_round, + evals=watchlist, + obj=tm.logregobj, + custom_metric=neg_evalerror, + maximize=True, + ) preds2 = bst2.predict(dtest) - err2 = sum(1 for i in range(len(preds2)) - if int(preds2[i] > 0.5) != labels[i]) / float(len(preds2)) + err2 = sum( + 1 for i in range(len(preds2)) if int(preds2[i] > 0.5) != labels[i] + ) / float(len(preds2)) assert err == err2 def test_custom_objective(self): @@ -197,36 +225,54 @@ def test_custom_objective(self): def test_multi_eval_metric(self): dtrain, dtest = tm.load_agaricus(__file__) - watchlist = [(dtest, 'eval'), (dtrain, 'train')] - param = {'max_depth': 2, 'eta': 0.2, 'verbosity': 1, - 'objective': 'binary:logistic'} - param['eval_metric'] = ["auc", "logloss", 'error'] + watchlist = [(dtest, "eval"), (dtrain, "train")] + param = { + "max_depth": 2, + "eta": 0.2, + "verbosity": 1, + "objective": "binary:logistic", + } + param["eval_metric"] = ["auc", "logloss", "error"] evals_result = {} - bst = xgb.train(param, dtrain, 4, watchlist, evals_result=evals_result) + bst = xgb.train(param, dtrain, 4, evals=watchlist, evals_result=evals_result) assert isinstance(bst, xgb.core.Booster) - assert len(evals_result['eval']) == 3 - assert set(evals_result['eval'].keys()) == {'auc', 'error', 'logloss'} + assert len(evals_result["eval"]) == 3 + assert set(evals_result["eval"].keys()) == {"auc", "error", "logloss"} def test_fpreproc(self): - param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} + param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} num_round = 2 def fpreproc(dtrain, dtest, param): label = dtrain.get_label() ratio = float(np.sum(label == 0)) / np.sum(label == 1) - param['scale_pos_weight'] = ratio + param["scale_pos_weight"] = ratio return (dtrain, dtest, param) dtrain, _ = tm.load_agaricus(__file__) - xgb.cv(param, dtrain, num_round, nfold=5, - metrics={'auc'}, seed=0, fpreproc=fpreproc) + xgb.cv( + param, + dtrain, + num_round, + nfold=5, + metrics={"auc"}, + seed=0, + fpreproc=fpreproc, + ) def test_show_stdv(self): - param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} + param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"} num_round = 2 dtrain, _ = tm.load_agaricus(__file__) - xgb.cv(param, dtrain, num_round, nfold=5, - metrics={'error'}, seed=0, show_stdv=False) + xgb.cv( + param, + dtrain, + num_round, + nfold=5, + metrics={"error"}, + seed=0, + show_stdv=False, + ) def test_prediction_cache(self) -> None: X, y = tm.make_sparse_regression(512, 4, 0.5, as_dense=False) @@ -273,28 +319,34 @@ def validate_model(parameters): X = np.random.random((100, 30)) y = np.random.randint(0, 4, size=(100,)) - parameters['num_class'] = 4 + parameters["num_class"] = 4 m = xgb.DMatrix(X, y) booster = xgb.train(parameters, m) - dump = booster.get_dump(dump_format='json') + dump = booster.get_dump(dump_format="json") for i in range(len(dump)): - jsonschema.validate(instance=json.loads(dump[i]), - schema=schema) + jsonschema.validate(instance=json.loads(dump[i]), schema=schema) path = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - doc = os.path.join(path, 'doc', 'dump.schema') - with open(doc, 'r') as fd: + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + ) + doc = os.path.join(path, "doc", "dump.schema") + with open(doc, "r") as fd: schema = json.load(fd) - parameters = {'tree_method': 'hist', 'booster': 'gbtree', - 'objective': 'multi:softmax'} + parameters = { + "tree_method": "hist", + "booster": "gbtree", + "objective": "multi:softmax", + } validate_model(parameters) - parameters = {'tree_method': 'hist', 'booster': 'dart', - 'objective': 'multi:softmax'} + parameters = { + "tree_method": "hist", + "booster": "dart", + "objective": "multi:softmax", + } validate_model(parameters) def test_special_model_dump_characters(self) -> None: @@ -363,7 +415,7 @@ def run_slice( sliced_trees = end * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) - sliced = booster[: end] + sliced = booster[:end] sliced_trees = end * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py index d2e7cb5c4b8e..1ee31d6610c1 100644 --- a/tests/python/test_callback.py +++ b/tests/python/test_callback.py @@ -1,8 +1,10 @@ import json import os import tempfile -from typing import Union +from collections import namedtuple +from typing import Tuple, Union +import numpy as np import pytest import xgboost as xgb @@ -12,21 +14,29 @@ pytestmark = pytest.mark.skipif(**tm.no_sklearn()) -class TestCallbacks: - @classmethod - def setup_class(cls): - from sklearn.datasets import load_breast_cancer +BreastCancer = namedtuple("BreastCancer", ["full", "tr", "va"]) + + +@pytest.fixture +def breast_cancer() -> BreastCancer: + from sklearn.datasets import load_breast_cancer + + X, y = load_breast_cancer(return_X_y=True) + + split = int(X.shape[0] * 0.8) + return BreastCancer( + full=(X, y), + tr=(X[:split, ...], y[:split, ...]), + va=(X[split:, ...], y[split:, ...]), + ) + - X, y = load_breast_cancer(return_X_y=True) - cls.X = X - cls.y = y +def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]: + # No custom objective, recieve transformed output + return tm.eval_error_metric(predt, dtrain, rev_link=False) - split = int(X.shape[0] * 0.8) - cls.X_train = X[:split, ...] - cls.y_train = y[:split, ...] - cls.X_valid = X[split:, ...] - cls.y_valid = y[split:, ...] +class TestCallbacks: def run_evaluation_monitor( self, D_train: xgb.DMatrix, @@ -70,9 +80,9 @@ def check_output(output: str) -> None: output = out.getvalue().strip() check_output(output) - def test_evaluation_monitor(self): - D_train = xgb.DMatrix(self.X_train, self.y_train) - D_valid = xgb.DMatrix(self.X_valid, self.y_valid) + def test_evaluation_monitor(self, breast_cancer: BreastCancer) -> None: + D_train = xgb.DMatrix(breast_cancer.tr[0], breast_cancer.tr[1]) + D_valid = xgb.DMatrix(breast_cancer.va[0], breast_cancer.va[1]) evals_result = {} rounds = 10 xgb.train( @@ -91,9 +101,9 @@ def test_evaluation_monitor(self): self.run_evaluation_monitor(D_train, D_valid, rounds, 4) self.run_evaluation_monitor(D_train, D_valid, rounds, rounds + 1) - def test_early_stopping(self): - D_train = xgb.DMatrix(self.X_train, self.y_train) - D_valid = xgb.DMatrix(self.X_valid, self.y_valid) + def test_early_stopping(self, breast_cancer: BreastCancer) -> None: + D_train = xgb.DMatrix(breast_cancer.tr[0], breast_cancer.tr[1]) + D_valid = xgb.DMatrix(breast_cancer.va[0], breast_cancer.va[1]) evals_result = {} rounds = 30 early_stopping_rounds = 5 @@ -109,9 +119,9 @@ def test_early_stopping(self): dump = booster.get_dump(dump_format="json") assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 - def test_early_stopping_custom_eval(self): - D_train = xgb.DMatrix(self.X_train, self.y_train) - D_valid = xgb.DMatrix(self.X_valid, self.y_valid) + def test_early_stopping_custom_eval(self, breast_cancer: BreastCancer) -> None: + D_train = xgb.DMatrix(breast_cancer.tr[0], breast_cancer.tr[1]) + D_valid = xgb.DMatrix(breast_cancer.va[0], breast_cancer.va[1]) early_stopping_rounds = 5 booster = xgb.train( { @@ -121,7 +131,7 @@ def test_early_stopping_custom_eval(self): }, D_train, evals=[(D_train, "Train"), (D_valid, "Valid")], - feval=tm.eval_error_metric, + custom_metric=eval_error_metric, num_boost_round=1000, early_stopping_rounds=early_stopping_rounds, verbose_eval=False, @@ -129,9 +139,9 @@ def test_early_stopping_custom_eval(self): dump = booster.get_dump(dump_format="json") assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 - def test_early_stopping_customize(self): - D_train = xgb.DMatrix(self.X_train, self.y_train) - D_valid = xgb.DMatrix(self.X_valid, self.y_valid) + def test_early_stopping_customize(self, breast_cancer: BreastCancer) -> None: + D_train = xgb.DMatrix(breast_cancer.tr[0], breast_cancer.tr[1]) + D_valid = xgb.DMatrix(breast_cancer.va[0], breast_cancer.va[1]) early_stopping_rounds = 5 early_stop = xgb.callback.EarlyStopping( rounds=early_stopping_rounds, metric_name="CustomErr", data_name="Train" @@ -145,7 +155,7 @@ def test_early_stopping_customize(self): }, D_train, evals=[(D_train, "Train"), (D_valid, "Valid")], - feval=tm.eval_error_metric, + custom_metric=eval_error_metric, num_boost_round=1000, callbacks=[early_stop], verbose_eval=False, @@ -170,7 +180,8 @@ def test_early_stopping_customize(self): }, D_train, evals=[(D_train, "Train"), (D_valid, "Valid")], - feval=tm.eval_error_metric, + # No custom objective, transformed output + custom_metric=eval_error_metric, num_boost_round=rounds, callbacks=[early_stop], verbose_eval=False, @@ -179,10 +190,8 @@ def test_early_stopping_customize(self): assert booster.best_iteration == 0 assert booster.num_boosted_rounds() == 1 - def test_early_stopping_skl(self): - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True) + def test_early_stopping_skl(self, breast_cancer: BreastCancer) -> None: + X, y = breast_cancer.full early_stopping_rounds = 5 cls = xgb.XGBClassifier( early_stopping_rounds=early_stopping_rounds, eval_metric="error" @@ -192,10 +201,8 @@ def test_early_stopping_skl(self): dump = booster.get_dump(dump_format="json") assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 - def test_early_stopping_custom_eval_skl(self): - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True) + def test_early_stopping_custom_eval_skl(self, breast_cancer: BreastCancer) -> None: + X, y = breast_cancer.full early_stopping_rounds = 5 early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds) cls = xgb.XGBClassifier( @@ -206,10 +213,8 @@ def test_early_stopping_custom_eval_skl(self): dump = booster.get_dump(dump_format="json") assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 - def test_early_stopping_save_best_model(self): - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True) + def test_early_stopping_save_best_model(self, breast_cancer: BreastCancer) -> None: + X, y = breast_cancer.full n_estimators = 100 early_stopping_rounds = 5 early_stop = xgb.callback.EarlyStopping( @@ -248,10 +253,8 @@ def test_early_stopping_save_best_model(self): callbacks=[early_stop], ).fit(X, y, eval_set=[(X, y)]) - def test_early_stopping_continuation(self): - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True) + def test_early_stopping_continuation(self, breast_cancer: BreastCancer) -> None: + X, y = breast_cancer.full early_stopping_rounds = 5 early_stop = xgb.callback.EarlyStopping( @@ -283,7 +286,23 @@ def test_early_stopping_continuation(self): == booster.best_iteration + early_stopping_rounds + 1 ) - def run_eta_decay(self, tree_method): + def test_early_stopping_multiple_metrics(self): + from sklearn.datasets import make_classification + + X, y = make_classification(random_state=1994) + # AUC approaches 1.0 real quick. + clf = xgb.XGBClassifier(eval_metric=["logloss", "auc"], early_stopping_rounds=2) + clf.fit(X, y, eval_set=[(X, y)]) + assert clf.best_iteration < 8 + assert clf.evals_result()["validation_0"]["auc"][-1] > 0.99 + + clf = xgb.XGBClassifier(eval_metric=["auc", "logloss"], early_stopping_rounds=2) + clf.fit(X, y, eval_set=[(X, y)]) + + assert clf.best_iteration > 50 + assert clf.evals_result()["validation_0"]["auc"][-1] > 0.99 + + def run_eta_decay(self, tree_method: str) -> None: """Test learning rate scheduler, used by both CPU and GPU tests.""" scheduler = xgb.callback.LearningRateScheduler @@ -457,10 +476,8 @@ def test_eta_decay(self, tree_method: str) -> None: def test_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None: self.run_eta_decay_leaf_output(tree_method, objective) - def test_check_point(self) -> None: - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True) + def test_check_point(self, breast_cancer: BreastCancer) -> None: + X, y = breast_cancer.full m = xgb.DMatrix(X, y) with tempfile.TemporaryDirectory() as tmpdir: check_point = xgb.callback.TrainingCheckPoint( @@ -509,10 +526,8 @@ def test_callback_list(self) -> None: ) assert len(callbacks) == 1 - def test_attribute_error(self) -> None: - from sklearn.datasets import load_breast_cancer - - X, y = load_breast_cancer(return_X_y=True) + def test_attribute_error(self, breast_cancer: BreastCancer) -> None: + X, y = breast_cancer.full clf = xgb.XGBClassifier(n_estimators=8) clf.fit(X, y, eval_set=[(X, y)]) diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py index 01634af2924d..d20e5bc384cc 100644 --- a/tests/python/test_demos.py +++ b/tests/python/test_demos.py @@ -174,7 +174,7 @@ def test_quantile_reg() -> None: @pytest.mark.skipif(**tm.no_ubjson()) def test_json_model() -> None: - script = os.path.join(DEMO_DIR, "json-model", "json_parser.py") + script = os.path.join(PYTHON_DEMO_DIR, "model_parser.py") def run_test(reg: xgboost.XGBRegressor) -> None: with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py index a275a8077b71..32afb5f75f51 100644 --- a/tests/python/test_early_stopping.py +++ b/tests/python/test_early_stopping.py @@ -1,3 +1,5 @@ +from typing import Tuple + import numpy as np import pytest @@ -14,9 +16,7 @@ def test_early_stopping_nonparallel(self): from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split - digits = load_digits(n_class=2) - X = digits["data"] - y = digits["target"] + X, y = load_digits(n_class=2, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf1 = xgb.XGBClassifier( learning_rate=0.1, early_stopping_rounds=5, eval_metric="auc" @@ -47,50 +47,64 @@ def test_early_stopping_nonparallel(self): assert clf3.best_score == 1 - def evalerror(self, preds, dtrain): - from sklearn.metrics import mean_squared_error - - labels = dtrain.get_label() - preds = 1.0 / (1.0 + np.exp(-preds)) - return 'rmse', mean_squared_error(labels, preds) - @staticmethod def assert_metrics_length(cv, expected_length): for key, value in cv.items(): assert len(value) == expected_length @pytest.mark.skipif(**tm.no_sklearn()) - def test_cv_early_stopping(self): + def test_cv_early_stopping(self) -> None: from sklearn.datasets import load_digits - digits = load_digits(n_class=2) - X = digits['data'] - y = digits['target'] + X, y = load_digits(n_class=2, return_X_y=True) dm = xgb.DMatrix(X, label=y) params = { - 'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic', - 'eval_metric': 'error' + "max_depth": 2, + "eta": 1, + "objective": "binary:logistic", + "eval_metric": "error", } - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - early_stopping_rounds=10) + def evalerror(preds: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]: + from sklearn.metrics import mean_squared_error + + labels = dtrain.get_label() + return "rmse", mean_squared_error(labels, preds) + + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10) self.assert_metrics_length(cv, 10) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - early_stopping_rounds=5) + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=5) self.assert_metrics_length(cv, 3) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - early_stopping_rounds=1) + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=1) self.assert_metrics_length(cv, 1) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - feval=self.evalerror, early_stopping_rounds=10) + cv = xgb.cv( + params, + dm, + num_boost_round=10, + nfold=10, + custom_metric=evalerror, + early_stopping_rounds=10, + ) self.assert_metrics_length(cv, 10) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - feval=self.evalerror, early_stopping_rounds=1) + cv = xgb.cv( + params, + dm, + num_boost_round=10, + nfold=10, + custom_metric=evalerror, + early_stopping_rounds=1, + ) self.assert_metrics_length(cv, 5) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - feval=self.evalerror, maximize=True, - early_stopping_rounds=1) + cv = xgb.cv( + params, + dm, + num_boost_round=10, + nfold=10, + custom_metric=evalerror, + maximize=True, + early_stopping_rounds=1, + ) self.assert_metrics_length(cv, 1) @pytest.mark.skipif(**tm.no_sklearn()) @@ -100,21 +114,35 @@ def test_cv_early_stopping_with_multiple_eval_sets_and_metrics(self): X, y = load_breast_cancer(return_X_y=True) dm = xgb.DMatrix(X, label=y) - params = {'objective':'binary:logistic'} + params = {"objective": "binary:logistic"} - metrics = [['auc'], ['error'], ['logloss'], - ['logloss', 'auc'], ['logloss', 'error'], ['error', 'logloss']] + metrics = [ + ["auc"], + ["error"], + ["logloss"], + ["logloss", "auc"], + ["logloss", "error"], + ["error", "logloss"], + ] num_iteration_history = [] # If more than one metrics is given, early stopping should use the last metric for i, m in enumerate(metrics): - result = xgb.cv(params, dm, num_boost_round=1000, nfold=5, stratified=True, - metrics=m, early_stopping_rounds=20, seed=42) + result = xgb.cv( + params, + dm, + num_boost_round=1000, + nfold=5, + stratified=True, + metrics=m, + early_stopping_rounds=20, + seed=42, + ) num_iteration_history.append(len(result)) - df = result['test-{}-mean'.format(m[-1])] + df = result["test-{}-mean".format(m[-1])] # When early stopping is invoked, the last metric should be as best it can be. - if m[-1] == 'auc': + if m[-1] == "auc": assert np.all(df <= df.iloc[-1]) else: assert np.all(df >= df.iloc[-1]) diff --git a/tests/python/test_eval_metrics.py b/tests/python/test_eval_metrics.py index 2ee8c02cc2b5..b02f348013fb 100644 --- a/tests/python/test_eval_metrics.py +++ b/tests/python/test_eval_metrics.py @@ -92,7 +92,7 @@ def test_eval_metrics(self): 10, watchlist, early_stopping_rounds=2, - feval=self.evalerror_01, + custom_metric=self.evalerror_01, ) gbdt_02 = xgb.train( self.xgb_params_02, @@ -100,7 +100,7 @@ def test_eval_metrics(self): 10, watchlist, early_stopping_rounds=2, - feval=self.evalerror_02, + custom_metric=self.evalerror_02, ) gbdt_03 = xgb.train( self.xgb_params_03, @@ -108,7 +108,7 @@ def test_eval_metrics(self): 10, watchlist, early_stopping_rounds=2, - feval=self.evalerror_03, + custom_metric=self.evalerror_03, ) gbdt_04 = xgb.train( self.xgb_params_04, @@ -116,7 +116,7 @@ def test_eval_metrics(self): 10, watchlist, early_stopping_rounds=2, - feval=self.evalerror_04, + custom_metric=self.evalerror_04, ) assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0] assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0] diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 6c4540301432..3f2b13038c34 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -2,6 +2,7 @@ import os import pickle import random +import re import tempfile import warnings from typing import Callable, Optional @@ -825,6 +826,32 @@ def get_tm(clf: xgb.XGBClassifier) -> str: assert clf.get_params()["tree_method"] is None +def test_get_params_works_as_expected(): + # XGBModel -> BaseEstimator + params = xgb.XGBModel(max_depth=2).get_params() + assert params["max_depth"] == 2 + # 'objective' defaults to None in the signature of XGBModel + assert params["objective"] is None + + # XGBRegressor -> XGBModel -> BaseEstimator + params = xgb.XGBRegressor(max_depth=3).get_params() + assert params["max_depth"] == 3 + # 'objective' defaults to 'reg:squarederror' in the signature of XGBRegressor + assert params["objective"] == "reg:squarederror" + # 'colsample_bynode' defaults to 'None' for XGBModel (which XGBRegressor inherits from), so it + # should be in get_params() output + assert params["colsample_bynode"] is None + + # XGBRFRegressor -> XGBRegressor -> XGBModel -> BaseEstimator + params = xgb.XGBRFRegressor(max_depth=4, objective="reg:tweedie").get_params() + assert params["max_depth"] == 4 + # 'objective' is a keyword argument for XGBRegressor, so it should be in get_params() output + # ... but values passed through kwargs should override the default from the signature of XGBRegressor + assert params["objective"] == "reg:tweedie" + # 'colsample_bynode' defaults to 0.8 for XGBRFRegressor...that should be preferred to the None from XGBRegressor + assert params["colsample_bynode"] == 0.8 + + def test_kwargs_error(): params = {'updater': 'grow_gpu_hist', 'subsample': .5, 'n_jobs': -1} with pytest.raises(TypeError): @@ -1159,7 +1186,7 @@ def test_feature_weights(tree_method): for i in range(kCols): fw[i] *= float(i) - parser_path = os.path.join(tm.demo_dir(__file__), "json-model", "json_parser.py") + parser_path = os.path.join(tm.demo_dir(__file__), "guide-python", "model_parser.py") poly_increasing = get_feature_weights( X=X, y=y, @@ -1517,7 +1544,7 @@ def test_tags() -> None: assert tags["multioutput"] is True assert tags["multioutput_only"] is False - for clf in [xgb.XGBClassifier()]: + for clf in [xgb.XGBClassifier(), xgb.XGBRFClassifier()]: tags = clf._more_tags() assert "multioutput" not in tags assert tags["multilabel"] is True @@ -1526,6 +1553,58 @@ def test_tags() -> None: assert "multioutput" not in tags +# the try-excepts in this test should be removed once xgboost's +# minimum supported scikit-learn version is at least 1.6 +def test_sklearn_tags(): + + def _assert_has_xgbmodel_tags(tags): + # values set by XGBModel.__sklearn_tags__() + assert tags.non_deterministic is False + assert tags.no_validation is True + assert tags.input_tags.allow_nan is True + + for reg in [xgb.XGBRegressor(), xgb.XGBRFRegressor()]: + try: + # if no AttributeError was thrown, we must be using scikit-learn>=1.6, + # and so the actual effects of __sklearn_tags__() should be tested + tags = reg.__sklearn_tags__() + _assert_has_xgbmodel_tags(tags) + # regressor-specific values + assert tags.estimator_type == "regressor" + assert tags.regressor_tags is not None + assert tags.classifier_tags is None + assert tags.target_tags.multi_output is True + assert tags.target_tags.single_output is True + except AttributeError as err: + # only the exact error we expected to be raised should be raised + assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err))) + + for clf in [xgb.XGBClassifier(), xgb.XGBRFClassifier()]: + try: + # if no AttributeError was thrown, we must be using scikit-learn>=1.6, + # and so the actual effects of __sklearn_tags__() should be tested + tags = clf.__sklearn_tags__() + _assert_has_xgbmodel_tags(tags) + # classifier-specific values + assert tags.estimator_type == "classifier" + assert tags.regressor_tags is None + assert tags.classifier_tags is not None + assert tags.classifier_tags.multi_label is True + except AttributeError as err: + # only the exact error we expected to be raised should be raised + assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err))) + + for rnk in [xgb.XGBRanker(),]: + try: + # if no AttributeError was thrown, we must be using scikit-learn>=1.6, + # and so the actual effects of __sklearn_tags__() should be tested + tags = rnk.__sklearn_tags__() + _assert_has_xgbmodel_tags(tags) + except AttributeError as err: + # only the exact error we expected to be raised should be raised + assert bool(re.search(r"__sklearn_tags__.* should not be called", str(err))) + + def test_doc_link() -> None: for est in [ xgb.XGBRegressor(), diff --git a/tests/test_distributed/test_gpu_with_dask/conftest.py b/tests/test_distributed/test_gpu_with_dask/conftest.py index 0332dd945651..a066461303d3 100644 --- a/tests/test_distributed/test_gpu_with_dask/conftest.py +++ b/tests/test_distributed/test_gpu_with_dask/conftest.py @@ -1,4 +1,4 @@ -from typing import Generator, Sequence +from typing import Any, Generator, Sequence import pytest @@ -6,12 +6,12 @@ @pytest.fixture(scope="session", autouse=True) -def setup_rmm_pool(request, pytestconfig: pytest.Config) -> None: +def setup_rmm_pool(request: Any, pytestconfig: pytest.Config) -> None: tm.setup_rmm_pool(request, pytestconfig) @pytest.fixture(scope="class") -def local_cuda_client(request, pytestconfig: pytest.Config) -> Generator: +def local_cuda_client(request: Any, pytestconfig: pytest.Config) -> Generator: kwargs = {} if hasattr(request, "param"): kwargs.update(request.param) diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py index 553b8746f0d0..848321ae4613 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_demos.py @@ -14,14 +14,14 @@ @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.mgpu -def test_dask_training(): +def test_dask_training() -> None: script = os.path.join(tm.demo_dir(__file__), "dask", "gpu_training.py") cmd = ["python", script] subprocess.check_call(cmd) @pytest.mark.mgpu -def test_dask_sklearn_demo(): +def test_dask_sklearn_demo() -> None: script = os.path.join(tm.demo_dir(__file__), "dask", "sklearn_gpu_training.py") cmd = ["python", script] subprocess.check_call(cmd) @@ -29,7 +29,7 @@ def test_dask_sklearn_demo(): @pytest.mark.mgpu @pytest.mark.skipif(**tm.no_cupy()) -def test_forward_logging_demo(): +def test_forward_logging_demo() -> None: script = os.path.join(tm.demo_dir(__file__), "dask", "forward_logging.py") cmd = ["python", script] subprocess.check_call(cmd) diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py new file mode 100644 index 000000000000..f8f586e39746 --- /dev/null +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_ranking.py @@ -0,0 +1,18 @@ +"""Copyright 2024, XGBoost contributors""" + +import dask +import pytest +from distributed import Client + +from xgboost.testing import dask as dtm + + +@pytest.mark.filterwarnings("error") +def test_no_group_split(local_cuda_client: Client) -> None: + with dask.config.set( + { + "array.backend": "cupy", + "dataframe.backend": "cudf", + } + ): + dtm.check_no_group_split(local_cuda_client, "cuda") diff --git a/tests/test_distributed/test_with_dask/test_ranking.py b/tests/test_distributed/test_with_dask/test_ranking.py index 0b2ea404fde1..f806d61d2592 100644 --- a/tests/test_distributed/test_with_dask/test_ranking.py +++ b/tests/test_distributed/test_with_dask/test_ranking.py @@ -11,6 +11,7 @@ from xgboost import dask as dxgb from xgboost import testing as tm +from xgboost.testing import dask as dtm @pytest.fixture(scope="module") @@ -59,7 +60,10 @@ def test_dask_ranking(client: Client) -> None: qid_test = qid_test.astype(np.uint32) rank = dxgb.DaskXGBRanker( - n_estimators=2500, eval_metric=["ndcg"], early_stopping_rounds=10 + n_estimators=2500, + eval_metric=["ndcg"], + early_stopping_rounds=10, + allow_group_split=True, ) rank.fit( x_train, @@ -71,3 +75,8 @@ def test_dask_ranking(client: Client) -> None: ) assert rank.n_features_in_ == 46 assert rank.best_score > 0.98 + + +@pytest.mark.filterwarnings("error") +def test_no_group_split(client: Client) -> None: + dtm.check_no_group_split(client, "cpu") diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py index 77db640c2a78..680ed025f15b 100644 --- a/tests/test_distributed/test_with_dask/test_with_dask.py +++ b/tests/test_distributed/test_with_dask/test_with_dask.py @@ -1633,7 +1633,7 @@ def test_feature_weights(self, client: "Client") -> None: for i in range(kCols): fw[i] *= float(i) fw = da.from_array(fw) - parser = os.path.join(tm.demo_dir(__file__), "json-model", "json_parser.py") + parser = os.path.join(tm.demo_dir(__file__), "guide-python", "model_parser.py") poly_increasing = get_feature_weights( X=X, y=y, @@ -2153,6 +2153,9 @@ def test_early_stopping_custom_eval(self, client: "Client") -> None: X, y = da.from_array(X), da.from_array(y) m = dxgb.DaskDMatrix(client, X, y) + def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix): + return tm.eval_error_metric(predt, dtrain, rev_link=False) + valid = dxgb.DaskDMatrix(client, X, y) early_stopping_rounds = 5 booster = dxgb.train( @@ -2164,7 +2167,7 @@ def test_early_stopping_custom_eval(self, client: "Client") -> None: }, m, evals=[(m, "Train"), (valid, "Valid")], - feval=tm.eval_error_metric, + custom_metric=eval_error_metric, num_boost_round=1000, early_stopping_rounds=early_stopping_rounds, )["booster"] diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py index 1f8374e06d11..5f0dafd9d6be 100644 --- a/tests/test_distributed/test_with_spark/test_spark_local.py +++ b/tests/test_distributed/test_with_spark/test_spark_local.py @@ -4,19 +4,11 @@ import tempfile import uuid from collections import namedtuple -from typing import Generator, Sequence +from typing import Generator, Iterable, List, Sequence import numpy as np import pytest from pyspark import SparkConf - -import xgboost as xgb -from xgboost import testing as tm -from xgboost.collective import Config -from xgboost.spark.data import pred_contribs - -pytestmark = [tm.timeout(60), pytest.mark.skipif(**tm.no_spark())] - from pyspark.ml import Pipeline, PipelineModel from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.feature import VectorAssembler @@ -26,7 +18,10 @@ from pyspark.sql import SparkSession from pyspark.sql import functions as spark_sql_func +import xgboost as xgb from xgboost import XGBClassifier, XGBModel, XGBRegressor +from xgboost import testing as tm +from xgboost.collective import Config from xgboost.spark import ( SparkXGBClassifier, SparkXGBClassifierModel, @@ -35,11 +30,14 @@ SparkXGBRegressorModel, ) from xgboost.spark.core import _non_booster_params +from xgboost.spark.data import pred_contribs from .utils import SparkTestCase logging.getLogger("py4j").setLevel(logging.INFO) +pytestmark = [tm.timeout(60), pytest.mark.skipif(**tm.no_spark())] + def no_sparse_unwrap() -> tm.PytestSkip: try: @@ -1794,3 +1792,16 @@ def test_ranker_qid_sorted(self, ltr_data: LTRData) -> None: assert ranker.getOrDefault(ranker.objective) == "rank:ndcg" model = ranker.fit(ltr_data.df_train_1) model.transform(ltr_data.df_test).collect() + + def test_ranker_same_qid_in_same_partition(self, ltr_data: LTRData) -> None: + ranker = SparkXGBRanker(qid_col="qid", num_workers=4, force_repartition=True) + df, _ = ranker._prepare_input(ltr_data.df_train_1) + + def f(iterator: Iterable) -> List[int]: + yield list(set(iterator)) + + rows = df.select("qid").rdd.mapPartitions(f).collect() + assert len(rows) == 4 + for row in rows: + assert len(row) == 1 + assert row[0].qid in [6, 7, 8, 9]