diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index ed1a2d304916..d4f028e33f93 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -1,4 +1,4 @@
-Thanks for participating in the XGBoost community! We use https://discuss.xgboost.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking. You are always welcomed to post on the forum first :)
+Thanks for participating in the XGBoost community! The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.
Issues that are inactive for a period of time may get closed. We adopt this policy so that we won't lose track of actionable issues that may fall at the bottom of the pile. Feel free to reopen a new one if you feel there is an additional problem that needs attention when an old one gets closed.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
deleted file mode 100644
index 1a8098071ba3..000000000000
--- a/.github/dependabot.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-# To get started with Dependabot version updates, you'll need to specify which
-# package ecosystems to update and where the package manifests are located.
-# Please see the documentation for all configuration options:
-# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
-
-version: 2
-updates:
- - package-ecosystem: "maven"
- directory: "/jvm-packages"
- schedule:
- interval: "monthly"
- - package-ecosystem: "maven"
- directory: "/jvm-packages/xgboost4j"
- schedule:
- interval: "monthly"
- - package-ecosystem: "maven"
- directory: "/jvm-packages/xgboost4j-gpu"
- schedule:
- interval: "monthly"
- - package-ecosystem: "maven"
- directory: "/jvm-packages/xgboost4j-example"
- schedule:
- interval: "monthly"
- - package-ecosystem: "maven"
- directory: "/jvm-packages/xgboost4j-spark"
- schedule:
- interval: "monthly"
- - package-ecosystem: "maven"
- directory: "/jvm-packages/xgboost4j-spark-gpu"
- schedule:
- interval: "monthly"
- - package-ecosystem: "github-actions"
- directory: /
- schedule:
- interval: "monthly"
diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index d951a08e8273..e21895ee8c3b 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -34,4 +34,3 @@ runners:
cpu: 32
family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"]
image: windows-amd64
-
diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml
index d3208a1294d1..26e8fa34c119 100644
--- a/.github/workflows/freebsd.yml
+++ b/.github/workflows/freebsd.yml
@@ -15,20 +15,15 @@ jobs:
timeout-minutes: 20
name: A job to run test in FreeBSD
steps:
- - uses: actions/checkout@v4
- with:
- submodules: 'true'
- - name: Test in FreeBSD
- id: test
- uses: vmactions/freebsd-vm@v1
- with:
- usesh: true
- prepare: |
- pkg install -y cmake git ninja googletest
-
- run: |
- mkdir build
- cd build
- cmake .. -GNinja -DGOOGLE_TEST=ON
- ninja -v
- ./testxgboost
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - name: Test in FreeBSD
+ id: test
+ uses: vmactions/freebsd-vm@v1
+ with:
+ usesh: true
+ prepare: |
+ pkg install -y cmake git ninja googletest bash
+ run: |
+ bash ops/pipeline/test-freebsd.sh
diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml
index aec7e9d31087..8b7c71a82bf8 100644
--- a/.github/workflows/i386.yml
+++ b/.github/workflows/i386.yml
@@ -19,25 +19,25 @@ jobs:
ports:
- 5000:5000
steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - name: Set up Docker Buildx
- uses: docker/setup-buildx-action@v3.7.1
- with:
- driver-opts: network=host
- - name: Build and push container
- uses: docker/build-push-action@v6
- with:
- context: .
- file: tests/ci_build/Dockerfile.i386
- push: true
- tags: localhost:5000/xgboost/build-32bit:latest
- cache-from: type=gha
- cache-to: type=gha,mode=max
- - name: Build XGBoost
- run: |
- docker run --rm -v $PWD:/workspace -w /workspace \
- -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \
- localhost:5000/xgboost/build-32bit:latest \
- tests/ci_build/build_via_cmake.sh
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+ with:
+ driver-opts: network=host
+ - name: Build and push container
+ uses: docker/build-push-action@v6
+ with:
+ context: .
+ file: ops/docker/dockerfile/Dockerfile.i386
+ push: true
+ tags: localhost:5000/xgboost/build-32bit:latest
+ cache-from: type=gha
+ cache-to: type=gha,mode=max
+ - name: Build XGBoost
+ run: |
+ docker run --rm -v $PWD:/workspace -w /workspace \
+ -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \
+ localhost:5000/xgboost/build-32bit:latest \
+ bash ops/script/build_via_cmake.sh
diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index 945f362685a4..53e695721887 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -1,100 +1,287 @@
-name: XGBoost-JVM-Tests
+name: XGBoost CI (JVM packages)
on: [push, pull_request]
permissions:
- contents: read # to fetch code (actions/checkout)
+ contents: read # to fetch code (actions/checkout)
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
+env:
+ BRANCH_NAME: >-
+ ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
+ USE_DOCKER_CACHE: 1
+
jobs:
- test-with-jvm:
- name: Test JVM on OS ${{ matrix.os }}
- timeout-minutes: 30
- runs-on: ${{ matrix.os }}
+ build-containers:
+ name: Build CI containers (${{ matrix.container_id }})
+ runs-on:
+ - runs-on
+ - runner=${{ matrix.runner }}
+ - run-id=${{ github.run_id }}
+ - tag=jvm-tests-build-containers-${{ matrix.container_id }}
strategy:
- fail-fast: false
matrix:
- os: [windows-latest, ubuntu-latest, macos-13]
-
+ container_id:
+ - xgb-ci.manylinux2014_x86_64
+ - xgb-ci.jvm
+ - xgb-ci.jvm_gpu_build
+ runner: [linux-amd64-cpu]
+ include:
+ - container_id: xgb-ci.manylinux2014_aarch64
+ runner: linux-arm64-cpu
steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Build ${{ matrix.container_id }}
+ run: bash ops/docker_build.sh ${{ matrix.container_id }}
- - uses: actions/setup-java@8df1039502a15bceb9433410b1a100fbe190c53b # v4.5.0
- with:
- distribution: 'temurin'
- java-version: '8'
-
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- miniforge-variant: Miniforge3
- miniforge-version: latest
- activate-environment: jvm_tests
- environment-file: tests/ci_build/conda_env/jvm_tests.yml
- use-mamba: true
+ build-jvm-manylinux2014:
+ name: >-
+ Build libxgboost4j.so targeting glibc 2.17
+ (arch ${{ matrix.arch }}, runner ${{ matrix.runner }})
+ needs: build-containers
+ runs-on:
+ - runs-on
+ - runner=${{ matrix.runner }}
+ - run-id=${{ github.run_id }}
+ - tag=jvm-tests-build-jvm-manylinux2014-${{ matrix.arch }}
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - arch: aarch64
+ runner: linux-arm64-cpu
+ - arch: x86_64
+ runner: linux-amd64-cpu
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.manylinux2014_${{ matrix.arch }}
+ - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }}
+ - name: Upload libxgboost4j.so
+ run: |
+ libname=lib/libxgboost4j_linux_${{ matrix.arch }}_${{ github.sha }}.so
+ mv -v lib/libxgboost4j.so ${libname}
+ bash ops/pipeline/publish-artifact.sh ${libname} \
+ s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/
- - name: Cache Maven packages
- uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
- with:
- path: ~/.m2
- key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
- restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
+ build-jvm-gpu:
+ name: Build libxgboost4j.so with CUDA
+ needs: build-containers
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-cpu
+ - tag=jvm-tests-build-jvm-gpu
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build
+ - run: bash ops/pipeline/build-jvm-gpu.sh
+ - name: Stash files
+ run: |
+ bash ops/pipeline/stash-artifacts.sh stash build-jvm-gpu lib/libxgboost4j.so
- - name: Test XGBoost4J (Core)
- run: |
- cd jvm-packages
- mvn test -B -pl :xgboost4j_2.12
+ build-jvm-mac:
+ name: "Build libxgboost4j.dylib for ${{ matrix.description }}"
+ runs-on: ${{ matrix.runner }}
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - description: "MacOS (Apple Silicon)"
+ script: ops/pipeline/build-jvm-macos-apple-silicon.sh
+ libname: libxgboost4j_m1_${{ github.sha }}.dylib
+ runner: macos-14
+ - description: "MacOS (Intel)"
+ script: ops/pipeline/build-jvm-macos-intel.sh
+ libname: libxgboost4j_intel_${{ github.sha }}.dylib
+ runner: macos-13
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - run: bash ${{ matrix.script }}
+ - name: Upload libxgboost4j.dylib
+ run: |
+ mv -v lib/libxgboost4j.dylib ${{ matrix.libname }}
+ bash ops/pipeline/publish-artifact.sh ${{ matrix.libname }} \
+ s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/
+ env:
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
- - name: Test XGBoost4J (Core, Spark, Examples)
- run: |
- rm -rfv build/
- cd jvm-packages
- mvn -B test
- if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows
+ build-jvm-docs:
+ name: Build docs for JVM packages
+ needs: [build-jvm-gpu]
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-cpu
+ - tag=jvm-tests-build-jvm-docs
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build
+ - name: Unstash files
+ run: |
+ bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so
+ - run: bash ops/pipeline/build-jvm-doc.sh
+ - name: Upload JVM doc
+ run: |
+ bash ops/pipeline/publish-artifact.sh \
+ jvm-packages/${{ env.BRANCH_NAME }}.tar.bz2 \
+ s3://xgboost-docs/
- - name: Extract branch name
- shell: bash
- run: |
- echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT"
- id: extract_branch
- if: |
- (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
- (matrix.os == 'windows-latest' || matrix.os == 'macos-13')
+ build-test-jvm-packages:
+ name: Build and test JVM packages (Linux, Scala ${{ matrix.scala_version }})
+ needs: build-containers
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-cpu
+ - tag=jvm-tests-build-test-jvm-packages-scala${{ matrix.scala_version }}
+ strategy:
+ fail-fast: false
+ matrix:
+ scala_version: ["2.12", "2.13"]
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.jvm
+ - name: Build and test JVM packages (Scala ${{ matrix.scala_version }})
+ run: bash ops/pipeline/build-test-jvm-packages.sh
+ env:
+ SCALA_VERSION: ${{ matrix.scala_version }}
+ - name: Stash files
+ run: |
+ bash ops/pipeline/stash-artifacts.sh stash \
+ build-test-jvm-packages lib/libxgboost4j.so
+ if: matrix.scala_version == '2.13'
- - name: Publish artifact xgboost4j.dll to S3
- run: |
- cd lib/
- Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
- dir
- python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2
- if: |
- (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
- matrix.os == 'windows-latest'
- env:
- AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
- AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
+ build-test-jvm-packages-other-os:
+ name: Build and test JVM packages (${{ matrix.os }})
+ timeout-minutes: 30
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [windows-latest, macos-13]
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - uses: actions/setup-java@v4
+ with:
+ distribution: 'temurin'
+ java-version: '8'
+ - uses: dmlc/xgboost-devops/miniforge-setup@main
+ with:
+ environment-name: minimal
+ environment-file: ops/conda_env/minimal.yml
+ - name: Cache Maven packages
+ uses: actions/cache@v4
+ with:
+ path: ~/.m2
+ key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
+ restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
+ - name: Test XGBoost4J (Core)
+ run: |
+ cd jvm-packages
+ mvn test -B -pl :xgboost4j_2.12
+ - name: Publish artifact xgboost4j.dll to S3
+ run: |
+ cd lib/
+ Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
+ python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll `
+ s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ `
+ --acl public-read --region us-west-2
+ if: |
+ (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
+ matrix.os == 'windows-latest'
+ env:
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
- - name: Publish artifact libxgboost4j.dylib to S3
- shell: bash -l {0}
- run: |
- cd lib/
- mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib
- ls
- python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2
- if: |
- (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
- matrix.os == 'macos-13'
- env:
- AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
- AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
+ test-jvm-packages-gpu:
+ name: Test JVM packages with CUDA (Scala ${{ matrix.scala_version }})
+ needs: [build-jvm-gpu]
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-mgpu
+ - tag=jvm-tests-test-jvm-packages-gpu-scala${{ matrix.scala_version }}
+ strategy:
+ fail-fast: false
+ matrix:
+ scala_version: ["2.12", "2.13"]
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.jvm_gpu_build
+ - name: Unstash files
+ run: |
+ bash ops/pipeline/stash-artifacts.sh unstash build-jvm-gpu lib/libxgboost4j.so
+ - run: bash ops/pipeline/test-jvm-gpu.sh
+ env:
+ SCALA_VERSION: ${{ matrix.scala_version }}
- - name: Build and Test XGBoost4J with scala 2.13
- run: |
- rm -rfv build/
- cd jvm-packages
- mvn -B clean install test -Pdefault,scala-2.13
- if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows
+ deploy-jvm-packages:
+ name: Deploy JVM packages to S3 (${{ matrix.variant.name }})
+ needs: [build-jvm-gpu, build-test-jvm-packages, test-jvm-packages-gpu]
+ runs-on:
+ - runs-on
+ - runner=linux-amd64-cpu
+ - run-id=${{ github.run_id }}
+ - tag=jvm-tests-deploy-jvm-packages-${{ matrix.variant.name }}-scala${{ matrix.scala_version }}
+ strategy:
+ fail-fast: false
+ matrix:
+ variant:
+ - name: cpu
+ container_id: xgb-ci.jvm
+ artifact_from: build-test-jvm-packages
+ - name: gpu
+ container_id: xgb-ci.jvm_gpu_build
+ artifact_from: build-jvm-gpu
+ scala_version: ['2.12', '2.13']
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh ${{ matrix.variant.container_id }}
+ - name: Unstash files
+ run: |
+ bash ops/pipeline/stash-artifacts.sh \
+ unstash ${{ matrix.variant.artifact_from }} \
+ lib/libxgboost4j.so
+ ls -lh lib/libxgboost4j.so
+ - name: Deploy JVM packages to S3
+ run: |
+ bash ops/pipeline/deploy-jvm-packages.sh ${{ matrix.variant.name }} \
+ ${{ matrix.variant.container_id }} ${{ matrix.scala_version }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 000000000000..2c400b073988
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,119 @@
+name: XGBoost CI (Lint)
+
+on: [push, pull_request]
+
+permissions:
+ contents: read # to fetch code (actions/checkout)
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+ cancel-in-progress: true
+
+env:
+ BRANCH_NAME: >-
+ ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
+
+jobs:
+ build-containers:
+ name: Build CI containers
+ env:
+ CONTAINER_ID: xgb-ci.clang_tidy
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-cpu
+ - tag=lint-build-containers
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Build ${{ env.CONTAINER_ID }}
+ run: bash ops/docker_build.sh ${{ env.CONTAINER_ID }}
+
+ clang-tidy:
+ name: Run clang-tidy
+ needs: build-containers
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-cpu
+ - tag=lint-clang-tidy
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.clang_tidy
+ - run: bash ops/pipeline/run-clang-tidy.sh
+
+ python-mypy-lint:
+ runs-on: ubuntu-latest
+ name: Type and format checks for the Python package
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - uses: dmlc/xgboost-devops/miniforge-setup@main
+ with:
+ environment-name: python_lint
+ environment-file: ops/conda_env/python_lint.yml
+ - name: Run mypy
+ shell: bash -el {0}
+ run: |
+ python ops/script/lint_python.py --format=0 --type-check=1 --pylint=0
+ - name: Run formatter
+ shell: bash -el {0}
+ run: |
+ python ops/script/lint_python.py --format=1 --type-check=0 --pylint=0
+ - name: Run pylint
+ shell: bash -el {0}
+ run: |
+ python ops/script/lint_python.py --format=0 --type-check=0 --pylint=1
+
+ cpp-lint:
+ runs-on: ubuntu-latest
+ name: Code linting for C++
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.10"
+ architecture: 'x64'
+ - name: Install Python packages
+ run: |
+ python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint
+ - name: Run lint
+ run: |
+ python3 ops/script/lint_cpp.py
+ bash ops/script/lint_cmake.sh
+
+ lintr:
+ runs-on: ubuntu-latest
+ name: Run R linters on Ubuntu
+ env:
+ R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - uses: r-lib/actions/setup-r@v2
+ with:
+ r-version: "release"
+ - name: Cache R packages
+ uses: actions/cache@v4
+ with:
+ path: ${{ env.R_LIBS_USER }}
+ key: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }}
+ restore-keys: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }}
+ - name: Install dependencies
+ shell: Rscript {0}
+ run: |
+ source("./R-package/tests/helper_scripts/install_deps.R")
+ - name: Run lintr
+ run: |
+ MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/
+ Rscript ops/script/lint_r.R $(pwd)
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index d1395c15f77e..cbed730405fa 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,193 +1,311 @@
-# This is a basic workflow to help you get started with Actions
+name: XGBoost CI
-name: XGBoost-CI
-
-# Controls when the action will run. Triggers the workflow on push or pull request
-# events but only for the master branch
on: [push, pull_request]
permissions:
- contents: read # to fetch code (actions/checkout)
+ contents: read # to fetch code (actions/checkout)
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
-# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+env:
+ BRANCH_NAME: >-
+ ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
+ USE_DOCKER_CACHE: 1
+
jobs:
- gtest-cpu:
- name: Test Google C++ test (CPU)
- runs-on: ${{ matrix.os }}
+ build-containers:
+ name: Build CI containers (${{ matrix.container_id }})
+ runs-on:
+ - runs-on
+ - runner=${{ matrix.runner }}
+ - run-id=${{ github.run_id }}
+ - tag=main-build-containers-${{ matrix.container_id }}
strategy:
- fail-fast: false
matrix:
- os: [macos-12]
+ container_id:
+ - xgb-ci.gpu_build_rockylinux8
+ - xgb-ci.gpu_build_rockylinux8_dev_ver
+ - xgb-ci.gpu_build_r_rockylinux8
+ - xgb-ci.gpu
+ - xgb-ci.cpu
+ - xgb-ci.manylinux_2_28_x86_64
+ - xgb-ci.manylinux2014_x86_64
+ runner: [linux-amd64-cpu]
+ include:
+ - container_id: xgb-ci.manylinux2014_aarch64
+ runner: linux-arm64-cpu
+ - container_id: xgb-ci.aarch64
+ runner: linux-arm64-cpu
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Build ${{ matrix.container_id }}
+ run: bash ops/docker_build.sh ${{ matrix.container_id }}
+
+ build-cpu:
+ name: Build CPU
+ needs: build-containers
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-cpu
+ - tag=main-build-cpu
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.cpu
+ - run: bash ops/pipeline/build-cpu.sh
+ - name: Stash CLI executable
+ run: bash ops/pipeline/stash-artifacts.sh stash build-cpu ./xgboost
+
+ build-cpu-arm64:
+ name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel
+ needs: build-containers
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-arm64-cpu
+ - tag=build-cpu-arm64
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.aarch64
+ - run: bash ops/pipeline/build-cpu-arm64.sh
+ - name: Stash files
+ run: |
+ bash ops/pipeline/stash-artifacts.sh stash build-cpu-arm64 \
+ ./xgboost python-package/dist/*.whl
+ - name: Upload Python wheel
+ run: |
+ bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \
+ s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/
+
+ build-cuda:
+ name: Build CUDA + manylinux_2_28_x86_64 wheel
+ needs: build-containers
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-cpu
+ - tag=main-build-cuda
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64
+ - run: bash ops/pipeline/build-cuda.sh
+ - name: Stash files
+ run: |
+ bash ops/pipeline/stash-artifacts.sh stash build-cuda \
+ build/testxgboost ./xgboost python-package/dist/*.whl
+ - name: Upload Python wheel
+ run: |
+ for file in python-package/dist/*.whl python-package/dist/meta.json
+ do
+ bash ops/pipeline/publish-artifact.sh "${file}" \
+ s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/
+ done
+
+ build-cuda-with-rmm:
+ name: Build CUDA with RMM
+ needs: build-containers
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-cpu
+ - tag=main-build-cuda-with-rmm
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64
+ - run: |
+ bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8
+ - name: Stash files
+ run: |
+ bash ops/pipeline/stash-artifacts.sh \
+ stash build-cuda-with-rmm build/testxgboost
+ - name: Upload Python wheel
+ run: |
+ bash ops/pipeline/publish-artifact.sh python-package/dist/*.whl \
+ s3://xgboost-nightly-builds/experimental_build_with_rmm/
+
+ build-cuda-with-rmm-dev:
+ name: Build CUDA with RMM (dev)
+ needs: build-containers
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-cpu
+ - tag=main-build-cuda-with-rmm-dev
steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - name: Install system packages
- run: |
- brew install ninja libomp
- - name: Build gtest binary
- run: |
- mkdir build
- cd build
- cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo
- ninja -v
- - name: Run gtest binary
- run: |
- cd build
- ./testxgboost
- ctest -R TestXGBoostCLI --extra-verbose
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.gpu_build_rockylinux8_dev_ver
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.manylinux_2_28_x86_64
+ - run: |
+ bash ops/pipeline/build-cuda-with-rmm.sh xgb-ci.gpu_build_rockylinux8_dev_ver
- gtest-cpu-nonomp:
- name: Test Google C++ unittest (CPU Non-OMP)
- runs-on: ${{ matrix.os }}
+ build-manylinux2014:
+ name: Build manylinux2014_${{ matrix.arch }} wheel
+ needs: build-containers
+ runs-on:
+ - runs-on
+ - runner=${{ matrix.runner }}
+ - run-id=${{ github.run_id }}
+ - tag=main-build-manylinux2014-${{ matrix.arch }}
strategy:
fail-fast: false
matrix:
- os: [ubuntu-latest]
+ include:
+ - arch: aarch64
+ runner: linux-arm64-cpu
+ - arch: x86_64
+ runner: linux-amd64-cpu
+ steps:
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.manylinux2014_${{ matrix.arch }}
+ - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }}
+ - name: Upload Python wheel
+ run: |
+ for wheel in python-package/dist/*.whl
+ do
+ bash ops/pipeline/publish-artifact.sh "${wheel}" \
+ s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/
+ done
+
+ build-gpu-rpkg:
+ name: Build GPU-enabled R package
+ needs: build-containers
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=linux-amd64-cpu
+ - tag=main-build-gpu-rpkg
steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - name: Install system packages
- run: |
- sudo apt-get install -y --no-install-recommends ninja-build
- - name: Build and install XGBoost
- shell: bash -l {0}
- run: |
- mkdir build
- cd build
- cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON
- ninja -v
- - name: Run gtest binary
- run: |
- cd build
- ctest --extra-verbose
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.gpu_build_r_rockylinux8
+ - run: bash ops/pipeline/build-gpu-rpkg.sh
+ - name: Upload R tarball
+ run: |
+ bash ops/pipeline/publish-artifact.sh xgboost_r_gpu_linux_*.tar.gz \
+ s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/
- gtest-cpu-sycl:
- name: Test Google C++ unittest (CPU SYCL)
- runs-on: ${{ matrix.os }}
+
+ test-cpp-gpu:
+ name: >-
+ Run Google Tests with GPUs
+ (Suite ${{ matrix.suite }}, Runner ${{ matrix.runner }})
+ needs: [build-cuda, build-cuda-with-rmm]
+ runs-on:
+ - runs-on
+ - runner=${{ matrix.runner }}
+ - run-id=${{ github.run_id }}
+ - tag=main-test-cpp-gpu-${{ matrix.suite }}
strategy:
fail-fast: false
matrix:
- os: [ubuntu-latest]
- python-version: ["3.10"]
+ include:
+ - suite: gpu
+ runner: linux-amd64-gpu
+ artifact_from: build-cuda
+ - suite: gpu-rmm
+ runner: linux-amd64-gpu
+ artifact_from: build-cuda-with-rmm
+ - suite: mgpu
+ runner: linux-amd64-mgpu
+ artifact_from: build-cuda
steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- miniforge-variant: Miniforge3
- miniforge-version: latest
- activate-environment: linux_sycl_test
- environment-file: tests/ci_build/conda_env/linux_sycl_test.yml
- use-mamba: true
- - name: Display Conda env
- run: |
- conda info
- conda list
- - name: Build and install XGBoost
- shell: bash -l {0}
- run: |
- mkdir build
- cd build
- cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
- make -j$(nproc)
- - name: Run gtest binary for SYCL
- run: |
- cd build
- ./testxgboost --gtest_filter=Sycl*
- - name: Run gtest binary for non SYCL
- run: |
- cd build
- ./testxgboost --gtest_filter=-Sycl*
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh xgb-ci.gpu
+ - name: Unstash gtest
+ run: |
+ bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \
+ build/testxgboost
+ chmod +x build/testxgboost
+ - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }}
- c-api-demo:
- name: Test installing XGBoost lib + building the C API demo
- runs-on: ${{ matrix.os }}
- defaults:
- run:
- shell: bash -l {0}
+ test-python-wheel:
+ name: Run Python tests (${{ matrix.description }})
+ needs: [build-cuda, build-cpu-arm64]
+ runs-on:
+ - runs-on
+ - runner=${{ matrix.runner }}
+ - run-id=${{ github.run_id }}
+ - tag=main-test-python-wheel-${{ matrix.description }}
strategy:
fail-fast: false
matrix:
- os: ["ubuntu-latest"]
- python-version: ["3.10"]
- steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- miniforge-variant: Miniforge3
- miniforge-version: latest
- activate-environment: cpp_test
- environment-file: tests/ci_build/conda_env/cpp_test.yml
- use-mamba: true
- - name: Display Conda env
- run: |
- conda info
- conda list
-
- - name: Build and install XGBoost static library
- run: |
- mkdir build
- cd build
- cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja
- ninja -v install
- cd -
- - name: Build and run C API demo with static
- run: |
- pushd .
- cd demo/c-api/
- mkdir build
- cd build
- cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
- ninja -v
- ctest
- cd ..
- rm -rf ./build
- popd
-
- - name: Build and install XGBoost shared library
- run: |
- cd build
- cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON
- ninja -v install
- ./testxgboost
- cd -
- - name: Build and run C API demo with shared
- run: |
- pushd .
- cd demo/c-api/
- mkdir build
- cd build
- cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
- ninja -v
- ctest
- popd
- ./tests/ci_build/verify_link.sh ./demo/c-api/build/basic/api-demo
- ./tests/ci_build/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo
-
- cpp-lint:
- runs-on: ubuntu-latest
- name: Code linting for C++
+ include:
+ - description: single-gpu
+ container: xgb-ci.gpu
+ suite: gpu
+ runner: linux-amd64-gpu
+ artifact_from: build-cuda
+ - description: multiple-gpu
+ container: xgb-ci.gpu
+ suite: mgpu
+ runner: linux-amd64-mgpu
+ artifact_from: build-cuda
+ - description: cpu-amd64
+ container: xgb-ci.cpu
+ suite: cpu
+ runner: linux-amd64-cpu
+ artifact_from: build-cuda
+ - description: cpu-arm64
+ container: xgb-ci.aarch64
+ suite: cpu-arm64
+ runner: linux-arm64-cpu
+ artifact_from: build-cpu-arm64
steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
- with:
- python-version: "3.10"
- architecture: 'x64'
- - name: Install Python packages
- run: |
- python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint
- - name: Run lint
- run: |
- python3 tests/ci_build/lint_cpp.py
- sh ./tests/ci_build/lint_cmake.sh
+ # Restart Docker daemon so that it recognizes the ephemeral disks
+ - run: sudo systemctl restart docker
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Fetch container from cache
+ run: bash ops/docker_build.sh ${{ matrix.container }}
+ - name: Unstash Python wheel
+ run: |
+ bash ops/pipeline/stash-artifacts.sh unstash ${{ matrix.artifact_from }} \
+ python-package/dist/*.whl ./xgboost
+ chmod +x ./xgboost
+ - name: Run Python tests, ${{ matrix.description }}
+ run: bash ops/pipeline/test-python-wheel.sh ${{ matrix.suite }} ${{ matrix.container }}
diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml
new file mode 100644
index 000000000000..67c1bf57d3a2
--- /dev/null
+++ b/.github/workflows/misc.yml
@@ -0,0 +1,49 @@
+name: XGBoost CI (misc)
+
+on: [push, pull_request]
+
+permissions:
+ contents: read # to fetch code (actions/checkout)
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+ cancel-in-progress: true
+
+env:
+ BRANCH_NAME: >-
+ ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
+
+jobs:
+ gtest-cpu-nonomp:
+ name: Test Google C++ unittest (CPU Non-OMP)
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - name: Install system packages
+ run: |
+ sudo apt-get install -y --no-install-recommends ninja-build
+ - name: Build and install XGBoost
+ run: bash ops/script/build_via_cmake.sh -DUSE_OPENMP=OFF
+ - name: Run gtest binary
+ run: |
+ cd build
+ ctest --extra-verbose
+
+ c-api-demo:
+ name: Test installing XGBoost lib + building the C API demo
+ runs-on: ubuntu-latest
+ defaults:
+ run:
+ shell: bash -l {0}
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - uses: dmlc/xgboost-devops/miniforge-setup@main
+ with:
+ environment-name: cpp_test
+ environment-file: ops/conda_env/cpp_test.yml
+ - name: Build and run C API demo with shared
+ run: bash ops/pipeline/test-c-api-demo.sh
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 8f0ab1c68262..dc8de819e2bb 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -1,4 +1,4 @@
-name: XGBoost-Python-Tests
+name: XGBoost CI (Python tests)
on: [push, pull_request]
@@ -14,335 +14,51 @@ concurrency:
cancel-in-progress: true
jobs:
- python-mypy-lint:
- runs-on: ubuntu-latest
- name: Type and format checks for the Python package
- strategy:
- matrix:
- os: [ubuntu-latest]
- steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- miniforge-variant: Miniforge3
- miniforge-version: latest
- activate-environment: python_lint
- environment-file: tests/ci_build/conda_env/python_lint.yml
- use-mamba: true
- - name: Display Conda env
- run: |
- conda info
- conda list
- - name: Run mypy
- run: |
- python tests/ci_build/lint_python.py --format=0 --type-check=1 --pylint=0
- - name: Run formatter
- run: |
- python tests/ci_build/lint_python.py --format=1 --type-check=0 --pylint=0
- - name: Run pylint
- run: |
- python tests/ci_build/lint_python.py --format=0 --type-check=0 --pylint=1
-
- python-sdist-test-on-Linux:
- # Mismatched glibcxx version between system and conda forge.
- runs-on: ${{ matrix.os }}
- name: Test installing XGBoost Python source package on ${{ matrix.os }}
- strategy:
- matrix:
- os: [ubuntu-latest]
- steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- miniforge-variant: Miniforge3
- miniforge-version: latest
- activate-environment: sdist_test
- environment-file: tests/ci_build/conda_env/sdist_test.yml
- use-mamba: true
- - name: Display Conda env
- run: |
- conda info
- conda list
- - name: Build and install XGBoost
- run: |
- cd python-package
- python --version
- python -m build --sdist
- pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False
- cd ..
- python -c 'import xgboost'
-
python-sdist-test:
- # Use system toolchain instead of conda toolchain for macos and windows.
- # MacOS has linker error if clang++ from conda-forge is used
runs-on: ${{ matrix.os }}
- name: Test installing XGBoost Python source package on ${{ matrix.os }}
+ name: Test installing Python XGBoost from the source distribution (${{ matrix.os }})
strategy:
+ fail-fast: false
matrix:
- os: [macos-13, windows-latest]
- python-version: ["3.10"]
+ os: [macos-13, windows-latest, ubuntu-latest]
steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - name: Install osx system dependencies
- if: matrix.os == 'macos-13'
- run: |
- brew install ninja libomp
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- auto-update-conda: true
- python-version: ${{ matrix.python-version }}
- activate-environment: test
- - name: Install build
- run: |
- conda install -c conda-forge python-build
- - name: Display Conda env
- run: |
- conda info
- conda list
- - name: Build and install XGBoost
- run: |
- cd python-package
- python --version
- python -m build --sdist
- pip install -v ./dist/xgboost-*.tar.gz
- cd ..
- python -c 'import xgboost'
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - uses: dmlc/xgboost-devops/miniforge-setup@main
+ with:
+ environment-name: sdist_test
+ environment-file: ops/conda_env/sdist_test.yml
+ - name: Install extra package for MacOS
+ run: |
+ mamba install -c conda-forge llvm-openmp
+ if: matrix.os == 'macos-13'
+ - name: Build and install XGBoost
+ run: bash ops/pipeline/test-python-sdist.sh
python-tests-on-macos:
- name: Test XGBoost Python package on ${{ matrix.config.os }}
- runs-on: ${{ matrix.config.os }}
- timeout-minutes: 60
- strategy:
- matrix:
- config:
- - {os: macos-13}
-
- steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
-
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- miniforge-variant: Miniforge3
- miniforge-version: latest
- activate-environment: macos_cpu_test
- environment-file: tests/ci_build/conda_env/macos_cpu_test.yml
- use-mamba: true
-
- - name: Display Conda env
- run: |
- conda info
- conda list
-
- - name: Build XGBoost on macos
- run: |
- brew install ninja
-
- mkdir build
- cd build
- # Set prefix, to use OpenMP library from Conda env
- # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228
- # to learn why we don't use libomp from Homebrew.
- cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON
- ninja
-
- - name: Install Python package
- run: |
- cd python-package
- python --version
- pip install -v .
-
- - name: Test Python package
- run: |
- pytest -s -v -rxXs --durations=0 ./tests/python
-
- - name: Test Dask Interface
- run: |
- pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask
-
- python-tests-on-win:
- name: Test XGBoost Python package on ${{ matrix.config.os }}
- runs-on: ${{ matrix.config.os }}
+ name: Test XGBoost Python package on macos-13
+ runs-on: macos-13
timeout-minutes: 60
- strategy:
- matrix:
- config:
- - {os: windows-latest, python-version: '3.10'}
-
- steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
-
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- auto-update-conda: true
- python-version: ${{ matrix.config.python-version }}
- activate-environment: win64_env
- environment-file: tests/ci_build/conda_env/win64_cpu_test.yml
-
- - name: Display Conda env
- run: |
- conda info
- conda list
-
- - name: Build XGBoost on Windows
- run: |
- mkdir build_msvc
- cd build_msvc
- cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON
- cmake --build . --config Release --parallel $(nproc)
-
- - name: Install Python package
- run: |
- cd python-package
- python --version
- pip wheel -v . --wheel-dir dist/
- pip install ./dist/*.whl
-
- - name: Test Python package
- run: |
- pytest -s -v -rxXs --durations=0 ./tests/python
-
- python-tests-on-ubuntu:
- name: Test XGBoost Python package on ${{ matrix.config.os }}
- runs-on: ${{ matrix.config.os }}
- timeout-minutes: 90
- strategy:
- matrix:
- config:
- - {os: ubuntu-latest, python-version: "3.10"}
-
- steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
-
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- miniforge-variant: Miniforge3
- miniforge-version: latest
- activate-environment: linux_cpu_test
- environment-file: tests/ci_build/conda_env/linux_cpu_test.yml
- use-mamba: true
-
- - name: Display Conda env
- run: |
- conda info
- conda list
-
- - name: Build XGBoost on Ubuntu
- run: |
- mkdir build
- cd build
- cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON
- ninja
-
- - name: Install Python package
- run: |
- cd python-package
- python --version
- pip install -v .
-
- - name: Test Python package
- run: |
- pytest -s -v -rxXs --durations=0 ./tests/python
-
- - name: Test Dask Interface
- run: |
- pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask
-
- - name: Test PySpark Interface
- shell: bash -l {0}
- run: |
- pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark
-
- python-sycl-tests-on-ubuntu:
- name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }}
- runs-on: ${{ matrix.config.os }}
- timeout-minutes: 90
- strategy:
- matrix:
- config:
- - {os: ubuntu-latest, python-version: "3.10"}
-
steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
-
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- miniforge-variant: Miniforge3
- miniforge-version: latest
- activate-environment: linux_sycl_test
- environment-file: tests/ci_build/conda_env/linux_sycl_test.yml
- use-mamba: true
-
- - name: Display Conda env
- run: |
- conda info
- conda list
- - name: Build XGBoost on Ubuntu
- run: |
- mkdir build
- cd build
- cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
- make -j$(nproc)
- - name: Install Python package
- run: |
- cd python-package
- python --version
- pip install -v .
- - name: Test Python package
- run: |
- pytest -s -v -rxXs --durations=0 ./tests/python-sycl/
-
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - uses: dmlc/xgboost-devops/miniforge-setup@main
+ with:
+ environment-name: macos_cpu_test
+ environment-file: ops/conda_env/macos_cpu_test.yml
+ - run: bash ops/pipeline/test-python-macos.sh
python-system-installation-on-ubuntu:
- name: Test XGBoost Python package System Installation on ${{ matrix.os }}
- runs-on: ${{ matrix.os }}
- strategy:
- matrix:
- os: [ubuntu-latest]
-
+ name: Test XGBoost Python package System Installation on Ubuntu
+ runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+ - uses: actions/checkout@v4
with:
submodules: 'true'
-
- name: Set up Python 3.10
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ uses: actions/setup-python@v5
with:
python-version: "3.10"
-
- - name: Install ninja
- run: |
- sudo apt-get update && sudo apt-get install -y ninja-build
-
- - name: Build XGBoost on Ubuntu
- run: |
- mkdir build
- cd build
- cmake .. -GNinja
- ninja
-
- - name: Copy lib to system lib
- run: |
- cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib"
-
- - name: Install XGBoost in Virtual Environment
- run: |
- cd python-package
- pip install virtualenv
- virtualenv venv
- source venv/bin/activate && \
- pip install -v . --config-settings use_system_libxgboost=True && \
- python -c 'import xgboost'
+ - run: bash ops/pipeline/test-python-with-sysprefix.sh
diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml
deleted file mode 100644
index 1bbdedc3f9c6..000000000000
--- a/.github/workflows/python_wheels.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: XGBoost-Python-Wheels
-
-on: [push, pull_request]
-
-permissions:
- contents: read # to fetch code (actions/checkout)
-
-defaults:
- run:
- shell: bash -l {0}
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
- cancel-in-progress: true
-
-jobs:
- python-wheels:
- name: Build wheel for ${{ matrix.platform_id }}
- runs-on: ${{ matrix.os }}
- strategy:
- matrix:
- include:
- - os: macos-13
- platform_id: macosx_x86_64
- - os: macos-14
- platform_id: macosx_arm64
- steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - name: Set up homebrew
- uses: Homebrew/actions/setup-homebrew@68fa6aeb1ccb0596d311f2b34ec74ec21ee68e54
- - name: Install libomp
- run: brew install libomp
- - uses: conda-incubator/setup-miniconda@d2e6a045a86077fb6cad6f5adf368e9076ddaa8d # v3.1.0
- with:
- miniforge-variant: Miniforge3
- miniforge-version: latest
- python-version: "3.10"
- use-mamba: true
- - name: Build wheels
- run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }}
- - name: Extract branch name
- run: |
- echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT"
- id: extract_branch
- if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
- - name: Upload Python wheel
- if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
- run: |
- python -m pip install awscli
- python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read --region us-west-2
- env:
- AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
- AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml
new file mode 100644
index 000000000000..ab13dfa395cd
--- /dev/null
+++ b/.github/workflows/python_wheels_macos.yml
@@ -0,0 +1,53 @@
+name: Build Python wheels targeting MacOS
+
+on: [push, pull_request]
+
+permissions:
+ contents: read # to fetch code (actions/checkout)
+
+defaults:
+ run:
+ shell: bash -l {0}
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+ cancel-in-progress: true
+
+env:
+ BRANCH_NAME: >-
+ ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
+
+jobs:
+ python-wheels-macos:
+ name: Build wheel for ${{ matrix.platform_id }}
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - os: macos-13
+ platform_id: macosx_x86_64
+ - os: macos-14
+ platform_id: macosx_arm64
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - name: Set up homebrew
+ uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8
+ - name: Install libomp
+ run: brew install libomp
+ - uses: dmlc/xgboost-devops/miniforge-setup@main
+ with:
+ environment-name: minimal
+ environment-file: ops/conda_env/minimal.yml
+ - name: Build wheels
+ run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }}
+ - name: Upload Python wheel
+ if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
+ run: |
+ python -m pip install awscli
+ python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2
+ env:
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml
index 4b506927e06c..da01f39f650b 100644
--- a/.github/workflows/r_nold.yml
+++ b/.github/workflows/r_nold.yml
@@ -22,23 +22,20 @@ jobs:
container:
image: rhub/debian-gcc-devel-nold
steps:
- - name: Install git and system packages
- shell: bash
- run: |
- apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y
-
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
-
- - name: Install dependencies
- shell: bash -l {0}
- run: |
- /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')"
-
- - name: Run R tests
- shell: bash
- run: |
- cd R-package && \
- /tmp/R-devel/bin/R CMD INSTALL . && \
- /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')"
+ - name: Install git and system packages
+ shell: bash
+ run: |
+ apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - name: Install dependencies
+ shell: bash -l {0}
+ run: |
+ /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')"
+ - name: Run R tests
+ shell: bash
+ run: |
+ cd R-package && \
+ /tmp/R-devel/bin/R CMD INSTALL . && \
+ /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')"
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index c56d1f8ef943..fc0245f5752e 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -13,138 +13,91 @@ concurrency:
cancel-in-progress: true
jobs:
- lintr:
- runs-on: ${{ matrix.config.os }}
- name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
- strategy:
- matrix:
- config:
- - {os: ubuntu-latest, r: 'release'}
- env:
- R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
- RSPM: ${{ matrix.config.rspm }}
-
- steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
-
- - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0
- with:
- r-version: ${{ matrix.config.r }}
-
- - name: Cache R packages
- uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
- with:
- path: ${{ env.R_LIBS_USER }}
- key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
- restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
-
- - name: Install dependencies
- shell: Rscript {0}
- run: |
- source("./R-package/tests/helper_scripts/install_deps.R")
-
- - name: Run lintr
- run: |
- MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/
- Rscript tests/ci_build/lint_r.R $(pwd)
-
test-Rpkg:
- runs-on: ${{ matrix.config.os }}
- name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
+ runs-on: ${{ matrix.os }}
+ name: Test R on OS ${{ matrix.os }}, R ${{ matrix.r }}, Compiler ${{ matrix.compiler }}, Build ${{ matrix.build }}
strategy:
fail-fast: false
matrix:
- config:
- - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
- - {os: ubuntu-latest, r: 'release', compiler: 'none', build: 'cmake'}
+ include:
+ - os: windows-latest
+ r: release
+ compiler: mingw
+ build: autotools
+ - os: ubuntu-latest
+ r: release
+ compiler: none
+ build: cmake
env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
- RSPM: ${{ matrix.config.rspm }}
-
steps:
- - name: Install system dependencies
- run: |
- sudo apt update
- sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev
- if: matrix.config.os == 'ubuntu-latest'
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
-
- - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0
- with:
- r-version: ${{ matrix.config.r }}
-
- - name: Cache R packages
- uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
- with:
- path: ${{ env.R_LIBS_USER }}
- key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
- restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
-
- - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
- with:
- python-version: "3.10"
- architecture: 'x64'
-
- - uses: r-lib/actions/setup-tinytex@v2
-
- - name: Install dependencies
- shell: Rscript {0}
- run: |
- source("./R-package/tests/helper_scripts/install_deps.R")
-
- - name: Test R
- run: |
- python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool="${{ matrix.config.build }}" --task=check
- if: matrix.config.compiler != 'none'
-
- - name: Test R
- run: |
- python tests/ci_build/test_r_package.py --build-tool="${{ matrix.config.build }}" --task=check
- if: matrix.config.compiler == 'none'
+ - name: Install system dependencies
+ run: |
+ sudo apt update
+ sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev
+ if: matrix.os == 'ubuntu-latest'
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - uses: r-lib/actions/setup-r@v2
+ with:
+ r-version: ${{ matrix.r }}
+ - name: Cache R packages
+ uses: actions/cache@v4
+ with:
+ path: ${{ env.R_LIBS_USER }}
+ key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
+ restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }}
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.10"
+ architecture: 'x64'
+ - uses: r-lib/actions/setup-tinytex@v2
+ - name: Install dependencies
+ shell: Rscript {0}
+ run: |
+ source("./R-package/tests/helper_scripts/install_deps.R")
+ - name: Test R
+ run: |
+ python ops/script/test_r_package.py --compiler='${{ matrix.compiler }}' --build-tool="${{ matrix.build }}" --task=check
+ if: matrix.compiler != 'none'
+ - name: Test R
+ run: |
+ python ops/script/test_r_package.py --build-tool="${{ matrix.build }}" --task=check
+ if: matrix.compiler == 'none'
test-R-on-Debian:
name: Test R package on Debian
runs-on: ubuntu-latest
container:
image: rhub/debian-gcc-release
-
steps:
- - name: Install system dependencies
- run: |
- # Must run before checkout to have the latest git installed.
- # No need to add pandoc, the container has it figured out.
- apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y
-
- - name: Trust git cloning project sources
- run: |
- git config --global --add safe.directory "${GITHUB_WORKSPACE}"
-
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
-
- - name: Install dependencies
- shell: bash -l {0}
- run: |
- Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')"
-
- - name: Test R
- shell: bash -l {0}
- run: |
- python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check
-
- - uses: dorny/paths-filter@v3
- id: changes
- with:
- filters: |
- r_package:
- - 'R-package/**'
-
- - name: Run document check
- if: steps.changes.outputs.r_package == 'true'
- run: |
- python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc
+ - name: Install system dependencies
+ run: |
+ # Must run before checkout to have the latest git installed.
+ # No need to add pandoc, the container has it figured out.
+ apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y
+ - name: Trust git cloning project sources
+ run: |
+ git config --global --add safe.directory "${GITHUB_WORKSPACE}"
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - name: Install dependencies
+ shell: bash -l {0}
+ run: |
+ Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')"
+ - name: Test R
+ shell: bash -l {0}
+ run: |
+ python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check
+ - uses: dorny/paths-filter@v3
+ id: changes
+ with:
+ filters: |
+ r_package:
+ - 'R-package/**'
+ - name: Run document check
+ if: steps.changes.outputs.r_package == 'true'
+ run: |
+ python3 ops/script/test_r_package.py --r=/usr/bin/R --task=doc
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 85a9abb57e1b..f3837391b4fe 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -22,7 +22,7 @@ jobs:
steps:
- name: "Checkout code"
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+ uses: actions/checkout@v4
with:
persist-credentials: false
diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml
new file mode 100644
index 000000000000..22456b1b68e5
--- /dev/null
+++ b/.github/workflows/sycl_tests.yml
@@ -0,0 +1,48 @@
+name: XGBoost CI (oneAPI)
+
+on: [push, pull_request]
+
+permissions:
+ contents: read # to fetch code (actions/checkout)
+
+defaults:
+ run:
+ shell: bash -l {0}
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+ cancel-in-progress: true
+
+env:
+ BRANCH_NAME: >-
+ ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
+
+jobs:
+ gtest-cpu-sycl:
+ name: Test Google C++ unittest (CPU SYCL)
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - uses: dmlc/xgboost-devops/miniforge-setup@main
+ with:
+ environment-name: linux_sycl_test
+ environment-file: ops/conda_env/linux_sycl_test.yml
+ - name: Run gtest
+ run: bash ops/pipeline/build-test-sycl.sh gtest
+
+ python-sycl-tests-on-ubuntu:
+ name: Test XGBoost Python package with SYCL
+ runs-on: ubuntu-latest
+ timeout-minutes: 90
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - uses: dmlc/xgboost-devops/miniforge-setup@main
+ with:
+ environment-name: linux_sycl_test
+ environment-file: ops/conda_env/linux_sycl_test.yml
+ - name: Test Python package
+ run: bash ops/pipeline/build-test-sycl.sh pytest
diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml
index 5e229db4c050..4a3e4747c3ff 100644
--- a/.github/workflows/update_rapids.yml
+++ b/.github/workflows/update_rapids.yml
@@ -25,20 +25,20 @@ jobs:
name: Check latest RAPIDS
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
- with:
- submodules: 'true'
- - name: Check latest RAPIDS and update conftest.sh
- run: |
- bash tests/buildkite/update-rapids.sh
- - name: Create Pull Request
- uses: peter-evans/create-pull-request@v7
- if: github.ref == 'refs/heads/master'
- with:
- add-paths: |
- tests/buildkite
- branch: create-pull-request/update-rapids
- base: master
- title: "[CI] Update RAPIDS to latest stable"
- commit-message: "[CI] Update RAPIDS to latest stable"
+ - uses: actions/checkout@v4
+ with:
+ submodules: 'true'
+ - name: Check latest RAPIDS and update conftest.sh
+ run: |
+ bash ops/script/update_rapids.sh
+ - name: Create Pull Request
+ uses: peter-evans/create-pull-request@v7
+ if: github.ref == 'refs/heads/master'
+ with:
+ add-paths: |
+ ops/docker
+ branch: create-pull-request/update-rapids
+ base: master
+ title: "[CI] Update RAPIDS to latest stable"
+ commit-message: "[CI] Update RAPIDS to latest stable"
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
new file mode 100644
index 000000000000..f97daf761abf
--- /dev/null
+++ b/.github/workflows/windows.yml
@@ -0,0 +1,53 @@
+name: XGBoost CI (Windows)
+
+on: [push, pull_request]
+
+permissions:
+ contents: read # to fetch code (actions/checkout)
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+ cancel-in-progress: true
+
+defaults:
+ run:
+ shell: powershell
+
+env:
+ BRANCH_NAME: >-
+ ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
+
+jobs:
+ build-win64-gpu:
+ name: Build XGBoost for Windows with CUDA
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=windows-cpu
+ - tag=windows-build-win64-gpu
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - run: powershell ops/pipeline/build-win64-gpu.ps1
+ - name: Stash files
+ run: |
+ powershell ops/pipeline/stash-artifacts.ps1 stash build-win64-gpu `
+ build/testxgboost.exe xgboost.exe `
+ (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName)
+
+ test-win64-gpu:
+ name: Test XGBoost on Windows
+ needs: build-win64-gpu
+ runs-on:
+ - runs-on=${{ github.run_id }}
+ - runner=windows-gpu
+ - tag=windows-test-win64-gpu
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: "true"
+ - name: Unstash files
+ run: |
+ powershell ops/pipeline/stash-artifacts.ps1 unstash build-win64-gpu `
+ build/testxgboost.exe xgboost.exe python-package/dist/*.whl
+ - run: powershell ops/pipeline/test-win64-gpu.ps1
diff --git a/.gitignore b/.gitignore
index 082e85e2c67f..c29dcc43d9d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,6 +52,7 @@ Debug
*.bak
#.Rbuildignore
R-package.Rproj
+R-package/build/*
*.cache*
.mypy_cache/
doxygen
@@ -144,11 +145,13 @@ credentials.csv
.bloop
# python tests
+*.bin
demo/**/*.txt
*.dmatrix
.hypothesis
__MACOSX/
model*.json
+/tests/python/models/models/
# R tests
*.htm
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 035f4ae45f47..8bd8caabc20f 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -10,6 +10,7 @@ S3method(getinfo,xgb.Booster)
S3method(getinfo,xgb.DMatrix)
S3method(length,xgb.Booster)
S3method(predict,xgb.Booster)
+S3method(predict,xgboost)
S3method(print,xgb.Booster)
S3method(print,xgb.DMatrix)
S3method(print,xgb.cv.synchronous)
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 78249a53f18d..008a88dcd715 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -423,7 +423,7 @@ NULL
#'
#' @description
#' When it comes to serializing XGBoost models, it's possible to use R serializers such as
-#' [save()] or [saveRDS()] to serialize an XGBoost R model, but XGBoost also provides
+#' [save()] or [saveRDS()] to serialize an XGBoost model object, but XGBoost also provides
#' its own serializers with better compatibility guarantees, which allow loading
#' said models in other language bindings of XGBoost.
#'
@@ -451,14 +451,15 @@ NULL
#' not used for prediction / importance / plotting / etc.
#' These R attributes are only preserved when using R's serializers.
#'
-#' In addition to the regular `xgb.Booster` objects producted by [xgb.train()], the
-#' function [xgboost()] produces a different subclass `xgboost`, which keeps other
-#' additional metadata as R attributes such as class names in classification problems,
-#' and which has a dedicated `predict` method that uses different defaults. XGBoost's
+#' In addition to the regular `xgb.Booster` objects produced by [xgb.train()], the
+#' function [xgboost()] produces objects with a different subclass `xgboost` (which
+#' inherits from `xgb.Booster`), which keeps other additional metadata as R attributes
+#' such as class names in classification problems, and which has a dedicated `predict`
+#' method that uses different defaults and takes different argument names. XGBoost's
#' own serializers can work with this `xgboost` class, but as they do not keep R
#' attributes, the resulting object, when deserialized, is downcasted to the regular
#' `xgb.Booster` class (i.e. it loses the metadata, and the resulting object will use
-#' `predict.xgb.Booster` instead of `predict.xgboost`) - for these `xgboost` objects,
+#' [predict.xgb.Booster()] instead of [predict.xgboost()]) - for these `xgboost` objects,
#' `saveRDS` might thus be a better option if the extra functionalities are needed.
#'
#' Note that XGBoost models in R starting from version `2.1.0` and onwards, and
@@ -466,8 +467,8 @@ NULL
#' are incompatible with each other. Hence, models that were saved with R serializers
#' like [saveRDS()] or [save()] before version `2.1.0` will not work with latter
#' `xgboost` versions and vice versa. Be aware that the structure of R model objects
-#' could in theory change again in the future, so XGBoost's serializers
-#' should be preferred for long-term storage.
+#' could in theory change again in the future, so XGBoost's serializers should be
+#' preferred for long-term storage.
#'
#' Furthermore, note that using the package `qs` for serialization will require
#' version 0.26 or higher of said package, and will have the same compatibility
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 808289b63de3..b38cd42bcef3 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -126,6 +126,8 @@ xgb.get.handle <- function(object) {
#' of the iterations (rounds) otherwise.
#'
#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
+#'
+#' Not applicable to `gblinear` booster.
#' @param strict_shape Whether to always return an array with the same dimensions for the given prediction mode
#' regardless of the model type - meaning that, for example, both a multi-class and a binary classification
#' model would generate output arrays with the same number of dimensions, with the 'class' dimension having
@@ -144,7 +146,13 @@ xgb.get.handle <- function(object) {
#'
#' If passing `TRUE`, then the result will have dimensions in reverse order - for example, rows
#' will be the last dimensions instead of the first dimension.
-#' @param base_margin Base margin used for boosting from existing model.
+#' @param base_margin Base margin used for boosting from existing model (raw score that gets added to
+#' all observations independently of the trees in the model).
+#'
+#' If supplied, should be either a vector with length equal to the number of rows in `newdata`
+#' (for objectives which produces a single score per observation), or a matrix with number of
+#' rows matching to the number rows in `newdata` and number of columns matching to the number
+#' of scores estimated by the model (e.g. number of classes for multi-class classification).
#'
#' Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
#' be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
@@ -206,6 +214,9 @@ xgb.get.handle <- function(object) {
#' For multi-class / multi-target, they will be arranged so that columns in the output will have
#' the leafs from one group followed by leafs of the other group (e.g. order will be `group1:feat1`,
#' `group1:feat2`, ..., `group2:feat1`, `group2:feat2`, ...).
+#'
+#' If there is more than one parallel tree (e.g. random forests), the parallel trees will be the
+#' last grouping in the resulting order, which will still be 2D.
#' \item For `predcontrib`: when not multi-class / multi-target, a matrix with dimensions
#' `[nrows, nfeats+1]`. The last "+ 1" column corresponds to the baseline value.
#'
@@ -222,7 +233,7 @@ xgb.get.handle <- function(object) {
#' For multi-class and multi-target, will be a 4D array with dimensions `[nrows, ngroups, nfeats+1, nfeats+1]`
#' }
#'
-#' If passing `strict_shape=FALSE`, the result is always an array:
+#' If passing `strict_shape=TRUE`, the result is always a matrix (if 2D) or array (if 3D or higher):
#' - For normal predictions, the dimension is `[nrows, ngroups]`.
#' - For `predcontrib=TRUE`, the dimension is `[nrows, ngroups, nfeats+1]`.
#' - For `predinteraction=TRUE`, the dimension is `[nrows, ngroups, nfeats+1, nfeats+1]`.
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index 429cf3f0422c..280fcf52ee3e 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -9,12 +9,13 @@
#' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the
#' sorted-indices method (`tree_method = "exact"`), nor for the approximate method
#' (`tree_method = "approx"`).
+#'
#' @param data Data from which to create a DMatrix, which can then be used for fitting models or
#' for getting predictions out of a fitted model.
#'
-#' Supported input types are as follows:\itemize{
-#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
-#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
+#' Supported input types are as follows:
+#' - `matrix` objects, with types `numeric`, `integer`, or `logical`.
+#' - `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`
#'
#' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
#' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
@@ -23,33 +24,14 @@
#' was constructed.
#'
#' Other column types are not supported.
-#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
-#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are **not** supported for
-#' 'xgb.QuantileDMatrix'.
-#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
-#' as a single row (only when making predictions from a fitted model).
-#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
-#' the file, with an optional format specifier.
-#'
-#' These are **not** supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
-#' \item XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()].
-#' \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
-#' `?format=libsvm` at the end of the file path. It will be the default format if not
-#' otherwise specified.
-#' \item CSV files (comma-separated values). This format can be specified by adding suffix
-#' `?format=csv` at the end ofthe file path. It will **not** be auto-deduced from file extensions.
-#' }
+#' - CSR matrices, as class `dgRMatrix` from package `Matrix`.
+#' - CSC matrices, as class `dgCMatrix` from package `Matrix`.
#'
-#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
-#' it will not look at the extension or file contents to determine that it is a comma-separated value.
-#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
-#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
-#' corresponds to the labels).
+#' These are **not** supported by `xgb.QuantileDMatrix`.
+#' - XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()].
+#' - Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
+#' as a single row (only when making predictions from a fitted model).
#'
-#' For more information about passing text files as input, see the articles
-#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
-#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
-#' }
#' @param label Label of the training data. For classification problems, should be passed encoded as
#' integers with numeration starting at zero.
#' @param weight Weight for each instance.
@@ -95,15 +77,9 @@
#' @param label_lower_bound Lower bound for survival training.
#' @param label_upper_bound Upper bound for survival training.
#' @param feature_weights Set feature weights for column sampling.
-#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
-#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
-#'
-#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
-#' how the file was split beforehand. Default to row.
-#'
-#' This is not used when `data` is not a URI.
-#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
-#' subclass 'xgb.QuantileDMatrix'.
+#' @param data_split_mode Not used yet. This parameter is for distributed training, which is not yet available for the R package.
+#' @return An 'xgb.DMatrix' object. If calling `xgb.QuantileDMatrix`, it will have additional
+#' subclass `xgb.QuantileDMatrix`.
#'
#' @details
#' Note that DMatrix objects are not serializable through R functions such as [saveRDS()] or [save()].
@@ -145,6 +121,9 @@ xgb.DMatrix <- function(
if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
}
+ if (data_split_mode != "row") {
+ stop("'data_split_mode' is not supported yet.")
+ }
nthread <- as.integer(NVL(nthread, -1L))
if (typeof(data) == "character") {
if (length(data) > 1) {
diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R
index f9d892caa1e5..2c4015c5f2de 100644
--- a/R-package/R/xgb.create.features.R
+++ b/R-package/R/xgb.create.features.R
@@ -86,7 +86,7 @@
#' @export
xgb.create.features <- function(model, data, ...) {
check.deprecation(...)
- pred_with_leaf <- predict(model, data, predleaf = TRUE)
+ pred_with_leaf <- predict.xgb.Booster(model, data, predleaf = TRUE)
cols <- lapply(as.data.frame(pred_with_leaf), factor)
cbind(data, sparse.model.matrix(~ . -1, cols)) # nolint
}
diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
index 443020e1ac7e..4184c6f5ea6a 100644
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -16,7 +16,7 @@
#' @param target_class Only relevant for multiclass models. The default (`NULL`)
#' averages the SHAP values over all classes. Pass a (0-based) class index
#' to show only SHAP values of that class.
-#' @param approxcontrib Passed to `predict()` when `shap_contrib = NULL`.
+#' @param approxcontrib Passed to [predict.xgb.Booster()] when `shap_contrib = NULL`.
#' @param subsample Fraction of data points randomly picked for plotting.
#' The default (`NULL`) will use up to 100k data points.
#' @param n_col Number of columns in a grid of plots.
@@ -353,7 +353,7 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
}
if (is.null(shap_contrib)) {
- shap_contrib <- predict(
+ shap_contrib <- predict.xgb.Booster(
model,
newdata = data,
predcontrib = TRUE,
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index 48a81fab34d8..c22752a3f506 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -949,6 +949,243 @@ xgboost <- function(
return(model)
}
+#' @title Compute predictions from XGBoost model on new data
+#' @description Predict values on data based on XGBoost model.
+#' @param object An XGBoost model object of class `xgboost`, as produced by function [xgboost()].
+#'
+#' Note that there is also a lower-level [predict.xgb.Booster()] method for models of class
+#' `xgb.Booster` as produced by [xgb.train()], which can also be used for `xgboost` class models as
+#' an alternative that performs fewer validations and post-processings.
+#' @param newdata Data on which to compute predictions from the model passed in `object`. Supported
+#' input classes are:
+#' - Data Frames (class `data.frame` from base R and subclasses like `data.table`).
+#' - Matrices (class `matrix` from base R).
+#' - Sparse matrices from package `Matrix`, either as class `dgRMatrix` (CSR) or `dgCMatrix` (CSC).
+#' - Sparse vectors from package `Matrix`, which will be interpreted as containing a single
+#' observation.
+#'
+#' In the case of data frames, if there are any categorical features, they should be of class
+#' `factor` and should have the same levels as the `factor` columns of the data from which the model
+#' was constructed.
+#'
+#' If there are named columns and the model was fitted to data with named columns, they will be
+#' matched by name by default (see `validate_features`).
+#' @param type Type of prediction to make. Supported options are:
+#' - `"response"`: will output model predictions on the scale of the response variable (e.g.
+#' probabilities of belonging to the last class in the case of binary classification). Result will
+#' be either a numeric vector with length matching to rows in `newdata`, or a numeric matrix with
+#' shape `[nrows(newdata), nscores]` (for objectives that produce more than one score per
+#' observation such as multi-class classification or multi-quantile regression).
+#' - `"raw"`: will output the unprocessed boosting scores (e.g. log-odds in the case of objective
+#' `binary:logistic`). Same output shape and type as for `"response"`.
+#' - `"class"`: will output the class with the highest predicted probability, returned as a `factor`
+#' (only applicable to classification objectives) with length matching to rows in `newdata`.
+#' - `"leaf"`: will output the terminal node indices of each observation across each tree, as an
+#' integer matrix of shape `[nrows(newdata), ntrees]`, or as an integer array with an extra one or
+#' two dimensions, up to `[nrows(newdata), ntrees, nscores, n_parallel_trees]` for models that
+#' produce more than one score per tree and/or which have more than one parallel tree (e.g.
+#' random forests).
+#'
+#' Only applicable to tree-based boosters (not `gblinear`).
+#' - `"contrib"`: will produce per-feature contribution estimates towards the model score for a
+#' given observation, based on SHAP values. The contribution values are on the scale of
+#' untransformed margin (e.g., for binary classification, the values are log-odds deviations from
+#' the baseline).
+#'
+#' Output will be a numeric matrix with shape `[nrows, nfeatures+1]`, with the intercept being the
+#' last feature, or a numeric array with shape `[nrows, nscores, nfeatures+1]` if the model
+#' produces more than one score per observation.
+#' - `"interaction"`: similar to `"contrib"`, but computing SHAP values of contributions of
+#' interaction of each pair of features. Note that this operation might be rather expensive in
+#' terms of compute and memory.
+#'
+#' Since it quadratically depends on the number of features, it is recommended to perform
+#' selection of the most important features first.
+#'
+#' Output will be a numeric array of shape `[nrows, nfeatures+1, nfeatures+1]`, or shape
+#' `[nrows, nscores, nfeatures+1, nfeatures+1]` (for objectives that produce more than one score
+#' per observation).
+#' @param base_margin Base margin used for boosting from existing model (raw score that gets added to
+#' all observations independently of the trees in the model).
+#'
+#' If supplied, should be either a vector with length equal to the number of rows in `newdata`
+#' (for objectives which produces a single score per observation), or a matrix with number of
+#' rows matching to the number rows in `newdata` and number of columns matching to the number
+#' of scores estimated by the model (e.g. number of classes for multi-class classification).
+#' @param iteration_range Sequence of rounds/iterations from the model to use for prediction, specified by passing
+#' a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
+#' base-1 indexing, and inclusive of both ends).
+#'
+#' For example, passing `c(1,20)` will predict using the first twenty iterations, while passing `c(1,1)` will
+#' predict using only the first one.
+#'
+#' If passing `NULL`, will either stop at the best iteration if the model used early stopping, or use all
+#' of the iterations (rounds) otherwise.
+#'
+#' If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
+#'
+#' Not applicable to `gblinear` booster.
+#' @param validate_features Validate that the feature names in the data match to the feature names
+#' in the column, and reorder them in the data otherwise.
+#'
+#' If passing `FALSE`, it is assumed that the feature names and types are the same,
+#' and come in the same order as in the training data.
+#'
+#' Be aware that this only applies to column names and not to factor levels in categorical columns.
+#'
+#' Note that this check might add some sizable latency to the predictions, so it's
+#' recommended to disable it for performance-sensitive applications.
+#' @param ... Not used.
+#' @return Either a numeric vector (for 1D outputs), numeric matrix (for 2D outputs), numeric array
+#' (for 3D and higher), or `factor` (for class predictions). See documentation for parameter `type`
+#' for details about what the output type and shape will be.
+#' @method predict xgboost
+#' @export
+#' @examples
+#' data("ToothGrowth")
+#' y <- ToothGrowth$supp
+#' x <- ToothGrowth[, -2L]
+#' model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L)
+#' pred_prob <- predict(model, x[1:5, ], type = "response")
+#' pred_raw <- predict(model, x[1:5, ], type = "raw")
+#' pred_class <- predict(model, x[1:5, ], type = "class")
+#'
+#' # Relationships between these
+#' manual_probs <- 1 / (1 + exp(-pred_raw))
+#' manual_class <- ifelse(manual_probs < 0.5, levels(y)[1], levels(y)[2])
+#'
+#' # They should match up to numerical precision
+#' round(pred_prob, 6) == round(manual_probs, 6)
+#' pred_class == manual_class
+predict.xgboost <- function(
+ object,
+ newdata,
+ type = "response",
+ base_margin = NULL,
+ iteration_range = NULL,
+ validate_features = TRUE,
+ ...
+) {
+ if (inherits(newdata, "xgb.DMatrix")) {
+ stop(
+ "Predictions on 'xgb.DMatrix' objects are not supported with 'xgboost' class.",
+ " Try 'xgb.train' or 'predict.xgb.Booster'."
+ )
+ }
+
+ outputmargin <- FALSE
+ predleaf <- FALSE
+ predcontrib <- FALSE
+ predinteraction <- FALSE
+ pred_class <- FALSE
+ strict_shape <- FALSE
+ allowed_types <- c(
+ "response",
+ "raw",
+ "class",
+ "leaf",
+ "contrib",
+ "interaction"
+ )
+ type <- head(type, 1L)
+ if (!is.character(type) || !(type %in% allowed_types)) {
+ stop("'type' must be one of: ", paste(allowed_types, collapse = ", "))
+ }
+
+ if (type != "response") {
+ switch(
+ type,
+ "raw" = {
+ outputmargin <- TRUE
+ }, "class" = {
+ if (is.null(attributes(object)$metadata$y_levels)) {
+ stop("Prediction type 'class' is only for classification objectives.")
+ }
+ pred_class <- TRUE
+ outputmargin <- TRUE
+ }, "leaf" = {
+ predleaf <- TRUE
+ strict_shape <- TRUE # required for 3D and 4D outputs
+ }, "contrib" = {
+ predcontrib <- TRUE
+ }, "interaction" = {
+ predinteraction <- TRUE
+ }
+ )
+ }
+ out <- predict.xgb.Booster(
+ object,
+ newdata,
+ outputmargin = outputmargin,
+ predleaf = predleaf,
+ predcontrib = predcontrib,
+ predinteraction = predinteraction,
+ iterationrange = iteration_range,
+ strict_shape = strict_shape,
+ validate_features = validate_features,
+ base_margin = base_margin
+ )
+
+ if (strict_shape) {
+ # Should only end up here for leaf predictions
+ out_dims <- dim(out)
+ dims_remove <- integer()
+ if (out_dims[3L] == 1L) {
+ dims_remove <- c(dims_remove, -3L)
+ }
+ if (length(out_dims) >= 4L && out_dims[4L] == 1L) {
+ dims_remove <- c(dims_remove, -4L)
+ }
+ if (length(dims_remove)) {
+ new_dimnames <- dimnames(out)[dims_remove]
+ dim(out) <- out_dims[dims_remove]
+ dimnames(out) <- new_dimnames
+ }
+ }
+
+ if (pred_class) {
+
+ if (is.null(dim(out))) {
+ out <- as.integer(out >= 0) + 1L
+ } else {
+ out <- max.col(out, ties.method = "first")
+ }
+ attr_out <- attributes(out)
+ attr_out$class <- "factor"
+ attr_out$levels <- attributes(object)$metadata$y_levels
+ attributes(out) <- attr_out
+
+ } else if (NCOL(out) > 1L || (strict_shape && length(dim(out)) >= 3L)) {
+
+ names_use <- NULL
+ if (NROW(attributes(object)$metadata$y_levels) > 2L) {
+ names_use <- attributes(object)$metadata$y_levels
+ } else if (NROW(attributes(object)$metadata$y_names)) {
+ names_use <- attributes(object)$metadata$y_names
+ } else if (NROW(attributes(object)$params$quantile_alpha) > 1L) {
+ names_use <- paste0("q", attributes(object)$params$quantile_alpha)
+ if (anyDuplicated(names_use)) {
+ warning("Cannot add quantile names to output due to clashes in their character conversions")
+ names_use <- NULL
+ }
+ }
+ if (NROW(names_use)) {
+ dimnames_out <- dimnames(out)
+ dim_with_names <- if (type == "leaf") 3L else 2L
+ dimnames_out[[dim_with_names]] <- names_use
+ .Call(XGSetArrayDimNamesInplace_R, out, dimnames_out)
+ }
+
+ }
+
+ return(out)
+}
+
+#' @title Print info from XGBoost model
+#' @description Prints basic properties of an XGBoost model object.
+#' @param x An XGBoost model object of class `xgboost`, as produced by function [xgboost()].
+#' @param ... Not used.
+#' @return Same object `x`, after printing its info.
#' @method print xgboost
#' @export
print.xgboost <- function(x, ...) {
diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd
index af90ddded197..4ce043799436 100644
--- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd
+++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd
@@ -5,7 +5,7 @@
\title{Model Serialization and Compatibility}
\description{
When it comes to serializing XGBoost models, it's possible to use R serializers such as
-\code{\link[=save]{save()}} or \code{\link[=saveRDS]{saveRDS()}} to serialize an XGBoost R model, but XGBoost also provides
+\code{\link[=save]{save()}} or \code{\link[=saveRDS]{saveRDS()}} to serialize an XGBoost model object, but XGBoost also provides
its own serializers with better compatibility guarantees, which allow loading
said models in other language bindings of XGBoost.
@@ -35,14 +35,15 @@ the model was fit, or saving the R call that produced the model, but are otherwi
not used for prediction / importance / plotting / etc.
These R attributes are only preserved when using R's serializers.
-In addition to the regular \code{xgb.Booster} objects producted by \code{\link[=xgb.train]{xgb.train()}}, the
-function \code{\link[=xgboost]{xgboost()}} produces a different subclass \code{xgboost}, which keeps other
-additional metadata as R attributes such as class names in classification problems,
-and which has a dedicated \code{predict} method that uses different defaults. XGBoost's
+In addition to the regular \code{xgb.Booster} objects produced by \code{\link[=xgb.train]{xgb.train()}}, the
+function \code{\link[=xgboost]{xgboost()}} produces objects with a different subclass \code{xgboost} (which
+inherits from \code{xgb.Booster}), which keeps other additional metadata as R attributes
+such as class names in classification problems, and which has a dedicated \code{predict}
+method that uses different defaults and takes different argument names. XGBoost's
own serializers can work with this \code{xgboost} class, but as they do not keep R
attributes, the resulting object, when deserialized, is downcasted to the regular
\code{xgb.Booster} class (i.e. it loses the metadata, and the resulting object will use
-\code{predict.xgb.Booster} instead of \code{predict.xgboost}) - for these \code{xgboost} objects,
+\code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} instead of \code{\link[=predict.xgboost]{predict.xgboost()}}) - for these \code{xgboost} objects,
\code{saveRDS} might thus be a better option if the extra functionalities are needed.
Note that XGBoost models in R starting from version \verb{2.1.0} and onwards, and
@@ -50,8 +51,8 @@ XGBoost models before version \verb{2.1.0}; have a very different R object struc
are incompatible with each other. Hence, models that were saved with R serializers
like \code{\link[=saveRDS]{saveRDS()}} or \code{\link[=save]{save()}} before version \verb{2.1.0} will not work with latter
\code{xgboost} versions and vice versa. Be aware that the structure of R model objects
-could in theory change again in the future, so XGBoost's serializers
-should be preferred for long-term storage.
+could in theory change again in the future, so XGBoost's serializers should be
+preferred for long-term storage.
Furthermore, note that using the package \code{qs} for serialization will require
version 0.26 or higher of said package, and will have the same compatibility
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index d97984e7fa48..5cdfed97f504 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -80,7 +80,9 @@ predict using only the first one.
If passing \code{NULL}, will either stop at the best iteration if the model used early stopping, or use all
of the iterations (rounds) otherwise.
-If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.}
+If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
+
+Not applicable to \code{gblinear} booster.}
\item{strict_shape}{Whether to always return an array with the same dimensions for the given prediction mode
regardless of the model type - meaning that, for example, both a multi-class and a binary classification
@@ -118,7 +120,13 @@ and come in the same order as in the training data.
Note that this check might add some sizable latency to the predictions, so it's
recommended to disable it for performance-sensitive applications.}
-\item{base_margin}{Base margin used for boosting from existing model.
+\item{base_margin}{Base margin used for boosting from existing model (raw score that gets added to
+all observations independently of the trees in the model).
+
+If supplied, should be either a vector with length equal to the number of rows in \code{newdata}
+(for objectives which produces a single score per observation), or a matrix with number of
+rows matching to the number rows in \code{newdata} and number of columns matching to the number
+of scores estimated by the model (e.g. number of classes for multi-class classification).
Note that, if \code{newdata} is an \code{xgb.DMatrix} object, this argument will
be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
@@ -141,6 +149,9 @@ Note that objective variant \code{multi:softmax} defaults towards predicting mos
For multi-class / multi-target, they will be arranged so that columns in the output will have
the leafs from one group followed by leafs of the other group (e.g. order will be \code{group1:feat1},
\code{group1:feat2}, ..., \code{group2:feat1}, \code{group2:feat2}, ...).
+
+If there is more than one parallel tree (e.g. random forests), the parallel trees will be the
+last grouping in the resulting order, which will still be 2D.
\item For \code{predcontrib}: when not multi-class / multi-target, a matrix with dimensions
\verb{[nrows, nfeats+1]}. The last "+ 1" column corresponds to the baseline value.
@@ -157,7 +168,7 @@ dimension should produce practically the same result as \code{predcontrib = TRUE
For multi-class and multi-target, will be a 4D array with dimensions \verb{[nrows, ngroups, nfeats+1, nfeats+1]}
}
-If passing \code{strict_shape=FALSE}, the result is always an array:
+If passing \code{strict_shape=TRUE}, the result is always a matrix (if 2D) or array (if 3D or higher):
\itemize{
\item For normal predictions, the dimension is \verb{[nrows, ngroups]}.
\item For \code{predcontrib=TRUE}, the dimension is \verb{[nrows, ngroups, nfeats+1]}.
diff --git a/R-package/man/predict.xgboost.Rd b/R-package/man/predict.xgboost.Rd
new file mode 100644
index 000000000000..15e75965aaa6
--- /dev/null
+++ b/R-package/man/predict.xgboost.Rd
@@ -0,0 +1,138 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgboost.R
+\name{predict.xgboost}
+\alias{predict.xgboost}
+\title{Compute predictions from XGBoost model on new data}
+\usage{
+\method{predict}{xgboost}(
+ object,
+ newdata,
+ type = "response",
+ base_margin = NULL,
+ iteration_range = NULL,
+ validate_features = TRUE,
+ ...
+)
+}
+\arguments{
+\item{object}{An XGBoost model object of class \code{xgboost}, as produced by function \code{\link[=xgboost]{xgboost()}}.
+
+Note that there is also a lower-level \code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} method for models of class
+\code{xgb.Booster} as produced by \code{\link[=xgb.train]{xgb.train()}}, which can also be used for \code{xgboost} class models as
+an alternative that performs fewer validations and post-processings.}
+
+\item{newdata}{Data on which to compute predictions from the model passed in \code{object}. Supported
+input classes are:
+\itemize{
+\item Data Frames (class \code{data.frame} from base R and subclasses like \code{data.table}).
+\item Matrices (class \code{matrix} from base R).
+\item Sparse matrices from package \code{Matrix}, either as class \code{dgRMatrix} (CSR) or \code{dgCMatrix} (CSC).
+\item Sparse vectors from package \code{Matrix}, which will be interpreted as containing a single
+observation.
+}
+
+In the case of data frames, if there are any categorical features, they should be of class
+\code{factor} and should have the same levels as the \code{factor} columns of the data from which the model
+was constructed.
+
+If there are named columns and the model was fitted to data with named columns, they will be
+matched by name by default (see \code{validate_features}).}
+
+\item{type}{Type of prediction to make. Supported options are:
+\itemize{
+\item \code{"response"}: will output model predictions on the scale of the response variable (e.g.
+probabilities of belonging to the last class in the case of binary classification). Result will
+be either a numeric vector with length matching to rows in \code{newdata}, or a numeric matrix with
+shape \verb{[nrows(newdata), nscores]} (for objectives that produce more than one score per
+observation such as multi-class classification or multi-quantile regression).
+\item \code{"raw"}: will output the unprocessed boosting scores (e.g. log-odds in the case of objective
+\code{binary:logistic}). Same output shape and type as for \code{"response"}.
+\item \code{"class"}: will output the class with the highest predicted probability, returned as a \code{factor}
+(only applicable to classification objectives) with length matching to rows in \code{newdata}.
+\item \code{"leaf"}: will output the terminal node indices of each observation across each tree, as an
+integer matrix of shape \verb{[nrows(newdata), ntrees]}, or as an integer array with an extra one or
+two dimensions, up to \verb{[nrows(newdata), ntrees, nscores, n_parallel_trees]} for models that
+produce more than one score per tree and/or which have more than one parallel tree (e.g.
+random forests).
+
+Only applicable to tree-based boosters (not \code{gblinear}).
+\item \code{"contrib"}: will produce per-feature contribution estimates towards the model score for a
+given observation, based on SHAP values. The contribution values are on the scale of
+untransformed margin (e.g., for binary classification, the values are log-odds deviations from
+the baseline).
+
+Output will be a numeric matrix with shape \verb{[nrows, nfeatures+1]}, with the intercept being the
+last feature, or a numeric array with shape \verb{[nrows, nscores, nfeatures+1]} if the model
+produces more than one score per observation.
+\item \code{"interaction"}: similar to \code{"contrib"}, but computing SHAP values of contributions of
+interaction of each pair of features. Note that this operation might be rather expensive in
+terms of compute and memory.
+
+Since it quadratically depends on the number of features, it is recommended to perform
+selection of the most important features first.
+
+Output will be a numeric array of shape \verb{[nrows, nfeatures+1, nfeatures+1]}, or shape
+\verb{[nrows, nscores, nfeatures+1, nfeatures+1]} (for objectives that produce more than one score
+per observation).
+}}
+
+\item{base_margin}{Base margin used for boosting from existing model (raw score that gets added to
+all observations independently of the trees in the model).
+
+If supplied, should be either a vector with length equal to the number of rows in \code{newdata}
+(for objectives which produces a single score per observation), or a matrix with number of
+rows matching to the number rows in \code{newdata} and number of columns matching to the number
+of scores estimated by the model (e.g. number of classes for multi-class classification).}
+
+\item{iteration_range}{Sequence of rounds/iterations from the model to use for prediction, specified by passing
+a two-dimensional vector with the start and end numbers in the sequence (same format as R's \code{seq} - i.e.
+base-1 indexing, and inclusive of both ends).
+
+For example, passing \code{c(1,20)} will predict using the first twenty iterations, while passing \code{c(1,1)} will
+predict using only the first one.
+
+If passing \code{NULL}, will either stop at the best iteration if the model used early stopping, or use all
+of the iterations (rounds) otherwise.
+
+If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
+
+Not applicable to \code{gblinear} booster.}
+
+\item{validate_features}{Validate that the feature names in the data match to the feature names
+in the column, and reorder them in the data otherwise.
+
+If passing \code{FALSE}, it is assumed that the feature names and types are the same,
+and come in the same order as in the training data.
+
+Be aware that this only applies to column names and not to factor levels in categorical columns.
+
+Note that this check might add some sizable latency to the predictions, so it's
+recommended to disable it for performance-sensitive applications.}
+
+\item{...}{Not used.}
+}
+\value{
+Either a numeric vector (for 1D outputs), numeric matrix (for 2D outputs), numeric array
+(for 3D and higher), or \code{factor} (for class predictions). See documentation for parameter \code{type}
+for details about what the output type and shape will be.
+}
+\description{
+Predict values on data based on XGBoost model.
+}
+\examples{
+data("ToothGrowth")
+y <- ToothGrowth$supp
+x <- ToothGrowth[, -2L]
+model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L)
+pred_prob <- predict(model, x[1:5, ], type = "response")
+pred_raw <- predict(model, x[1:5, ], type = "raw")
+pred_class <- predict(model, x[1:5, ], type = "class")
+
+# Relationships between these
+manual_probs <- 1 / (1 + exp(-pred_raw))
+manual_class <- ifelse(manual_probs < 0.5, levels(y)[1], levels(y)[2])
+
+# They should match up to numerical precision
+round(pred_prob, 6) == round(manual_probs, 6)
+pred_class == manual_class
+}
diff --git a/R-package/man/print.xgboost.Rd b/R-package/man/print.xgboost.Rd
new file mode 100644
index 000000000000..235f3e36bdd0
--- /dev/null
+++ b/R-package/man/print.xgboost.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgboost.R
+\name{print.xgboost}
+\alias{print.xgboost}
+\title{Print info from XGBoost model}
+\usage{
+\method{print}{xgboost}(x, ...)
+}
+\arguments{
+\item{x}{An XGBoost model object of class \code{xgboost}, as produced by function \code{\link[=xgboost]{xgboost()}}.}
+
+\item{...}{Not used.}
+}
+\value{
+Same object \code{x}, after printing its info.
+}
+\description{
+Prints basic properties of an XGBoost model object.
+}
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index 2cfa2e713038..23a24dec4226 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -45,9 +45,11 @@ xgb.QuantileDMatrix(
\item{data}{Data from which to create a DMatrix, which can then be used for fitting models or
for getting predictions out of a fitted model.
-Supported input types are as follows:\itemize{
+Supported input types are as follows:
+\itemize{
\item \code{matrix} objects, with types \code{numeric}, \code{integer}, or \code{logical}.
-\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}.
+\item \code{data.frame} objects, with columns of types \code{numeric}, \code{integer}, \code{logical}, or \code{factor}
+}
Note that xgboost uses base-0 encoding for categorical types, hence \code{factor} types (which use base-1
encoding') will be converted inside the function call. Be aware that the encoding used for \code{factor}
@@ -56,32 +58,16 @@ responsibility to ensure that factor columns have the same levels as the ones fr
was constructed.
Other column types are not supported.
+\itemize{
\item CSR matrices, as class \code{dgRMatrix} from package \code{Matrix}.
-\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}. These are \strong{not} supported for
-'xgb.QuantileDMatrix'.
-\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
-as a single row (only when making predictions from a fitted model).
-\item Text files in a supported format, passed as a \code{character} variable containing the URI path to
-the file, with an optional format specifier.
-
-These are \strong{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{
-\item XGBoost's own binary format for DMatrices, as produced by \code{\link[=xgb.DMatrix.save]{xgb.DMatrix.save()}}.
-\item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
-\code{?format=libsvm} at the end of the file path. It will be the default format if not
-otherwise specified.
-\item CSV files (comma-separated values). This format can be specified by adding suffix
-\code{?format=csv} at the end ofthe file path. It will \strong{not} be auto-deduced from file extensions.
+\item CSC matrices, as class \code{dgCMatrix} from package \code{Matrix}.
}
-Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
-it will not look at the extension or file contents to determine that it is a comma-separated value.
-Instead, the format must be specified following the URI format, so the input to \code{data} should be passed
-like this: \code{"file.csv?format=csv"} (or \code{"file.csv?format=csv&label_column=0"} if the first column
-corresponds to the labels).
-
-For more information about passing text files as input, see the articles
-\href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
-\href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
+These are \strong{not} supported by \code{xgb.QuantileDMatrix}.
+\itemize{
+\item XGBoost's own binary format for DMatrices, as produced by \code{\link[=xgb.DMatrix.save]{xgb.DMatrix.save()}}.
+\item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
+as a single row (only when making predictions from a fitted model).
}}
\item{label}{Label of the training data. For classification problems, should be passed encoded as
@@ -144,13 +130,7 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h
\item{feature_weights}{Set feature weights for column sampling.}
-\item{data_split_mode}{When passing a URI (as R \code{character}) as input, this signals
-whether to split by row or column. Allowed values are \code{"row"} and \code{"col"}.
-
-In distributed mode, the file is split accordingly; otherwise this is only an indicator on
-how the file was split beforehand. Default to row.
-
-This is not used when \code{data} is not a URI.}
+\item{data_split_mode}{Not used yet. This parameter is for distributed training, which is not yet available for the R package.}
\item{ref}{The training dataset that provides quantile information, needed when creating
validation/test dataset with \code{\link[=xgb.QuantileDMatrix]{xgb.QuantileDMatrix()}}. Supplying the training DMatrix
@@ -163,8 +143,8 @@ applied to the validation/test data}
This is only supported when constructing a QuantileDMatrix.}
}
\value{
-An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
-subclass 'xgb.QuantileDMatrix'.
+An 'xgb.DMatrix' object. If calling \code{xgb.QuantileDMatrix}, it will have additional
+subclass \code{xgb.QuantileDMatrix}.
}
\description{
Construct an 'xgb.DMatrix' object from a given data source, which can then be passed to functions
diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd
index f4f51059d653..969a7d103c62 100644
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@@ -54,7 +54,7 @@ Only used when \code{features = NULL}.}
averages the SHAP values over all classes. Pass a (0-based) class index
to show only SHAP values of that class.}
-\item{approxcontrib}{Passed to \code{predict()} when \code{shap_contrib = NULL}.}
+\item{approxcontrib}{Passed to \code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} when \code{shap_contrib = NULL}.}
\item{subsample}{Fraction of data points randomly picked for plotting.
The default (\code{NULL}) will use up to 100k data points.}
diff --git a/R-package/man/xgb.plot.shap.summary.Rd b/R-package/man/xgb.plot.shap.summary.Rd
index f6df2daca758..b72c560b3769 100644
--- a/R-package/man/xgb.plot.shap.summary.Rd
+++ b/R-package/man/xgb.plot.shap.summary.Rd
@@ -51,7 +51,7 @@ Only used when \code{features = NULL}.}
averages the SHAP values over all classes. Pass a (0-based) class index
to show only SHAP values of that class.}
-\item{approxcontrib}{Passed to \code{predict()} when \code{shap_contrib = NULL}.}
+\item{approxcontrib}{Passed to \code{\link[=predict.xgb.Booster]{predict.xgb.Booster()}} when \code{shap_contrib = NULL}.}
\item{subsample}{Fraction of data points randomly picked for plotting.
The default (\code{NULL}) will use up to 100k data points.}
diff --git a/R-package/tests/testthat/test_xgboost.R b/R-package/tests/testthat/test_xgboost.R
index a4ac658a11b8..8f0c1e7ba9a7 100644
--- a/R-package/tests/testthat/test_xgboost.R
+++ b/R-package/tests/testthat/test_xgboost.R
@@ -1,5 +1,8 @@
library(survival)
library(data.table)
+data("iris")
+data("mtcars")
+data("ToothGrowth")
test_that("Auto determine objective", {
y_num <- seq(1, 10)
@@ -621,3 +624,324 @@ test_that("Whole function works", {
expect_true(any(grepl("Number of iterations: 5", txt, fixed = TRUE)))
expect_true(any(grepl("Number of features: 8", txt, fixed = TRUE)))
})
+
+test_that("Can predict probabilities and raw scores", {
+ y <- ToothGrowth$supp
+ x <- ToothGrowth[, -2L]
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L)
+ pred_prob <- predict(model, x, type = "response")
+ pred_raw <- predict(model, x, type = "raw")
+ expect_true(is.vector(pred_prob))
+ expect_equal(length(pred_prob), nrow(x))
+ expect_true(min(pred_prob) >= 0)
+ expect_true(max(pred_prob) <= 1)
+
+ expect_equal(length(pred_raw), nrow(x))
+ expect_true(is.vector(pred_raw))
+ expect_true(min(pred_raw) < 0)
+ expect_true(max(pred_raw) > 0)
+
+ expect_equal(
+ pred_prob,
+ 1 / (1 + exp(-pred_raw)),
+ tolerance = 1e-6
+ )
+})
+
+test_that("Can predict class", {
+ y <- iris$Species
+ x <- iris[, -5L]
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L)
+ pred_class <- predict(model, x, type = "class")
+ expect_true(is.factor(pred_class))
+ expect_equal(levels(pred_class), levels(y))
+
+ y <- ToothGrowth$supp
+ x <- ToothGrowth[, -2L]
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L)
+ pred_class <- predict(model, x, type = "class")
+ expect_true(is.factor(pred_class))
+ expect_equal(levels(pred_class), levels(y))
+
+ probs <- predict(model, x, type = "response")
+ expect_true(all(pred_class[probs >= 0.5] == levels(y)[[2L]]))
+ expect_true(all(pred_class[probs < 0.5] == levels(y)[[1L]]))
+
+ # Check that it fails for regression models
+ y <- mtcars$mpg
+ x <- mtcars[, -1L]
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L)
+ expect_error({
+ predict(model, x, type = "class")
+ })
+})
+
+test_that("Metadata survives serialization", {
+ y <- iris$Species
+ x <- iris[, -5L]
+ model_fresh <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L)
+ temp_file <- file.path(tempdir(), "xgb_model.Rds")
+ saveRDS(model_fresh, temp_file)
+ model <- readRDS(temp_file)
+ pred_class <- predict(model, x, type = "class")
+ expect_true(is.factor(pred_class))
+ expect_equal(levels(pred_class), levels(y))
+})
+
+test_that("Column names aren't added when not appropriate", {
+ pred_types <- c(
+ "response",
+ "raw",
+ "leaf"
+ )
+ for (pred_type in pred_types) {
+ y <- mtcars$mpg
+ x <- mtcars[, -1L]
+ model <- xgboost(
+ x,
+ y,
+ nthreads = 1L,
+ nrounds = 3L,
+ max_depth = 2L,
+ objective = "reg:quantileerror",
+ quantile_alpha = 0.5
+ )
+ pred <- predict(model, x, type = pred_type)
+ if (pred_type %in% c("raw", "response")) {
+ expect_true(is.vector(pred))
+ } else {
+ expect_true(length(dim(pred)) >= 2L)
+ expect_null(colnames(pred))
+ }
+
+ y <- ToothGrowth$supp
+ x <- ToothGrowth[, -2L]
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L)
+ pred <- predict(model, x, type = pred_type)
+ if (pred_type %in% c("raw", "response")) {
+ expect_true(is.vector(pred))
+ } else {
+ expect_true(length(dim(pred)) >= 2L)
+ expect_null(colnames(pred))
+ }
+ }
+})
+
+test_that("Column names from multiclass are added to non-class predictions", {
+ y <- iris$Species
+ x <- iris[, -5L]
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L)
+
+ pred_types_with_colnames <- c(
+ "response",
+ "raw",
+ "contrib",
+ "interaction"
+ )
+
+ for (pred_type in pred_types_with_colnames) {
+ pred <- predict(model, x, type = pred_type)
+ expect_equal(nrow(pred), nrow(x))
+ expect_equal(ncol(pred), 3L)
+ expect_equal(colnames(pred), levels(y))
+ }
+})
+
+test_that("Column names from multitarget are added to predictions", {
+ y <- data.frame(
+ ylog = log(mtcars$mpg),
+ ysqrt = sqrt(mtcars$mpg)
+ )
+ x <- mtcars[, -1L]
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 3L, max_depth = 2L)
+
+ pred_types_with_colnames <- c(
+ "response",
+ "raw",
+ "contrib",
+ "interaction"
+ )
+
+ for (pred_type in pred_types_with_colnames) {
+ pred <- predict(model, x, type = pred_type)
+ expect_equal(nrow(pred), nrow(x))
+ expect_equal(ncol(pred), 2L)
+ expect_equal(colnames(pred), colnames(y))
+ }
+})
+
+test_that("Column names from multiquantile are added to predictions", {
+ y <- mtcars$mpg
+ x <- mtcars[, -1L]
+ model <- xgboost(
+ x,
+ y,
+ nthreads = 1L,
+ nrounds = 3L,
+ max_depth = 2L,
+ objective = "reg:quantileerror",
+ quantile_alpha = c(0.25, 0.5, 0.75)
+ )
+
+ pred_types_with_colnames <- c(
+ "response",
+ "raw",
+ "contrib",
+ "interaction"
+ )
+
+ for (pred_type in pred_types_with_colnames) {
+ pred <- predict(model, x, type = pred_type)
+ expect_equal(nrow(pred), nrow(x))
+ expect_equal(ncol(pred), 3L)
+ expect_equal(colnames(pred), c("q0.25", "q0.5", "q0.75"))
+ }
+})
+
+test_that("Leaf predictions have multiple dimensions when needed", {
+ # single score, multiple trees
+ y <- mtcars$mpg
+ x <- mtcars[, -1L]
+ model <- xgboost(
+ x,
+ y,
+ nthreads = 1L,
+ nrounds = 4L,
+ max_depth = 2L,
+ objective = "reg:quantileerror",
+ quantile_alpha = 0.5
+ )
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 4L))
+ expect_equal(row.names(pred), row.names(x))
+ expect_null(colnames(pred))
+
+ # single score, single tree
+ model <- xgboost(
+ x,
+ y,
+ nthreads = 1L,
+ nrounds = 1L,
+ max_depth = 2L,
+ objective = "reg:quantileerror",
+ quantile_alpha = 0.5
+ )
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 1L))
+ expect_equal(row.names(pred), row.names(x))
+ expect_null(colnames(pred))
+
+ # multiple score, multiple trees
+ model <- xgboost(
+ x,
+ y,
+ nthreads = 1L,
+ nrounds = 4L,
+ max_depth = 2L,
+ objective = "reg:quantileerror",
+ quantile_alpha = c(0.25, 0.5, 0.75)
+ )
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 4L, 3L))
+ expect_equal(row.names(pred), row.names(x))
+ expect_null(colnames(pred))
+ expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75"))
+
+ # multiple score, single tree
+ model <- xgboost(
+ x,
+ y,
+ nthreads = 1L,
+ nrounds = 1L,
+ max_depth = 2L,
+ objective = "reg:quantileerror",
+ quantile_alpha = c(0.25, 0.5, 0.75)
+ )
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 1L, 3L))
+ expect_equal(row.names(pred), row.names(x))
+ expect_null(colnames(pred))
+ expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75"))
+
+ # parallel trees, single tree, single score
+ model <- xgboost(
+ x,
+ y,
+ nthreads = 1L,
+ nrounds = 1L,
+ max_depth = 2L,
+ objective = "count:poisson",
+ num_parallel_tree = 2L
+ )
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 1L, 2L))
+ expect_equal(row.names(pred), row.names(x))
+ expect_null(colnames(pred))
+ expect_null(dimnames(pred)[[3L]])
+
+ # num_parallel_tree>1 + multiple scores is not supported at the moment so no test for it.
+})
+
+test_that("Column names from multiclass are added to leaf predictions", {
+ y <- iris$Species
+ x <- iris[, -5L]
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 4L, max_depth = 2L)
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 4L, 3L))
+ expect_equal(dimnames(pred)[[3L]], levels(y))
+
+ # Check also for a single tree
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 1L, max_depth = 2L)
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 1L, 3L))
+ expect_equal(dimnames(pred)[[3L]], levels(y))
+})
+
+test_that("Column names from multitarget are added to leaf predictions", {
+ y <- data.frame(
+ ylog = log(mtcars$mpg),
+ ysqrt = sqrt(mtcars$mpg)
+ )
+ x <- mtcars[, -1L]
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 4L, max_depth = 2L)
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 4L, 2L))
+ expect_equal(dimnames(pred)[[3L]], colnames(y))
+
+ # Check also for a single tree
+ model <- xgboost(x, y, nthreads = 1L, nrounds = 1L, max_depth = 2L)
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 1L, 2L))
+ expect_equal(dimnames(pred)[[3L]], colnames(y))
+})
+
+test_that("Column names from multiquantile are added to leaf predictions", {
+ y <- mtcars$mpg
+ x <- mtcars[, -1L]
+ model <- xgboost(
+ x,
+ y,
+ nthreads = 1L,
+ nrounds = 4L,
+ max_depth = 2L,
+ objective = "reg:quantileerror",
+ quantile_alpha = c(0.25, 0.5, 0.75)
+ )
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 4L, 3L))
+ expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75"))
+
+ # Check also for a single tree
+ model <- xgboost(
+ x,
+ y,
+ nthreads = 1L,
+ nrounds = 1L,
+ max_depth = 2L,
+ objective = "reg:quantileerror",
+ quantile_alpha = c(0.25, 0.5, 0.75)
+ )
+ pred <- predict(model, x, type = "leaf")
+ expect_equal(dim(pred), c(nrow(x), 1L, 3L))
+ expect_equal(dimnames(pred)[[3L]], c("q0.25", "q0.5", "q0.75"))
+})
diff --git a/demo/dask/dask_learning_to_rank.py b/demo/dask/dask_learning_to_rank.py
new file mode 100644
index 000000000000..c08450fec56e
--- /dev/null
+++ b/demo/dask/dask_learning_to_rank.py
@@ -0,0 +1,201 @@
+"""
+Learning to rank with the Dask Interface
+========================================
+
+ .. versionadded:: 3.0.0
+
+This is a demonstration of using XGBoost for learning to rank tasks using the
+MSLR_10k_letor dataset. For more infomation about the dataset, please visit its
+`description page `_.
+
+See :ref:`ltr-dist` for a general description for distributed learning to rank and
+:ref:`ltr-dask` for Dask-specific features.
+
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+from contextlib import contextmanager
+from typing import Generator
+
+import dask
+import numpy as np
+from dask import dataframe as dd
+from distributed import Client, LocalCluster, wait
+from sklearn.datasets import load_svmlight_file
+
+from xgboost import dask as dxgb
+
+
+def load_mslr_10k(
+ device: str, data_path: str, cache_path: str
+) -> tuple[dd.DataFrame, dd.DataFrame, dd.DataFrame]:
+ """Load the MSLR10k dataset from data_path and save parquet files in the cache_path."""
+ root_path = os.path.expanduser(args.data)
+ cache_path = os.path.expanduser(args.cache)
+
+ # Use only the Fold1 for demo:
+ # Train, Valid, Test
+ # {S1,S2,S3}, S4, S5
+ fold = 1
+
+ if not os.path.exists(cache_path):
+ os.mkdir(cache_path)
+ fold_path = os.path.join(root_path, f"Fold{fold}")
+ train_path = os.path.join(fold_path, "train.txt")
+ valid_path = os.path.join(fold_path, "vali.txt")
+ test_path = os.path.join(fold_path, "test.txt")
+
+ X_train, y_train, qid_train = load_svmlight_file(
+ train_path, query_id=True, dtype=np.float32
+ )
+ columns = [f"f{i}" for i in range(X_train.shape[1])]
+ X_train = dd.from_array(X_train.toarray(), columns=columns)
+ y_train = y_train.astype(np.int32)
+ qid_train = qid_train.astype(np.int32)
+
+ X_train["y"] = dd.from_array(y_train)
+ X_train["qid"] = dd.from_array(qid_train)
+ X_train.to_parquet(os.path.join(cache_path, "train"), engine="pyarrow")
+
+ X_valid, y_valid, qid_valid = load_svmlight_file(
+ valid_path, query_id=True, dtype=np.float32
+ )
+ X_valid = dd.from_array(X_valid.toarray(), columns=columns)
+ y_valid = y_valid.astype(np.int32)
+ qid_valid = qid_valid.astype(np.int32)
+
+ X_valid["y"] = dd.from_array(y_valid)
+ X_valid["qid"] = dd.from_array(qid_valid)
+ X_valid.to_parquet(os.path.join(cache_path, "valid"), engine="pyarrow")
+
+ X_test, y_test, qid_test = load_svmlight_file(
+ test_path, query_id=True, dtype=np.float32
+ )
+
+ X_test = dd.from_array(X_test.toarray(), columns=columns)
+ y_test = y_test.astype(np.int32)
+ qid_test = qid_test.astype(np.int32)
+
+ X_test["y"] = dd.from_array(y_test)
+ X_test["qid"] = dd.from_array(qid_test)
+ X_test.to_parquet(os.path.join(cache_path, "test"), engine="pyarrow")
+
+ df_train = dd.read_parquet(
+ os.path.join(cache_path, "train"), calculate_divisions=True
+ )
+ df_valid = dd.read_parquet(
+ os.path.join(cache_path, "valid"), calculate_divisions=True
+ )
+ df_test = dd.read_parquet(
+ os.path.join(cache_path, "test"), calculate_divisions=True
+ )
+
+ return df_train, df_valid, df_test
+
+
+def ranking_demo(client: Client, args: argparse.Namespace) -> None:
+ """Learning to rank with data sorted locally."""
+ df_tr, df_va, _ = load_mslr_10k(args.device, args.data, args.cache)
+
+ X_train: dd.DataFrame = df_tr[df_tr.columns.difference(["y", "qid"])]
+ y_train = df_tr[["y", "qid"]]
+ Xy_train = dxgb.DaskQuantileDMatrix(client, X_train, y_train.y, qid=y_train.qid)
+
+ X_valid: dd.DataFrame = df_va[df_va.columns.difference(["y", "qid"])]
+ y_valid = df_va[["y", "qid"]]
+ Xy_valid = dxgb.DaskQuantileDMatrix(
+ client, X_valid, y_valid.y, qid=y_valid.qid, ref=Xy_train
+ )
+ # Upon training, you will see a performance warning about sorting data based on
+ # query groups.
+ dxgb.train(
+ client,
+ {"objective": "rank:ndcg", "device": args.device},
+ Xy_train,
+ evals=[(Xy_train, "Train"), (Xy_valid, "Valid")],
+ num_boost_round=100,
+ )
+
+
+def ranking_wo_split_demo(client: Client, args: argparse.Namespace) -> None:
+ """Learning to rank with data partitioned according to query groups."""
+ df_tr, df_va, df_te = load_mslr_10k(args.device, args.data, args.cache)
+
+ X_tr = df_tr[df_tr.columns.difference(["y", "qid"])]
+ X_va = df_va[df_va.columns.difference(["y", "qid"])]
+
+ # `allow_group_split=False` makes sure data is partitioned according to the query
+ # groups.
+ ltr = dxgb.DaskXGBRanker(allow_group_split=False, device=args.device)
+ ltr.client = client
+ ltr = ltr.fit(
+ X_tr,
+ df_tr.y,
+ qid=df_tr.qid,
+ eval_set=[(X_tr, df_tr.y), (X_va, df_va.y)],
+ eval_qid=[df_tr.qid, df_va.qid],
+ verbose=True,
+ )
+
+ df_te = df_te.persist()
+ wait([df_te])
+
+ X_te = df_te[df_te.columns.difference(["y", "qid"])]
+ predt = ltr.predict(X_te)
+ y = client.compute(df_te.y)
+ wait([predt, y])
+
+
+@contextmanager
+def gen_client(device: str) -> Generator[Client, None, None]:
+ match device:
+ case "cuda":
+ from dask_cuda import LocalCUDACluster
+
+ with LocalCUDACluster() as cluster:
+ with Client(cluster) as client:
+ with dask.config.set(
+ {
+ "array.backend": "cupy",
+ "dataframe.backend": "cudf",
+ }
+ ):
+ yield client
+ case "cpu":
+ with LocalCluster() as cluster:
+ with Client(cluster) as client:
+ yield client
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Demonstration of learning to rank using XGBoost."
+ )
+ parser.add_argument(
+ "--data",
+ type=str,
+ help="Root directory of the MSLR-WEB10K data.",
+ required=True,
+ )
+ parser.add_argument(
+ "--cache",
+ type=str,
+ help="Directory for caching processed data.",
+ required=True,
+ )
+ parser.add_argument("--device", choices=["cpu", "cuda"], default="cpu")
+ parser.add_argument(
+ "--no-split",
+ action="store_true",
+ help="Flag to indicate query groups should not be split.",
+ )
+ args = parser.parse_args()
+
+ with gen_client(args.device) as client:
+ if args.no_split:
+ ranking_wo_split_demo(client, args)
+ else:
+ ranking_demo(client, args)
diff --git a/demo/guide-python/cross_validation.py b/demo/guide-python/cross_validation.py
index 4e537108aa1a..a33a16c36f04 100644
--- a/demo/guide-python/cross_validation.py
+++ b/demo/guide-python/cross_validation.py
@@ -2,6 +2,7 @@
Demo for using cross validation
===============================
"""
+
import os
import numpy as np
@@ -83,9 +84,12 @@ def logregobj(preds, dtrain):
def evalerror(preds, dtrain):
labels = dtrain.get_label()
+ preds = 1.0 / (1.0 + np.exp(-preds))
return "error", float(sum(labels != (preds > 0.0))) / len(labels)
param = {"max_depth": 2, "eta": 1}
# train with customized objective
-xgb.cv(param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, feval=evalerror)
+xgb.cv(
+ param, dtrain, num_round, nfold=5, seed=0, obj=logregobj, custom_metric=evalerror
+)
diff --git a/demo/guide-python/learning_to_rank.py b/demo/guide-python/learning_to_rank.py
index b131b31f76f6..fbc1f44baf50 100644
--- a/demo/guide-python/learning_to_rank.py
+++ b/demo/guide-python/learning_to_rank.py
@@ -12,8 +12,8 @@
train on relevance degree, and the second part simulates click data and enable the
position debiasing training.
-For an overview of learning to rank in XGBoost, please see
-:doc:`Learning to Rank `.
+For an overview of learning to rank in XGBoost, please see :doc:`Learning to Rank
+`.
"""
from __future__ import annotations
@@ -31,7 +31,7 @@
from xgboost.testing.data import RelDataCV, simulate_clicks, sort_ltr_samples
-def load_mlsr_10k(data_path: str, cache_path: str) -> RelDataCV:
+def load_mslr_10k(data_path: str, cache_path: str) -> RelDataCV:
"""Load the MSLR10k dataset from data_path and cache a pickle object in cache_path.
Returns
@@ -89,7 +89,7 @@ def load_mlsr_10k(data_path: str, cache_path: str) -> RelDataCV:
def ranking_demo(args: argparse.Namespace) -> None:
"""Demonstration for learning to rank with relevance degree."""
- data = load_mlsr_10k(args.data, args.cache)
+ data = load_mslr_10k(args.data, args.cache)
# Sort data according to query index
X_train, y_train, qid_train = data.train
@@ -123,7 +123,7 @@ def ranking_demo(args: argparse.Namespace) -> None:
def click_data_demo(args: argparse.Namespace) -> None:
"""Demonstration for learning to rank with click data."""
- data = load_mlsr_10k(args.data, args.cache)
+ data = load_mslr_10k(args.data, args.cache)
train, test = simulate_clicks(data)
assert test is not None
diff --git a/demo/json-model/json_parser.py b/demo/guide-python/model_parser.py
similarity index 98%
rename from demo/json-model/json_parser.py
rename to demo/guide-python/model_parser.py
index b744d9569aea..39a459613409 100644
--- a/demo/json-model/json_parser.py
+++ b/demo/guide-python/model_parser.py
@@ -1,4 +1,9 @@
-"""Demonstration for parsing JSON/UBJSON tree model file generated by XGBoost.
+"""
+Demonstration for parsing JSON/UBJSON tree model files
+======================================================
+
+See :doc:`/tutorials/saving_model` for details about the model serialization.
+
"""
import argparse
diff --git a/demo/json-model/README.md b/demo/json-model/README.md
deleted file mode 100644
index 065d854f476a..000000000000
--- a/demo/json-model/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-We introduced initial support for saving XGBoost model in JSON format in 1.0.0. Note that
-it's still experimental and under development, output schema is subject to change due to
-bug fixes or further refactoring. For an overview, see https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html .
\ No newline at end of file
diff --git a/dev/prepare_jvm_release.py b/dev/prepare_jvm_release.py
index 0b4594e2d2c0..c5a72724f707 100644
--- a/dev/prepare_jvm_release.py
+++ b/dev/prepare_jvm_release.py
@@ -203,7 +203,7 @@ def main():
)
print(
"5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n"
- " python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n"
+ " python ops/script/change_scala_version.py --scala-version 2.13 --purge-artifacts\n"
" GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests -Dskip.native.build=true"
)
print(
diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py
index 08bb2cbfaff2..bfcc813a0ef1 100644
--- a/dev/release-artifacts.py
+++ b/dev/release-artifacts.py
@@ -123,7 +123,7 @@ def make_python_sdist(
with DirectoryExcursion(ROOT):
with open("python-package/pyproject.toml", "r") as f:
orig_pyproj_lines = f.read()
- with open("tests/buildkite/remove_nccl_dep.patch", "r") as f:
+ with open("ops/patch/remove_nccl_dep.patch", "r") as f:
patch_lines = f.read()
subprocess.run(
["patch", "-p0"], input=patch_lines, check=True, text=True, encoding="utf-8"
@@ -234,7 +234,7 @@ def check_path() -> None:
def make_src_tarball(release: str, outdir: Path) -> Tuple[str, str]:
- tarball_name = f"xgboost-{release}.tar.gz"
+ tarball_name = f"xgboost-src-{release}.tar.gz"
tarball_path = outdir / tarball_name
if tarball_path.exists():
tarball_path.unlink()
@@ -301,7 +301,7 @@ def release_note(
* xgboost_r_gpu_linux_{release}.tar.gz: [Download]({r_gpu_linux_url})
**Source tarball**
-* xgboost.tar.gz: [Download]({src_tarball})"""
+* {tarball_name}: [Download]({src_tarball})"""
print(end_note)
with open(outdir / "end_note.md", "w") as f:
f.write(end_note)
diff --git a/doc/changes/v2.1.0.rst b/doc/changes/v2.1.0.rst
index 4a657c3a403f..3e2297c8a89d 100644
--- a/doc/changes/v2.1.0.rst
+++ b/doc/changes/v2.1.0.rst
@@ -1,3 +1,13 @@
+#################################
+2.1.3 Patch Release (2024 Nov 26)
+#################################
+
+The 2.1.3 patch release makes the following bug fixes:
+
+- [pyspark] Support large model size (#10984).
+- Fix rng for the column sampler (#10998).
+- Handle `cudf.pandas` proxy objects properly (#11014).
+
#################################
2.1.2 Patch Release (2024 Oct 23)
#################################
diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst
index af9e6556290c..d6effa0b09d4 100644
--- a/doc/contrib/ci.rst
+++ b/doc/contrib/ci.rst
@@ -14,11 +14,9 @@ project.
**************
GitHub Actions
**************
-The configuration files are located under the directory
-`.github/workflows `_.
-
-Most of the tests listed in the configuration files run automatically for every incoming pull
-requests and every update to branches. A few tests however require manual activation:
+We make the extensive use of `GitHub Actions `_ to host our
+CI pipelines. Most of the tests listed in the configuration files run automatically for every
+incoming pull requests and every update to branches. A few tests however require manual activation:
* R tests with ``noLD`` option: Run R tests using a custom-built R with compilation flag
``--disable-long-double``. See `this page `_ for more
@@ -26,18 +24,29 @@ requests and every update to branches. A few tests however require manual activa
To invoke this test suite for a particular pull request, simply add a review comment
``/gha run r-nold-test``. (Ordinary comment won't work. It needs to be a review comment.)
-GitHub Actions is also used to build Python wheels targeting MacOS Intel and Apple Silicon. See
-`.github/workflows/python_wheels.yml
-`_. The
-``python_wheels`` pipeline sets up environment variables prefixed ``CIBW_*`` to indicate the target
-OS and processor. The pipeline then invokes the script ``build_python_wheels.sh``, which in turns
-calls ``cibuildwheel`` to build the wheel. The ``cibuildwheel`` is a library that sets up a
-suitable Python environment for each OS and processor target. Since we don't have Apple Silicon
-machine in GitHub Actions, cross-compilation is needed; ``cibuildwheel`` takes care of the complex
-task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call
-``pip wheel``. Since XGBoost has a native library component, we created a customized build
-backend that hooks into ``pip``. The customized backend contains the glue code to compile the native
-library on the fly.)
+*******************************
+Self-Hosted Runners with RunsOn
+*******************************
+
+`RunsOn `_ is a SaaS (Software as a Service) app that lets us to easily create
+self-hosted runners to use with GitHub Actions pipelines. RunsOn uses
+`Amazon Web Services (AWS) `_ under the hood to provision runners with
+access to various amount of CPUs, memory, and NVIDIA GPUs. Thanks to this app, we are able to test
+GPU-accelerated and distributed algorithms of XGBoost while using the familar interface of
+GitHub Actions.
+
+In GitHub Actions, jobs run on Microsoft-hosted runners by default.
+To opt into self-hosted runners (enabled by RunsOn), we use the following special syntax:
+
+.. code-block:: yaml
+
+ runs-on:
+ - runs-on
+ - runner=runner-name
+ - run-id=${{ github.run_id }}
+ - tag=[unique tag that uniquely identifies the job in the GH Action workflow]
+
+where the runner is defined in ``.github/runs-on.yml``.
*********************************************************
Reproduce CI testing environments using Docker containers
@@ -49,116 +58,298 @@ You can reproduce the same testing environment as the CI pipelines by running Do
Prerequisites
=============
1. Install Docker: https://docs.docker.com/engine/install/ubuntu/
-2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian
+2. Install NVIDIA Docker runtime:
+ https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html.
The runtime lets you access NVIDIA GPUs inside a Docker container.
+.. _build_run_docker_locally:
+
==============================================
Building and Running Docker containers locally
==============================================
-For your convenience, we provide the wrapper script ``tests/ci_build/ci_build.sh``. You can use it as follows:
+For your convenience, we provide three wrapper scripts:
+
+* ``ops/docker_build.py``: Build a Docker container
+* ``ops/docker_build.sh``: Wrapper for ``ops/docker_build.py`` with a more concise interface
+* ``ops/docker_run.py``: Run a command inside a Docker container
+
+**To build a Docker container**, invoke ``docker_build.sh`` as follows:
+
+.. code-block:: bash
+
+ export BRANCH_NAME="master" # Relevant for CI, for local testing, use "master"
+ bash ops/docker_build.sh CONTAINER_ID
+
+where ``CONTAINER_ID`` identifies for the container. The wrapper script will look up the YAML file
+``ops/docker/ci_container.yml``. For example, when ``CONTAINER_ID`` is set to ``xgb-ci.gpu``,
+the script will use the corresponding entry from ``ci_container.yml``:
+
+.. code-block:: yaml
+
+ xgb-ci.gpu:
+ container_def: gpu
+ build_args:
+ CUDA_VERSION_ARG: "12.4.1"
+ NCCL_VERSION_ARG: "2.23.4-1"
+ RAPIDS_VERSION_ARG: "24.10"
+
+The ``container_def`` entry indicates where the Dockerfile is located. The container
+definition will be fetched from ``ops/docker/dockerfile/Dockerfile.CONTAINER_DEF`` where
+``CONTAINER_DEF`` is the value of ``container_def`` entry. In this example, the Dockerfile
+is ``ops/docker/dockerfile/Dockerfile.gpu``.
+
+The ``build_args`` entry lists all the build arguments for the Docker build. In this example,
+the build arguments are:
+
+.. code-block::
+
+ --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg NCCL_VERSION_ARG=2.23.4-1 \
+ --build-arg RAPIDS_VERSION_ARG=24.10
+
+The build arguments provide inputs to the ``ARG`` instructions in the Dockerfile.
+
+.. note:: Inspect the logs from the CI pipeline to find what's going on under the hood
+
+ When invoked, ``ops/docker_build.sh`` logs the precise commands that it runs under the hood.
+ Using the example above:
+
+ .. code-block:: bash
+
+ # docker_build.sh calls docker_build.py...
+ python3 ops/docker_build.py --container-def gpu --container-id xgb-ci.gpu \
+ --build-arg CUDA_VERSION_ARG=12.4.1 --build-arg NCCL_VERSION_ARG=2.23.4-1 \
+ --build-arg RAPIDS_VERSION_ARG=24.10
+
+ ...
+
+ # .. and docker_build.py in turn calls "docker build"...
+ docker build --build-arg CUDA_VERSION_ARG=12.4.1 \
+ --build-arg NCCL_VERSION_ARG=2.23.4-1 \
+ --build-arg RAPIDS_VERSION_ARG=24.10 \
+ --load --progress=plain \
+ --ulimit nofile=1024000:1024000 \
+ -t xgb-ci.gpu \
+ -f ops/docker/dockerfile/Dockerfile.gpu \
+ ops/
+
+ The logs come in handy when debugging the container builds. In addition, you can change
+ the build arguments to make changes to the container.
+
+**To run commands within a Docker container**, invoke ``docker_run.py`` as follows:
+
+.. code-block:: bash
+
+ python3 ops/docker_run.py --container-id "ID of the container" [--use-gpus] \
+ -- "command to run inside the container"
+
+where ``--use-gpus`` should be specified to expose NVIDIA GPUs to the Docker container.
+
+For example:
.. code-block:: bash
- tests/ci_build/ci_build.sh --use-gpus --build-arg \
- ...
+ # Run without GPU
+ python3 ops/docker_run.py --container-id xgb-ci.cpu \
+ -- bash ops/script/build_via_cmake.sh
+
+ # Run with NVIDIA GPU
+ python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \
+ -- bash ops/pipeline/test-python-wheel-impl.sh gpu
+
+The ``docker_run.py`` script will convert these commands to the following invocations
+of ``docker run``:
-where:
+.. code-block:: bash
+
+ docker run --rm --pid=host \
+ -w /workspace -v /path/to/xgboost:/workspace \
+ -e CI_BUILD_UID= -e CI_BUILD_USER= \
+ -e CI_BUILD_GID= -e CI_BUILD_GROUP= \
+ xgb-ci.cpu \
+ bash ops/script/build_via_cmake.sh
-* ```` is the identifier for the container. The wrapper script will use the
- container definition (Dockerfile) located at ``tests/ci_build/Dockerfile.``.
- For example, setting the container type to ``gpu`` will cause the script to load the Dockerfile
- ``tests/ci_build/Dockerfile.gpu``.
-* Specify ``--use-gpus`` to run any GPU code. This flag will grant the container access to all NVIDIA GPUs in the base machine. Omit the flag if the access to GPUs is not necessary.
-* ```` is a build argument to be passed to Docker. Must be of form ``VAR=VALUE``.
- Example: ``--build-arg CUDA_VERSION_ARG=11.0``. You can pass multiple ``--build-arg``.
-* ```` is the command to run inside the Docker container. This can be more than one argument.
- Example: ``tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON``.
+ docker run --rm --pid=host --gpus all \
+ -w /workspace -v /path/to/xgboost:/workspace \
+ -e CI_BUILD_UID= -e CI_BUILD_USER= \
+ -e CI_BUILD_GID= -e CI_BUILD_GROUP= \
+ xgb-ci.gpu \
+ bash ops/pipeline/test-python-wheel-impl.sh gpu
-Optionally, you can set the environment variable ``CI_DOCKER_EXTRA_PARAMS_INIT`` to pass extra
-arguments to Docker. For example:
+Optionally, you can specify ``--run-args`` to pass extra arguments to ``docker run``:
.. code-block:: bash
# Allocate extra space in /dev/shm to enable NCCL
- export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
- # Run multi-GPU test suite
- tests/ci_build/ci_build.sh gpu --use-gpus --build-arg CUDA_VERSION_ARG=11.0 \
- tests/ci_build/test_python.sh mgpu
+ # Also run the container with elevated privileges
+ python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \
+ --run-args='--shm-size=4g --privileged' \
+ -- bash ops/pipeline/test-python-wheel-impl.sh gpu
+
+which translates to
+
+.. code-block:: bash
+
+ docker run --rm --pid=host --gpus all \
+ -w /workspace -v /path/to/xgboost:/workspace \
+ -e CI_BUILD_UID= -e CI_BUILD_USER= \
+ -e CI_BUILD_GID= -e CI_BUILD_GROUP= \
+ --shm-size=4g --privileged \
+ xgb-ci.gpu \
+ bash ops/pipeline/test-python-wheel-impl.sh gpu
+
+*******************************************************************
+The Lay of the Land: how CI pipelines are organized in the codebase
+*******************************************************************
+The XGBoost project stores the configuration for its CI pipelines as part of the codebase.
+The git repository therefore stores not only the change history for its source code but also
+the change history for the CI pipelines.
+
+=================
+File Organization
+=================
+
+The CI pipelines are organized into the following directories and files:
+
+* ``.github/workflows/``: Definition of CI pipelines, using the GitHub Actions syntax
+* ``.github/runs-on.yml``: Configuration for the RunsOn service. Specifies the spec for
+ the self-hosted CI runners.
+* ``ops/conda_env/``: Definitions for Conda environments
+* ``ops/packer/``: Packer scripts to build VM images for Amazon EC2
+* ``ops/patch/``: Patch files
+* ``ops/pipeline/``: Shell scripts defining CI/CD pipelines. Most of these scripts can be run
+ locally (to assist with development and debugging); a few must run in the CI.
+* ``ops/script/``: Various utility scripts useful for testing
+* ``ops/docker/dockerfile/``: Dockerfiles to define containers
+* ``ops/docker/ci_container.yml``: Defines the mapping between Dockerfiles and containers.
+ Also specifies the build arguments to be used with each container. See
+ :ref:`build_run_docker_locally` to learn how this YAML file is used in the context of
+ a container build.
+* ``ops/docker_build.*``: Wrapper scripts to build and test CI containers. See
+ :ref:`build_run_docker_locally` for the detailed description.
+
+To inspect a given CI pipeline, inspect files in the following order:
+
+.. plot::
+ :nofigs:
+
+ from graphviz import Source
+ source = r"""
+ digraph ci_graph {
+ graph [fontname = "monospace"];
+ node [fontname = "monospace"];
+ edge [fontname = "monospace"];
+ 0 [label=<.github/workflows/*.yml>, shape=box];
+ 1 [label=, shape=box];
+ 2 [label=, shape=box];
+ 3 [label=, shape=box];
+ 0 -> 1 [xlabel="Calls"];
+ 1 -> 2 [xlabel="Calls,\nvia docker_run.py"];
+ 2 -> 3 [xlabel="Calls"];
+ 1 -> 3 [xlabel="Calls"];
+ }
+ """
+ Source(source, format='png').render('../_static/ci_graph', view=False)
+ Source(source, format='svg').render('../_static/ci_graph', view=False)
+
+.. figure:: ../_static/ci_graph.svg
+ :align: center
+ :figwidth: 80 %
+
+===================================
+Primitives used in the CI pipelines
+===================================
+
+------------------------
+Build and run containers
+------------------------
+
+See :ref:`build_run_docker_locally` to learn about the utility scripts for building and
+using containers.
+
+**What's the relationship between the VM image (for Amazon EC2) and the container image?**
+In ``ops/packer/`` directory, we define Packer scripts to build VM images for Amazon EC2.
+The VM image contains the minimal set of drivers and system software that are needed to
+run the containers.
+
+We update container images much more often than VM images. Whereas VM images are
+updated sparingly (once in a few months), container images are updated each time a branch
+or a pull request is updated. This way, developers can make changes to containers and
+see the results of the changes immediately in the CI run.
+
+------------------------------------------
+Stash artifacts, to move them between jobs
+------------------------------------------
+
+This primitive is useful when one pipeline job needs to consume the output
+from another job.
+We use `Amazon S3 `_ to store the stashed files.
+
+**To stash a file**:
+
+.. code-block:: bash
+
+ REMOTE_PREFIX="remote directory to place the artifact(s)"
+ bash ops/pipeline/stash-artifacts.sh stash "${REMOTE_PREFIX}" path/to/file
+
+The ``REMOTE_PREFIX`` argument, which is the second command-line argument
+for ``stash-artifacts.sh``, specifies the remote directory in which the artifact(s)
+should be placed. More precisely, the artifact(s) will be placed in
+``s3://{RUNS_ON_S3_BUCKET_CACHE}/cache/{GITHUB_REPOSITORY}/stash/{GITHUB_RUN_ID}/{REMOTE_PREFIX}/``
+where ``RUNS_ON_S3_BUCKET_CACHE``, ``GITHUB_REPOSITORY``, and ``GITHUB_RUN_ID`` are set by
+the CI. (RunsOn provisions an S3 bucket to stage cache, and its name is stored in the environment
+variable ``RUNS_ON_S3_BUCKET_CACHE``.)
+
+You can upload multiple files, possibly with wildcard globbing:
-To pass multiple extra arguments:
+.. code-block:: bash
+
+ REMOTE_PREFIX="build-cuda"
+ bash ops/pipeline/stash-artifacts.sh stash "${REMOTE_PREFIX}" \
+ build/testxgboost python-package/dist/*.whl
+
+**To unstash a file**:
+
+.. code-block:: bash
+
+ REMOTE_PREFIX="remote directory to place the artifact(s)"
+ bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" path/to/file
+
+You can also use the wildcard globbing. The script will download the matching artifacts
+from the remote directory.
.. code-block:: bash
- export CI_DOCKER_EXTRA_PARAMS_INIT='-e VAR1=VAL1 -e VAR2=VAL2 -e VAR3=VAL3'
-
-********************************************
-Update pipeline definitions for BuildKite CI
-********************************************
-
-`BuildKite `_ is a SaaS (Software as a Service) platform that orchestrates
-cloud machines to host CI pipelines. The BuildKite platform allows us to define CI pipelines as a
-declarative YAML file.
-
-The pipeline definitions are found in ``tests/buildkite/``:
-
-* ``tests/buildkite/pipeline-win64.yml``: This pipeline builds and tests XGBoost for the Windows platform.
-* ``tests/buildkite/pipeline-mgpu.yml``: This pipeline builds and tests XGBoost with access to multiple
- NVIDIA GPUs.
-* ``tests/buildkite/pipeline.yml``: This pipeline builds and tests XGBoost with access to a single
- NVIDIA GPU. Most tests are located here.
-
-****************************************
-Managing Elastic CI Stack with BuildKite
-****************************************
-
-BuildKite allows us to define cloud resources in
-a declarative fashion. Every configuration step is now documented explicitly as code.
-
-**Prerequisite**: You should have some knowledge of `CloudFormation `_.
-CloudFormation lets us define a stack of cloud resources (EC2 machines, Lambda functions, S3 etc) using
-a single YAML file.
-
-**Prerequisite**: Gain access to the XGBoost project's AWS account (``admin@xgboost-ci.net``), and then
-set up a credential pair in order to provision resources on AWS. See
-`Creating an IAM user in your AWS account `_.
-
-* Option 1. Give full admin privileges to your IAM user. This is the simplest option.
-* Option 2. Give limited set of permissions to your IAM user, to reduce the possibility of messing up other resources.
- For this, use the script ``tests/buildkite/infrastructure/service-user/create_service_user.py``.
-
-=====================
-Worker Image Pipeline
-=====================
-Building images for worker machines used to be a chore: you'd provision an EC2 machine, SSH into it, and
-manually install the necessary packages. This process is not only laborious but also error-prone. You may
-forget to install a package or change a system configuration.
-
-No more. Now we have an automated pipeline for building images for worker machines.
-
-* Run ``tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py`` in order to provision
- CloudFormation stacks named ``buildkite-linux-amd64-gpu-worker`` and ``buildkite-windows-gpu-worker``. They are
- pipelines that create AMIs (Amazon Machine Images) for Linux and Windows workers, respectively.
-* Navigate to the CloudFormation web console to verify that the image builder pipelines have been provisioned. It may
- take some time.
-* Once they pipelines have been fully provisioned, run the script
- ``tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py`` to execute the pipelines. New AMIs will be
- uploaded to the EC2 service. You can locate them in the EC2 console.
-* Make sure to modify ``tests/buildkite/infrastructure/aws-stack-creator/metadata.py`` to use the correct AMI IDs.
- (For ``linux-amd64-cpu`` and ``linux-arm64-cpu``, use the AMIs provided by BuildKite. Consult the ``AWSRegion2AMI``
- section of https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml.)
-
-======================
-EC2 Autoscaling Groups
-======================
-In EC2, you can create auto-scaling groups, where you can dynamically adjust the number of worker instances according to
-workload. When a pull request is submitted, the following steps take place:
-
-1. GitHub sends a signal to the registered webhook, which connects to the BuildKite server.
-2. BuildKite sends a signal to a `Lambda `_ function named ``Autoscaling``.
-3. The Lambda function sends a signal to the auto-scaling group. The group scales up and adds additional worker instances.
-4. New worker instances run the test jobs. Test results are reported back to BuildKite.
-5. When the test jobs complete, BuildKite sends a signal to ``Autoscaling``, which in turn requests the autoscaling group
- to scale down. Idle worker instances are shut down.
-
-To set up the auto-scaling group, run the script ``tests/buildkite/infrastructure/aws-stack-creator/create_stack.py``.
-Check the CloudFormation web console to verify successful provision of auto-scaling groups.
+ REMOTE_PREFIX="build-cuda"
+ # Download all files whose path matches the wildcard pattern python-package/dist/*.whl
+ bash ops/pipeline/stash-artifacts.sh unstash "${REMOTE_PREFIX}" \
+ python-package/dist/*.whl
+
+-----------------------------------------
+Custom actions in ``dmlc/xgboost-devops``
+-----------------------------------------
+
+XGBoost implements a few custom
+`composite actions `_
+to reduce duplicated code within workflow YAML files. The custom actions are hosted in a separate repository,
+`dmlc/xgboost-devops `_, to make it easy to test changes to the custom actions in
+a pull request or a fork.
+
+In a workflow file, we'd refer to ``dmlc/xgboost-devops/{custom-action}@main``. For example:
+
+.. code-block:: yaml
+
+ - uses: dmlc/xgboost-devops/miniforge-setup@main
+ with:
+ environment-name: cpp_test
+ environment-file: ops/conda_env/cpp_test.yml
+
+Each custom action consists of two components:
+
+* Main script (``dmlc/xgboost-devops/{custom-action}/action.yml``): dispatches to a specific version
+ of the implementation script (see the next item). The main script clones ``xgboost-devops`` from
+ a specified fork at a particular ref, allowing us to easily test changes to the custom action.
+* Implementation script (``dmlc/xgboost-devops/impls/{custom-action}/action.yml``): Implements the
+ custom script.
+
+This design was inspired by Mike Sarahan's work in
+`rapidsai/shared-actions `_.
diff --git a/doc/contrib/coding_guide.rst b/doc/contrib/coding_guide.rst
index bf18ad08cf53..60b3c4a13bd2 100644
--- a/doc/contrib/coding_guide.rst
+++ b/doc/contrib/coding_guide.rst
@@ -107,7 +107,7 @@ C++ interface of the R package, please make corresponding changes in ``src/init.
Generating the Package and Running Tests
========================================
-The source layout of XGBoost is a bit unusual to normal R packages as XGBoost is primarily written in C++ with multiple language bindings in mind. As a result, some special cares need to be taken to generate a standard R tarball. Most of the tests are being run on CI, and as a result, the best way to see how things work is by looking at the CI configuration files (GitHub action, at the time of writing). There are helper scripts in ``tests/ci_build`` and ``R-package/tests/helper_scripts`` for running various checks including linter and making the standard tarball.
+The source layout of XGBoost is a bit unusual to normal R packages as XGBoost is primarily written in C++ with multiple language bindings in mind. As a result, some special cares need to be taken to generate a standard R tarball. Most of the tests are being run on CI, and as a result, the best way to see how things work is by looking at the CI configuration files (GitHub action, at the time of writing). There are helper scripts in ``ops/script`` and ``R-package/tests/helper_scripts`` for running various checks including linter and making the standard tarball.
*********************************
Running Formatting Checks Locally
@@ -127,7 +127,7 @@ To run checks for Python locally, install the checkers mentioned previously and
.. code-block:: bash
cd /path/to/xgboost/
- python ./tests/ci_build/lint_python.py --fix
+ python ./ops/script/lint_python.py --fix
To run checks for R:
@@ -135,21 +135,21 @@ To run checks for R:
cd /path/to/xgboost/
R CMD INSTALL R-package/
- Rscript tests/ci_build/lint_r.R $(pwd)
+ Rscript ops/script/lint_r.R $(pwd)
To run checks for cpplint locally:
.. code-block:: bash
cd /path/to/xgboost/
- python ./tests/ci_build/lint_cpp.py
+ python ./ops/script/lint_cpp.py
See next section for clang-tidy. For CMake scripts:
.. code-block:: bash
- bash ./tests/ci_build/lint_cmake.sh
+ bash ./ops/script/lint_cmake.sh
Lastly, the linter for jvm-packages is integrated into the maven build process.
@@ -163,21 +163,21 @@ To run this check locally, run the following command from the top level source t
.. code-block:: bash
cd /path/to/xgboost/
- python3 tests/ci_build/tidy.py
+ python3 ops/script/run_clang_tidy.py
Also, the script accepts two optional integer arguments, namely ``--cpp`` and ``--cuda``. By default they are both set to 1, meaning that both C++ and CUDA code will be checked. If the CUDA toolkit is not installed on your machine, you'll encounter an error. To exclude CUDA source from linting, use:
.. code-block:: bash
cd /path/to/xgboost/
- python3 tests/ci_build/tidy.py --cuda=0
+ python3 ops/script/run_clang_tidy.py --cuda=0
Similarly, if you want to exclude C++ source from linting:
.. code-block:: bash
cd /path/to/xgboost/
- python3 tests/ci_build/tidy.py --cpp=0
+ python3 ops/script/run_clang_tidy.py --cpp=0
**********************************
Guide for handling user input data
diff --git a/doc/contrib/donate.rst b/doc/contrib/donate.rst
index b6171c412c74..ba7c75a942f9 100644
--- a/doc/contrib/donate.rst
+++ b/doc/contrib/donate.rst
@@ -13,9 +13,9 @@ DMLC/XGBoost has grown from a research project incubated in academia to one of t
A robust and efficient **continuous integration (CI)** infrastructure is one of the most critical solutions to address the above challenge. A CI service will monitor an open-source repository and run a suite of integration tests for every incoming contribution. This way, the CI ensures that every proposed change in the codebase is compatible with existing functionalities. Furthermore, XGBoost can enable more thorough tests with a powerful CI infrastructure to cover cases which are closer to the production environment.
-There are several CI services available free to open source projects, such as Travis CI and AppVeyor. The XGBoost project already utilizes GitHub Actions. However, the XGBoost project has needs that these free services do not adequately address. In particular, the limited usage quota of resources such as CPU and memory leaves XGBoost developers unable to bring "too-intensive" tests. In addition, they do not offer test machines with GPUs for testing XGBoost-GPU code base which has been attracting more and more interest across many organizations. Consequently, the XGBoost project uses a cloud-hosted test farm. We use `BuildKite `_ to organize CI pipelines.
+There are several CI services available free to open source projects, such as Travis CI and AppVeyor. The XGBoost project already utilizes GitHub Actions. However, the XGBoost project has needs that these free services do not adequately address. In particular, the limited usage quota of resources such as CPU and memory leaves XGBoost developers unable to bring "too-intensive" tests. In addition, they do not offer test machines with GPUs for testing XGBoost-GPU code base which has been attracting more and more interest across many organizations. Consequently, the XGBoost project uses a cloud-hosted test farm. We host `Amazon Web Services (AWS) `_ to host the test machines, along with `GitHub Actions `_ and `RunsOn `_ (SaaS app) to organize the CI pipelines.
-The cloud-hosted test farm has recurring operating expenses. It utilizes a leading cloud provider (AWS) to accommodate variable workload. BuildKite launches worker machines on AWS on demand, to run the test suite on incoming contributions. To save cost, the worker machines are terminated when they are no longer needed.
+The cloud-hosted test farm has recurring operating expenses. RunsOn launches worker machines on AWS on demand to run the test suite on incoming contributions. To save cost, the worker machines are terminated when they are no longer needed.
To help defray the hosting cost, the XGBoost project seeks donations from third parties.
@@ -29,9 +29,9 @@ The Project Management Committee (PMC) of the XGBoost project appointed `Open So
All expenses incurred for hosting CI will be submitted to the fiscal host with receipts. Only the expenses in the following categories will be approved for reimbursement:
-* Cloud expenses for the cloud test farm (https://buildkite.com/xgboost)
+* Cloud expenses for the cloud test farm
* Cost of domain https://xgboost-ci.net
-* Monthly cost of using BuildKite
+* Annual subscription for RunsOn
* Hosting cost of the User Forum (https://discuss.xgboost.ai)
Administration of cloud CI infrastructure
diff --git a/doc/contrib/release.rst b/doc/contrib/release.rst
index c0370b14ed42..4548b1ffa9a2 100644
--- a/doc/contrib/release.rst
+++ b/doc/contrib/release.rst
@@ -17,7 +17,7 @@ Making a Release
-----------------
1. Create an issue for the release, noting the estimated date and expected features or major fixes, pin that issue.
-2. Create a release branch if this is a major release. Bump release version. There's a helper script ``tests/ci_build/change_version.py``.
+2. Create a release branch if this is a major release. Bump release version. There's a helper script ``ops/script/change_version.py``.
3. Commit the change, create a PR on GitHub on release branch. Port the bumped version to default branch, optionally with the postfix ``SNAPSHOT``.
4. Create a tag on release branch, either on GitHub or locally.
5. Make a release on GitHub tag page, which might be done with previous step if the tag is created on GitHub.
diff --git a/doc/contrib/unit_tests.rst b/doc/contrib/unit_tests.rst
index aa58cd337020..857d7a067307 100644
--- a/doc/contrib/unit_tests.rst
+++ b/doc/contrib/unit_tests.rst
@@ -63,7 +63,7 @@ Run
.. code-block:: bash
- python ./tests/ci_build/test_r_package.py --task=check
+ python ./ops/script/test_r_package.py --task=check
at the root of the project directory. The command builds and checks the XGBoost
r-package. Alternatively, if you want to just run the tests, you can use the following
diff --git a/doc/jvm/api.rst b/doc/jvm/api.rst
index b9e7821aa6fa..3d56cb2c9aa4 100644
--- a/doc/jvm/api.rst
+++ b/doc/jvm/api.rst
@@ -5,4 +5,5 @@ API Docs for the JVM packages
* `XGBoost4J Java API <../jvm_docs/javadocs/index.html>`_
* `XGBoost4J Scala API <../jvm_docs/scaladocs/xgboost4j/index.html>`_
* `XGBoost4J-Spark Scala API <../jvm_docs/scaladocs/xgboost4j-spark/index.html>`_
+* `XGBoost4J-Spark-GPU Scala API <../jvm_docs/scaladocs/xgboost4j-spark-gpu/index.html>`_
* `XGBoost4J-Flink Scala API <../jvm_docs/scaladocs/xgboost4j-flink/index.html>`_
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index a8999e119ab4..5398fb5d091f 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -37,6 +37,7 @@ Core Data Structure
.. autoclass:: xgboost.Booster
:members:
:show-inheritance:
+ :special-members: __getitem__
.. autoclass:: xgboost.DataIter
:members:
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 6e68d83a0083..036b1e725d47 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -355,15 +355,18 @@ Working with asyncio
.. versionadded:: 1.2.0
-XGBoost's dask interface supports the new ``asyncio`` in Python and can be integrated into
-asynchronous workflows. For using dask with asynchronous operations, please refer to
-`this dask example `_ and document in
-`distributed `_. To use XGBoost's
-dask interface asynchronously, the ``client`` which is passed as an argument for training and
-prediction must be operating in asynchronous mode by specifying ``asynchronous=True`` when the
-``client`` is created (example below). All functions (including ``DaskDMatrix``) provided
-by the functional interface will then return coroutines which can then be awaited to retrieve
-their result.
+XGBoost's dask interface supports the new :py:mod:`asyncio` in Python and can be
+integrated into asynchronous workflows. For using dask with asynchronous operations,
+please refer to `this dask example
+`_ and document in `distributed
+`_. To use XGBoost's Dask
+interface asynchronously, the ``client`` which is passed as an argument for training and
+prediction must be operating in asynchronous mode by specifying ``asynchronous=True`` when
+the ``client`` is created (example below). All functions (including ``DaskDMatrix``)
+provided by the functional interface will then return coroutines which can then be awaited
+to retrieve their result. Please note that XGBoost is a compute-bounded application, where
+parallelism is more important than concurrency. The support for `asyncio` is more about
+compatibility instead of performance gain.
Functional interface:
@@ -526,6 +529,47 @@ See https://github.com/coiled/dask-xgboost-nyctaxi for a set of examples of usin
with dask and optuna.
+.. _ltr-dask:
+
+****************
+Learning to Rank
+****************
+
+ .. versionadded:: 3.0.0
+
+ .. note::
+
+ Position debiasing is not yet supported.
+
+There are two operation modes in the Dask learning to rank for performance reasons. The
+difference is whether a distributed global sort is needed. Please see :ref:`ltr-dist` for
+how ranking works with distributed training in general. Below we will discuss some of the
+Dask-specific features.
+
+First, if you use the :py:class:`~xgboost.dask.DaskQuantileDMatrix` interface or the
+:py:class:`~xgboost.dask.DaskXGBRanker` with ``allow_group_split`` set to ``True``,
+XGBoost will try to sort and group the samples for each worker based on the query ID. This
+mode tries to skip the global sort and sort only worker-local data, and hence no
+inter-worker data shuffle. Please note that even worker-local sort is costly, particularly
+in terms of memory usage as there's no spilling when
+:py:meth:`~pandas.DataFrame.sort_values` is used, and we need to concatenate the
+data. XGBoost first checks whether the QID is already sorted before actually performing
+the sorting operation. One can choose this if the query groups are relatively consecutive,
+meaning most of the samples within a query group are close to each other and are likely to
+be resided to the same worker. Don't use this if you have performed a random shuffle on
+your data.
+
+If the input data is random, then there's no way we can guarantee most of data within the
+same group being in the same worker. For large query groups, this might not be an
+issue. But for small query groups, it's possible that each worker gets only one or two
+samples from their group for all groups, which can lead to disastrous performance. In that
+case, we can partition the data according to query group, which is the default behavior of
+the :py:class:`~xgboost.dask.DaskXGBRanker` unless the ``allow_group_split`` is set to
+``True``. This mode performs a sort and a groupby on the entire dataset in addition to an
+encoding operation for the query group IDs. Along with partition fragmentation, this
+option can lead to slow performance. See
+:ref:`sphx_glr_python_dask-examples_dask_learning_to_rank.py` for a worked example.
+
.. _tracker-ip:
***************
diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index 4d2cbad4aa47..8743a672d219 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -165,10 +165,26 @@ On the other hand, if you have comparatively small amount of training data:
For any method chosen, you can modify ``lambdarank_num_pair_per_sample`` to control the amount of pairs generated.
+.. _ltr-dist:
+
********************
Distributed Training
********************
-XGBoost implements distributed learning-to-rank with integration of multiple frameworks including Dask, Spark, and PySpark. The interface is similar to the single-node counterpart. Please refer to document of the respective XGBoost interface for details. Scattering a query group onto multiple workers is theoretically sound but can affect the model accuracy. For most of the use cases, the small discrepancy is not an issue, as the amount of training data is usually large when distributed training is used. As a result, users don't need to partition the data based on query groups. As long as each data partition is correctly sorted by query IDs, XGBoost can aggregate sample gradients accordingly.
+
+XGBoost implements distributed learning-to-rank with integration of multiple frameworks
+including :doc:`Dask `, :doc:`Spark `, and
+:doc:`PySpark `. The interface is similar to the single-node
+counterpart. Please refer to document of the respective XGBoost interface for details.
+
+.. warning::
+
+ Position-debiasing is not yet supported for existing distributed interfaces.
+
+XGBoost works with collective operations, which means data is scattered to multiple workers. We can divide the data partitions by query group and ensure no query group is split among workers. However, this requires a costly sort and groupby operation and might only be necessary for selected use cases. Splitting and scattering a query group to multiple workers is theoretically sound but can affect the model's accuracy. If there are only a small number of groups sitting at the boundaries of workers, the small discrepancy is not an issue, as the amount of training data is usually large when distributed training is used.
+
+For a longer explanation, assuming the pairwise ranking method is used, we calculate the gradient based on relevance degree by constructing pairs within a query group. If a single query group is split among workers and we use worker-local data for gradient calculation, then we are simply sampling pairs from a smaller group for each worker to calculate the gradient and the evaluation metric. The comparison between each pair doesn't change because a group is split into sub-groups, what changes is the number of total and effective pairs and normalizers like `IDCG`. One can generate more pairs from a large group than it's from two smaller subgroups. As a result, the obtained gradient is still valid from a theoretical standpoint but might not be optimal. As long as each data partitions within a worker are correctly sorted by query IDs, XGBoost can aggregate sample gradients accordingly. And both the (Py)Spark interface and the Dask interface can sort the data according to query ID, please see respected tutorials for more information.
+
+However, it's possible that a distributed framework shuffles the data during map reduce and splits every query group into multiple workers. In that case, the performance would be disastrous. As a result, it depends on the data and the framework for whether a sorted groupby is needed.
*******************
Reproducible Result
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 7e8ed2f29568..6ae1dea8d3ce 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -876,31 +876,48 @@ XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *conf
* @defgroup Booster Booster
*
* @brief The `Booster` class is the gradient-boosted model for XGBoost.
+ *
+ * During training, the booster object has many caches for improved performance. In
+ * addition to gradient and prediction, it also includes runtime buffers like leaf
+ * partitions. These buffers persist with the Booster object until either XGBoosterReset()
+ * is called or the booster is deleted by the XGBoosterFree().
+ *
* @{
*/
-/*!
- * \brief create xgboost learner
- * \param dmats matrices that are set to be cached
- * \param len length of dmats
- * \param out handle to the result booster
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief Create a XGBoost learner (booster)
+ *
+ * @param dmats matrices that are set to be cached by the booster.
+ * @param len length of dmats
+ * @param out handle to the result booster
+ *
+ * @return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[], bst_ulong len, BoosterHandle *out);
/**
* @example c-api-demo.c
*/
-/*!
- * \brief free obj in handle
- * \param handle handle to be freed
- * \return 0 when success, -1 when failure happens
+/**
+ * @brief Delete the booster.
+ *
+ * @param handle The handle to be freed.
+ *
+ * @return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterFree(BoosterHandle handle);
/**
* @example c-api-demo.c inference.c external_memory.c
*/
+/**
+ * @brief Reset the booster object to release data caches used for training.
+ *
+ * @since 3.0.0
+ */
+XGB_DLL int XGBoosterReset(BoosterHandle handle);
+
/*!
* \brief Slice a model using boosting index. The slice m:n indicates taking all trees
* that were fit during the boosting rounds m, (m+1), (m+2), ..., (n-1).
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 939324e4a6c4..1499804c8592 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -249,6 +249,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
std::string format) = 0;
virtual XGBAPIThreadLocalEntry& GetThreadLocal() const = 0;
+ /**
+ * @brief Reset the booster object to release data caches used for training.
+ */
+ virtual void Reset() = 0;
/*!
* \brief Create a new instance of learner.
* \param cache_data The matrix to cache the prediction.
diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py
index 6be7b451ce14..fbd9b4ce5672 100755
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@@ -32,7 +32,7 @@ def cd(path):
path = normpath(path)
cwd = os.getcwd()
os.chdir(path)
- print("cd " + path)
+ print("cd " + path, flush=True)
try:
yield path
finally:
@@ -41,7 +41,7 @@ def cd(path):
def maybe_makedirs(path):
path = normpath(path)
- print("mkdir -p " + path)
+ print("mkdir -p " + path, flush=True)
try:
os.makedirs(path)
except OSError as e:
@@ -50,14 +50,14 @@ def maybe_makedirs(path):
def run(command, **kwargs):
- print(command)
+ print(command, flush=True)
subprocess.run(command, shell=True, check=True, env=os.environ, **kwargs)
def cp(source, target):
source = normpath(source)
target = normpath(target)
- print("cp {0} {1}".format(source, target))
+ print("cp {0} {1}".format(source, target), flush=True)
shutil.copy(source, target)
@@ -78,7 +78,7 @@ def native_build(args):
subprocess.check_output("/usr/libexec/java_home").strip().decode()
)
- print("building Java wrapper")
+ print("building Java wrapper", flush=True)
with cd(".."):
build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build"
maybe_makedirs(build_dir)
@@ -123,7 +123,7 @@ def native_build(args):
run("cmake .. " + " ".join(args + [generator]))
break
except subprocess.CalledProcessError as e:
- print(f"Failed to build with generator: {generator}", e)
+ print(f"Failed to build with generator: {generator}", e, flush=True)
with cd(os.path.pardir):
shutil.rmtree(build_dir)
maybe_makedirs(build_dir)
@@ -132,7 +132,7 @@ def native_build(args):
run("cmake --build . --config Release" + maybe_parallel_build)
- print("copying native library")
+ print("copying native library", flush=True)
library_name, os_folder = {
"Windows": ("xgboost4j.dll", "windows"),
"Darwin": ("libxgboost4j.dylib", "macos"),
@@ -153,7 +153,7 @@ def native_build(args):
maybe_makedirs(output_folder)
cp("../lib/" + library_name, output_folder)
- print("copying train/test files")
+ print("copying train/test files", flush=True)
# for xgboost4j
maybe_makedirs("xgboost4j/src/test/resources")
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index be46dc261285..b8a7d3337f35 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -116,6 +116,22 @@
+
+ docs
+
+ ON
+ true
+ true
+ true
+
+
+ xgboost4j
+ xgboost4j-spark
+ xgboost4j-spark-gpu
+ xgboost4j-flink
+
+
+
release
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
index 6559d90c7887..a5ff2ba0f589 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/spark/GpuXGBoostPluginSuite.scala
@@ -542,6 +542,55 @@ class GpuXGBoostPluginSuite extends GpuTestSuite {
}
}
+ test("Same group must be in the same partition") {
+ val num_workers = 3
+ withGpuSparkSession() { spark =>
+ import spark.implicits._
+ val df = spark.createDataFrame(spark.sparkContext.parallelize(Seq(
+ (0.1, 1, 0),
+ (0.1, 1, 0),
+ (0.1, 1, 0),
+ (0.1, 1, 1),
+ (0.1, 1, 1),
+ (0.1, 1, 1),
+ (0.1, 1, 2),
+ (0.1, 1, 2),
+ (0.1, 1, 2)), 1)).toDF("label", "f1", "group")
+
+ // The original pattern will repartition df in a RoundRobin manner
+ val oriRows = df.repartition(num_workers)
+ .sortWithinPartitions(df.col("group"))
+ .select("group")
+ .mapPartitions { case iter =>
+ val tmp: ArrayBuffer[Int] = ArrayBuffer.empty
+ while (iter.hasNext) {
+ val r = iter.next()
+ tmp.append(r.getInt(0))
+ }
+ Iterator.single(tmp.mkString(","))
+ }.collect()
+ assert(oriRows.length == 3)
+ assert(oriRows.contains("0,1,2"))
+
+ // The fix has replaced repartition with repartitionByRange which will put the
+ // instances with same group into the same partition
+ val ranker = new XGBoostRanker().setGroupCol("group").setNumWorkers(num_workers)
+ val processedDf = ranker.getPlugin.get.asInstanceOf[GpuXGBoostPlugin].preprocess(ranker, df)
+ val rows = processedDf
+ .select("group")
+ .mapPartitions { case iter =>
+ val tmp: ArrayBuffer[Int] = ArrayBuffer.empty
+ while (iter.hasNext) {
+ val r = iter.next()
+ tmp.append(r.getInt(0))
+ }
+ Iterator.single(tmp.mkString(","))
+ }.collect()
+
+ rows.forall(Seq("0,0,0", "1,1,1", "2,2,2").contains)
+ }
+ }
+
test("Ranker: XGBoost-Spark should match xgboost4j") {
withGpuSparkSession() { spark =>
import spark.implicits._
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala
index 14d13e34ff61..0265eac55979 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRanker.scala
@@ -22,6 +22,7 @@ import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable, MLReadable, MLReader}
import org.apache.spark.ml.xgboost.SparkUtils
import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
import ml.dmlc.xgboost4j.scala.Booster
@@ -62,6 +63,22 @@ class XGBoostRanker(override val uid: String,
}
}
+ /**
+ * Repartition the dataset to the numWorkers if needed.
+ *
+ * @param dataset to be repartition
+ * @return the repartitioned dataset
+ */
+ override private[spark] def repartitionIfNeeded(dataset: Dataset[_]) = {
+ val numPartitions = dataset.rdd.getNumPartitions
+ if (getForceRepartition || getNumWorkers != numPartitions) {
+ // Please note that the output of repartitionByRange is not deterministic
+ dataset.repartitionByRange(getNumWorkers, col(getGroupCol))
+ } else {
+ dataset
+ }
+ }
+
/**
* Sort partition for Ranker issue.
*
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala
index 81a770bfe327..063836538931 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRankerSuite.scala
@@ -151,6 +151,54 @@ class XGBoostRankerSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite
}}
}
+ test("Same group must be in the same partition") {
+ val spark = ss
+ import spark.implicits._
+ val num_workers = 3
+ val df = ss.createDataFrame(sc.parallelize(Seq(
+ (0.1, Vectors.dense(1.0, 2.0, 3.0), 0),
+ (0.1, Vectors.dense(0.0, 0.0, 0.0), 0),
+ (0.1, Vectors.dense(0.0, 3.0, 0.0), 0),
+ (0.1, Vectors.dense(2.0, 0.0, 4.0), 1),
+ (0.1, Vectors.dense(0.2, 1.2, 2.0), 1),
+ (0.1, Vectors.dense(0.5, 2.2, 1.7), 1),
+ (0.1, Vectors.dense(0.5, 2.2, 1.7), 2),
+ (0.1, Vectors.dense(0.5, 2.2, 1.7), 2),
+ (0.1, Vectors.dense(0.5, 2.2, 1.7), 2)), 1)).toDF("label", "features", "group")
+
+ // The original pattern will repartition df in a RoundRobin manner
+ val oriRows = df.repartition(num_workers)
+ .sortWithinPartitions(df.col("group"))
+ .select("group")
+ .mapPartitions { case iter =>
+ val tmp: ArrayBuffer[Int] = ArrayBuffer.empty
+ while (iter.hasNext) {
+ val r = iter.next()
+ tmp.append(r.getInt(0))
+ }
+ Iterator.single(tmp.mkString(","))
+ }.collect()
+ assert(oriRows.length == 3)
+ assert(oriRows.contains("0,1,2"))
+
+ // The fix has replaced repartition with repartitionByRange which will put the
+ // instances with same group into the same partition
+ val ranker = new XGBoostRanker().setGroupCol("group").setNumWorkers(num_workers)
+ val (processedDf, _) = ranker.preprocess(df)
+ val rows = processedDf
+ .select("group")
+ .mapPartitions { case iter =>
+ val tmp: ArrayBuffer[Int] = ArrayBuffer.empty
+ while (iter.hasNext) {
+ val r = iter.next()
+ tmp.append(r.getInt(0))
+ }
+ Iterator.single(tmp.mkString(","))
+ }.collect()
+
+ rows.forall(Seq("0,0,0", "1,1,1", "2,2,2").contains)
+ }
+
private def runLengthEncode(input: Seq[Int]): Seq[Int] = {
if (input.isEmpty) return Seq(0)
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
index 524e5984803d..a9798465686f 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j-gpu.cu
@@ -97,7 +97,7 @@ void CopyInterface(std::vector> &interface_arr,
Json{Boolean{false}}};
out["data"] = Array(std::move(j_data));
- out["shape"] = Array(std::vector{Json(Integer(interface.Shape(0)))});
+ out["shape"] = Array(std::vector{Json(Integer(interface.Shape<0>()))});
if (interface.valid.Data()) {
CopyColumnMask(interface, columns, kind, c, &mask, &out, stream);
@@ -113,7 +113,7 @@ void CopyMetaInfo(Json *p_interface, dh::device_vector *out, cudaStream_t str
CHECK_EQ(get(j_interface).size(), 1);
auto object = get