diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index b0a03ba69cc..8994e0172f1 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -2,7 +2,7 @@ # Copyright (c) 2022-2025, NVIDIA CORPORATION. # Support invoking test_python_cudf.sh outside the script directory -cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ || exit; # Common setup steps shared by Python test jobs source ./ci/test_python_common.sh test_python_other @@ -14,10 +14,22 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e +# Get the total GPU memory in MiB +GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | awk '{print $1}') +GPU_MEMORY_GB=$((GPU_MEMORY / 1024)) + +# Set the NUM_PROCESSES based on GPU memory +if [ "$GPU_MEMORY_GB" -lt 24 ]; then + NUM_PROCESSES=10 +else + NUM_PROCESSES=20 +fi + + rapids-logger "pytest dask_cudf" ./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ - --numprocesses=8 \ + --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ --cov-config=../.coveragerc \ --cov=dask_cudf \ @@ -31,7 +43,7 @@ rapids-logger "pytest cudf_kafka" rapids-logger "pytest custreamz" ./ci/run_custreamz_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \ - --numprocesses=8 \ + --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ --cov-config=../.coveragerc \ --cov=custreamz \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 606d157be05..98ba3245b23 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -27,12 +27,22 @@ RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" +# Get the total GPU memory in MiB +GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | awk '{print $1}') +GPU_MEMORY_GB=$((GPU_MEMORY / 1024)) + +# Set the NUM_PROCESSES based on GPU memory +if [ "$GPU_MEMORY_GB" -lt 24 ]; then + NUM_PROCESSES=10 +else + NUM_PROCESSES=20 +fi rapids-logger "pytest pylibcudf" pushd python/pylibcudf/pylibcudf/tests python -m pytest \ --cache-clear \ - --numprocesses=8 \ + --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ . popd @@ -42,7 +52,7 @@ pushd python/cudf/cudf/tests python -m pytest \ --cache-clear \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \ - --numprocesses=8 \ + --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ . popd diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 5e55fff78cd..f184cf22029 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -29,12 +29,24 @@ RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" +# Get the total GPU memory in MiB +GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | awk '{print $1}') +GPU_MEMORY_GB=$((GPU_MEMORY / 1024)) + +# Set the NUM_PROCESSES based on GPU memory +if [ "$GPU_MEMORY_GB" -lt 24 ]; then + NUM_PROCESSES=10 +else + NUM_PROCESSES=20 +fi + + # Run tests in dask_cudf/tests and dask_cudf/io/tests rapids-logger "pytest dask_cudf" pushd python/dask_cudf/dask_cudf python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ - --numprocesses=8 \ + --numprocesses=${NUM_PROCESSES} \ --dist=worksteal \ . popd