From 205a2a35b833bf2337c8e8d04556d9a8290c5bbc Mon Sep 17 00:00:00 2001 From: leej3 <“johnleenimh@gmail.com> Date: Tue, 2 Apr 2024 14:17:35 +0000 Subject: [PATCH] retry with pytest last failed logic greatly speeds up reruns of tests as only previously failed tests are rerun. define pytest cachedir for each pytest invocation to prevent interaction between different selections of tests. protect against exit code of 5 when a previous pytest invocation had no failed tests which results in all tests being deselected. use eval to avoid issues with the -k and -m expansions. --- .github/workflows/hvd-tests.yml | 1 + .github/workflows/pytorch-version-tests.yml | 1 + .github/workflows/tpu-tests.yml | 1 + .github/workflows/unit-tests.yml | 1 + tests/run_cpu_tests.sh | 23 +++++++++++++-- tests/run_gpu_tests.sh | 31 ++++++++++++++++++--- tests/run_tpu_tests.sh | 20 +++++++++++-- 7 files changed, 69 insertions(+), 9 deletions(-) diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml index e15d0981aaa8..3f6ba7f24bd7 100644 --- a/.github/workflows/hvd-tests.yml +++ b/.github/workflows/hvd-tests.yml @@ -81,6 +81,7 @@ jobs: timeout_minutes: 25 shell: bash command: bash tests/run_cpu_tests.sh + new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml index 7716cb63611f..62020d0aba12 100644 --- a/.github/workflows/pytorch-version-tests.yml +++ b/.github/workflows/pytorch-version-tests.yml @@ -98,6 +98,7 @@ jobs: timeout_minutes: 25 shell: bash command: bash tests/run_cpu_tests.sh "not test_time_profilers" + new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh "not test_time_profilers" # create-issue: # runs-on: ubuntu-latest diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index 9fa2da7974e5..a7a52949114c 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -97,6 +97,7 @@ jobs: command: | python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)" bash tests/run_tpu_tests.sh + new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh env: LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0" diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 9e12fd84acc0..fd91bc18d4a7 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -126,6 +126,7 @@ jobs: timeout_minutes: 25 shell: bash command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh + new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/tests/run_cpu_tests.sh b/tests/run_cpu_tests.sh index 2297be94219d..35a9743a712e 100644 --- a/tests/run_cpu_tests.sh +++ b/tests/run_cpu_tests.sh @@ -5,12 +5,21 @@ set -xeu if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then skip_distrib_opt=(-m "not distributed and not tpu and not multinode_distributed") else - skip_distrib_opt=(-m "") + skip_distrib_opt=() fi MATCH_TESTS_EXPRESSION=${1:-""} -CUDA_VISIBLE_DEVICES="" pytest --tx 4*popen//python=python --cov ignite --cov-report term-missing --cov-report xml -vvv tests "${skip_distrib_opt[@]}" -k "$MATCH_TESTS_EXPRESSION" +# Will catch exit code 5 when tests are deselected from previous passing run +EXIT_CODE_ALL_TESTS_DESELECTED=5 + +CACHE_DIR=.cpu-not-distrib +echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini +PYTEST_ARGS="--tx 4*popen//python=python --cov ignite --cov-report term-missing --cov-report xml -vvv tests ${skip_distrib_opt[@]} ${MATCH_TESTS_EXPRESSION}" +if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then + PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}" +fi +CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;} # https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02 if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then @@ -18,5 +27,13 @@ if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then fi export WORLD_SIZE=2 -CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION" +CACHE_DIR=.cpu-distrib +echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini +PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv ${MATCH_TESTS_EXPRESSION}" +if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then + PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}" +fi +CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" unset WORLD_SIZE + +rm -f pytest.ini diff --git a/tests/run_gpu_tests.sh b/tests/run_gpu_tests.sh index 3146443a531d..84d659e9533a 100644 --- a/tests/run_gpu_tests.sh +++ b/tests/run_gpu_tests.sh @@ -14,22 +14,45 @@ else cuda_pattern="cuda and $MATCH_TESTS_EXPRESSION" fi +# Will catch exit code 5 when tests are deselected from previous passing run +EXIT_CODE_ALL_TESTS_DESELECTED=5 + set -xeu -pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k "$cuda_pattern" +CACHE_DIR=.gpu-cuda +echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini +PYTEST_ARGS="--cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k '${cuda_pattern}'" +if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then + PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}" +fi +CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;} + + # https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02 if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then exit 0 fi -pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k "$MATCH_TESTS_EXPRESSION" - +CACHE_DIR=.gpu-distrib +echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini +PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k '${MATCH_TESTS_EXPRESSION}'" +if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then + PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}" +fi +CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;} if [ ${ngpus} -gt 1 ]; then export WORLD_SIZE=${ngpus} - pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION" + CACHE_DIR=.gpu-distrib-multi + echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini + PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k '${MATCH_TESTS_EXPRESSION}'" + if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then + PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}" + fi + CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" unset WORLD_SIZE fi +rm -f pytest.ini diff --git a/tests/run_tpu_tests.sh b/tests/run_tpu_tests.sh index 0877de858aed..c4aa3d86e62a 100644 --- a/tests/run_tpu_tests.sh +++ b/tests/run_tpu_tests.sh @@ -1,10 +1,26 @@ #!/bin/bash +# Will catch exit code 5 when tests are deselected from previous passing run +EXIT_CODE_ALL_TESTS_DESELECTED=5 set -xeu -pytest --cov ignite --cov-report term-missing --cov-report xml tests/ -vvv -m tpu +CACHE_DIR=.tpu +echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini +PYTEST_ARGS="--cov ignite --cov-report term-missing --cov-report xml tests/ -vvv -m tpu" +if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then + PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}" +fi +CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;} + if [ -z ${NUM_TPU_WORKERS+x} ]; then export NUM_TPU_WORKERS=1 - pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml tests/ -vvv -m tpu + CACHE_DIR=.tpu-multi + echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini + PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml tests/ -vvv -m tpu" + if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then + PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}" + fi + CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;} fi +rm -f pytest.ini