Skip to content

Commit

Permalink
retry with pytest last failed logic
Browse files Browse the repository at this point in the history
greatly speeds up reruns of tests as only previously failed tests are
rerun.

define pytest cachedir for each pytest invocation to prevent interaction
between different selections of tests.

protect against exit code of 5 when a previous pytest invocation
had no failed tests which results in all tests being deselected.

use eval to avoid issues with the -k and -m expansions.
  • Loading branch information
leej3 committed Apr 29, 2024
1 parent a1aadfb commit 205a2a3
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 9 deletions.
1 change: 1 addition & 0 deletions .github/workflows/hvd-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ jobs:
timeout_minutes: 25
shell: bash
command: bash tests/run_cpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/pytorch-version-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ jobs:
timeout_minutes: 25
shell: bash
command: bash tests/run_cpu_tests.sh "not test_time_profilers"
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh "not test_time_profilers"

# create-issue:
# runs-on: ubuntu-latest
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/tpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ jobs:
command: |
python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
bash tests/run_tpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh
env:
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ jobs:
timeout_minutes: 25
shell: bash
command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
23 changes: 20 additions & 3 deletions tests/run_cpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,35 @@ set -xeu
if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
skip_distrib_opt=(-m "not distributed and not tpu and not multinode_distributed")
else
skip_distrib_opt=(-m "")
skip_distrib_opt=()
fi

MATCH_TESTS_EXPRESSION=${1:-""}

CUDA_VISIBLE_DEVICES="" pytest --tx 4*popen//python=python --cov ignite --cov-report term-missing --cov-report xml -vvv tests "${skip_distrib_opt[@]}" -k "$MATCH_TESTS_EXPRESSION"
# Will catch exit code 5 when tests are deselected from previous passing run
EXIT_CODE_ALL_TESTS_DESELECTED=5

CACHE_DIR=.cpu-not-distrib
echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
PYTEST_ARGS="--tx 4*popen//python=python --cov ignite --cov-report term-missing --cov-report xml -vvv tests ${skip_distrib_opt[@]} ${MATCH_TESTS_EXPRESSION}"
if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
fi
CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}

# https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
exit 0
fi

export WORLD_SIZE=2
CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION"
CACHE_DIR=.cpu-distrib
echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv ${MATCH_TESTS_EXPRESSION}"
if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
fi
CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}"
unset WORLD_SIZE

rm -f pytest.ini
31 changes: 27 additions & 4 deletions tests/run_gpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,45 @@ else
cuda_pattern="cuda and $MATCH_TESTS_EXPRESSION"
fi

# Will catch exit code 5 when tests are deselected from previous passing run
EXIT_CODE_ALL_TESTS_DESELECTED=5

set -xeu

pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k "$cuda_pattern"
CACHE_DIR=.gpu-cuda
echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
PYTEST_ARGS="--cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k '${cuda_pattern}'"
if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
fi
CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}



# https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
exit 0
fi

pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k "$MATCH_TESTS_EXPRESSION"

CACHE_DIR=.gpu-distrib
echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k '${MATCH_TESTS_EXPRESSION}'"
if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
fi
CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}

if [ ${ngpus} -gt 1 ]; then

export WORLD_SIZE=${ngpus}
pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION"
CACHE_DIR=.gpu-distrib-multi
echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k '${MATCH_TESTS_EXPRESSION}'"
if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
fi
CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}"
unset WORLD_SIZE

fi
rm -f pytest.ini
20 changes: 18 additions & 2 deletions tests/run_tpu_tests.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,26 @@
#!/bin/bash
# Will catch exit code 5 when tests are deselected from previous passing run
EXIT_CODE_ALL_TESTS_DESELECTED=5

set -xeu

pytest --cov ignite --cov-report term-missing --cov-report xml tests/ -vvv -m tpu
CACHE_DIR=.tpu
echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
PYTEST_ARGS="--cov ignite --cov-report term-missing --cov-report xml tests/ -vvv -m tpu"
if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
fi
CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}


if [ -z ${NUM_TPU_WORKERS+x} ]; then
export NUM_TPU_WORKERS=1
pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml tests/ -vvv -m tpu
CACHE_DIR=.tpu-multi
echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml tests/ -vvv -m tpu"
if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
fi
CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}
fi
rm -f pytest.ini

0 comments on commit 205a2a3

Please sign in to comment.