From 205a2a35b833bf2337c8e8d04556d9a8290c5bbc Mon Sep 17 00:00:00 2001
From: leej3 <“johnleenimh@gmail.com>
Date: Tue, 2 Apr 2024 14:17:35 +0000
Subject: [PATCH] retry with pytest last failed logic

greatly speeds up reruns of tests as only previously failed tests are
rerun.

define pytest cachedir for each pytest invocation to prevent interaction
between different selections of tests.

protect against exit code of 5 when a previous pytest invocation
had no failed tests which results in all tests being deselected.

use eval to avoid issues with the -k and -m expansions.
---
 .github/workflows/hvd-tests.yml             |  1 +
 .github/workflows/pytorch-version-tests.yml |  1 +
 .github/workflows/tpu-tests.yml             |  1 +
 .github/workflows/unit-tests.yml            |  1 +
 tests/run_cpu_tests.sh                      | 23 +++++++++++++--
 tests/run_gpu_tests.sh                      | 31 ++++++++++++++++++---
 tests/run_tpu_tests.sh                      | 20 +++++++++++--
 7 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml
index e15d0981aaa8..3f6ba7f24bd7 100644
--- a/.github/workflows/hvd-tests.yml
+++ b/.github/workflows/hvd-tests.yml
@@ -81,6 +81,7 @@ jobs:
           timeout_minutes: 25
           shell: bash
           command: bash tests/run_cpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1   bash tests/run_cpu_tests.sh
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3
diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml
index 7716cb63611f..62020d0aba12 100644
--- a/.github/workflows/pytorch-version-tests.yml
+++ b/.github/workflows/pytorch-version-tests.yml
@@ -98,6 +98,7 @@ jobs:
           timeout_minutes: 25
           shell: bash
           command: bash tests/run_cpu_tests.sh "not test_time_profilers"
+          new_command_on_retry: USE_LAST_FAILED=1  bash tests/run_cpu_tests.sh "not test_time_profilers"
 
   # create-issue:
   #   runs-on: ubuntu-latest
diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml
index 9fa2da7974e5..a7a52949114c 100644
--- a/.github/workflows/tpu-tests.yml
+++ b/.github/workflows/tpu-tests.yml
@@ -97,6 +97,7 @@ jobs:
           command: |
             python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
             bash tests/run_tpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1  bash tests/run_tpu_tests.sh
         env:
           LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
           XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 9e12fd84acc0..fd91bc18d4a7 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -126,6 +126,7 @@ jobs:
           timeout_minutes: 25
           shell: bash
           command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1  SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3
diff --git a/tests/run_cpu_tests.sh b/tests/run_cpu_tests.sh
index 2297be94219d..35a9743a712e 100644
--- a/tests/run_cpu_tests.sh
+++ b/tests/run_cpu_tests.sh
@@ -5,12 +5,21 @@ set -xeu
 if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
     skip_distrib_opt=(-m "not distributed and not tpu and not multinode_distributed")
 else
-    skip_distrib_opt=(-m "")
+    skip_distrib_opt=()
 fi
 
 MATCH_TESTS_EXPRESSION=${1:-""}
 
-CUDA_VISIBLE_DEVICES="" pytest --tx 4*popen//python=python --cov ignite --cov-report term-missing --cov-report xml -vvv tests "${skip_distrib_opt[@]}" -k "$MATCH_TESTS_EXPRESSION"
+# Will catch exit code 5 when tests are deselected from previous passing run
+EXIT_CODE_ALL_TESTS_DESELECTED=5
+
+CACHE_DIR=.cpu-not-distrib
+echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
+PYTEST_ARGS="--tx 4*popen//python=python --cov ignite --cov-report term-missing --cov-report xml -vvv tests ${skip_distrib_opt[@]} ${MATCH_TESTS_EXPRESSION}"
+if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
+    PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
+fi
+CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}"  || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}
 
 # https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
 if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
@@ -18,5 +27,13 @@ if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
 fi
 
 export WORLD_SIZE=2
-CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION"
+CACHE_DIR=.cpu-distrib
+echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
+PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv ${MATCH_TESTS_EXPRESSION}"
+if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
+    PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
+fi
+CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}"
 unset WORLD_SIZE
+
+rm -f pytest.ini
diff --git a/tests/run_gpu_tests.sh b/tests/run_gpu_tests.sh
index 3146443a531d..84d659e9533a 100644
--- a/tests/run_gpu_tests.sh
+++ b/tests/run_gpu_tests.sh
@@ -14,22 +14,45 @@ else
     cuda_pattern="cuda and $MATCH_TESTS_EXPRESSION"
 fi
 
+# Will catch exit code 5 when tests are deselected from previous passing run
+EXIT_CODE_ALL_TESTS_DESELECTED=5
+
 set -xeu
 
-pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k "$cuda_pattern"
+CACHE_DIR=.gpu-cuda
+echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
+PYTEST_ARGS="--cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k '${cuda_pattern}'"
+if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
+    PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
+fi
+CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}
+
+
 
 # https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
 if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
     exit 0
 fi
 
-pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k "$MATCH_TESTS_EXPRESSION"
-
+CACHE_DIR=.gpu-distrib
+echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
+PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k '${MATCH_TESTS_EXPRESSION}'"
+if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
+    PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
+fi
+CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}
 
 if [ ${ngpus} -gt 1 ]; then
 
     export WORLD_SIZE=${ngpus}
-    pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION"
+    CACHE_DIR=.gpu-distrib-multi
+    echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
+    PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k '${MATCH_TESTS_EXPRESSION}'"
+    if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
+        PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
+    fi
+    CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}"
     unset WORLD_SIZE
 
 fi
+rm -f pytest.ini
diff --git a/tests/run_tpu_tests.sh b/tests/run_tpu_tests.sh
index 0877de858aed..c4aa3d86e62a 100644
--- a/tests/run_tpu_tests.sh
+++ b/tests/run_tpu_tests.sh
@@ -1,10 +1,26 @@
 #!/bin/bash
+# Will catch exit code 5 when tests are deselected from previous passing run
+EXIT_CODE_ALL_TESTS_DESELECTED=5
 
 set -xeu
 
-pytest --cov ignite --cov-report term-missing --cov-report xml tests/ -vvv -m tpu
+CACHE_DIR=.tpu
+echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
+PYTEST_ARGS="--cov ignite --cov-report term-missing --cov-report xml tests/ -vvv -m tpu"
+if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
+    PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
+fi
+CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}
+
 
 if [ -z ${NUM_TPU_WORKERS+x} ]; then
     export NUM_TPU_WORKERS=1
-    pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml tests/ -vvv -m tpu
+    CACHE_DIR=.tpu-multi
+    echo [pytest] > pytest.ini ; echo "cache_dir=${CACHE_DIR}" >> pytest.ini
+    PYTEST_ARGS="--cov ignite --cov-append --cov-report term-missing --cov-report xml tests/ -vvv -m tpu"
+    if [ "${USE_LAST_FAILED:-0}" -eq "1" ] && [ -d "${CACHE_DIR}" ]; then
+        PYTEST_ARGS="--last-failed --last-failed-no-failures none ${PYTEST_ARGS}"
+    fi
+    CUDA_VISIBLE_DEVICES="" eval "pytest ${PYTEST_ARGS}" || { exit_code=$?; if [ "$exit_code" -eq 5 ]; then echo "All tests deselected"; else exit $exit_code; fi;}
 fi
+rm -f pytest.ini