pytorch · leej3 · May 23, 2024 · Apr 4, 2024 · Apr 2, 2024 · Apr 25, 2024
diff --git a/.github/workflows/gpu-hvd-tests.yml b/.github/workflows/gpu-hvd-tests.yml
@@ -22,7 +22,7 @@ jobs:
   gpu-hvd-tests:
     strategy:
       matrix:
-        pytorch-channel: [pytorch, ]
+        pytorch-channel: [pytorch]
       fail-fast: false
     env:
       DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
@@ -128,8 +128,8 @@ jobs:
           # Can't build Horovod with recent pytorch due to pytorch required C++17 standard
           # and horovod is still using C++14
           # HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
-          # Using a similar hack as described here: 
-          # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345 
+          # Using a similar hack as described here:
+          # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345
           git clone --recursive https://github.com/horovod/horovod.git /horovod
           cd /horovod
           sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt
@@ -152,7 +152,7 @@ jobs:
           set -xe
 
           bash tests/run_gpu_tests.sh 2 hvd
-          CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
+          CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ignite -m distributed -k hvd
 
           EOF
           )

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
@@ -29,7 +29,7 @@ jobs:
       REPOSITORY: ${{ github.repository }}
       PR_NUMBER: ${{ github.event.pull_request.number }}
     runs-on: linux.8xlarge.nvidia.gpu
-    timeout-minutes: 45
+    timeout-minutes: 85
 
     steps:
       - name: Clean workspace
@@ -121,18 +121,13 @@ jobs:
 
       - name: Run GPU Unit Tests
         continue-on-error: false
-        run: |
-
-          script=$(cat << EOF
-
-          set -xe
-
-          bash tests/run_gpu_tests.sh 2
-
-          EOF
-          )
-
-          docker exec -t pthd /bin/bash -c "${script}"
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 25
+          shell: bash
+          command: docker exec -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
+          new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3

diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml
@@ -75,9 +75,13 @@ jobs:
           target_dir: /tmp
 
       - name: Run Tests
-        shell: bash -l {0}
-        run: |
-          bash tests/run_cpu_tests.sh
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 15
+          shell: bash
+          command: bash tests/run_cpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1   bash tests/run_cpu_tests.sh
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3

diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml
@@ -10,15 +10,15 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
-    timeout-minutes: 45
+    timeout-minutes: 85
     strategy:
       max-parallel: 5
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9, "3.10"]
         pytorch-version:
           [2.1.2, 2.0.1, 1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.5.1]
-        exclude:            
+        exclude:
           - pytorch-version: 1.5.1
             python-version: 3.9
           - pytorch-version: 1.5.1
@@ -78,7 +78,7 @@ jobs:
           pip install -r requirements-dev.txt
           python setup.py install
 
-          # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern 
+          # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern
           # which raises the error: AttributeError: module 'distutils' has no attribute 'version' for setuptools>59
           bad_pth_version=$(python -c "import torch; print('.'.join(torch.__version__.split('.')[:2]) in ['1.9', '1.10'])")
           if [ "${bad_pth_version}" == "True" ]; then
@@ -92,9 +92,13 @@ jobs:
           target_dir: /tmp
 
       - name: Run Tests
-        shell: bash -l {0}
-        run: |
-          bash tests/run_cpu_tests.sh "not test_time_profilers"
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 15
+          shell: bash
+          command: bash tests/run_cpu_tests.sh "not test_time_profilers"
+          new_command_on_retry: USE_LAST_FAILED=1  bash tests/run_cpu_tests.sh "not test_time_profilers"
 
   # create-issue:
   #   runs-on: ubuntu-latest

diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml
@@ -89,13 +89,19 @@ jobs:
           target_dir: /tmp
 
       - name: Run Tests
-        run: |
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${Python_ROOT_DIR}/lib
-          export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
-          export XRT_WORKERS="localservice:0;grpc://localhost:40934"
-
-          python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
-          bash tests/run_tpu_tests.sh
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 25
+          shell: bash
+          command: |
+            python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
+            bash tests/run_tpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1  bash tests/run_tpu_tests.sh
+        env:
+          LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
+          XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
+          XRT_WORKERS: "localservice:0;grpc://localhost:40934"
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -31,7 +31,7 @@ concurrency:
 jobs:
   cpu-tests:
     runs-on: ${{ matrix.os }}
-    timeout-minutes: 45
+    timeout-minutes: 85
     defaults:
       run:
         shell: bash
@@ -40,7 +40,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11","3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
         pytorch-channel: [pytorch, pytorch-nightly]
         include:
           # includes a single build on windows
@@ -102,7 +102,7 @@ jobs:
 
       - name: Run Mypy
         # https://github.com/pytorch/ignite/pull/2780
-        # 
+        #
         if: ${{ matrix.os == 'ubuntu-latest' && matrix.pytorch-channel == 'pytorch-nightly'}}
         run: |
           bash ./tests/run_code_style.sh mypy
@@ -120,8 +120,13 @@ jobs:
           cp -R /tmp/MNIST .
 
       - name: Run Tests
-        run: |
-          SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 15
+          shell: bash
+          command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1  SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3

diff --git a/tests/common-test-functionality.sh b/tests/common-test-functionality.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Will catch exit code 5 when tests are deselected from previous passing run
+# (relevent for --last-failed-no-failures none)
+last_failed_no_failures_code=5
+
+#  functions shared across test files
+run_tests() {
+    # Set defaults
+    local core_args="-vvv tests/ignite"
+    local cache_dir=".unknown-cache"
+    local skip_distrib_tests=1
+    local match_tests_expression=""
+    local trap_deselected_exit_code=1
+    local use_last_failed=0
+    local use_coverage=0
+    local world_size=0
+    # Always clean up pytest.ini
+    trap 'rm -f pytest.ini' RETURN
+    # Parse arguments
+    while [[ $# -gt 0 ]]
+    do
+        key="$1"
+        case $key in
+            --core_args)
+            core_args="$2"
+            shift
+            shift
+            ;;
+            --cache_dir)
+            cache_dir="$2"
+            shift
+            shift
+            ;;
+            --skip_distrib_tests)
+            skip_distrib_tests="$2"
+            shift
+            shift
+            ;;
+            --match_tests_expression)
+            match_tests_expression="$2"
+            shift
+            shift
+            ;;
+            --trap_deselected_exit_code)
+            trap_deselected_exit_code="$2"
+            shift
+            shift
+            ;;
+            --use_last_failed)
+            use_last_failed="$2"
+            shift
+            shift
+            ;;
+            --use_coverage)
+            use_coverage="$2"
+            shift
+            shift
+            ;;
+            --world_size)
+            world_size="$2"
+            shift
+            shift
+            ;;
+            *)
+            echo "Error: Unknown argument $key"
+            exit 1
+            shift
+            ;;
+        esac
+    done
+
+    if [ "${skip_distrib_tests}" -eq "1" ]; then
+        # can be overwritten by core_args
+        skip_distrib_opt="-m 'not distributed and not tpu and not multinode_distributed'"
+    else
+        skip_distrib_opt=""
+    fi
+
+
+    echo [pytest] > pytest.ini ; echo "cache_dir=${cache_dir}" >> pytest.ini
+
+    # Assemble options for the pytest command
+    pytest_args="${skip_distrib_opt} ${core_args} --treat-unrun-as-failed -k '${match_tests_expression}'"
+    if [ "${use_last_failed:-0}" -eq "1" ] && [ -d "${cache_dir}" ]; then
+        pytest_args="--last-failed --last-failed-no-failures none ${pytest_args}"
+    fi
+    if [ "${use_coverage}" -eq "1" ]; then
+        pytest_args="--cov ignite --cov-append --cov-report term-missing --cov-report xml ${pytest_args}"
+    fi
+    if [ ! "${world_size}" -eq "0" ]; then
+        export WORLD_SIZE="${world_size}"
+        pytest_args="--dist=each --tx ${WORLD_SIZE}*popen//python=python ${pytest_args}"
+    fi
+
+    # Run the command
+    if [ "$trap_deselected_exit_code" -eq "1" ]; then
+        CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" || { exit_code=$?; if [ "$exit_code" -eq ${last_failed_no_failures_code} ]; then echo "All tests deselected"; else exit $exit_code; fi; }
+    else
+        CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}"
+    fi
+}
diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py
@@ -1,8 +1,10 @@
 import functools
 import os
 import shutil
+import signal
 import sys
 import tempfile
+import threading
 import time
 from pathlib import Path
 
@@ -17,6 +19,35 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "distributed: run distributed")
     config.addinivalue_line("markers", "multinode_distributed: distributed")
     config.addinivalue_line("markers", "tpu: run on tpu")
+    if config.option.treat_unrun_as_failed:
+        unrun_tracker = UnrunTracker()
+        config.pluginmanager.register(unrun_tracker, "unrun_tracker_plugin")
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--treat-unrun-as-failed",
+        action="store_true",
+        help="""
+        If a session is interrupted treat the unrun tests as failed so that a
+        rerun with --last-failed runs any tests that have not passed or been
+        skipped.
+        """,
+    )
+
+
+@pytest.fixture(scope="session", autouse=True)
+def term_handler():
+    # This allows the pytest session to be terminated upon retries on CI. It may
+    # be worth using this fixture solely in that context. For a discussion on
+    # whether sigterm should be ignored see:
+    # https://github.com/pytest-dev/pytest/issues/5243
+    if threading.current_thread() is threading.main_thread() and hasattr(signal, "SIGTERM"):
+        orig = signal.signal(signal.SIGTERM, signal.getsignal(signal.SIGINT))
+        yield
+        signal.signal(signal.SIGTERM, orig)
+    else:
+        yield  # Just pass through if SIGTERM isn't supported or we are not in the main thread
 
 
 @pytest.fixture(
@@ -447,6 +478,37 @@ def distributed(request, local_rank, world_size):
         raise RuntimeError(f"Invalid parameter value for `distributed` fixture, given {request.param}")
 
 
+class UnrunTracker:
+    """
+    Keeps track of unrun tests to improve the user experience when a test
+    session is interrupted. This is particularly useful on CI when rerunning
+    "failing" tests where the failure was due to a deadlock and many tests
+    weren't actually run so they didn't actually fail.
+    """
+
+    def __init__(self):
+        self.unrun_tests = []
+
+    def pytest_collection_finish(self, session):
+        # At the end of the collection, add all items to the unrun_tests list
+        self.unrun_tests.extend(session.items)
+
+    def pytest_runtest_teardown(self, item):
+        if item in self.unrun_tests:
+            self.unrun_tests.remove(item)
+
+    def record_unrun_as_failed(self, session, exitstatus):
+        # Get current lastfailed entries (if any)
+        lastfailed = session.config.cache.get("cache/lastfailed", {})
+
+        # Add unrun tests to lastfailed
+        for test in self.unrun_tests:
+            lastfailed[test.nodeid] = True
+
+        # Update the cache with the new lastfailed
+        session.config.cache.set("cache/lastfailed", lastfailed)
+
+
 @pytest.hookimpl
 def pytest_pyfunc_call(pyfuncitem: pytest.Function) -> None:
     if any(fx in pyfuncitem.fixturenames for fx in ["distributed", "multinode_distributed"]):
@@ -508,3 +570,9 @@ def xla_worker(index, fn):
                 assert ex_.code == 0, "Didn't successfully exit in XLA test"
 
         pyfuncitem.obj = functools.partial(testfunc_wrapper, pyfuncitem.obj)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    if session.config.option.treat_unrun_as_failed:
+        unrun_tracker = session.config.pluginmanager.get_plugin("unrun_tracker_plugin")
+        unrun_tracker.record_unrun_as_failed(session, exitstatus)