Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry tests #3229

Merged
merged 13 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/gpu-hvd-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
gpu-hvd-tests:
strategy:
matrix:
pytorch-channel: [pytorch, ]
pytorch-channel: [pytorch]
fail-fast: false
env:
DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
Expand Down Expand Up @@ -128,8 +128,8 @@ jobs:
# Can't build Horovod with recent pytorch due to pytorch required C++17 standard
# and horovod is still using C++14
# HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
# Using a similar hack as described here:
# https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345
# Using a similar hack as described here:
# https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345
git clone --recursive https://github.com/horovod/horovod.git /horovod
cd /horovod
sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt
Expand All @@ -152,7 +152,7 @@ jobs:
set -xe

bash tests/run_gpu_tests.sh 2 hvd
CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ignite -m distributed -k hvd

EOF
)
Expand Down
21 changes: 8 additions & 13 deletions .github/workflows/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
REPOSITORY: ${{ github.repository }}
PR_NUMBER: ${{ github.event.pull_request.number }}
runs-on: linux.8xlarge.nvidia.gpu
timeout-minutes: 45
timeout-minutes: 85

steps:
- name: Clean workspace
Expand Down Expand Up @@ -121,18 +121,13 @@ jobs:

- name: Run GPU Unit Tests
continue-on-error: false
run: |

script=$(cat << EOF

set -xe

bash tests/run_gpu_tests.sh 2

EOF
)

docker exec -t pthd /bin/bash -c "${script}"
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 25
shell: bash
command: docker exec -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/hvd-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,13 @@ jobs:
target_dir: /tmp

- name: Run Tests
shell: bash -l {0}
run: |
bash tests/run_cpu_tests.sh
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 15
shell: bash
command: bash tests/run_cpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
16 changes: 10 additions & 6 deletions .github/workflows/pytorch-version-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ on:
jobs:
build:
runs-on: ubuntu-latest
timeout-minutes: 45
timeout-minutes: 85
strategy:
max-parallel: 5
fail-fast: false
matrix:
python-version: [3.8, 3.9, "3.10"]
pytorch-version:
[2.1.2, 2.0.1, 1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.5.1]
exclude:
exclude:
- pytorch-version: 1.5.1
python-version: 3.9
- pytorch-version: 1.5.1
Expand Down Expand Up @@ -78,7 +78,7 @@ jobs:
pip install -r requirements-dev.txt
python setup.py install

# pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern
# pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern
# which raises the error: AttributeError: module 'distutils' has no attribute 'version' for setuptools>59
bad_pth_version=$(python -c "import torch; print('.'.join(torch.__version__.split('.')[:2]) in ['1.9', '1.10'])")
if [ "${bad_pth_version}" == "True" ]; then
Expand All @@ -92,9 +92,13 @@ jobs:
target_dir: /tmp

- name: Run Tests
shell: bash -l {0}
run: |
bash tests/run_cpu_tests.sh "not test_time_profilers"
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 15
shell: bash
command: bash tests/run_cpu_tests.sh "not test_time_profilers"
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh "not test_time_profilers"

# create-issue:
# runs-on: ubuntu-latest
Expand Down
20 changes: 13 additions & 7 deletions .github/workflows/tpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,19 @@ jobs:
target_dir: /tmp

- name: Run Tests
run: |
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${Python_ROOT_DIR}/lib
export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
export XRT_WORKERS="localservice:0;grpc://localhost:40934"

python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
bash tests/run_tpu_tests.sh
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 25
shell: bash
command: |
python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
bash tests/run_tpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh
env:
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
XRT_WORKERS: "localservice:0;grpc://localhost:40934"

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
15 changes: 10 additions & 5 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ concurrency:
jobs:
cpu-tests:
runs-on: ${{ matrix.os }}
timeout-minutes: 45
timeout-minutes: 85
defaults:
run:
shell: bash
Expand All @@ -40,7 +40,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ["3.8", "3.9", "3.10", "3.11","3.12"]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
pytorch-channel: [pytorch, pytorch-nightly]
include:
# includes a single build on windows
Expand Down Expand Up @@ -102,7 +102,7 @@ jobs:

- name: Run Mypy
# https://github.com/pytorch/ignite/pull/2780
#
#
if: ${{ matrix.os == 'ubuntu-latest' && matrix.pytorch-channel == 'pytorch-nightly'}}
run: |
bash ./tests/run_code_style.sh mypy
Expand All @@ -120,8 +120,13 @@ jobs:
cp -R /tmp/MNIST .

- name: Run Tests
run: |
SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 15
shell: bash
command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
102 changes: 102 additions & 0 deletions tests/common-test-functionality.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/bash
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder whether it would be more simple to write this script in python for later maintenance instead of a bash script and how much effort it would take?
If you think this is feasible we can do that in a follow-up PR

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be easy enough: largely just providing a simple cli, setting environment variables, and assembling commands to run as a subprocess. A few hours probably. Perhaps a few more to iron out issues and add some tests for it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, sounds good, let's make a python script instead of this bash script in a follow-up PR


# Will catch exit code 5 when tests are deselected from previous passing run
# (relevent for --last-failed-no-failures none)
last_failed_no_failures_code=5

# functions shared across test files
run_tests() {
# Set defaults
local core_args="-vvv tests/ignite"
local cache_dir=".unknown-cache"
local skip_distrib_tests=1
local match_tests_expression=""
local trap_deselected_exit_code=1
local use_last_failed=0
local use_coverage=0
local world_size=0
# Always clean up pytest.ini
trap 'rm -f pytest.ini' RETURN
# Parse arguments
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--core_args)
core_args="$2"
shift
shift
;;
--cache_dir)
cache_dir="$2"
shift
shift
;;
--skip_distrib_tests)
skip_distrib_tests="$2"
shift
shift
;;
--match_tests_expression)
match_tests_expression="$2"
shift
shift
;;
--trap_deselected_exit_code)
trap_deselected_exit_code="$2"
shift
shift
;;
--use_last_failed)
use_last_failed="$2"
shift
shift
;;
--use_coverage)
use_coverage="$2"
shift
shift
;;
--world_size)
world_size="$2"
shift
shift
;;
*)
echo "Error: Unknown argument $key"
exit 1
shift
;;
esac
done

if [ "${skip_distrib_tests}" -eq "1" ]; then
# can be overwritten by core_args
skip_distrib_opt="-m 'not distributed and not tpu and not multinode_distributed'"
else
skip_distrib_opt=""
fi


echo [pytest] > pytest.ini ; echo "cache_dir=${cache_dir}" >> pytest.ini

# Assemble options for the pytest command
pytest_args="${skip_distrib_opt} ${core_args} --treat-unrun-as-failed -k '${match_tests_expression}'"
if [ "${use_last_failed:-0}" -eq "1" ] && [ -d "${cache_dir}" ]; then
pytest_args="--last-failed --last-failed-no-failures none ${pytest_args}"
fi
if [ "${use_coverage}" -eq "1" ]; then
pytest_args="--cov ignite --cov-append --cov-report term-missing --cov-report xml ${pytest_args}"
fi
if [ ! "${world_size}" -eq "0" ]; then
export WORLD_SIZE="${world_size}"
pytest_args="--dist=each --tx ${WORLD_SIZE}*popen//python=python ${pytest_args}"
fi

# Run the command
if [ "$trap_deselected_exit_code" -eq "1" ]; then
CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" || { exit_code=$?; if [ "$exit_code" -eq ${last_failed_no_failures_code} ]; then echo "All tests deselected"; else exit $exit_code; fi; }
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need eval call here, can't we make the call without eval ?

Copy link
Collaborator Author

@leej3 leej3 May 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added an eval here because of some bugs I was running into where things like "-k ''" ends up as "-k" in the final command. The horrors of using eval are somewhat mitigated by the "-x" bash flag so that bugs in the command can be spotted more quickly. I think consistently using arrays for assembling commands in bash is a better alternative but I think using python is the best long term solution.

else
CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}"
fi
}
68 changes: 68 additions & 0 deletions tests/ignite/conftest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import functools
import os
import shutil
import signal
import sys
import tempfile
import threading
import time
from pathlib import Path

Expand All @@ -17,6 +19,35 @@ def pytest_configure(config):
config.addinivalue_line("markers", "distributed: run distributed")
config.addinivalue_line("markers", "multinode_distributed: distributed")
config.addinivalue_line("markers", "tpu: run on tpu")
if config.option.treat_unrun_as_failed:
unrun_tracker = UnrunTracker()
config.pluginmanager.register(unrun_tracker, "unrun_tracker_plugin")


def pytest_addoption(parser):
parser.addoption(
"--treat-unrun-as-failed",
action="store_true",
help="""
If a session is interrupted treat the unrun tests as failed so that a
rerun with --last-failed runs any tests that have not passed or been
skipped.
""",
)


@pytest.fixture(scope="session", autouse=True)
def term_handler():
# This allows the pytest session to be terminated upon retries on CI. It may
# be worth using this fixture solely in that context. For a discussion on
# whether sigterm should be ignored see:
# https://github.com/pytest-dev/pytest/issues/5243
if threading.current_thread() is threading.main_thread() and hasattr(signal, "SIGTERM"):
orig = signal.signal(signal.SIGTERM, signal.getsignal(signal.SIGINT))
yield
signal.signal(signal.SIGTERM, orig)
else:
yield # Just pass through if SIGTERM isn't supported or we are not in the main thread


@pytest.fixture(
Expand Down Expand Up @@ -447,6 +478,37 @@ def distributed(request, local_rank, world_size):
raise RuntimeError(f"Invalid parameter value for `distributed` fixture, given {request.param}")


class UnrunTracker:
"""
Keeps track of unrun tests to improve the user experience when a test
session is interrupted. This is particularly useful on CI when rerunning
"failing" tests where the failure was due to a deadlock and many tests
weren't actually run so they didn't actually fail.
"""

def __init__(self):
self.unrun_tests = []

def pytest_collection_finish(self, session):
# At the end of the collection, add all items to the unrun_tests list
self.unrun_tests.extend(session.items)

def pytest_runtest_teardown(self, item):
if item in self.unrun_tests:
self.unrun_tests.remove(item)

def record_unrun_as_failed(self, session, exitstatus):
# Get current lastfailed entries (if any)
lastfailed = session.config.cache.get("cache/lastfailed", {})

# Add unrun tests to lastfailed
for test in self.unrun_tests:
lastfailed[test.nodeid] = True

# Update the cache with the new lastfailed
session.config.cache.set("cache/lastfailed", lastfailed)


@pytest.hookimpl
def pytest_pyfunc_call(pyfuncitem: pytest.Function) -> None:
if any(fx in pyfuncitem.fixturenames for fx in ["distributed", "multinode_distributed"]):
Expand Down Expand Up @@ -508,3 +570,9 @@ def xla_worker(index, fn):
assert ex_.code == 0, "Didn't successfully exit in XLA test"

pyfuncitem.obj = functools.partial(testfunc_wrapper, pyfuncitem.obj)


def pytest_sessionfinish(session, exitstatus):
if session.config.option.treat_unrun_as_failed:
unrun_tracker = session.config.pluginmanager.get_plugin("unrun_tracker_plugin")
unrun_tracker.record_unrun_as_failed(session, exitstatus)
Loading
Loading