Skip to content

Commit

Permalink
expand use of on_retry_command
Browse files Browse the repository at this point in the history
also adds retry for gpu tests
  • Loading branch information
leej3 committed May 10, 2024
1 parent 6b5fbeb commit 426d4a3
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 14 deletions.
19 changes: 7 additions & 12 deletions .github/workflows/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,18 +121,13 @@ jobs:
- name: Run GPU Unit Tests
continue-on-error: false
run: |
script=$(cat << EOF
set -xe
bash tests/run_gpu_tests.sh 2
EOF
)
docker exec -t pthd /bin/bash -c "${script}"
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 30
shell: bash
command: docker exec -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'
new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'tests/run_gpu_tests.sh 2'

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/hvd-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,15 @@ jobs:
timeout_minutes: 25
shell: bash
command: bash tests/run_cpu_tests.sh
on_retry_command: |
echo sending kill signal until process group no longer exists...
if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ]; then
tests_pid=$(cat .ignite_testing.pid)
ps -p $tests_pid &> /dev/null && pkill -INT -g $tests_pid && sleep 5
ps -p $tests_pid &> /dev/null && pkill -KILL -g $tests_pid
else
echo "File .ignite_testing.pid does not exist or is empty"
fi
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/tpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,15 @@ jobs:
command: |
python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
bash tests/run_tpu_tests.sh
on_retry_command: |
echo sending kill signal until process group no longer exists...
if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ]; then
tests_pid=$(cat .ignite_testing.pid)
ps -p $tests_pid &> /dev/null && pkill -INT -g $tests_pid && sleep 5
ps -p $tests_pid &> /dev/null && pkill -KILL -g $tests_pid
else
echo "File .ignite_testing.pid does not exist or is empty"
fi
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh
env:
LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ jobs:
echo sending kill signal until process group no longer exists...
if [ -f .ignite_testing.pid ] && [ -s .ignite_testing.pid ]; then
tests_pid=$(cat .ignite_testing.pid)
ps -p $tests_pid > /dev/null && pkill -INT -g $tests_pid && sleep 5
ps -p $tests_pid > /dev/null && pkill -KILL -g $tests_pid
ps -p $tests_pid &> /dev/null && pkill -INT -g $tests_pid && sleep 5
ps -p $tests_pid &> /dev/null && pkill -KILL -g $tests_pid
else
echo "File .ignite_testing.pid does not exist or is empty"
fi
Expand Down

0 comments on commit 426d4a3

Please sign in to comment.