Skip to content

Commit c49554c

Browse files
sirutBuasaiSirut Buasai
and
Sirut Buasai
authored
Update distributed example tests in run_python_examples.sh (#1250)
* Fix distributed test * fix parallel scripts * install dill * remove dill * run 2 gpu * remove gpucount, use default * Add examples to distributed examples * refactor distributed test * fx ERRORS overwriting * run with base dir * remove distributed from run_python_examples.sh * move basedir to source * separate init --------- Co-authored-by: Sirut Buasai <[email protected]>
1 parent 911816c commit c49554c

File tree

3 files changed

+52
-77
lines changed

3 files changed

+52
-77
lines changed

Diff for: run_distributed_examples.sh

+8-34
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,8 @@
1010
# to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files.
1111
# Expects pytorch, torchvision to be installed.
1212

13-
BASE_DIR=`pwd`"/"`dirname $0`
14-
EXAMPLES=`echo $1 | sed -e 's/ //g'`
15-
16-
# Redirect 'python' calls to 'python3'
17-
python() {
18-
command python3 "$@"
19-
}
13+
BASE_DIR="$(pwd)/$(dirname $0)"
14+
source $BASE_DIR/utils.sh
2015

2116
USE_CUDA=$(python -c "import torch; print(torch.cuda.is_available())")
2217
case $USE_CUDA in
@@ -35,33 +30,12 @@ case $USE_CUDA in
3530
;;
3631
esac
3732

38-
ERRORS=""
39-
40-
function error() {
41-
ERR=$1
42-
ERRORS="$ERRORS\n$ERR"
43-
echo $ERR
44-
}
45-
46-
function install_deps() {
47-
echo "installing requirements"
48-
cat $BASE_DIR/*/requirements.txt | \
49-
sort -u | \
50-
# testing the installed version of torch, so don't pip install it.
51-
grep -vE '^torch$' | \
52-
pip install -r /dev/stdin || \
53-
{ error "failed to install dependencies"; exit 1; }
54-
}
55-
56-
function start() {
57-
EXAMPLE=${FUNCNAME[1]}
58-
cd $BASE_DIR/$EXAMPLE
59-
echo "Running example: $EXAMPLE"
60-
}
61-
6233
function distributed() {
6334
start
64-
torchrun --standalone --nnodes=1 --nproc_per_node=4 tensor_parallelism/fsdp_tp_example.py
35+
bash tensor_parallelism/run_example.sh tensor_parallelism/tensor_parallel_example.py || error "tensor parallel example failed"
36+
bash tensor_parallelism/run_example.sh tensor_parallelism/sequence_parallel_example.py || error "sequence parallel example failed"
37+
bash tensor_parallelism/run_example.sh tensor_parallelism/fsdp_tp_example.py || error "2D parallel example failed"
38+
python ddp/main.py || error "ddp example failed"
6539
}
6640

6741
function clean() {
@@ -88,8 +62,8 @@ fi
8862
if [ "" == "$ERRORS" ]; then
8963
echo "Completed successfully with status $?"
9064
else
91-
echo "Some examples failed:"
92-
printf "$ERRORS"
65+
echo "Some distributed examples failed:"
66+
printf "$ERRORS\n"
9367
#Exit with error (0-255) in case of failure in one of the tests.
9468
exit 1
9569

Diff for: run_python_examples.sh

+6-43
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,8 @@
1010
# to pip install dependencies (other than pytorch), run all examples, and remove temporary/changed data files.
1111
# Expects pytorch, torchvision to be installed.
1212

13-
BASE_DIR=`pwd`"/"`dirname $0`
14-
EXAMPLES=`echo $1 | sed -e 's/ //g'`
15-
16-
# Redirect 'python' calls to 'python3'
17-
python() {
18-
command python3 "$@"
19-
}
13+
BASE_DIR="$(pwd)/$(dirname $0)"
14+
source $BASE_DIR/utils.sh
2015

2116
USE_CUDA=$(python -c "import torchvision, torch; print(torch.cuda.is_available())")
2217
case $USE_CUDA in
@@ -35,43 +30,11 @@ case $USE_CUDA in
3530
;;
3631
esac
3732

38-
ERRORS=""
39-
40-
function error() {
41-
ERR=$1
42-
ERRORS="$ERRORS\n$ERR"
43-
echo $ERR
44-
}
45-
46-
function install_deps() {
47-
echo "installing requirements"
48-
cat $BASE_DIR/*/requirements.txt | \
49-
sort -u | \
50-
# testing the installed version of torch, so don't pip install it.
51-
grep -vE '^torch$' | \
52-
pip install -r /dev/stdin || \
53-
{ error "failed to install dependencies"; exit 1; }
54-
}
55-
56-
function start() {
57-
EXAMPLE=${FUNCNAME[1]}
58-
cd $BASE_DIR/$EXAMPLE
59-
echo "Running example: $EXAMPLE"
60-
}
61-
6233
function dcgan() {
6334
start
6435
python main.py --dataset fake $CUDA_FLAG --mps --dry-run || error "dcgan failed"
6536
}
6637

67-
function distributed() {
68-
start
69-
python tensor_parallelism/tensor_parallel_example.py || error "tensor parallel example failed"
70-
python tensor_parallelism/sequence_parallel_example.py || error "sequence parallel example failed"
71-
python tensor_parallelism/fsdp_tp_example.py || error "2D parallel example failed"
72-
python ddp/main.py || error "ddp example failed"
73-
}
74-
7538
function fast_neural_style() {
7639
start
7740
if [ ! -d "saved_models" ]; then
@@ -223,9 +186,9 @@ function clean() {
223186
}
224187

225188
function run_all() {
226-
# cpp
189+
# cpp moved to `run_cpp_examples.sh```
227190
dcgan
228-
distributed
191+
# distributed moved to `run_distributed_examples.sh`
229192
fast_neural_style
230193
imagenet
231194
language_translation
@@ -261,8 +224,8 @@ fi
261224
if [ "" == "$ERRORS" ]; then
262225
echo "Completed successfully with status $?"
263226
else
264-
echo "Some examples failed:"
265-
printf "$ERRORS"
227+
echo "Some python examples failed:"
228+
printf "$ERRORS\n"
266229
#Exit with error (0-255) in case of failure in one of the tests.
267230
exit 1
268231

Diff for: utils.sh

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env bash
2+
# This script contains utility functions and initialize exmaple scripts.
3+
# Eg: run_python_examples.sh, run_distributed_examples.sh
4+
5+
BASE_DIR="$(pwd)/$(dirname $0)"
6+
EXAMPLES=$(echo $1 | sed -e 's/ //g')
7+
8+
# Redirect 'python' calls to 'python3'
9+
python() {
10+
command python3 "$@"
11+
}
12+
13+
ERRORS=${ERRORS-""}
14+
15+
function error() {
16+
ERR=$1
17+
if [ "" == "$ERRORS" ]; then
18+
ERRORS="$ERR"
19+
else
20+
ERRORS="$ERRORS\n$ERR"
21+
fi
22+
}
23+
24+
function install_deps() {
25+
echo "installing requirements"
26+
cat $BASE_DIR/*/requirements.txt | \
27+
sort -u | \
28+
# testing the installed version of torch, so don't pip install it.
29+
grep -vE '^torch$' | \
30+
pip install -r /dev/stdin || \
31+
{ error "failed to install dependencies"; exit 1; }
32+
}
33+
34+
function start() {
35+
EXAMPLE=${FUNCNAME[1]}
36+
cd $BASE_DIR/$EXAMPLE
37+
echo "Running example: $EXAMPLE"
38+
}

0 commit comments

Comments
 (0)