-
Notifications
You must be signed in to change notification settings - Fork 9.6k
/
Copy pathrun_distributed_examples.sh
executable file
·82 lines (74 loc) · 2.46 KB
/
run_distributed_examples.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env bash
#
# This script runs through the code in each of the python examples.
# The purpose is just as an integration test, not to actually train models in any meaningful way.
# For that reason, most of these set epochs = 1 and --dry-run.
#
# Optionally specify a comma separated list of examples to run. Can be run as:
# * To run all examples:
# ./run_distributed_examples.sh
# * To run specific example:
# ./run_distributed_examples.sh "distributed/tensor_parallelism,distributed/ddp"
#
# To test examples on CUDA accelerator, run as:
# USE_CUDA=True ./run_distributed_examples.sh
#
# Script requires uv to be installed. When executed, script will install prerequisites from
# `requirements.txt` for each example. If ran within activated virtual environment (uv venv,
# python -m venv, conda) this might reinstall some of the packages. To change pip installation
# index or to pass additional pip install options, run as:
# PIP_INSTALL_ARGS="--pre -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html" \
# ./run_python_examples.sh
#
# To force script to create virtual environment for each example, run as:
# VIRTUAL_ENV=".venv" ./run_distributed_examples.sh
# Script will remove environments it creates in a teardown step after execution of each example.
BASE_DIR="$(pwd)/$(dirname $0)"
source $BASE_DIR/utils.sh
USE_CUDA=${USE_CUDA:-False}
case $USE_CUDA in
"True")
echo "using cuda"
CUDA=1
CUDA_FLAG="--cuda"
;;
"False")
echo "not using cuda"
CUDA=0
CUDA_FLAG=""
;;
"")
exit 1;
;;
esac
function distributed_tensor_parallelism() {
uv run bash run_example.sh tensor_parallel_example.py || error "tensor parallel example failed"
uv run bash run_example.sh sequence_parallel_example.py || error "sequence parallel example failed"
uv run bash run_example.sh fsdp_tp_example.py || error "2D parallel example failed"
}
function distributed_ddp() {
uv run main.py || error "ddp example failed"
}
function run_all() {
run distributed/tensor_parallelism
run distributed/ddp
}
# by default, run all examples
if [ "" == "$EXAMPLES" ]; then
run_all
else
for i in $(echo $EXAMPLES | sed "s/,/ /g")
do
echo "Starting $i"
run $i
echo "Finished $i, status $?"
done
fi
if [ "" == "$ERRORS" ]; then
echo "Completed successfully with status $?"
else
echo "Some distributed examples failed:"
printf "$ERRORS\n"
#Exit with error (0-255) in case of failure in one of the tests.
exit 1
fi