run_distributed_examples.sh

#!/usr/bin/env bash
#
# This script runs through the code in each of the python examples.
# The purpose is just as an integration test, not to actually train models in any meaningful way.
# For that reason, most of these set epochs = 1 and --dry-run.
#
# Optionally specify a comma separated list of examples to run. Can be run as:
# * To run all examples:
#   ./run_distributed_examples.sh
# * To run specific example:
#   ./run_distributed_examples.sh "distributed/tensor_parallelism,distributed/ddp"
#
# To test examples on CUDA accelerator, run as:
#   USE_CUDA=True ./run_distributed_examples.sh
#
# Script requires uv to be installed. When executed, script will install prerequisites from
# `requirements.txt` for each example. If ran within activated virtual environment (uv venv,
# python -m venv, conda) this might reinstall some of the packages. To change pip installation
# index or to pass additional pip install options, run as:
#   PIP_INSTALL_ARGS="--pre -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html" \
#     ./run_python_examples.sh
#
# To force script to create virtual environment for each example, run as:
#   VIRTUAL_ENV=".venv" ./run_distributed_examples.sh
# Script will remove environments it creates in a teardown step after execution of each example.

BASE_DIR="$(pwd)/$(dirname $0)"
source $BASE_DIR/utils.sh

USE_CUDA=${USE_CUDA:-False}
case $USE_CUDA in
  "True")
    echo "using cuda"
    CUDA=1
    CUDA_FLAG="--cuda"
    ;;
  "False")
    echo "not using cuda"
    CUDA=0
    CUDA_FLAG=""
    ;;
  "")
    exit 1;
    ;;
esac

function distributed_tensor_parallelism() {
    uv run bash run_example.sh tensor_parallel_example.py || error "tensor parallel example failed"
    uv run bash run_example.sh sequence_parallel_example.py || error "sequence parallel example failed"
    uv run bash run_example.sh fsdp_tp_example.py || error "2D parallel example failed"
}

function distributed_ddp() {
    uv run main.py || error "ddp example failed"
}

function run_all() {
  run distributed/tensor_parallelism
  run distributed/ddp
}

# by default, run all examples
if [ "" == "$EXAMPLES" ]; then
  run_all
else
  for i in $(echo $EXAMPLES | sed "s/,/ /g")
  do
    echo "Starting $i"
    run $i
    echo "Finished $i, status $?"
  done
fi

if [ "" == "$ERRORS" ]; then
  echo "Completed successfully with status $?"
else
  echo "Some distributed examples failed:"
  printf "$ERRORS\n"
  #Exit with error (0-255) in case of failure in one of the tests.
  exit 1

fi