DeepHealth/run_experiments_multi_gpu.sh

#!/usr/bin/env bash
set -euo pipefail

usage() {
  cat <<'USAGE'
Usage:
  ./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--dry-run] [-- <extra train.py args>]

Description:
  Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs
  at most one job per GPU at a time.

Examples:
  ./run_experiments_multi_gpu.sh --gpus 0,1,2
  ./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50
  ./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type weibull
USAGE
}

experiments_file="experiments.txt"
gpu_list=""
cmd_str="python train.py"
dry_run=0
extra_args=()

while [[ $# -gt 0 ]]; do
  case "$1" in
    --gpus)
      gpu_list="${2-}"
      shift 2
      ;;
    --experiments|-f)
      experiments_file="${2-}"
      shift 2
      ;;
    --cmd)
      cmd_str="${2-}"
      shift 2
      ;;
    --dry-run)
      dry_run=1
      shift
      ;;
    --help|-h)
      usage
      exit 0
      ;;
    --)
      shift
      extra_args=("$@")
      break
      ;;
    *)
      echo "Unknown argument: $1" >&2
      usage
      exit 2
      ;;
  esac
done

if [[ -z "$gpu_list" ]]; then
  echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2
  exit 2
fi

if [[ ! -f "$experiments_file" ]]; then
  echo "Error: experiments file not found: $experiments_file" >&2
  exit 2
fi

IFS=',' read -r -a gpus <<< "$gpu_list"
if [[ ${#gpus[@]} -lt 1 ]]; then
  echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
  exit 2
fi

# Parse cmd string into an argv array.
# shellcheck disable=SC2206
cmd=($cmd_str)
if [[ ${#cmd[@]} -lt 2 ]]; then
  echo "Error: --cmd should look like 'python train.py'" >&2
  exit 2
fi

_tmpdir=""
cleanup() {
  if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then
    rm -rf "${_tmpdir}"
  fi
}
trap cleanup EXIT

_tmpdir="$(mktemp -d)"

# Prepare per-GPU queue files.
queue_files=()
for i in "${!gpus[@]}"; do
  qfile="${_tmpdir}/queue_${i}.csv"
  : > "$qfile"
  queue_files+=("$qfile")
done

# Distribute experiments round-robin.
exp_idx=0
while IFS= read -r line || [[ -n "$line" ]]; do
  line="${line%$'\r'}" # handle CRLF
  [[ -z "$line" ]] && continue
  # Skip header if present
  if [[ "$line" == model_type,* ]]; then
    continue
  fi

  slot=$((exp_idx % ${#gpus[@]}))
  # Prefix a stable experiment index for logging.
  printf '%s,%s\n' "$exp_idx" "$line" >> "${queue_files[$slot]}"
  exp_idx=$((exp_idx + 1))
done < "$experiments_file"

if [[ $exp_idx -eq 0 ]]; then
  echo "No experiments found in $experiments_file" >&2
  exit 1
fi

echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}"

pids=()
for i in "${!gpus[@]}"; do
  gpu="${gpus[$i]}"
  qfile="${queue_files[$i]}"

  (
    export CUDA_VISIBLE_DEVICES="$gpu"

    while IFS=',' read -r exp_id model_type loss_type age_encoder full_cov || [[ -n "${exp_id-}" ]]; do
      # Skip empty lines
      [[ -z "${exp_id-}" ]] && continue

      # Normalize booleans / strip whitespace
      full_cov="${full_cov//[[:space:]]/}"

      run_cmd=("${cmd[@]}" \
        --model_type "$model_type" \
        --loss_type "$loss_type" \
        --age_encoder "$age_encoder")

      if [[ "$full_cov" == "True" || "$full_cov" == "true" || "$full_cov" == "1" ]]; then
        run_cmd+=(--full_cov)
      fi

      if [[ ${#extra_args[@]} -gt 0 ]]; then
        run_cmd+=("${extra_args[@]}")
      fi

      echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov"

      if [[ $dry_run -eq 1 ]]; then
        printf '[GPU %s] CMD: ' "$gpu"
        printf '%q ' "${run_cmd[@]}"
        echo
      else
        "${run_cmd[@]}"
      fi

      echo "[GPU $gpu] DONE  exp $exp_id"
    done < "$qfile"
  ) &

  pids+=("$!")
done

# Wait for all GPU workers.
fail=0
for pid in "${pids[@]}"; do
  if ! wait "$pid"; then
    fail=1
  fi
done

if [[ $fail -ne 0 ]]; then
  echo "One or more workers failed." >&2
  exit 1
fi

echo "All experiments complete."
Add multi-GPU experiment runner script and experiments configuration 2026-01-08 13:57:34 +08:00			`#!/usr/bin/env bash`
			`set -euo pipefail`

			`usage() {`
			`cat <<'USAGE'`
			`Usage:`
			`./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--dry-run] [-- <extra train.py args>]`

			`Description:`
			`Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs`
			`at most one job per GPU at a time.`

			`Examples:`
			`./run_experiments_multi_gpu.sh --gpus 0,1,2`
			`./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50`
			`./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type weibull`
			`USAGE`
			`}`

			`experiments_file="experiments.txt"`
			`gpu_list=""`
			`cmd_str="python train.py"`
			`dry_run=0`
			`extra_args=()`

			`while [[ $# -gt 0 ]]; do`
			`case "$1" in`
			`--gpus)`
			`gpu_list="${2-}"`
			`shift 2`
			`;;`
			`--experiments\|-f)`
			`experiments_file="${2-}"`
			`shift 2`
			`;;`
			`--cmd)`
			`cmd_str="${2-}"`
			`shift 2`
			`;;`
			`--dry-run)`
			`dry_run=1`
			`shift`
			`;;`
			`--help\|-h)`
			`usage`
			`exit 0`
			`;;`
			`--)`
			`shift`
			`extra_args=("$@")`
			`break`
			`;;`
			`*)`
			`echo "Unknown argument: $1" >&2`
			`usage`
			`exit 2`
			`;;`
			`esac`
			`done`

			`if [[ -z "$gpu_list" ]]; then`
			`echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2`
			`exit 2`
			`fi`

			`if [[ ! -f "$experiments_file" ]]; then`
			`echo "Error: experiments file not found: $experiments_file" >&2`
			`exit 2`
			`fi`

			`IFS=',' read -r -a gpus <<< "$gpu_list"`
			`if [[ ${#gpus[@]} -lt 1 ]]; then`
			`echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2`
			`exit 2`
			`fi`

			`# Parse cmd string into an argv array.`
			`# shellcheck disable=SC2206`
			`cmd=($cmd_str)`
			`if [[ ${#cmd[@]} -lt 2 ]]; then`
			`echo "Error: --cmd should look like 'python train.py'" >&2`
			`exit 2`
			`fi`

			`_tmpdir=""`
			`cleanup() {`
			`if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then`
			`rm -rf "${_tmpdir}"`
			`fi`
			`}`
			`trap cleanup EXIT`

			`_tmpdir="$(mktemp -d)"`

			`# Prepare per-GPU queue files.`
			`queue_files=()`
			`for i in "${!gpus[@]}"; do`
			`qfile="${_tmpdir}/queue_${i}.csv"`
			`: > "$qfile"`
			`queue_files+=("$qfile")`
			`done`

			`# Distribute experiments round-robin.`
			`exp_idx=0`
			`while IFS= read -r line \|\| [[ -n "$line" ]]; do`
			`line="${line%$'\r'}" # handle CRLF`
			`[[ -z "$line" ]] && continue`
			`# Skip header if present`
			`if [[ "$line" == model_type,* ]]; then`
			`continue`
			`fi`

			`slot=$((exp_idx % ${#gpus[@]}))`
			`# Prefix a stable experiment index for logging.`
			`printf '%s,%s\n' "$exp_idx" "$line" >> "${queue_files[$slot]}"`
			`exp_idx=$((exp_idx + 1))`
			`done < "$experiments_file"`

			`if [[ $exp_idx -eq 0 ]]; then`
			`echo "No experiments found in $experiments_file" >&2`
			`exit 1`
			`fi`

			`echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}"`

			`pids=()`
			`for i in "${!gpus[@]}"; do`
			`gpu="${gpus[$i]}"`
			`qfile="${queue_files[$i]}"`

			`(`
			`export CUDA_VISIBLE_DEVICES="$gpu"`

			`while IFS=',' read -r exp_id model_type loss_type age_encoder full_cov \|\| [[ -n "${exp_id-}" ]]; do`
			`# Skip empty lines`
			`[[ -z "${exp_id-}" ]] && continue`

			`# Normalize booleans / strip whitespace`
			`full_cov="${full_cov//[[:space:]]/}"`

			`run_cmd=("${cmd[@]}" \`
			`--model_type "$model_type" \`
			`--loss_type "$loss_type" \`
			`--age_encoder "$age_encoder")`

			`if [[ "$full_cov" == "True" \|\| "$full_cov" == "true" \|\| "$full_cov" == "1" ]]; then`
			`run_cmd+=(--full_cov)`
			`fi`

			`if [[ ${#extra_args[@]} -gt 0 ]]; then`
			`run_cmd+=("${extra_args[@]}")`
			`fi`

			`echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov"`

			`if [[ $dry_run -eq 1 ]]; then`
			`printf '[GPU %s] CMD: ' "$gpu"`
			`printf '%q ' "${run_cmd[@]}"`
			`echo`
			`else`
			`"${run_cmd[@]}"`
			`fi`

			`echo "[GPU $gpu] DONE exp $exp_id"`
			`done < "$qfile"`
			`) &`

			`pids+=("$!")`
			`done`

			`# Wait for all GPU workers.`
			`fail=0`
			`for pid in "${pids[@]}"; do`
			`if ! wait "$pid"; then`
			`fail=1`
			`fi`
			`done`

			`if [[ $fail -ne 0 ]]; then`
			`echo "One or more workers failed." >&2`
			`exit 1`
			`fi`

			`echo "All experiments complete."`