Add multi-GPU experiment runner script and experiments configuration

2026-01-08 13:57:34 +08:00
parent 01a96d37ea
commit d20d32ba22
2 changed files with 193 additions and 0 deletions
--- a/experiments.txt
+++ b/experiments.txt
@@ -0,0 +1,9 @@
+model_type,loss_type,age_encoder,full_cov
+delphi_fork,exponential,sinusoidal,False
+delphi_fork,piecewise_exponential,sinusoidal,False
+delphi_fork,exponential,mlp,False
+delphi_fork,piecewise_exponential,mlp,False
+delphi_fork,exponential,sinusoidal,True
+delphi_fork,piecewise_exponential,sinusoidal,True
+sap_delphi,exponential,sinusoidal,False
+sap_delphi,piecewise_exponential,sinusoidal,False
--- a/run_experiments_multi_gpu.sh
+++ b/run_experiments_multi_gpu.sh
@@ -0,0 +1,184 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat <<'USAGE'
+Usage:
+  ./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--dry-run] [-- <extra train.py args>]
+
+Description:
+  Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs
+  at most one job per GPU at a time.
+
+Examples:
+  ./run_experiments_multi_gpu.sh --gpus 0,1,2
+  ./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50
+  ./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type weibull
+USAGE
+}
+
+experiments_file="experiments.txt"
+gpu_list=""
+cmd_str="python train.py"
+dry_run=0
+extra_args=()
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --gpus)
+      gpu_list="${2-}"
+      shift 2
+      ;;
+    --experiments|-f)
+      experiments_file="${2-}"
+      shift 2
+      ;;
+    --cmd)
+      cmd_str="${2-}"
+      shift 2
+      ;;
+    --dry-run)
+      dry_run=1
+      shift
+      ;;
+    --help|-h)
+      usage
+      exit 0
+      ;;
+    --)
+      shift
+      extra_args=("$@")
+      break
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage
+      exit 2
+      ;;
+  esac
+done
+
+if [[ -z "$gpu_list" ]]; then
+  echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2
+  exit 2
+fi
+
+if [[ ! -f "$experiments_file" ]]; then
+  echo "Error: experiments file not found: $experiments_file" >&2
+  exit 2
+fi
+
+IFS=',' read -r -a gpus <<< "$gpu_list"
+if [[ ${#gpus[@]} -lt 1 ]]; then
+  echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
+  exit 2
+fi
+
+# Parse cmd string into an argv array.
+# shellcheck disable=SC2206
+cmd=($cmd_str)
+if [[ ${#cmd[@]} -lt 2 ]]; then
+  echo "Error: --cmd should look like 'python train.py'" >&2
+  exit 2
+fi
+
+_tmpdir=""
+cleanup() {
+  if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then
+    rm -rf "${_tmpdir}"
+  fi
+}
+trap cleanup EXIT
+
+_tmpdir="$(mktemp -d)"
+
+# Prepare per-GPU queue files.
+queue_files=()
+for i in "${!gpus[@]}"; do
+  qfile="${_tmpdir}/queue_${i}.csv"
+  : > "$qfile"
+  queue_files+=("$qfile")
+done
+
+# Distribute experiments round-robin.
+exp_idx=0
+while IFS= read -r line || [[ -n "$line" ]]; do
+  line="${line%$'\r'}" # handle CRLF
+  [[ -z "$line" ]] && continue
+  # Skip header if present
+  if [[ "$line" == model_type,* ]]; then
+    continue
+  fi
+
+  slot=$((exp_idx % ${#gpus[@]}))
+  # Prefix a stable experiment index for logging.
+  printf '%s,%s\n' "$exp_idx" "$line" >> "${queue_files[$slot]}"
+  exp_idx=$((exp_idx + 1))
+done < "$experiments_file"
+
+if [[ $exp_idx -eq 0 ]]; then
+  echo "No experiments found in $experiments_file" >&2
+  exit 1
+fi
+
+echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}"
+
+pids=()
+for i in "${!gpus[@]}"; do
+  gpu="${gpus[$i]}"
+  qfile="${queue_files[$i]}"
+
+  (
+    export CUDA_VISIBLE_DEVICES="$gpu"
+
+    while IFS=',' read -r exp_id model_type loss_type age_encoder full_cov || [[ -n "${exp_id-}" ]]; do
+      # Skip empty lines
+      [[ -z "${exp_id-}" ]] && continue
+
+      # Normalize booleans / strip whitespace
+      full_cov="${full_cov//[[:space:]]/}"
+
+      run_cmd=("${cmd[@]}" \
+        --model_type "$model_type" \
+        --loss_type "$loss_type" \
+        --age_encoder "$age_encoder")
+
+      if [[ "$full_cov" == "True" || "$full_cov" == "true" || "$full_cov" == "1" ]]; then
+        run_cmd+=(--full_cov)
+      fi
+
+      if [[ ${#extra_args[@]} -gt 0 ]]; then
+        run_cmd+=("${extra_args[@]}")
+      fi
+
+      echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov"
+
+      if [[ $dry_run -eq 1 ]]; then
+        printf '[GPU %s] CMD: ' "$gpu"
+        printf '%q ' "${run_cmd[@]}"
+        echo
+      else
+        "${run_cmd[@]}"
+      fi
+
+      echo "[GPU $gpu] DONE  exp $exp_id"
+    done < "$qfile"
+  ) &
+
+  pids+=("$!")
+done
+
+# Wait for all GPU workers.
+fail=0
+for pid in "${pids[@]}"; do
+  if ! wait "$pid"; then
+    fail=1
+  fi
+done
+
+if [[ $fail -ne 0 ]]; then
+  echo "One or more workers failed." >&2
+  exit 1
+fi
+
+echo "All experiments complete."