From d20d32ba22f43a1b0e65186f6f83a2557b67a40d Mon Sep 17 00:00:00 2001 From: Jiarui Li Date: Thu, 8 Jan 2026 13:57:34 +0800 Subject: [PATCH] Add multi-GPU experiment runner script and experiments configuration --- experiments.txt | 9 ++ run_experiments_multi_gpu.sh | 184 +++++++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 experiments.txt create mode 100644 run_experiments_multi_gpu.sh diff --git a/experiments.txt b/experiments.txt new file mode 100644 index 0000000..0124557 --- /dev/null +++ b/experiments.txt @@ -0,0 +1,9 @@ +model_type,loss_type,age_encoder,full_cov +delphi_fork,exponential,sinusoidal,False +delphi_fork,piecewise_exponential,sinusoidal,False +delphi_fork,exponential,mlp,False +delphi_fork,piecewise_exponential,mlp,False +delphi_fork,exponential,sinusoidal,True +delphi_fork,piecewise_exponential,sinusoidal,True +sap_delphi,exponential,sinusoidal,False +sap_delphi,piecewise_exponential,sinusoidal,False \ No newline at end of file diff --git a/run_experiments_multi_gpu.sh b/run_experiments_multi_gpu.sh new file mode 100644 index 0000000..db0c175 --- /dev/null +++ b/run_experiments_multi_gpu.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: + ./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--dry-run] [-- ] + +Description: + Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs + at most one job per GPU at a time. + +Examples: + ./run_experiments_multi_gpu.sh --gpus 0,1,2 + ./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50 + ./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type weibull +USAGE +} + +experiments_file="experiments.txt" +gpu_list="" +cmd_str="python train.py" +dry_run=0 +extra_args=() + +while [[ $# -gt 0 ]]; do + case "$1" in + --gpus) + gpu_list="${2-}" + shift 2 + ;; + --experiments|-f) + experiments_file="${2-}" + shift 2 + ;; + --cmd) + cmd_str="${2-}" + shift 2 + ;; + --dry-run) + dry_run=1 + shift + ;; + --help|-h) + usage + exit 0 + ;; + --) + shift + extra_args=("$@") + break + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 2 + ;; + esac +done + +if [[ -z "$gpu_list" ]]; then + echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2 + exit 2 +fi + +if [[ ! -f "$experiments_file" ]]; then + echo "Error: experiments file not found: $experiments_file" >&2 + exit 2 +fi + +IFS=',' read -r -a gpus <<< "$gpu_list" +if [[ ${#gpus[@]} -lt 1 ]]; then + echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2 + exit 2 +fi + +# Parse cmd string into an argv array. +# shellcheck disable=SC2206 +cmd=($cmd_str) +if [[ ${#cmd[@]} -lt 2 ]]; then + echo "Error: --cmd should look like 'python train.py'" >&2 + exit 2 +fi + +_tmpdir="" +cleanup() { + if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then + rm -rf "${_tmpdir}" + fi +} +trap cleanup EXIT + +_tmpdir="$(mktemp -d)" + +# Prepare per-GPU queue files. +queue_files=() +for i in "${!gpus[@]}"; do + qfile="${_tmpdir}/queue_${i}.csv" + : > "$qfile" + queue_files+=("$qfile") +done + +# Distribute experiments round-robin. +exp_idx=0 +while IFS= read -r line || [[ -n "$line" ]]; do + line="${line%$'\r'}" # handle CRLF + [[ -z "$line" ]] && continue + # Skip header if present + if [[ "$line" == model_type,* ]]; then + continue + fi + + slot=$((exp_idx % ${#gpus[@]})) + # Prefix a stable experiment index for logging. + printf '%s,%s\n' "$exp_idx" "$line" >> "${queue_files[$slot]}" + exp_idx=$((exp_idx + 1)) +done < "$experiments_file" + +if [[ $exp_idx -eq 0 ]]; then + echo "No experiments found in $experiments_file" >&2 + exit 1 +fi + +echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}" + +pids=() +for i in "${!gpus[@]}"; do + gpu="${gpus[$i]}" + qfile="${queue_files[$i]}" + + ( + export CUDA_VISIBLE_DEVICES="$gpu" + + while IFS=',' read -r exp_id model_type loss_type age_encoder full_cov || [[ -n "${exp_id-}" ]]; do + # Skip empty lines + [[ -z "${exp_id-}" ]] && continue + + # Normalize booleans / strip whitespace + full_cov="${full_cov//[[:space:]]/}" + + run_cmd=("${cmd[@]}" \ + --model_type "$model_type" \ + --loss_type "$loss_type" \ + --age_encoder "$age_encoder") + + if [[ "$full_cov" == "True" || "$full_cov" == "true" || "$full_cov" == "1" ]]; then + run_cmd+=(--full_cov) + fi + + if [[ ${#extra_args[@]} -gt 0 ]]; then + run_cmd+=("${extra_args[@]}") + fi + + echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov" + + if [[ $dry_run -eq 1 ]]; then + printf '[GPU %s] CMD: ' "$gpu" + printf '%q ' "${run_cmd[@]}" + echo + else + "${run_cmd[@]}" + fi + + echo "[GPU $gpu] DONE exp $exp_id" + done < "$qfile" + ) & + + pids+=("$!") +done + +# Wait for all GPU workers. +fail=0 +for pid in "${pids[@]}"; do + if ! wait "$pid"; then + fail=1 + fi +done + +if [[ $fail -ne 0 ]]; then + echo "One or more workers failed." >&2 + exit 1 +fi + +echo "All experiments complete."