#!/usr/bin/env bash set -euo pipefail usage() { cat <<'USAGE' Usage: ./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--log-dir experiment_logs] [--dry-run] [-- ] Description: Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs at most one job per GPU at a time. Examples: ./run_experiments_multi_gpu.sh --gpus 0,1,2 ./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50 ./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type discrete_time_cif USAGE } experiments_file="experiments.txt" gpu_list="" cmd_str="python train.py" log_dir="experiment_logs" dry_run=0 extra_args=() while [[ $# -gt 0 ]]; do case "$1" in --gpus) gpu_list="${2-}" shift 2 ;; --experiments|-f) experiments_file="${2-}" shift 2 ;; --cmd) cmd_str="${2-}" shift 2 ;; --log-dir) log_dir="${2-}" shift 2 ;; --dry-run) dry_run=1 shift ;; --help|-h) usage exit 0 ;; --) shift extra_args=("$@") break ;; *) echo "Unknown argument: $1" >&2 usage exit 2 ;; esac done if [[ -z "$gpu_list" ]]; then echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2 exit 2 fi mkdir -p "$log_dir" if [[ ! -f "$experiments_file" ]]; then echo "Error: experiments file not found: $experiments_file" >&2 exit 2 fi IFS=',' read -r -a gpus <<< "$gpu_list" if [[ ${#gpus[@]} -lt 1 ]]; then echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2 exit 2 fi # Parse cmd string into an argv array. # shellcheck disable=SC2206 cmd=($cmd_str) if [[ ${#cmd[@]} -lt 2 ]]; then echo "Error: --cmd should look like 'python train.py'" >&2 exit 2 fi _tmpdir="" cleanup() { if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then rm -rf "${_tmpdir}" fi } trap cleanup EXIT _tmpdir="$(mktemp -d)" # Prepare per-GPU queue files. queue_files=() for i in "${!gpus[@]}"; do qfile="${_tmpdir}/queue_${i}.csv" : > "$qfile" queue_files+=("$qfile") done # Distribute experiments round-robin. exp_idx=0 while IFS= read -r line || [[ -n "$line" ]]; do line="${line%$'\r'}" # handle CRLF [[ -z "$line" ]] && continue # Skip header if present if [[ "$line" == model_type,* ]]; then continue fi slot=$((exp_idx % ${#gpus[@]})) # Prefix a stable experiment index for logging. printf '%s,%s\n' "$exp_idx" "$line" >> "${queue_files[$slot]}" exp_idx=$((exp_idx + 1)) done < "$experiments_file" if [[ $exp_idx -eq 0 ]]; then echo "No experiments found in $experiments_file" >&2 exit 1 fi echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}" sanitize() { # Replace any char outside [A-Za-z0-9._-] with '_' local s="${1-}" s="${s//[^A-Za-z0-9._-]/_}" printf '%s' "$s" } pids=() for i in "${!gpus[@]}"; do gpu="${gpus[$i]}" qfile="${queue_files[$i]}" ( export CUDA_VISIBLE_DEVICES="$gpu" while IFS=',' read -r exp_id model_type loss_type age_encoder full_cov || [[ -n "${exp_id-}" ]]; do # Skip empty lines [[ -z "${exp_id-}" ]] && continue # Normalize booleans / strip whitespace full_cov="${full_cov//[[:space:]]/}" run_cmd=("${cmd[@]}" \ --model_type "$model_type" \ --loss_type "$loss_type" \ --age_encoder "$age_encoder") if [[ "$full_cov" == "True" || "$full_cov" == "true" || "$full_cov" == "1" ]]; then run_cmd+=(--full_cov) fi if [[ ${#extra_args[@]} -gt 0 ]]; then run_cmd+=("${extra_args[@]}") fi echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov" ts="$(date +%Y%m%d-%H%M%S)" safe_model="$(sanitize "$model_type")" safe_loss="$(sanitize "$loss_type")" safe_age="$(sanitize "$age_encoder")" safe_cov="$(sanitize "$full_cov")" log_file="${log_dir}/exp_${exp_id}_gpu${gpu}_${safe_model}_${safe_loss}_${safe_age}_${safe_cov}_${ts}.log" { echo "===== EXPERIMENT START =====" echo "timestamp: $ts" echo "gpu: $gpu" echo "exp_id: $exp_id" echo "model_type: $model_type" echo "loss_type: $loss_type" echo "age_encoder: $age_encoder" echo "full_cov: $full_cov" printf 'cmd:' printf ' %q' "${run_cmd[@]}" echo echo "============================" } > "$log_file" if [[ $dry_run -eq 1 ]]; then printf '[GPU %s] CMD: ' "$gpu" printf '%q ' "${run_cmd[@]}" echo echo "[GPU $gpu] LOG: $log_file" else set +e "${run_cmd[@]}" >> "$log_file" 2>&1 rc=$? set -e { echo "============================" echo "exit_code: $rc" echo "===== EXPERIMENT END =======" } >> "$log_file" if [[ $rc -ne 0 ]]; then echo "[GPU $gpu] FAIL exp $exp_id (exit=$rc). Log: $log_file" >&2 exit "$rc" fi fi echo "[GPU $gpu] DONE exp $exp_id (log: $log_file)" done < "$qfile" ) & pids+=("$!") done # Wait for all GPU workers. fail=0 for pid in "${pids[@]}"; do if ! wait "$pid"; then fail=1 fi done if [[ $fail -ne 0 ]]; then echo "One or more workers failed." >&2 exit 1 fi echo "All experiments complete."