Add script for multi-GPU evaluations with flexible options and logging

2026-01-17 14:56:45 +08:00
parent a90f22a865
commit 197842b1a6
1 changed files with 317 additions and 0 deletions
--- a/run_evaluations_multi_gpu.sh
+++ b/run_evaluations_multi_gpu.sh
@@ -0,0 +1,317 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat <<'USAGE'
+Usage:
+  ./run_evaluations_multi_gpu.sh --gpus 0,1,2 --tau-short 0.5 [options] [-- <extra eval args>]
+
+Description:
+  Discovers trained run directories (containing best_model.pt + train_config.json)
+  and runs BOTH evaluations on each run:
+    1) evaluate_next_event.py (requires --tau_short)
+    2) evaluate_horizon.py
+
+  Jobs are distributed round-robin across the provided GPU list and each GPU runs
+  at most one job at a time.
+
+Options:
+  --gpus              Comma-separated GPU ids (required), e.g. 0,1,2
+  --tau-short         Short-window horizon (years) for evaluate_next_event.py (required)
+  --runs-root         Root directory containing run subfolders (default: runs)
+  --pattern           Shell glob to filter run folder basenames (default: *)
+  --run-dirs-file     Text file with one run_dir per line (overrides --runs-root)
+  --horizons          Horizon grid in years (space-separated list). If omitted, uses script defaults.
+  --age-bins          Age bin boundaries in years (space-separated list). If omitted, uses script defaults.
+  --python            Python executable/command (default: python)
+  --log-dir           Directory for logs (default: eval_logs)
+  --dry-run           Print commands without executing
+  --help|-h           Show this help
+
+Extra eval args:
+  Anything after `--` is appended to BOTH evaluation commands.
+  Examples: -- --batch_size 512 --num_workers 4 --seed 0 --min_pos 20 --no_tqdm
+
+Examples:
+  ./run_evaluations_multi_gpu.sh --gpus 0,1 --tau-short 0.5
+  ./run_evaluations_multi_gpu.sh --gpus 0,1 --tau-short 0.5 --runs-root runs --pattern "delphi_*" \
+    --horizons 0.25 0.5 1 2 5 10 --age-bins 40 45 50 55 60 65 70 75 inf -- --batch_size 512 --num_workers 4
+
+USAGE
+}
+
+runs_root="runs"
+pattern="*"
+run_dirs_file=""
+gpu_list=""
+python_cmd="python"
+log_dir="eval_logs"
+dry_run=0
+
+tau_short=""
+horizons=()
+age_bins=()
+extra_args=()
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --gpus)
+      gpu_list="${2-}"
+      shift 2
+      ;;
+    --tau-short)
+      tau_short="${2-}"
+      shift 2
+      ;;
+    --runs-root)
+      runs_root="${2-}"
+      shift 2
+      ;;
+    --pattern)
+      pattern="${2-}"
+      shift 2
+      ;;
+    --run-dirs-file)
+      run_dirs_file="${2-}"
+      shift 2
+      ;;
+    --python)
+      python_cmd="${2-}"
+      shift 2
+      ;;
+    --log-dir)
+      log_dir="${2-}"
+      shift 2
+      ;;
+    --dry-run)
+      dry_run=1
+      shift
+      ;;
+    --horizons)
+      shift
+      horizons=()
+      while [[ $# -gt 0 && "$1" != --* ]]; do
+        horizons+=("$1")
+        shift
+      done
+      ;;
+    --age-bins)
+      shift
+      age_bins=()
+      while [[ $# -gt 0 && "$1" != --* ]]; do
+        age_bins+=("$1")
+        shift
+      done
+      ;;
+    --help|-h)
+      usage
+      exit 0
+      ;;
+    --)
+      shift
+      extra_args=("$@")
+      break
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage
+      exit 2
+      ;;
+  esac
+done
+
+if [[ -z "$gpu_list" ]]; then
+  echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2
+  exit 2
+fi
+
+if [[ -z "$tau_short" ]]; then
+  echo "Error: --tau-short is required (e.g. --tau-short 0.5)." >&2
+  exit 2
+fi
+
+mkdir -p "$log_dir"
+
+IFS=',' read -r -a gpus <<< "$gpu_list"
+if [[ ${#gpus[@]} -lt 1 ]]; then
+  echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
+  exit 2
+fi
+
+sanitize() {
+  # Replace any char outside [A-Za-z0-9._-] with '_'
+  local s="${1-}"
+  s="${s//[^A-Za-z0-9._-]/_}"
+  printf '%s' "$s"
+}
+
+# Discover run directories
+run_dirs=()
+if [[ -n "$run_dirs_file" ]]; then
+  if [[ ! -f "$run_dirs_file" ]]; then
+    echo "Error: --run-dirs-file not found: $run_dirs_file" >&2
+    exit 2
+  fi
+  while IFS= read -r line || [[ -n "$line" ]]; do
+    line="${line%$'\r'}" # handle CRLF
+    [[ -z "$line" ]] && continue
+    run_dirs+=("$line")
+  done < "$run_dirs_file"
+else
+  if [[ ! -d "$runs_root" ]]; then
+    echo "Error: runs root not found: $runs_root" >&2
+    exit 2
+  fi
+  shopt -s nullglob
+  for d in "$runs_root"/$pattern; do
+    [[ -d "$d" ]] || continue
+    [[ -f "$d/best_model.pt" ]] || continue
+    [[ -f "$d/train_config.json" ]] || continue
+    run_dirs+=("$d")
+  done
+  shopt -u nullglob
+fi
+
+if [[ ${#run_dirs[@]} -eq 0 ]]; then
+  echo "Error: no run directories found." >&2
+  exit 1
+fi
+
+echo "Queued ${#run_dirs[@]} run(s) across ${#gpus[@]} GPU(s): ${gpus[*]}"
+
+_tmpdir=""
+cleanup() {
+  if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then
+    rm -rf "${_tmpdir}"
+  fi
+}
+trap cleanup EXIT
+
+_tmpdir="$(mktemp -d)"
+
+# Prepare per-GPU queue files (TSV: job_id \t run_dir)
+queue_files=()
+for i in "${!gpus[@]}"; do
+  qfile="${_tmpdir}/queue_${i}.tsv"
+  : > "$qfile"
+  queue_files+=("$qfile")
+done
+
+job_id=0
+for run_dir in "${run_dirs[@]}"; do
+  slot=$((job_id % ${#gpus[@]}))
+  printf '%s\t%s\n' "$job_id" "$run_dir" >> "${queue_files[$slot]}"
+  job_id=$((job_id + 1))
+done
+
+pids=()
+for i in "${!gpus[@]}"; do
+  gpu="${gpus[$i]}"
+  qfile="${queue_files[$i]}"
+
+  (
+    export CUDA_VISIBLE_DEVICES="$gpu"
+
+    while IFS=$'\t' read -r jid run_dir || [[ -n "${jid-}" ]]; do
+      [[ -z "${jid-}" ]] && continue
+      [[ -z "${run_dir-}" ]] && continue
+
+      ts="$(date +%Y%m%d-%H%M%S)"
+      safe_run="$(sanitize "$(basename "$run_dir")")"
+      log_file="${log_dir}/eval_${jid}_gpu${gpu}_${safe_run}_${ts}.log"
+
+      {
+        echo "===== EVALUATION START ====="
+        echo "timestamp: $ts"
+        echo "gpu: $gpu"
+        echo "job_id: $jid"
+        echo "run_dir: $run_dir"
+        echo "tau_short: $tau_short"
+        if [[ ${#horizons[@]} -gt 0 ]]; then
+          echo "horizons: ${horizons[*]}"
+        fi
+        if [[ ${#age_bins[@]} -gt 0 ]]; then
+          echo "age_bins: ${age_bins[*]}"
+        fi
+        if [[ ${#extra_args[@]} -gt 0 ]]; then
+          echo "extra_args: ${extra_args[*]}"
+        fi
+        echo "============================"
+      } > "$log_file"
+
+      # Build argv arrays
+      next_cmd=("$python_cmd" evaluate_next_event.py --run_dir "$run_dir" --tau_short "$tau_short")
+      if [[ ${#age_bins[@]} -gt 0 ]]; then
+        next_cmd+=(--age_bins "${age_bins[@]}")
+      fi
+      if [[ ${#extra_args[@]} -gt 0 ]]; then
+        next_cmd+=("${extra_args[@]}")
+      fi
+
+      hor_cmd=("$python_cmd" evaluate_horizon.py --run_dir "$run_dir")
+      if [[ ${#horizons[@]} -gt 0 ]]; then
+        hor_cmd+=(--horizons "${horizons[@]}")
+      fi
+      if [[ ${#age_bins[@]} -gt 0 ]]; then
+        hor_cmd+=(--age_bins "${age_bins[@]}")
+      fi
+      if [[ ${#extra_args[@]} -gt 0 ]]; then
+        hor_cmd+=("${extra_args[@]}")
+      fi
+
+      echo "[GPU $gpu] START job $jid: $run_dir"
+
+      if [[ $dry_run -eq 1 ]]; then
+        {
+          echo "[DRY-RUN] next-event cmd:"; printf '  %q' "${next_cmd[@]}"; echo
+          echo "[DRY-RUN] horizon    cmd:"; printf '  %q' "${hor_cmd[@]}"; echo
+          echo "[DRY-RUN] log: $log_file"
+        } | tee -a "$log_file"
+        echo "[GPU $gpu] DONE  job $jid (dry-run)"
+        continue
+      fi
+
+      set +e
+      {
+        echo "--- RUN evaluate_next_event.py ---"
+        printf 'cmd:'; printf ' %q' "${next_cmd[@]}"; echo
+        "${next_cmd[@]}"
+        rc1=$?
+        echo "exit_code_next_event: $rc1"
+
+        echo "--- RUN evaluate_horizon.py ---"
+        printf 'cmd:'; printf ' %q' "${hor_cmd[@]}"; echo
+        "${hor_cmd[@]}"
+        rc2=$?
+        echo "exit_code_horizon: $rc2"
+
+        echo "===== EVALUATION END ======="
+      } >> "$log_file" 2>&1
+
+      set -e
+
+      if [[ $rc1 -ne 0 || $rc2 -ne 0 ]]; then
+        echo "[GPU $gpu] FAIL job $jid (next=$rc1 horizon=$rc2). Log: $log_file" >&2
+        exit 1
+      fi
+
+      echo "[GPU $gpu] DONE  job $jid (log: $log_file)"
+    done < "$qfile"
+  ) &
+
+  pids+=("$!")
+done
+
+fail=0
+for pid in "${pids[@]}"; do
+  if ! wait "$pid"; then
+    fail=1
+  fi
+done
+
+if [[ $fail -ne 0 ]]; then
+  echo "One or more workers failed." >&2
+  exit 1
+fi
+
+echo "All evaluations complete."