diff --git a/run_evaluations_multi_gpu.sh b/run_evaluations_multi_gpu.sh new file mode 100644 index 0000000..f9136e0 --- /dev/null +++ b/run_evaluations_multi_gpu.sh @@ -0,0 +1,317 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: + ./run_evaluations_multi_gpu.sh --gpus 0,1,2 --tau-short 0.5 [options] [-- ] + +Description: + Discovers trained run directories (containing best_model.pt + train_config.json) + and runs BOTH evaluations on each run: + 1) evaluate_next_event.py (requires --tau_short) + 2) evaluate_horizon.py + + Jobs are distributed round-robin across the provided GPU list and each GPU runs + at most one job at a time. + +Options: + --gpus Comma-separated GPU ids (required), e.g. 0,1,2 + --tau-short Short-window horizon (years) for evaluate_next_event.py (required) + --runs-root Root directory containing run subfolders (default: runs) + --pattern Shell glob to filter run folder basenames (default: *) + --run-dirs-file Text file with one run_dir per line (overrides --runs-root) + --horizons Horizon grid in years (space-separated list). If omitted, uses script defaults. + --age-bins Age bin boundaries in years (space-separated list). If omitted, uses script defaults. + --python Python executable/command (default: python) + --log-dir Directory for logs (default: eval_logs) + --dry-run Print commands without executing + --help|-h Show this help + +Extra eval args: + Anything after `--` is appended to BOTH evaluation commands. + Examples: -- --batch_size 512 --num_workers 4 --seed 0 --min_pos 20 --no_tqdm + +Examples: + ./run_evaluations_multi_gpu.sh --gpus 0,1 --tau-short 0.5 + ./run_evaluations_multi_gpu.sh --gpus 0,1 --tau-short 0.5 --runs-root runs --pattern "delphi_*" \ + --horizons 0.25 0.5 1 2 5 10 --age-bins 40 45 50 55 60 65 70 75 inf -- --batch_size 512 --num_workers 4 + +USAGE +} + +runs_root="runs" +pattern="*" +run_dirs_file="" +gpu_list="" +python_cmd="python" +log_dir="eval_logs" +dry_run=0 + +tau_short="" +horizons=() +age_bins=() +extra_args=() + +while [[ $# -gt 0 ]]; do + case "$1" in + --gpus) + gpu_list="${2-}" + shift 2 + ;; + --tau-short) + tau_short="${2-}" + shift 2 + ;; + --runs-root) + runs_root="${2-}" + shift 2 + ;; + --pattern) + pattern="${2-}" + shift 2 + ;; + --run-dirs-file) + run_dirs_file="${2-}" + shift 2 + ;; + --python) + python_cmd="${2-}" + shift 2 + ;; + --log-dir) + log_dir="${2-}" + shift 2 + ;; + --dry-run) + dry_run=1 + shift + ;; + --horizons) + shift + horizons=() + while [[ $# -gt 0 && "$1" != --* ]]; do + horizons+=("$1") + shift + done + ;; + --age-bins) + shift + age_bins=() + while [[ $# -gt 0 && "$1" != --* ]]; do + age_bins+=("$1") + shift + done + ;; + --help|-h) + usage + exit 0 + ;; + --) + shift + extra_args=("$@") + break + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 2 + ;; + esac +done + +if [[ -z "$gpu_list" ]]; then + echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2 + exit 2 +fi + +if [[ -z "$tau_short" ]]; then + echo "Error: --tau-short is required (e.g. --tau-short 0.5)." >&2 + exit 2 +fi + +mkdir -p "$log_dir" + +IFS=',' read -r -a gpus <<< "$gpu_list" +if [[ ${#gpus[@]} -lt 1 ]]; then + echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2 + exit 2 +fi + +sanitize() { + # Replace any char outside [A-Za-z0-9._-] with '_' + local s="${1-}" + s="${s//[^A-Za-z0-9._-]/_}" + printf '%s' "$s" +} + +# Discover run directories +run_dirs=() +if [[ -n "$run_dirs_file" ]]; then + if [[ ! -f "$run_dirs_file" ]]; then + echo "Error: --run-dirs-file not found: $run_dirs_file" >&2 + exit 2 + fi + while IFS= read -r line || [[ -n "$line" ]]; do + line="${line%$'\r'}" # handle CRLF + [[ -z "$line" ]] && continue + run_dirs+=("$line") + done < "$run_dirs_file" +else + if [[ ! -d "$runs_root" ]]; then + echo "Error: runs root not found: $runs_root" >&2 + exit 2 + fi + shopt -s nullglob + for d in "$runs_root"/$pattern; do + [[ -d "$d" ]] || continue + [[ -f "$d/best_model.pt" ]] || continue + [[ -f "$d/train_config.json" ]] || continue + run_dirs+=("$d") + done + shopt -u nullglob +fi + +if [[ ${#run_dirs[@]} -eq 0 ]]; then + echo "Error: no run directories found." >&2 + exit 1 +fi + +echo "Queued ${#run_dirs[@]} run(s) across ${#gpus[@]} GPU(s): ${gpus[*]}" + +_tmpdir="" +cleanup() { + if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then + rm -rf "${_tmpdir}" + fi +} +trap cleanup EXIT + +_tmpdir="$(mktemp -d)" + +# Prepare per-GPU queue files (TSV: job_id \t run_dir) +queue_files=() +for i in "${!gpus[@]}"; do + qfile="${_tmpdir}/queue_${i}.tsv" + : > "$qfile" + queue_files+=("$qfile") +done + +job_id=0 +for run_dir in "${run_dirs[@]}"; do + slot=$((job_id % ${#gpus[@]})) + printf '%s\t%s\n' "$job_id" "$run_dir" >> "${queue_files[$slot]}" + job_id=$((job_id + 1)) +done + +pids=() +for i in "${!gpus[@]}"; do + gpu="${gpus[$i]}" + qfile="${queue_files[$i]}" + + ( + export CUDA_VISIBLE_DEVICES="$gpu" + + while IFS=$'\t' read -r jid run_dir || [[ -n "${jid-}" ]]; do + [[ -z "${jid-}" ]] && continue + [[ -z "${run_dir-}" ]] && continue + + ts="$(date +%Y%m%d-%H%M%S)" + safe_run="$(sanitize "$(basename "$run_dir")")" + log_file="${log_dir}/eval_${jid}_gpu${gpu}_${safe_run}_${ts}.log" + + { + echo "===== EVALUATION START =====" + echo "timestamp: $ts" + echo "gpu: $gpu" + echo "job_id: $jid" + echo "run_dir: $run_dir" + echo "tau_short: $tau_short" + if [[ ${#horizons[@]} -gt 0 ]]; then + echo "horizons: ${horizons[*]}" + fi + if [[ ${#age_bins[@]} -gt 0 ]]; then + echo "age_bins: ${age_bins[*]}" + fi + if [[ ${#extra_args[@]} -gt 0 ]]; then + echo "extra_args: ${extra_args[*]}" + fi + echo "============================" + } > "$log_file" + + # Build argv arrays + next_cmd=("$python_cmd" evaluate_next_event.py --run_dir "$run_dir" --tau_short "$tau_short") + if [[ ${#age_bins[@]} -gt 0 ]]; then + next_cmd+=(--age_bins "${age_bins[@]}") + fi + if [[ ${#extra_args[@]} -gt 0 ]]; then + next_cmd+=("${extra_args[@]}") + fi + + hor_cmd=("$python_cmd" evaluate_horizon.py --run_dir "$run_dir") + if [[ ${#horizons[@]} -gt 0 ]]; then + hor_cmd+=(--horizons "${horizons[@]}") + fi + if [[ ${#age_bins[@]} -gt 0 ]]; then + hor_cmd+=(--age_bins "${age_bins[@]}") + fi + if [[ ${#extra_args[@]} -gt 0 ]]; then + hor_cmd+=("${extra_args[@]}") + fi + + echo "[GPU $gpu] START job $jid: $run_dir" + + if [[ $dry_run -eq 1 ]]; then + { + echo "[DRY-RUN] next-event cmd:"; printf ' %q' "${next_cmd[@]}"; echo + echo "[DRY-RUN] horizon cmd:"; printf ' %q' "${hor_cmd[@]}"; echo + echo "[DRY-RUN] log: $log_file" + } | tee -a "$log_file" + echo "[GPU $gpu] DONE job $jid (dry-run)" + continue + fi + + set +e + { + echo "--- RUN evaluate_next_event.py ---" + printf 'cmd:'; printf ' %q' "${next_cmd[@]}"; echo + "${next_cmd[@]}" + rc1=$? + echo "exit_code_next_event: $rc1" + + echo "--- RUN evaluate_horizon.py ---" + printf 'cmd:'; printf ' %q' "${hor_cmd[@]}"; echo + "${hor_cmd[@]}" + rc2=$? + echo "exit_code_horizon: $rc2" + + echo "===== EVALUATION END =======" + } >> "$log_file" 2>&1 + + set -e + + if [[ $rc1 -ne 0 || $rc2 -ne 0 ]]; then + echo "[GPU $gpu] FAIL job $jid (next=$rc1 horizon=$rc2). Log: $log_file" >&2 + exit 1 + fi + + echo "[GPU $gpu] DONE job $jid (log: $log_file)" + done < "$qfile" + ) & + + pids+=("$!") +done + +fail=0 +for pid in "${pids[@]}"; do + if ! wait "$pid"; then + fail=1 + fi +done + +if [[ $fail -ne 0 ]]; then + echo "One or more workers failed." >&2 + exit 1 +fi + +echo "All evaluations complete."