#!/usr/bin/env bash set -euo pipefail usage() { cat <<'USAGE' Usage: ./run_evaluations_multi_gpu.sh --gpus 0,1,2 --tau-short 0.5 [options] [-- ] Description: Discovers trained run directories (containing best_model.pt + train_config.json) and runs BOTH evaluations on each run: 1) evaluate_next_event.py (requires --tau_short) 2) evaluate_horizon.py Jobs are distributed round-robin across the provided GPU list and each GPU runs at most one job at a time. Options: --gpus Comma-separated GPU ids (required), e.g. 0,1,2 --tau-short Short-window horizon (years) for evaluate_next_event.py (required) --runs-root Root directory containing run subfolders (default: runs) --pattern Shell glob to filter run folder basenames (default: *) --run-dirs-file Text file with one run_dir per line (overrides --runs-root) --horizons Horizon grid in years (space-separated list). If omitted, uses script defaults. --age-bins Age bin boundaries in years (space-separated list). If omitted, uses script defaults. --python Python executable/command (default: python) --log-dir Directory for logs (default: eval_logs) --dry-run Print commands without executing --help|-h Show this help Extra eval args: Anything after `--` is appended to BOTH evaluation commands. Examples: -- --batch_size 512 --num_workers 4 --seed 0 --min_pos 20 --no_tqdm Examples: ./run_evaluations_multi_gpu.sh --gpus 0,1 --tau-short 0.5 ./run_evaluations_multi_gpu.sh --gpus 0,1 --tau-short 0.5 --runs-root runs --pattern "delphi_*" \ --horizons 0.25 0.5 1 2 5 10 --age-bins 40 45 50 55 60 65 70 75 inf -- --batch_size 512 --num_workers 4 USAGE } runs_root="runs" pattern="*" run_dirs_file="" gpu_list="" python_cmd="python" log_dir="eval_logs" dry_run=0 tau_short="" horizons=() age_bins=() extra_args=() while [[ $# -gt 0 ]]; do case "$1" in --gpus) gpu_list="${2-}" shift 2 ;; --tau-short) tau_short="${2-}" shift 2 ;; --runs-root) runs_root="${2-}" shift 2 ;; --pattern) pattern="${2-}" shift 2 ;; --run-dirs-file) run_dirs_file="${2-}" shift 2 ;; --python) python_cmd="${2-}" shift 2 ;; --log-dir) log_dir="${2-}" shift 2 ;; --dry-run) dry_run=1 shift ;; --horizons) shift horizons=() while [[ $# -gt 0 && "$1" != --* ]]; do horizons+=("$1") shift done ;; --age-bins) shift age_bins=() while [[ $# -gt 0 && "$1" != --* ]]; do age_bins+=("$1") shift done ;; --help|-h) usage exit 0 ;; --) shift extra_args=("$@") break ;; *) echo "Unknown argument: $1" >&2 usage exit 2 ;; esac done if [[ -z "$gpu_list" ]]; then echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2 exit 2 fi if [[ -z "$tau_short" ]]; then echo "Error: --tau-short is required (e.g. --tau-short 0.5)." >&2 exit 2 fi mkdir -p "$log_dir" IFS=',' read -r -a gpus <<< "$gpu_list" if [[ ${#gpus[@]} -lt 1 ]]; then echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2 exit 2 fi sanitize() { # Replace any char outside [A-Za-z0-9._-] with '_' local s="${1-}" s="${s//[^A-Za-z0-9._-]/_}" printf '%s' "$s" } # Discover run directories run_dirs=() if [[ -n "$run_dirs_file" ]]; then if [[ ! -f "$run_dirs_file" ]]; then echo "Error: --run-dirs-file not found: $run_dirs_file" >&2 exit 2 fi while IFS= read -r line || [[ -n "$line" ]]; do line="${line%$'\r'}" # handle CRLF [[ -z "$line" ]] && continue run_dirs+=("$line") done < "$run_dirs_file" else if [[ ! -d "$runs_root" ]]; then echo "Error: runs root not found: $runs_root" >&2 exit 2 fi shopt -s nullglob for d in "$runs_root"/$pattern; do [[ -d "$d" ]] || continue [[ -f "$d/best_model.pt" ]] || continue [[ -f "$d/train_config.json" ]] || continue run_dirs+=("$d") done shopt -u nullglob fi if [[ ${#run_dirs[@]} -eq 0 ]]; then echo "Error: no run directories found." >&2 exit 1 fi echo "Queued ${#run_dirs[@]} run(s) across ${#gpus[@]} GPU(s): ${gpus[*]}" _tmpdir="" cleanup() { if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then rm -rf "${_tmpdir}" fi } trap cleanup EXIT _tmpdir="$(mktemp -d)" # Prepare per-GPU queue files (TSV: job_id \t run_dir) queue_files=() for i in "${!gpus[@]}"; do qfile="${_tmpdir}/queue_${i}.tsv" : > "$qfile" queue_files+=("$qfile") done job_id=0 for run_dir in "${run_dirs[@]}"; do slot=$((job_id % ${#gpus[@]})) printf '%s\t%s\n' "$job_id" "$run_dir" >> "${queue_files[$slot]}" job_id=$((job_id + 1)) done pids=() for i in "${!gpus[@]}"; do gpu="${gpus[$i]}" qfile="${queue_files[$i]}" ( export CUDA_VISIBLE_DEVICES="$gpu" while IFS=$'\t' read -r jid run_dir || [[ -n "${jid-}" ]]; do [[ -z "${jid-}" ]] && continue [[ -z "${run_dir-}" ]] && continue ts="$(date +%Y%m%d-%H%M%S)" safe_run="$(sanitize "$(basename "$run_dir")")" log_file="${log_dir}/eval_${jid}_gpu${gpu}_${safe_run}_${ts}.log" { echo "===== EVALUATION START =====" echo "timestamp: $ts" echo "gpu: $gpu" echo "job_id: $jid" echo "run_dir: $run_dir" echo "tau_short: $tau_short" if [[ ${#horizons[@]} -gt 0 ]]; then echo "horizons: ${horizons[*]}" fi if [[ ${#age_bins[@]} -gt 0 ]]; then echo "age_bins: ${age_bins[*]}" fi if [[ ${#extra_args[@]} -gt 0 ]]; then echo "extra_args: ${extra_args[*]}" fi echo "============================" } > "$log_file" # Build argv arrays next_cmd=("$python_cmd" evaluate_next_event.py --run_dir "$run_dir" --tau_short "$tau_short") if [[ ${#age_bins[@]} -gt 0 ]]; then next_cmd+=(--age_bins "${age_bins[@]}") fi if [[ ${#extra_args[@]} -gt 0 ]]; then next_cmd+=("${extra_args[@]}") fi hor_cmd=("$python_cmd" evaluate_horizon.py --run_dir "$run_dir") if [[ ${#horizons[@]} -gt 0 ]]; then hor_cmd+=(--horizons "${horizons[@]}") fi if [[ ${#age_bins[@]} -gt 0 ]]; then hor_cmd+=(--age_bins "${age_bins[@]}") fi if [[ ${#extra_args[@]} -gt 0 ]]; then hor_cmd+=("${extra_args[@]}") fi echo "[GPU $gpu] START job $jid: $run_dir" if [[ $dry_run -eq 1 ]]; then { echo "[DRY-RUN] next-event cmd:"; printf ' %q' "${next_cmd[@]}"; echo echo "[DRY-RUN] horizon cmd:"; printf ' %q' "${hor_cmd[@]}"; echo echo "[DRY-RUN] log: $log_file" } | tee -a "$log_file" echo "[GPU $gpu] DONE job $jid (dry-run)" continue fi set +e { echo "--- RUN evaluate_next_event.py ---" printf 'cmd:'; printf ' %q' "${next_cmd[@]}"; echo "${next_cmd[@]}" rc1=$? echo "exit_code_next_event: $rc1" echo "--- RUN evaluate_horizon.py ---" printf 'cmd:'; printf ' %q' "${hor_cmd[@]}"; echo "${hor_cmd[@]}" rc2=$? echo "exit_code_horizon: $rc2" echo "===== EVALUATION END =======" } >> "$log_file" 2>&1 set -e if [[ $rc1 -ne 0 || $rc2 -ne 0 ]]; then echo "[GPU $gpu] FAIL job $jid (next=$rc1 horizon=$rc2). Log: $log_file" >&2 exit 1 fi echo "[GPU $gpu] DONE job $jid (log: $log_file)" done < "$qfile" ) & pids+=("$!") done fail=0 for pid in "${pids[@]}"; do if ! wait "$pid"; then fail=1 fi done if [[ $fail -ne 0 ]]; then echo "One or more workers failed." >&2 exit 1 fi echo "All evaluations complete."