#!/usr/bin/env bash set -euo pipefail usage() { cat <<'USAGE' Usage: ./run_experiments_multi_gpu.sh --gpus 0,1,2 [--runs-file runs_to_eval.txt | --runs-root runs] [--cmd "python evaluate.py"] [--log-dir evaluation_logs] [--out-root eval_outputs] [--skip-existing] [--dry-run] [-- ] Description: Distributes evaluation jobs across multiple GPUs (round-robin) and runs at most one job per GPU at a time. A job is a run directory containing: - train_config.json - best_model.pt By default, run directories are auto-discovered under --runs-root (default: runs). Alternatively, provide --runs-file with one run_dir per line. Examples: # Auto-discover run dirs under ./runs ./run_experiments_multi_gpu.sh --gpus 0,1,2 # Use an explicit list of run directories ./run_experiments_multi_gpu.sh --gpus 0,1 --runs-file runs_to_eval.txt # Centralize outputs (CSV bundle + summary JSON) under eval_outputs/ ./run_experiments_multi_gpu.sh --gpus 0,1 --out-root eval_outputs # Forward args to evaluate.py ./run_experiments_multi_gpu.sh --gpus 0,1 -- --batch_size 512 --num_workers 8 USAGE } runs_file="" runs_root="runs" gpu_list="" cmd_str="python evaluate.py" log_dir="evaluation_logs" out_root="" skip_existing=0 dry_run=0 extra_args=() while [[ $# -gt 0 ]]; do case "$1" in --gpus) gpu_list="${2-}" shift 2 ;; --runs-file|-f) runs_file="${2-}" shift 2 ;; --runs-root) runs_root="${2-}" shift 2 ;; --cmd) cmd_str="${2-}" shift 2 ;; --log-dir) log_dir="${2-}" shift 2 ;; --out-root) out_root="${2-}" shift 2 ;; --skip-existing) skip_existing=1 shift ;; --dry-run) dry_run=1 shift ;; --help|-h) usage exit 0 ;; --) shift extra_args=("$@") break ;; *) echo "Unknown argument: $1" >&2 usage exit 2 ;; esac done if [[ -z "$gpu_list" ]]; then echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2 exit 2 fi mkdir -p "$log_dir" IFS=',' read -r -a gpus <<< "$gpu_list" if [[ ${#gpus[@]} -lt 1 ]]; then echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2 exit 2 fi # Parse cmd string into an argv array. # shellcheck disable=SC2206 cmd=($cmd_str) if [[ ${#cmd[@]} -lt 2 ]]; then echo "Error: --cmd should look like 'python evaluate.py'" >&2 exit 2 fi _tmpdir="" cleanup() { if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then rm -rf "${_tmpdir}" fi } trap cleanup EXIT _tmpdir="$(mktemp -d)" # Prepare per-GPU queue files. queue_files=() for i in "${!gpus[@]}"; do qfile="${_tmpdir}/queue_${i}.csv" : > "$qfile" queue_files+=("$qfile") done discover_runs() { local root="${1-}" if [[ -z "$root" ]]; then return 0 fi if [[ ! -d "$root" ]]; then echo "Error: runs root not found: $root" >&2 return 2 fi # shellcheck disable=SC2016 find "$root" -mindepth 1 -maxdepth 1 -type d -print 2>/dev/null | sort } run_dirs=() if [[ -n "$runs_file" ]]; then if [[ ! -f "$runs_file" ]]; then echo "Error: runs file not found: $runs_file" >&2 exit 2 fi while IFS= read -r line || [[ -n "$line" ]]; do line="${line%$'\r'}" # handle CRLF [[ -z "$line" ]] && continue [[ "$line" == \#* ]] && continue run_dirs+=("$line") done < "$runs_file" else while IFS= read -r d || [[ -n "${d-}" ]]; do [[ -z "${d-}" ]] && continue run_dirs+=("$d") done < <(discover_runs "$runs_root") fi if [[ ${#run_dirs[@]} -eq 0 ]]; then if [[ -n "$runs_file" ]]; then echo "No run directories found in $runs_file" >&2 else echo "No run directories found under $runs_root" >&2 fi exit 1 fi is_valid_run_dir() { local d="${1-}" [[ -d "$d" ]] || return 1 [[ -f "$d/train_config.json" ]] || return 1 [[ -f "$d/best_model.pt" ]] || return 1 return 0 } valid_run_dirs=() for d in "${run_dirs[@]}"; do if is_valid_run_dir "$d"; then valid_run_dirs+=("$d") else echo "Skipping invalid run_dir (missing train_config.json or best_model.pt): $d" >&2 fi done if [[ ${#valid_run_dirs[@]} -eq 0 ]]; then echo "No valid run directories found." >&2 exit 1 fi # Distribute evaluation jobs round-robin. job_idx=0 for d in "${valid_run_dirs[@]}"; do slot=$((job_idx % ${#gpus[@]})) printf '%s,%s\n' "$job_idx" "$d" >> "${queue_files[$slot]}" job_idx=$((job_idx + 1)) done echo "Queued $job_idx evaluation job(s) across ${#gpus[@]} GPU(s): ${gpus[*]}" sanitize() { # Replace any char outside [A-Za-z0-9._-] with '_' local s="${1-}" s="${s//[^A-Za-z0-9._-]/_}" printf '%s' "$s" } pids=() for i in "${!gpus[@]}"; do gpu="${gpus[$i]}" qfile="${queue_files[$i]}" ( export CUDA_VISIBLE_DEVICES="$gpu" while IFS=',' read -r job_id run_dir || [[ -n "${job_id-}" ]]; do [[ -z "${job_id-}" ]] && continue [[ -z "${run_dir-}" ]] && continue ts="$(date +%Y%m%d-%H%M%S)" safe_run="$(sanitize "$(basename "$run_dir")")" # Decide output locations. out_dir_arg=() out_json="" if [[ -n "$out_root" ]]; then job_out_dir="${out_root%/}/run_${job_id}_${safe_run}" mkdir -p "$job_out_dir" out_json="$job_out_dir/evaluation_summary.json" out_dir_arg=(--out_dir "$job_out_dir" --output "$out_json") else out_json="$run_dir/evaluation_summary.json" fi if [[ $skip_existing -eq 1 && -f "$out_json" ]]; then echo "[GPU $gpu] SKIP job $job_id: already exists ($out_json)" continue fi run_cmd=("${cmd[@]}" \ --run_dir "$run_dir" \ --device cuda) if [[ ${#out_dir_arg[@]} -gt 0 ]]; then run_cmd+=("${out_dir_arg[@]}") fi if [[ ${#extra_args[@]} -gt 0 ]]; then run_cmd+=("${extra_args[@]}") fi echo "[GPU $gpu] START job $job_id: run_dir=$run_dir" log_file="${log_dir}/eval_${job_id}_gpu${gpu}_${safe_run}_${ts}.log" { echo "===== EVALUATION START =====" echo "timestamp: $ts" echo "gpu: $gpu" echo "job_id: $job_id" echo "run_dir: $run_dir" echo "out_root: ${out_root:-}" echo "out_json: $out_json" printf 'cmd:' printf ' %q' "${run_cmd[@]}" echo echo "============================" } > "$log_file" if [[ $dry_run -eq 1 ]]; then printf '[GPU %s] CMD: ' "$gpu" printf '%q ' "${run_cmd[@]}" echo echo "[GPU $gpu] LOG: $log_file" else set +e "${run_cmd[@]}" >> "$log_file" 2>&1 rc=$? set -e { echo "============================" echo "exit_code: $rc" echo "===== EVALUATION END =======" } >> "$log_file" if [[ $rc -ne 0 ]]; then echo "[GPU $gpu] FAIL job $job_id (exit=$rc). Log: $log_file" >&2 exit "$rc" fi fi echo "[GPU $gpu] DONE job $job_id (log: $log_file)" done < "$qfile" ) & pids+=("$!") done # Wait for all GPU workers. fail=0 for pid in "${pids[@]}"; do if ! wait "$pid"; then fail=1 fi done if [[ $fail -ne 0 ]]; then echo "One or more workers failed." >&2 exit 1 fi echo "All evaluations complete."