Refactor evaluation scripts for multi-GPU execution

- Removed `run_evaluations_multi_gpu.sh` script as it was redundant. - Updated `run_experiments_multi_gpu.sh` to handle evaluation jobs instead of training. - Changed command-line options to support evaluation-specific parameters. - Implemented run directory discovery and validation for evaluation jobs. - Enhanced logging to capture evaluation details and outputs. - Added options for centralized output management and skipping existing results.
2026-01-18 17:38:20 +08:00
parent b80d9a4256
commit 0057bc0dd9
3 changed files with 1391 additions and 413 deletions
--- a/evaluate.py
+++ b/evaluate.py
--- a/run_evaluations_multi_gpu.sh
+++ b/run_evaluations_multi_gpu.sh
@@ -1,354 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 usage() {
  cat <<'USAGE'
 Usage:
  ./run_evaluations_multi_gpu.sh --gpus 0,1,2 [options] [-- <common eval args>]
 Description:
  Discovers trained run directories (containing best_model.pt + train_config.json)
  and runs BOTH evaluations on each run:
    1) evaluate_next_event.py
    2) evaluate_horizon.py
  Jobs are distributed round-robin across the provided GPU list and each GPU runs
  at most one job at a time.
 Options:
  --gpus              Comma-separated GPU ids (required), e.g. 0,1,2
  --runs-root         Root directory containing run subfolders (default: runs)
  --pattern           Shell glob to filter run folder basenames (default: *)
  --run-dirs-file     Text file with one run_dir per line (overrides --runs-root)
  --horizons          Horizon grid in years (space-separated list). If omitted, uses script defaults.
  --age-bins          Age bin boundaries in years (space-separated list). If omitted, uses script defaults.
  --next-args-file     File with one CLI argument per line appended only to evaluate_next_event.py
  --horizon-args-file  File with one CLI argument per line appended only to evaluate_horizon.py
  --python            Python executable/command (default: python)
  --log-dir           Directory for logs (default: eval_logs)
  --dry-run           Print commands without executing
  --help|-h           Show this help
 Common eval args:
  Anything after `--` is appended to BOTH evaluation commands.
  Use this only for flags supported by BOTH scripts (e.g. --batch_size, --num_workers, --max_cpu_cores, --seed, --min_pos, --no_tqdm).
 Per-eval args:
  For eval-specific flags (e.g. evaluate_horizon.py --topk_list / --workload_fracs), use --horizon-args-file.
  Args files are "one argument per line"; blank lines are ignored.
 Examples:
  ./run_evaluations_multi_gpu.sh --gpus 0,1
  ./run_evaluations_multi_gpu.sh --gpus 0,1 --runs-root runs --pattern "delphi_*" \
    --horizons 0.25 0.5 1 2 5 10 --age-bins 40 45 50 55 60 65 70 75 inf -- --batch_size 512 --num_workers 4
  ./run_evaluations_multi_gpu.sh --gpus 0,1 --runs-root runs --pattern "delphi_*" \
    -- --batch_size 512 --num_workers 4 --max_cpu_cores -1
 USAGE
 }
 runs_root="runs"
 pattern="*"
 run_dirs_file=""
 gpu_list=""
 python_cmd="python"
 log_dir="eval_logs"
 dry_run=0
 horizons=()
 age_bins=()
 extra_args=()
 next_args_file=""
 horizon_args_file=""
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --gpus)
      gpu_list="${2-}"
      shift 2
      ;;
    --runs-root)
      runs_root="${2-}"
      shift 2
      ;;
    --pattern)
      pattern="${2-}"
      shift 2
      ;;
    --run-dirs-file)
      run_dirs_file="${2-}"
      shift 2
      ;;
    --next-args-file)
      next_args_file="${2-}"
      shift 2
      ;;
    --horizon-args-file)
      horizon_args_file="${2-}"
      shift 2
      ;;
    --python)
      python_cmd="${2-}"
      shift 2
      ;;
    --log-dir)
      log_dir="${2-}"
      shift 2
      ;;
    --dry-run)
      dry_run=1
      shift
      ;;
    --horizons)
      shift
      horizons=()
      while [[ $# -gt 0 && "$1" != --* ]]; do
        horizons+=("$1")
        shift
      done
      ;;
    --age-bins)
      shift
      age_bins=()
      while [[ $# -gt 0 && "$1" != --* ]]; do
        age_bins+=("$1")
        shift
      done
      ;;
    --help|-h)
      usage
      exit 0
      ;;
    --)
      shift
      extra_args=("$@")
      break
      ;;
    *)
      echo "Unknown argument: $1" >&2
      usage
      exit 2
      ;;
  esac
 done
 if [[ -z "$gpu_list" ]]; then
  echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2
  exit 2
 fi
 read_args_file() {
  local f="${1-}"
  if [[ -z "$f" ]]; then
    return 0
  fi
  if [[ ! -f "$f" ]]; then
    echo "Error: args file not found: $f" >&2
    exit 2
  fi
  while IFS= read -r line || [[ -n "$line" ]]; do
    line="${line%$'\r'}" # handle CRLF
    [[ -z "$line" ]] && continue
    printf '%s\n' "$line"
  done < "$f"
 }
 mkdir -p "$log_dir"
 IFS=',' read -r -a gpus <<< "$gpu_list"
 if [[ ${#gpus[@]} -lt 1 ]]; then
  echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
  exit 2
 fi
 sanitize() {
  # Replace any char outside [A-Za-z0-9._-] with '_'
  local s="${1-}"
  s="${s//[^A-Za-z0-9._-]/_}"
  printf '%s' "$s"
 }
 # Discover run directories
 run_dirs=()
 if [[ -n "$run_dirs_file" ]]; then
  if [[ ! -f "$run_dirs_file" ]]; then
    echo "Error: --run-dirs-file not found: $run_dirs_file" >&2
    exit 2
  fi
  while IFS= read -r line || [[ -n "$line" ]]; do
    line="${line%$'\r'}" # handle CRLF
    [[ -z "$line" ]] && continue
    run_dirs+=("$line")
  done < "$run_dirs_file"
 else
  if [[ ! -d "$runs_root" ]]; then
    echo "Error: runs root not found: $runs_root" >&2
    exit 2
  fi
  shopt -s nullglob
  for d in "$runs_root"/$pattern; do
    [[ -d "$d" ]] || continue
    [[ -f "$d/best_model.pt" ]] || continue
    [[ -f "$d/train_config.json" ]] || continue
    run_dirs+=("$d")
  done
  shopt -u nullglob
 fi
 if [[ ${#run_dirs[@]} -eq 0 ]]; then
  echo "Error: no run directories found." >&2
  exit 1
 fi
 echo "Queued ${#run_dirs[@]} run(s) across ${#gpus[@]} GPU(s): ${gpus[*]}"
 _tmpdir=""
 cleanup() {
  if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then
    rm -rf "${_tmpdir}"
  fi
 }
 trap cleanup EXIT
 _tmpdir="$(mktemp -d)"
 # Prepare per-GPU queue files (TSV: job_id \t run_dir)
 queue_files=()
 for i in "${!gpus[@]}"; do
  qfile="${_tmpdir}/queue_${i}.tsv"
  : > "$qfile"
  queue_files+=("$qfile")
 done
 job_id=0
 for run_dir in "${run_dirs[@]}"; do
  slot=$((job_id % ${#gpus[@]}))
  printf '%s\t%s\n' "$job_id" "$run_dir" >> "${queue_files[$slot]}"
  job_id=$((job_id + 1))
 done
 pids=()
 for i in "${!gpus[@]}"; do
  gpu="${gpus[$i]}"
  qfile="${queue_files[$i]}"
  (
    export CUDA_VISIBLE_DEVICES="$gpu"
    while IFS=$'\t' read -r jid run_dir || [[ -n "${jid-}" ]]; do
      [[ -z "${jid-}" ]] && continue
      [[ -z "${run_dir-}" ]] && continue
      ts="$(date +%Y%m%d-%H%M%S)"
      safe_run="$(sanitize "$(basename "$run_dir")")"
      log_file="${log_dir}/eval_${jid}_gpu${gpu}_${safe_run}_${ts}.log"
      {
        echo "===== EVALUATION START ====="
        echo "timestamp: $ts"
        echo "gpu: $gpu"
        echo "job_id: $jid"
        echo "run_dir: $run_dir"
        if [[ ${#horizons[@]} -gt 0 ]]; then
          echo "horizons: ${horizons[*]}"
        fi
        if [[ ${#age_bins[@]} -gt 0 ]]; then
          echo "age_bins: ${age_bins[*]}"
        fi
        if [[ -n "${next_args_file}" ]]; then
          echo "next_args_file: ${next_args_file}"
        fi
        if [[ -n "${horizon_args_file}" ]]; then
          echo "horizon_args_file: ${horizon_args_file}"
        fi
        if [[ ${#extra_args[@]} -gt 0 ]]; then
          echo "extra_args: ${extra_args[*]}"
        fi
        echo "============================"
      } > "$log_file"
      # Build argv arrays
      next_cmd=("$python_cmd" evaluate_next_event.py --run_dir "$run_dir")
      if [[ ${#age_bins[@]} -gt 0 ]]; then
        next_cmd+=(--age_bins "${age_bins[@]}")
      fi
      if [[ -n "${next_args_file}" ]]; then
        while IFS= read -r a; do
          next_cmd+=("$a")
        done < <(read_args_file "${next_args_file}")
      fi
      if [[ ${#extra_args[@]} -gt 0 ]]; then
        next_cmd+=("${extra_args[@]}")
      fi
      hor_cmd=("$python_cmd" evaluate_horizon.py --run_dir "$run_dir")
      if [[ ${#horizons[@]} -gt 0 ]]; then
        hor_cmd+=(--horizons "${horizons[@]}")
      fi
      if [[ ${#age_bins[@]} -gt 0 ]]; then
        hor_cmd+=(--age_bins "${age_bins[@]}")
      fi
      if [[ -n "${horizon_args_file}" ]]; then
        while IFS= read -r a; do
          hor_cmd+=("$a")
        done < <(read_args_file "${horizon_args_file}")
      fi
      if [[ ${#extra_args[@]} -gt 0 ]]; then
        hor_cmd+=("${extra_args[@]}")
      fi
      echo "[GPU $gpu] START job $jid: $run_dir"
      if [[ $dry_run -eq 1 ]]; then
        {
          echo "[DRY-RUN] next-event cmd:"; printf '  %q' "${next_cmd[@]}"; echo
          echo "[DRY-RUN] horizon    cmd:"; printf '  %q' "${hor_cmd[@]}"; echo
          echo "[DRY-RUN] log: $log_file"
        } | tee -a "$log_file"
        echo "[GPU $gpu] DONE  job $jid (dry-run)"
        continue
      fi
      set +e
      {
        echo "--- RUN evaluate_next_event.py ---"
        printf 'cmd:'; printf ' %q' "${next_cmd[@]}"; echo
        "${next_cmd[@]}"
        rc1=$?
        echo "exit_code_next_event: $rc1"
        echo "--- RUN evaluate_horizon.py ---"
        printf 'cmd:'; printf ' %q' "${hor_cmd[@]}"; echo
        "${hor_cmd[@]}"
        rc2=$?
        echo "exit_code_horizon: $rc2"
        echo "===== EVALUATION END ======="
      } >> "$log_file" 2>&1
      set -e
      if [[ $rc1 -ne 0 || $rc2 -ne 0 ]]; then
        echo "[GPU $gpu] FAIL job $jid (next=$rc1 horizon=$rc2). Log: $log_file" >&2
        exit 1
      fi
      echo "[GPU $gpu] DONE  job $jid (log: $log_file)"
    done < "$qfile"
  ) &
  pids+=("$!")
 done
 fail=0
 for pid in "${pids[@]}"; do
  if ! wait "$pid"; then
    fail=1
  fi
 done
 if [[ $fail -ne 0 ]]; then
  echo "One or more workers failed." >&2
  exit 1
 fi
 echo "All evaluations complete."
--- a/run_experiments_multi_gpu.sh
+++ b/run_experiments_multi_gpu.sh
@@ -4,23 +4,41 @@ set -euo pipefail
 usage() {
  cat <<'USAGE'
 Usage:
-  ./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--log-dir experiment_logs] [--dry-run] [-- <extra train.py args>]
+  ./run_experiments_multi_gpu.sh --gpus 0,1,2 [--runs-file runs_to_eval.txt | --runs-root runs] [--cmd "python evaluate.py"] [--log-dir evaluation_logs] [--out-root eval_outputs] [--skip-existing] [--dry-run] [-- <extra evaluate.py args>]
 Description:
-  Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs
+  Distributes evaluation jobs across multiple GPUs (round-robin) and runs
  at most one job per GPU at a time.
  A job is a run directory containing:
    - train_config.json
    - best_model.pt
  By default, run directories are auto-discovered under --runs-root (default: runs).
  Alternatively, provide --runs-file with one run_dir per line.
 Examples:
  # Auto-discover run dirs under ./runs
  ./run_experiments_multi_gpu.sh --gpus 0,1,2
-  ./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50
+
-  ./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type discrete_time_cif
+  # Use an explicit list of run directories
  ./run_experiments_multi_gpu.sh --gpus 0,1 --runs-file runs_to_eval.txt
  # Centralize outputs (CSV bundle + summary JSON) under eval_outputs/
  ./run_experiments_multi_gpu.sh --gpus 0,1 --out-root eval_outputs
  # Forward args to evaluate.py
  ./run_experiments_multi_gpu.sh --gpus 0,1 -- --batch_size 512 --num_workers 8
 USAGE
 }
-experiments_file="experiments.txt"
+runs_file=""
 runs_root="runs"
 gpu_list=""
-cmd_str="python train.py"
+cmd_str="python evaluate.py"
-log_dir="experiment_logs"
+log_dir="evaluation_logs"
 out_root=""
 skip_existing=0
 dry_run=0
 extra_args=()
@@ -30,8 +48,12 @@ while [[ $# -gt 0 ]]; do
      gpu_list="${2-}"
      shift 2
      ;;
-    --experiments|-f)
+    --runs-file|-f)
-      experiments_file="${2-}"
+      runs_file="${2-}"
      shift 2
      ;;
    --runs-root)
      runs_root="${2-}"
      shift 2
      ;;
    --cmd)
@@ -42,6 +64,14 @@ while [[ $# -gt 0 ]]; do
      log_dir="${2-}"
      shift 2
      ;;
    --out-root)
      out_root="${2-}"
      shift 2
      ;;
    --skip-existing)
      skip_existing=1
      shift
      ;;
    --dry-run)
      dry_run=1
      shift
@@ -70,11 +100,6 @@ fi
 mkdir -p "$log_dir"
 if [[ ! -f "$experiments_file" ]]; then
  echo "Error: experiments file not found: $experiments_file" >&2
  exit 2
 fi
 IFS=',' read -r -a gpus <<< "$gpu_list"
 if [[ ${#gpus[@]} -lt 1 ]]; then
  echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
@@ -85,7 +110,7 @@ fi
 # shellcheck disable=SC2206
 cmd=($cmd_str)
 if [[ ${#cmd[@]} -lt 2 ]]; then
-  echo "Error: --cmd should look like 'python train.py'" >&2
+  echo "Error: --cmd should look like 'python evaluate.py'" >&2
  exit 2
 fi
@@ -107,28 +132,81 @@ for i in "${!gpus[@]}"; do
  queue_files+=("$qfile")
 done
-# Distribute experiments round-robin.
+discover_runs() {
-exp_idx=0
+  local root="${1-}"
-while IFS= read -r line || [[ -n "$line" ]]; do
+  if [[ -z "$root" ]]; then
-  line="${line%$'\r'}" # handle CRLF
+    return 0
-  [[ -z "$line" ]] && continue
+  fi
-  # Skip header if present
+  if [[ ! -d "$root" ]]; then
-  if [[ "$line" == model_type,* ]]; then
+    echo "Error: runs root not found: $root" >&2
-    continue
+    return 2
  fi
-  slot=$((exp_idx % ${#gpus[@]}))
+  # shellcheck disable=SC2016
-  # Prefix a stable experiment index for logging.
+  find "$root" -mindepth 1 -maxdepth 1 -type d -print 2>/dev/null |
-  printf '%s,%s\n' "$exp_idx" "$line" >> "${queue_files[$slot]}"
+    sort
-  exp_idx=$((exp_idx + 1))
+}
 done < "$experiments_file"
-if [[ $exp_idx -eq 0 ]]; then
+run_dirs=()
-  echo "No experiments found in $experiments_file" >&2
+if [[ -n "$runs_file" ]]; then
  if [[ ! -f "$runs_file" ]]; then
    echo "Error: runs file not found: $runs_file" >&2
    exit 2
  fi
  while IFS= read -r line || [[ -n "$line" ]]; do
    line="${line%$'\r'}" # handle CRLF
    [[ -z "$line" ]] && continue
    [[ "$line" == \#* ]] && continue
    run_dirs+=("$line")
  done < "$runs_file"
 else
  while IFS= read -r d || [[ -n "${d-}" ]]; do
    [[ -z "${d-}" ]] && continue
    run_dirs+=("$d")
  done < <(discover_runs "$runs_root")
 fi
 if [[ ${#run_dirs[@]} -eq 0 ]]; then
  if [[ -n "$runs_file" ]]; then
    echo "No run directories found in $runs_file" >&2
  else
    echo "No run directories found under $runs_root" >&2
  fi
  exit 1
 fi
-echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}"
+is_valid_run_dir() {
  local d="${1-}"
  [[ -d "$d" ]] || return 1
  [[ -f "$d/train_config.json" ]] || return 1
  [[ -f "$d/best_model.pt" ]] || return 1
  return 0
 }
 valid_run_dirs=()
 for d in "${run_dirs[@]}"; do
  if is_valid_run_dir "$d"; then
    valid_run_dirs+=("$d")
  else
    echo "Skipping invalid run_dir (missing train_config.json or best_model.pt): $d" >&2
  fi
 done
 if [[ ${#valid_run_dirs[@]} -eq 0 ]]; then
  echo "No valid run directories found." >&2
  exit 1
 fi
 # Distribute evaluation jobs round-robin.
 job_idx=0
 for d in "${valid_run_dirs[@]}"; do
  slot=$((job_idx % ${#gpus[@]}))
  printf '%s,%s\n' "$job_idx" "$d" >> "${queue_files[$slot]}"
  job_idx=$((job_idx + 1))
 done
 echo "Queued $job_idx evaluation job(s) across ${#gpus[@]} GPU(s): ${gpus[*]}"
 sanitize() {
  # Replace any char outside [A-Za-z0-9._-] with '_'
@@ -145,44 +223,53 @@ for i in "${!gpus[@]}"; do
  (
    export CUDA_VISIBLE_DEVICES="$gpu"
-    while IFS=',' read -r exp_id model_type loss_type age_encoder full_cov || [[ -n "${exp_id-}" ]]; do
+    while IFS=',' read -r job_id run_dir || [[ -n "${job_id-}" ]]; do
-      # Skip empty lines
+      [[ -z "${job_id-}" ]] && continue
-      [[ -z "${exp_id-}" ]] && continue
+      [[ -z "${run_dir-}" ]] && continue
-      # Normalize booleans / strip whitespace
+      ts="$(date +%Y%m%d-%H%M%S)"
-      full_cov="${full_cov//[[:space:]]/}"
+      safe_run="$(sanitize "$(basename "$run_dir")")"
      # Decide output locations.
      out_dir_arg=()
      out_json=""
      if [[ -n "$out_root" ]]; then
        job_out_dir="${out_root%/}/run_${job_id}_${safe_run}"
        mkdir -p "$job_out_dir"
        out_json="$job_out_dir/evaluation_summary.json"
        out_dir_arg=(--out_dir "$job_out_dir" --output "$out_json")
      else
        out_json="$run_dir/evaluation_summary.json"
      fi
      if [[ $skip_existing -eq 1 && -f "$out_json" ]]; then
        echo "[GPU $gpu] SKIP job $job_id: already exists ($out_json)"
        continue
      fi
      run_cmd=("${cmd[@]}" \
-        --model_type "$model_type" \
+        --run_dir "$run_dir" \
-        --loss_type "$loss_type" \
+        --device cuda)
        --age_encoder "$age_encoder")
-      if [[ "$full_cov" == "True" || "$full_cov" == "true" || "$full_cov" == "1" ]]; then
+      if [[ ${#out_dir_arg[@]} -gt 0 ]]; then
-        run_cmd+=(--full_cov)
+        run_cmd+=("${out_dir_arg[@]}")
      fi
      if [[ ${#extra_args[@]} -gt 0 ]]; then
        run_cmd+=("${extra_args[@]}")
      fi
-      echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov"
+      echo "[GPU $gpu] START job $job_id: run_dir=$run_dir"
      ts="$(date +%Y%m%d-%H%M%S)"
      safe_model="$(sanitize "$model_type")"
      safe_loss="$(sanitize "$loss_type")"
      safe_age="$(sanitize "$age_encoder")"
      safe_cov="$(sanitize "$full_cov")"
      log_file="${log_dir}/exp_${exp_id}_gpu${gpu}_${safe_model}_${safe_loss}_${safe_age}_${safe_cov}_${ts}.log"
      log_file="${log_dir}/eval_${job_id}_gpu${gpu}_${safe_run}_${ts}.log"
      {
-        echo "===== EXPERIMENT START ====="
+        echo "===== EVALUATION START ====="
        echo "timestamp: $ts"
        echo "gpu: $gpu"
-        echo "exp_id: $exp_id"
+        echo "job_id: $job_id"
-        echo "model_type: $model_type"
+        echo "run_dir: $run_dir"
-        echo "loss_type: $loss_type"
+        echo "out_root: ${out_root:-<default in run_dir>}"
-        echo "age_encoder: $age_encoder"
+        echo "out_json: $out_json"
        echo "full_cov: $full_cov"
        printf 'cmd:'
        printf ' %q' "${run_cmd[@]}"
        echo
@@ -203,16 +290,16 @@ for i in "${!gpus[@]}"; do
        {
          echo "============================"
          echo "exit_code: $rc"
-          echo "===== EXPERIMENT END ======="
+          echo "===== EVALUATION END ======="
        } >> "$log_file"
        if [[ $rc -ne 0 ]]; then
-          echo "[GPU $gpu] FAIL exp $exp_id (exit=$rc). Log: $log_file" >&2
+          echo "[GPU $gpu] FAIL job $job_id (exit=$rc). Log: $log_file" >&2
          exit "$rc"
        fi
      fi
-      echo "[GPU $gpu] DONE  exp $exp_id (log: $log_file)"
+      echo "[GPU $gpu] DONE  job $job_id (log: $log_file)"
    done < "$qfile"
  ) &
@@ -232,4 +319,4 @@ if [[ $fail -ne 0 ]]; then
  exit 1
 fi
-echo "All experiments complete."
+echo "All evaluations complete."