Refactor run_evaluations_multi_gpu.sh to remove --tau-short requirement and add support for next-args and horizon-args files

2026-01-17 23:39:55 +08:00
parent 4686b56336
commit 248fb09c34
1 changed files with 54 additions and 19 deletions
--- a/run_evaluations_multi_gpu.sh
+++ b/run_evaluations_multi_gpu.sh
@@ -4,12 +4,12 @@ set -euo pipefail
 usage() {
  cat <<'USAGE'
 Usage:
-  ./run_evaluations_multi_gpu.sh --gpus 0,1,2 --tau-short 0.5 [options] [-- <extra eval args>]
+  ./run_evaluations_multi_gpu.sh --gpus 0,1,2 [options] [-- <common eval args>]

 Description:
  Discovers trained run directories (containing best_model.pt + train_config.json)
  and runs BOTH evaluations on each run:
-    1) evaluate_next_event.py (requires --tau_short)
+    1) evaluate_next_event.py
    2) evaluate_horizon.py

  Jobs are distributed round-robin across the provided GPU list and each GPU runs
@@ -17,24 +17,29 @@ Description:

 Options:
  --gpus              Comma-separated GPU ids (required), e.g. 0,1,2
-  --tau-short         Short-window horizon (years) for evaluate_next_event.py (required)
  --runs-root         Root directory containing run subfolders (default: runs)
  --pattern           Shell glob to filter run folder basenames (default: *)
  --run-dirs-file     Text file with one run_dir per line (overrides --runs-root)
  --horizons          Horizon grid in years (space-separated list). If omitted, uses script defaults.
  --age-bins          Age bin boundaries in years (space-separated list). If omitted, uses script defaults.
+  --next-args-file     File with one CLI argument per line appended only to evaluate_next_event.py
+  --horizon-args-file  File with one CLI argument per line appended only to evaluate_horizon.py
  --python            Python executable/command (default: python)
  --log-dir           Directory for logs (default: eval_logs)
  --dry-run           Print commands without executing
  --help|-h           Show this help

-Extra eval args:
+Common eval args:
  Anything after `--` is appended to BOTH evaluation commands.
-  Examples: -- --batch_size 512 --num_workers 4 --seed 0 --min_pos 20 --no_tqdm
+  Use this only for flags supported by BOTH scripts (e.g. --batch_size, --num_workers, --seed, --min_pos, --no_tqdm).
+
+Per-eval args:
+  For eval-specific flags (e.g. evaluate_horizon.py --topk_list / --workload_fracs), use --horizon-args-file.
+  Args files are "one argument per line"; blank lines are ignored.

 Examples:
-  ./run_evaluations_multi_gpu.sh --gpus 0,1 --tau-short 0.5
-  ./run_evaluations_multi_gpu.sh --gpus 0,1 --tau-short 0.5 --runs-root runs --pattern "delphi_*" \
+  ./run_evaluations_multi_gpu.sh --gpus 0,1
+  ./run_evaluations_multi_gpu.sh --gpus 0,1 --runs-root runs --pattern "delphi_*" \
    --horizons 0.25 0.5 1 2 5 10 --age-bins 40 45 50 55 60 65 70 75 inf -- --batch_size 512 --num_workers 4

 USAGE
@@ -47,11 +52,11 @@ gpu_list=""
 python_cmd="python"
 log_dir="eval_logs"
 dry_run=0
-
-tau_short=""
 horizons=()
 age_bins=()
 extra_args=()
+next_args_file=""
+horizon_args_file=""

 while [[ $# -gt 0 ]]; do
  case "$1" in
@@ -59,10 +64,6 @@ while [[ $# -gt 0 ]]; do
      gpu_list="${2-}"
      shift 2
      ;;
-    --tau-short)
-      tau_short="${2-}"
-      shift 2
-      ;;
    --runs-root)
      runs_root="${2-}"
      shift 2
@@ -75,6 +76,14 @@ while [[ $# -gt 0 ]]; do
      run_dirs_file="${2-}"
      shift 2
      ;;
+    --next-args-file)
+      next_args_file="${2-}"
+      shift 2
+      ;;
+    --horizon-args-file)
+      horizon_args_file="${2-}"
+      shift 2
+      ;;
    --python)
      python_cmd="${2-}"
      shift 2
@@ -125,10 +134,21 @@ if [[ -z "$gpu_list" ]]; then
  exit 2
 fi

-if [[ -z "$tau_short" ]]; then
-  echo "Error: --tau-short is required (e.g. --tau-short 0.5)." >&2
-  exit 2
-fi
+read_args_file() {
+  local f="${1-}"
+  if [[ -z "$f" ]]; then
+    return 0
+  fi
+  if [[ ! -f "$f" ]]; then
+    echo "Error: args file not found: $f" >&2
+    exit 2
+  fi
+  while IFS= read -r line || [[ -n "$line" ]]; do
+    line="${line%$'\r'}" # handle CRLF
+    [[ -z "$line" ]] && continue
+    printf '%s\n' "$line"
+  done < "$f"
+}

 mkdir -p "$log_dir"

@@ -226,13 +246,18 @@ for i in "${!gpus[@]}"; do
        echo "gpu: $gpu"
        echo "job_id: $jid"
        echo "run_dir: $run_dir"
-        echo "tau_short: $tau_short"
        if [[ ${#horizons[@]} -gt 0 ]]; then
          echo "horizons: ${horizons[*]}"
        fi
        if [[ ${#age_bins[@]} -gt 0 ]]; then
          echo "age_bins: ${age_bins[*]}"
        fi
+        if [[ -n "${next_args_file}" ]]; then
+          echo "next_args_file: ${next_args_file}"
+        fi
+        if [[ -n "${horizon_args_file}" ]]; then
+          echo "horizon_args_file: ${horizon_args_file}"
+        fi
        if [[ ${#extra_args[@]} -gt 0 ]]; then
          echo "extra_args: ${extra_args[*]}"
        fi
@@ -240,10 +265,15 @@ for i in "${!gpus[@]}"; do
      } > "$log_file"

      # Build argv arrays
-      next_cmd=("$python_cmd" evaluate_next_event.py --run_dir "$run_dir" --tau_short "$tau_short")
+      next_cmd=("$python_cmd" evaluate_next_event.py --run_dir "$run_dir")
      if [[ ${#age_bins[@]} -gt 0 ]]; then
        next_cmd+=(--age_bins "${age_bins[@]}")
      fi
+      if [[ -n "${next_args_file}" ]]; then
+        while IFS= read -r a; do
+          next_cmd+=("$a")
+        done < <(read_args_file "${next_args_file}")
+      fi
      if [[ ${#extra_args[@]} -gt 0 ]]; then
        next_cmd+=("${extra_args[@]}")
      fi
@@ -255,6 +285,11 @@ for i in "${!gpus[@]}"; do
      if [[ ${#age_bins[@]} -gt 0 ]]; then
        hor_cmd+=(--age_bins "${age_bins[@]}")
      fi
+      if [[ -n "${horizon_args_file}" ]]; then
+        while IFS= read -r a; do
+          hor_cmd+=("$a")
+        done < <(read_args_file "${horizon_args_file}")
+      fi
      if [[ ${#extra_args[@]} -gt 0 ]]; then
        hor_cmd+=("${extra_args[@]}")
      fi