DeepHealth/run_evaluations_multi_gpu.sh

#!/usr/bin/env bash
set -euo pipefail

usage() {
  cat <<'USAGE'
Usage:
  ./run_evaluations_multi_gpu.sh --gpus 0,1,2 --tau-short 0.5 [options] [-- <extra eval args>]

Description:
  Discovers trained run directories (containing best_model.pt + train_config.json)
  and runs BOTH evaluations on each run:
    1) evaluate_next_event.py (requires --tau_short)
    2) evaluate_horizon.py

  Jobs are distributed round-robin across the provided GPU list and each GPU runs
  at most one job at a time.

Options:
  --gpus              Comma-separated GPU ids (required), e.g. 0,1,2
  --tau-short         Short-window horizon (years) for evaluate_next_event.py (required)
  --runs-root         Root directory containing run subfolders (default: runs)
  --pattern           Shell glob to filter run folder basenames (default: *)
  --run-dirs-file     Text file with one run_dir per line (overrides --runs-root)
  --horizons          Horizon grid in years (space-separated list). If omitted, uses script defaults.
  --age-bins          Age bin boundaries in years (space-separated list). If omitted, uses script defaults.
  --python            Python executable/command (default: python)
  --log-dir           Directory for logs (default: eval_logs)
  --dry-run           Print commands without executing
  --help|-h           Show this help

Extra eval args:
  Anything after `--` is appended to BOTH evaluation commands.
  Examples: -- --batch_size 512 --num_workers 4 --seed 0 --min_pos 20 --no_tqdm

Examples:
  ./run_evaluations_multi_gpu.sh --gpus 0,1 --tau-short 0.5
  ./run_evaluations_multi_gpu.sh --gpus 0,1 --tau-short 0.5 --runs-root runs --pattern "delphi_*" \
    --horizons 0.25 0.5 1 2 5 10 --age-bins 40 45 50 55 60 65 70 75 inf -- --batch_size 512 --num_workers 4

USAGE
}

runs_root="runs"
pattern="*"
run_dirs_file=""
gpu_list=""
python_cmd="python"
log_dir="eval_logs"
dry_run=0

tau_short=""
horizons=()
age_bins=()
extra_args=()

while [[ $# -gt 0 ]]; do
  case "$1" in
    --gpus)
      gpu_list="${2-}"
      shift 2
      ;;
    --tau-short)
      tau_short="${2-}"
      shift 2
      ;;
    --runs-root)
      runs_root="${2-}"
      shift 2
      ;;
    --pattern)
      pattern="${2-}"
      shift 2
      ;;
    --run-dirs-file)
      run_dirs_file="${2-}"
      shift 2
      ;;
    --python)
      python_cmd="${2-}"
      shift 2
      ;;
    --log-dir)
      log_dir="${2-}"
      shift 2
      ;;
    --dry-run)
      dry_run=1
      shift
      ;;
    --horizons)
      shift
      horizons=()
      while [[ $# -gt 0 && "$1" != --* ]]; do
        horizons+=("$1")
        shift
      done
      ;;
    --age-bins)
      shift
      age_bins=()
      while [[ $# -gt 0 && "$1" != --* ]]; do
        age_bins+=("$1")
        shift
      done
      ;;
    --help|-h)
      usage
      exit 0
      ;;
    --)
      shift
      extra_args=("$@")
      break
      ;;
    *)
      echo "Unknown argument: $1" >&2
      usage
      exit 2
      ;;
  esac
done

if [[ -z "$gpu_list" ]]; then
  echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2
  exit 2
fi

if [[ -z "$tau_short" ]]; then
  echo "Error: --tau-short is required (e.g. --tau-short 0.5)." >&2
  exit 2
fi

mkdir -p "$log_dir"

IFS=',' read -r -a gpus <<< "$gpu_list"
if [[ ${#gpus[@]} -lt 1 ]]; then
  echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
  exit 2
fi

sanitize() {
  # Replace any char outside [A-Za-z0-9._-] with '_'
  local s="${1-}"
  s="${s//[^A-Za-z0-9._-]/_}"
  printf '%s' "$s"
}

# Discover run directories
run_dirs=()
if [[ -n "$run_dirs_file" ]]; then
  if [[ ! -f "$run_dirs_file" ]]; then
    echo "Error: --run-dirs-file not found: $run_dirs_file" >&2
    exit 2
  fi
  while IFS= read -r line || [[ -n "$line" ]]; do
    line="${line%$'\r'}" # handle CRLF
    [[ -z "$line" ]] && continue
    run_dirs+=("$line")
  done < "$run_dirs_file"
else
  if [[ ! -d "$runs_root" ]]; then
    echo "Error: runs root not found: $runs_root" >&2
    exit 2
  fi
  shopt -s nullglob
  for d in "$runs_root"/$pattern; do
    [[ -d "$d" ]] || continue
    [[ -f "$d/best_model.pt" ]] || continue
    [[ -f "$d/train_config.json" ]] || continue
    run_dirs+=("$d")
  done
  shopt -u nullglob
fi

if [[ ${#run_dirs[@]} -eq 0 ]]; then
  echo "Error: no run directories found." >&2
  exit 1
fi

echo "Queued ${#run_dirs[@]} run(s) across ${#gpus[@]} GPU(s): ${gpus[*]}"

_tmpdir=""
cleanup() {
  if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then
    rm -rf "${_tmpdir}"
  fi
}
trap cleanup EXIT

_tmpdir="$(mktemp -d)"

# Prepare per-GPU queue files (TSV: job_id \t run_dir)
queue_files=()
for i in "${!gpus[@]}"; do
  qfile="${_tmpdir}/queue_${i}.tsv"
  : > "$qfile"
  queue_files+=("$qfile")
done

job_id=0
for run_dir in "${run_dirs[@]}"; do
  slot=$((job_id % ${#gpus[@]}))
  printf '%s\t%s\n' "$job_id" "$run_dir" >> "${queue_files[$slot]}"
  job_id=$((job_id + 1))
done

pids=()
for i in "${!gpus[@]}"; do
  gpu="${gpus[$i]}"
  qfile="${queue_files[$i]}"

  (
    export CUDA_VISIBLE_DEVICES="$gpu"

    while IFS=$'\t' read -r jid run_dir || [[ -n "${jid-}" ]]; do
      [[ -z "${jid-}" ]] && continue
      [[ -z "${run_dir-}" ]] && continue

      ts="$(date +%Y%m%d-%H%M%S)"
      safe_run="$(sanitize "$(basename "$run_dir")")"
      log_file="${log_dir}/eval_${jid}_gpu${gpu}_${safe_run}_${ts}.log"

      {
        echo "===== EVALUATION START ====="
        echo "timestamp: $ts"
        echo "gpu: $gpu"
        echo "job_id: $jid"
        echo "run_dir: $run_dir"
        echo "tau_short: $tau_short"
        if [[ ${#horizons[@]} -gt 0 ]]; then
          echo "horizons: ${horizons[*]}"
        fi
        if [[ ${#age_bins[@]} -gt 0 ]]; then
          echo "age_bins: ${age_bins[*]}"
        fi
        if [[ ${#extra_args[@]} -gt 0 ]]; then
          echo "extra_args: ${extra_args[*]}"
        fi
        echo "============================"
      } > "$log_file"

      # Build argv arrays
      next_cmd=("$python_cmd" evaluate_next_event.py --run_dir "$run_dir" --tau_short "$tau_short")
      if [[ ${#age_bins[@]} -gt 0 ]]; then
        next_cmd+=(--age_bins "${age_bins[@]}")
      fi
      if [[ ${#extra_args[@]} -gt 0 ]]; then
        next_cmd+=("${extra_args[@]}")
      fi

      hor_cmd=("$python_cmd" evaluate_horizon.py --run_dir "$run_dir")
      if [[ ${#horizons[@]} -gt 0 ]]; then
        hor_cmd+=(--horizons "${horizons[@]}")
      fi
      if [[ ${#age_bins[@]} -gt 0 ]]; then
        hor_cmd+=(--age_bins "${age_bins[@]}")
      fi
      if [[ ${#extra_args[@]} -gt 0 ]]; then
        hor_cmd+=("${extra_args[@]}")
      fi

      echo "[GPU $gpu] START job $jid: $run_dir"

      if [[ $dry_run -eq 1 ]]; then
        {
          echo "[DRY-RUN] next-event cmd:"; printf '  %q' "${next_cmd[@]}"; echo
          echo "[DRY-RUN] horizon    cmd:"; printf '  %q' "${hor_cmd[@]}"; echo
          echo "[DRY-RUN] log: $log_file"
        } | tee -a "$log_file"
        echo "[GPU $gpu] DONE  job $jid (dry-run)"
        continue
      fi

      set +e
      {
        echo "--- RUN evaluate_next_event.py ---"
        printf 'cmd:'; printf ' %q' "${next_cmd[@]}"; echo
        "${next_cmd[@]}"
        rc1=$?
        echo "exit_code_next_event: $rc1"

        echo "--- RUN evaluate_horizon.py ---"
        printf 'cmd:'; printf ' %q' "${hor_cmd[@]}"; echo
        "${hor_cmd[@]}"
        rc2=$?
        echo "exit_code_horizon: $rc2"

        echo "===== EVALUATION END ======="
      } >> "$log_file" 2>&1

      set -e

      if [[ $rc1 -ne 0 || $rc2 -ne 0 ]]; then
        echo "[GPU $gpu] FAIL job $jid (next=$rc1 horizon=$rc2). Log: $log_file" >&2
        exit 1
      fi

      echo "[GPU $gpu] DONE  job $jid (log: $log_file)"
    done < "$qfile"
  ) &

  pids+=("$!")
done

fail=0
for pid in "${pids[@]}"; do
  if ! wait "$pid"; then
    fail=1
  fi
done

if [[ $fail -ne 0 ]]; then
  echo "One or more workers failed." >&2
  exit 1
fi

echo "All evaluations complete."