Refactor evaluation scripts for multi-GPU execution
- Removed `run_evaluations_multi_gpu.sh` script as it was redundant. - Updated `run_experiments_multi_gpu.sh` to handle evaluation jobs instead of training. - Changed command-line options to support evaluation-specific parameters. - Implemented run directory discovery and validation for evaluation jobs. - Enhanced logging to capture evaluation details and outputs. - Added options for centralized output management and skipping existing results.
This commit is contained in:
1245
evaluate.py
Normal file
1245
evaluate.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,354 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
cat <<'USAGE'
|
|
||||||
Usage:
|
|
||||||
./run_evaluations_multi_gpu.sh --gpus 0,1,2 [options] [-- <common eval args>]
|
|
||||||
|
|
||||||
Description:
|
|
||||||
Discovers trained run directories (containing best_model.pt + train_config.json)
|
|
||||||
and runs BOTH evaluations on each run:
|
|
||||||
1) evaluate_next_event.py
|
|
||||||
2) evaluate_horizon.py
|
|
||||||
|
|
||||||
Jobs are distributed round-robin across the provided GPU list and each GPU runs
|
|
||||||
at most one job at a time.
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--gpus Comma-separated GPU ids (required), e.g. 0,1,2
|
|
||||||
--runs-root Root directory containing run subfolders (default: runs)
|
|
||||||
--pattern Shell glob to filter run folder basenames (default: *)
|
|
||||||
--run-dirs-file Text file with one run_dir per line (overrides --runs-root)
|
|
||||||
--horizons Horizon grid in years (space-separated list). If omitted, uses script defaults.
|
|
||||||
--age-bins Age bin boundaries in years (space-separated list). If omitted, uses script defaults.
|
|
||||||
--next-args-file File with one CLI argument per line appended only to evaluate_next_event.py
|
|
||||||
--horizon-args-file File with one CLI argument per line appended only to evaluate_horizon.py
|
|
||||||
--python Python executable/command (default: python)
|
|
||||||
--log-dir Directory for logs (default: eval_logs)
|
|
||||||
--dry-run Print commands without executing
|
|
||||||
--help|-h Show this help
|
|
||||||
|
|
||||||
Common eval args:
|
|
||||||
Anything after `--` is appended to BOTH evaluation commands.
|
|
||||||
Use this only for flags supported by BOTH scripts (e.g. --batch_size, --num_workers, --max_cpu_cores, --seed, --min_pos, --no_tqdm).
|
|
||||||
|
|
||||||
Per-eval args:
|
|
||||||
For eval-specific flags (e.g. evaluate_horizon.py --topk_list / --workload_fracs), use --horizon-args-file.
|
|
||||||
Args files are "one argument per line"; blank lines are ignored.
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
./run_evaluations_multi_gpu.sh --gpus 0,1
|
|
||||||
./run_evaluations_multi_gpu.sh --gpus 0,1 --runs-root runs --pattern "delphi_*" \
|
|
||||||
--horizons 0.25 0.5 1 2 5 10 --age-bins 40 45 50 55 60 65 70 75 inf -- --batch_size 512 --num_workers 4
|
|
||||||
./run_evaluations_multi_gpu.sh --gpus 0,1 --runs-root runs --pattern "delphi_*" \
|
|
||||||
-- --batch_size 512 --num_workers 4 --max_cpu_cores -1
|
|
||||||
|
|
||||||
USAGE
|
|
||||||
}
|
|
||||||
|
|
||||||
runs_root="runs"
|
|
||||||
pattern="*"
|
|
||||||
run_dirs_file=""
|
|
||||||
gpu_list=""
|
|
||||||
python_cmd="python"
|
|
||||||
log_dir="eval_logs"
|
|
||||||
dry_run=0
|
|
||||||
horizons=()
|
|
||||||
age_bins=()
|
|
||||||
extra_args=()
|
|
||||||
next_args_file=""
|
|
||||||
horizon_args_file=""
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case "$1" in
|
|
||||||
--gpus)
|
|
||||||
gpu_list="${2-}"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--runs-root)
|
|
||||||
runs_root="${2-}"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--pattern)
|
|
||||||
pattern="${2-}"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--run-dirs-file)
|
|
||||||
run_dirs_file="${2-}"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--next-args-file)
|
|
||||||
next_args_file="${2-}"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--horizon-args-file)
|
|
||||||
horizon_args_file="${2-}"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--python)
|
|
||||||
python_cmd="${2-}"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--log-dir)
|
|
||||||
log_dir="${2-}"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--dry-run)
|
|
||||||
dry_run=1
|
|
||||||
shift
|
|
||||||
;;
|
|
||||||
--horizons)
|
|
||||||
shift
|
|
||||||
horizons=()
|
|
||||||
while [[ $# -gt 0 && "$1" != --* ]]; do
|
|
||||||
horizons+=("$1")
|
|
||||||
shift
|
|
||||||
done
|
|
||||||
;;
|
|
||||||
--age-bins)
|
|
||||||
shift
|
|
||||||
age_bins=()
|
|
||||||
while [[ $# -gt 0 && "$1" != --* ]]; do
|
|
||||||
age_bins+=("$1")
|
|
||||||
shift
|
|
||||||
done
|
|
||||||
;;
|
|
||||||
--help|-h)
|
|
||||||
usage
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
--)
|
|
||||||
shift
|
|
||||||
extra_args=("$@")
|
|
||||||
break
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown argument: $1" >&2
|
|
||||||
usage
|
|
||||||
exit 2
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ -z "$gpu_list" ]]; then
|
|
||||||
echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
read_args_file() {
|
|
||||||
local f="${1-}"
|
|
||||||
if [[ -z "$f" ]]; then
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
if [[ ! -f "$f" ]]; then
|
|
||||||
echo "Error: args file not found: $f" >&2
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
while IFS= read -r line || [[ -n "$line" ]]; do
|
|
||||||
line="${line%$'\r'}" # handle CRLF
|
|
||||||
[[ -z "$line" ]] && continue
|
|
||||||
printf '%s\n' "$line"
|
|
||||||
done < "$f"
|
|
||||||
}
|
|
||||||
|
|
||||||
mkdir -p "$log_dir"
|
|
||||||
|
|
||||||
IFS=',' read -r -a gpus <<< "$gpu_list"
|
|
||||||
if [[ ${#gpus[@]} -lt 1 ]]; then
|
|
||||||
echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
sanitize() {
|
|
||||||
# Replace any char outside [A-Za-z0-9._-] with '_'
|
|
||||||
local s="${1-}"
|
|
||||||
s="${s//[^A-Za-z0-9._-]/_}"
|
|
||||||
printf '%s' "$s"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Discover run directories
|
|
||||||
run_dirs=()
|
|
||||||
if [[ -n "$run_dirs_file" ]]; then
|
|
||||||
if [[ ! -f "$run_dirs_file" ]]; then
|
|
||||||
echo "Error: --run-dirs-file not found: $run_dirs_file" >&2
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
while IFS= read -r line || [[ -n "$line" ]]; do
|
|
||||||
line="${line%$'\r'}" # handle CRLF
|
|
||||||
[[ -z "$line" ]] && continue
|
|
||||||
run_dirs+=("$line")
|
|
||||||
done < "$run_dirs_file"
|
|
||||||
else
|
|
||||||
if [[ ! -d "$runs_root" ]]; then
|
|
||||||
echo "Error: runs root not found: $runs_root" >&2
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
shopt -s nullglob
|
|
||||||
for d in "$runs_root"/$pattern; do
|
|
||||||
[[ -d "$d" ]] || continue
|
|
||||||
[[ -f "$d/best_model.pt" ]] || continue
|
|
||||||
[[ -f "$d/train_config.json" ]] || continue
|
|
||||||
run_dirs+=("$d")
|
|
||||||
done
|
|
||||||
shopt -u nullglob
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ${#run_dirs[@]} -eq 0 ]]; then
|
|
||||||
echo "Error: no run directories found." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Queued ${#run_dirs[@]} run(s) across ${#gpus[@]} GPU(s): ${gpus[*]}"
|
|
||||||
|
|
||||||
_tmpdir=""
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then
|
|
||||||
rm -rf "${_tmpdir}"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
_tmpdir="$(mktemp -d)"
|
|
||||||
|
|
||||||
# Prepare per-GPU queue files (TSV: job_id \t run_dir)
|
|
||||||
queue_files=()
|
|
||||||
for i in "${!gpus[@]}"; do
|
|
||||||
qfile="${_tmpdir}/queue_${i}.tsv"
|
|
||||||
: > "$qfile"
|
|
||||||
queue_files+=("$qfile")
|
|
||||||
done
|
|
||||||
|
|
||||||
job_id=0
|
|
||||||
for run_dir in "${run_dirs[@]}"; do
|
|
||||||
slot=$((job_id % ${#gpus[@]}))
|
|
||||||
printf '%s\t%s\n' "$job_id" "$run_dir" >> "${queue_files[$slot]}"
|
|
||||||
job_id=$((job_id + 1))
|
|
||||||
done
|
|
||||||
|
|
||||||
pids=()
|
|
||||||
for i in "${!gpus[@]}"; do
|
|
||||||
gpu="${gpus[$i]}"
|
|
||||||
qfile="${queue_files[$i]}"
|
|
||||||
|
|
||||||
(
|
|
||||||
export CUDA_VISIBLE_DEVICES="$gpu"
|
|
||||||
|
|
||||||
while IFS=$'\t' read -r jid run_dir || [[ -n "${jid-}" ]]; do
|
|
||||||
[[ -z "${jid-}" ]] && continue
|
|
||||||
[[ -z "${run_dir-}" ]] && continue
|
|
||||||
|
|
||||||
ts="$(date +%Y%m%d-%H%M%S)"
|
|
||||||
safe_run="$(sanitize "$(basename "$run_dir")")"
|
|
||||||
log_file="${log_dir}/eval_${jid}_gpu${gpu}_${safe_run}_${ts}.log"
|
|
||||||
|
|
||||||
{
|
|
||||||
echo "===== EVALUATION START ====="
|
|
||||||
echo "timestamp: $ts"
|
|
||||||
echo "gpu: $gpu"
|
|
||||||
echo "job_id: $jid"
|
|
||||||
echo "run_dir: $run_dir"
|
|
||||||
if [[ ${#horizons[@]} -gt 0 ]]; then
|
|
||||||
echo "horizons: ${horizons[*]}"
|
|
||||||
fi
|
|
||||||
if [[ ${#age_bins[@]} -gt 0 ]]; then
|
|
||||||
echo "age_bins: ${age_bins[*]}"
|
|
||||||
fi
|
|
||||||
if [[ -n "${next_args_file}" ]]; then
|
|
||||||
echo "next_args_file: ${next_args_file}"
|
|
||||||
fi
|
|
||||||
if [[ -n "${horizon_args_file}" ]]; then
|
|
||||||
echo "horizon_args_file: ${horizon_args_file}"
|
|
||||||
fi
|
|
||||||
if [[ ${#extra_args[@]} -gt 0 ]]; then
|
|
||||||
echo "extra_args: ${extra_args[*]}"
|
|
||||||
fi
|
|
||||||
echo "============================"
|
|
||||||
} > "$log_file"
|
|
||||||
|
|
||||||
# Build argv arrays
|
|
||||||
next_cmd=("$python_cmd" evaluate_next_event.py --run_dir "$run_dir")
|
|
||||||
if [[ ${#age_bins[@]} -gt 0 ]]; then
|
|
||||||
next_cmd+=(--age_bins "${age_bins[@]}")
|
|
||||||
fi
|
|
||||||
if [[ -n "${next_args_file}" ]]; then
|
|
||||||
while IFS= read -r a; do
|
|
||||||
next_cmd+=("$a")
|
|
||||||
done < <(read_args_file "${next_args_file}")
|
|
||||||
fi
|
|
||||||
if [[ ${#extra_args[@]} -gt 0 ]]; then
|
|
||||||
next_cmd+=("${extra_args[@]}")
|
|
||||||
fi
|
|
||||||
|
|
||||||
hor_cmd=("$python_cmd" evaluate_horizon.py --run_dir "$run_dir")
|
|
||||||
if [[ ${#horizons[@]} -gt 0 ]]; then
|
|
||||||
hor_cmd+=(--horizons "${horizons[@]}")
|
|
||||||
fi
|
|
||||||
if [[ ${#age_bins[@]} -gt 0 ]]; then
|
|
||||||
hor_cmd+=(--age_bins "${age_bins[@]}")
|
|
||||||
fi
|
|
||||||
if [[ -n "${horizon_args_file}" ]]; then
|
|
||||||
while IFS= read -r a; do
|
|
||||||
hor_cmd+=("$a")
|
|
||||||
done < <(read_args_file "${horizon_args_file}")
|
|
||||||
fi
|
|
||||||
if [[ ${#extra_args[@]} -gt 0 ]]; then
|
|
||||||
hor_cmd+=("${extra_args[@]}")
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[GPU $gpu] START job $jid: $run_dir"
|
|
||||||
|
|
||||||
if [[ $dry_run -eq 1 ]]; then
|
|
||||||
{
|
|
||||||
echo "[DRY-RUN] next-event cmd:"; printf ' %q' "${next_cmd[@]}"; echo
|
|
||||||
echo "[DRY-RUN] horizon cmd:"; printf ' %q' "${hor_cmd[@]}"; echo
|
|
||||||
echo "[DRY-RUN] log: $log_file"
|
|
||||||
} | tee -a "$log_file"
|
|
||||||
echo "[GPU $gpu] DONE job $jid (dry-run)"
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
set +e
|
|
||||||
{
|
|
||||||
echo "--- RUN evaluate_next_event.py ---"
|
|
||||||
printf 'cmd:'; printf ' %q' "${next_cmd[@]}"; echo
|
|
||||||
"${next_cmd[@]}"
|
|
||||||
rc1=$?
|
|
||||||
echo "exit_code_next_event: $rc1"
|
|
||||||
|
|
||||||
echo "--- RUN evaluate_horizon.py ---"
|
|
||||||
printf 'cmd:'; printf ' %q' "${hor_cmd[@]}"; echo
|
|
||||||
"${hor_cmd[@]}"
|
|
||||||
rc2=$?
|
|
||||||
echo "exit_code_horizon: $rc2"
|
|
||||||
|
|
||||||
echo "===== EVALUATION END ======="
|
|
||||||
} >> "$log_file" 2>&1
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ $rc1 -ne 0 || $rc2 -ne 0 ]]; then
|
|
||||||
echo "[GPU $gpu] FAIL job $jid (next=$rc1 horizon=$rc2). Log: $log_file" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "[GPU $gpu] DONE job $jid (log: $log_file)"
|
|
||||||
done < "$qfile"
|
|
||||||
) &
|
|
||||||
|
|
||||||
pids+=("$!")
|
|
||||||
done
|
|
||||||
|
|
||||||
fail=0
|
|
||||||
for pid in "${pids[@]}"; do
|
|
||||||
if ! wait "$pid"; then
|
|
||||||
fail=1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
if [[ $fail -ne 0 ]]; then
|
|
||||||
echo "One or more workers failed." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "All evaluations complete."
|
|
||||||
@@ -4,23 +4,41 @@ set -euo pipefail
|
|||||||
usage() {
|
usage() {
|
||||||
cat <<'USAGE'
|
cat <<'USAGE'
|
||||||
Usage:
|
Usage:
|
||||||
./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--log-dir experiment_logs] [--dry-run] [-- <extra train.py args>]
|
./run_experiments_multi_gpu.sh --gpus 0,1,2 [--runs-file runs_to_eval.txt | --runs-root runs] [--cmd "python evaluate.py"] [--log-dir evaluation_logs] [--out-root eval_outputs] [--skip-existing] [--dry-run] [-- <extra evaluate.py args>]
|
||||||
|
|
||||||
Description:
|
Description:
|
||||||
Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs
|
Distributes evaluation jobs across multiple GPUs (round-robin) and runs
|
||||||
at most one job per GPU at a time.
|
at most one job per GPU at a time.
|
||||||
|
|
||||||
|
A job is a run directory containing:
|
||||||
|
- train_config.json
|
||||||
|
- best_model.pt
|
||||||
|
|
||||||
|
By default, run directories are auto-discovered under --runs-root (default: runs).
|
||||||
|
Alternatively, provide --runs-file with one run_dir per line.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
# Auto-discover run dirs under ./runs
|
||||||
./run_experiments_multi_gpu.sh --gpus 0,1,2
|
./run_experiments_multi_gpu.sh --gpus 0,1,2
|
||||||
./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50
|
|
||||||
./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type discrete_time_cif
|
# Use an explicit list of run directories
|
||||||
|
./run_experiments_multi_gpu.sh --gpus 0,1 --runs-file runs_to_eval.txt
|
||||||
|
|
||||||
|
# Centralize outputs (CSV bundle + summary JSON) under eval_outputs/
|
||||||
|
./run_experiments_multi_gpu.sh --gpus 0,1 --out-root eval_outputs
|
||||||
|
|
||||||
|
# Forward args to evaluate.py
|
||||||
|
./run_experiments_multi_gpu.sh --gpus 0,1 -- --batch_size 512 --num_workers 8
|
||||||
USAGE
|
USAGE
|
||||||
}
|
}
|
||||||
|
|
||||||
experiments_file="experiments.txt"
|
runs_file=""
|
||||||
|
runs_root="runs"
|
||||||
gpu_list=""
|
gpu_list=""
|
||||||
cmd_str="python train.py"
|
cmd_str="python evaluate.py"
|
||||||
log_dir="experiment_logs"
|
log_dir="evaluation_logs"
|
||||||
|
out_root=""
|
||||||
|
skip_existing=0
|
||||||
dry_run=0
|
dry_run=0
|
||||||
extra_args=()
|
extra_args=()
|
||||||
|
|
||||||
@@ -30,8 +48,12 @@ while [[ $# -gt 0 ]]; do
|
|||||||
gpu_list="${2-}"
|
gpu_list="${2-}"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
--experiments|-f)
|
--runs-file|-f)
|
||||||
experiments_file="${2-}"
|
runs_file="${2-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--runs-root)
|
||||||
|
runs_root="${2-}"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
--cmd)
|
--cmd)
|
||||||
@@ -42,6 +64,14 @@ while [[ $# -gt 0 ]]; do
|
|||||||
log_dir="${2-}"
|
log_dir="${2-}"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--out-root)
|
||||||
|
out_root="${2-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--skip-existing)
|
||||||
|
skip_existing=1
|
||||||
|
shift
|
||||||
|
;;
|
||||||
--dry-run)
|
--dry-run)
|
||||||
dry_run=1
|
dry_run=1
|
||||||
shift
|
shift
|
||||||
@@ -70,11 +100,6 @@ fi
|
|||||||
|
|
||||||
mkdir -p "$log_dir"
|
mkdir -p "$log_dir"
|
||||||
|
|
||||||
if [[ ! -f "$experiments_file" ]]; then
|
|
||||||
echo "Error: experiments file not found: $experiments_file" >&2
|
|
||||||
exit 2
|
|
||||||
fi
|
|
||||||
|
|
||||||
IFS=',' read -r -a gpus <<< "$gpu_list"
|
IFS=',' read -r -a gpus <<< "$gpu_list"
|
||||||
if [[ ${#gpus[@]} -lt 1 ]]; then
|
if [[ ${#gpus[@]} -lt 1 ]]; then
|
||||||
echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
|
echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
|
||||||
@@ -85,7 +110,7 @@ fi
|
|||||||
# shellcheck disable=SC2206
|
# shellcheck disable=SC2206
|
||||||
cmd=($cmd_str)
|
cmd=($cmd_str)
|
||||||
if [[ ${#cmd[@]} -lt 2 ]]; then
|
if [[ ${#cmd[@]} -lt 2 ]]; then
|
||||||
echo "Error: --cmd should look like 'python train.py'" >&2
|
echo "Error: --cmd should look like 'python evaluate.py'" >&2
|
||||||
exit 2
|
exit 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -107,28 +132,81 @@ for i in "${!gpus[@]}"; do
|
|||||||
queue_files+=("$qfile")
|
queue_files+=("$qfile")
|
||||||
done
|
done
|
||||||
|
|
||||||
# Distribute experiments round-robin.
|
discover_runs() {
|
||||||
exp_idx=0
|
local root="${1-}"
|
||||||
while IFS= read -r line || [[ -n "$line" ]]; do
|
if [[ -z "$root" ]]; then
|
||||||
line="${line%$'\r'}" # handle CRLF
|
return 0
|
||||||
[[ -z "$line" ]] && continue
|
fi
|
||||||
# Skip header if present
|
if [[ ! -d "$root" ]]; then
|
||||||
if [[ "$line" == model_type,* ]]; then
|
echo "Error: runs root not found: $root" >&2
|
||||||
continue
|
return 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
slot=$((exp_idx % ${#gpus[@]}))
|
# shellcheck disable=SC2016
|
||||||
# Prefix a stable experiment index for logging.
|
find "$root" -mindepth 1 -maxdepth 1 -type d -print 2>/dev/null |
|
||||||
printf '%s,%s\n' "$exp_idx" "$line" >> "${queue_files[$slot]}"
|
sort
|
||||||
exp_idx=$((exp_idx + 1))
|
}
|
||||||
done < "$experiments_file"
|
|
||||||
|
|
||||||
if [[ $exp_idx -eq 0 ]]; then
|
run_dirs=()
|
||||||
echo "No experiments found in $experiments_file" >&2
|
if [[ -n "$runs_file" ]]; then
|
||||||
|
if [[ ! -f "$runs_file" ]]; then
|
||||||
|
echo "Error: runs file not found: $runs_file" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
while IFS= read -r line || [[ -n "$line" ]]; do
|
||||||
|
line="${line%$'\r'}" # handle CRLF
|
||||||
|
[[ -z "$line" ]] && continue
|
||||||
|
[[ "$line" == \#* ]] && continue
|
||||||
|
run_dirs+=("$line")
|
||||||
|
done < "$runs_file"
|
||||||
|
else
|
||||||
|
while IFS= read -r d || [[ -n "${d-}" ]]; do
|
||||||
|
[[ -z "${d-}" ]] && continue
|
||||||
|
run_dirs+=("$d")
|
||||||
|
done < <(discover_runs "$runs_root")
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ${#run_dirs[@]} -eq 0 ]]; then
|
||||||
|
if [[ -n "$runs_file" ]]; then
|
||||||
|
echo "No run directories found in $runs_file" >&2
|
||||||
|
else
|
||||||
|
echo "No run directories found under $runs_root" >&2
|
||||||
|
fi
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}"
|
is_valid_run_dir() {
|
||||||
|
local d="${1-}"
|
||||||
|
[[ -d "$d" ]] || return 1
|
||||||
|
[[ -f "$d/train_config.json" ]] || return 1
|
||||||
|
[[ -f "$d/best_model.pt" ]] || return 1
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
valid_run_dirs=()
|
||||||
|
for d in "${run_dirs[@]}"; do
|
||||||
|
if is_valid_run_dir "$d"; then
|
||||||
|
valid_run_dirs+=("$d")
|
||||||
|
else
|
||||||
|
echo "Skipping invalid run_dir (missing train_config.json or best_model.pt): $d" >&2
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ ${#valid_run_dirs[@]} -eq 0 ]]; then
|
||||||
|
echo "No valid run directories found." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Distribute evaluation jobs round-robin.
|
||||||
|
job_idx=0
|
||||||
|
for d in "${valid_run_dirs[@]}"; do
|
||||||
|
slot=$((job_idx % ${#gpus[@]}))
|
||||||
|
printf '%s,%s\n' "$job_idx" "$d" >> "${queue_files[$slot]}"
|
||||||
|
job_idx=$((job_idx + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Queued $job_idx evaluation job(s) across ${#gpus[@]} GPU(s): ${gpus[*]}"
|
||||||
|
|
||||||
sanitize() {
|
sanitize() {
|
||||||
# Replace any char outside [A-Za-z0-9._-] with '_'
|
# Replace any char outside [A-Za-z0-9._-] with '_'
|
||||||
@@ -145,44 +223,53 @@ for i in "${!gpus[@]}"; do
|
|||||||
(
|
(
|
||||||
export CUDA_VISIBLE_DEVICES="$gpu"
|
export CUDA_VISIBLE_DEVICES="$gpu"
|
||||||
|
|
||||||
while IFS=',' read -r exp_id model_type loss_type age_encoder full_cov || [[ -n "${exp_id-}" ]]; do
|
while IFS=',' read -r job_id run_dir || [[ -n "${job_id-}" ]]; do
|
||||||
# Skip empty lines
|
[[ -z "${job_id-}" ]] && continue
|
||||||
[[ -z "${exp_id-}" ]] && continue
|
[[ -z "${run_dir-}" ]] && continue
|
||||||
|
|
||||||
# Normalize booleans / strip whitespace
|
ts="$(date +%Y%m%d-%H%M%S)"
|
||||||
full_cov="${full_cov//[[:space:]]/}"
|
safe_run="$(sanitize "$(basename "$run_dir")")"
|
||||||
|
|
||||||
|
# Decide output locations.
|
||||||
|
out_dir_arg=()
|
||||||
|
out_json=""
|
||||||
|
if [[ -n "$out_root" ]]; then
|
||||||
|
job_out_dir="${out_root%/}/run_${job_id}_${safe_run}"
|
||||||
|
mkdir -p "$job_out_dir"
|
||||||
|
out_json="$job_out_dir/evaluation_summary.json"
|
||||||
|
out_dir_arg=(--out_dir "$job_out_dir" --output "$out_json")
|
||||||
|
else
|
||||||
|
out_json="$run_dir/evaluation_summary.json"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $skip_existing -eq 1 && -f "$out_json" ]]; then
|
||||||
|
echo "[GPU $gpu] SKIP job $job_id: already exists ($out_json)"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
run_cmd=("${cmd[@]}" \
|
run_cmd=("${cmd[@]}" \
|
||||||
--model_type "$model_type" \
|
--run_dir "$run_dir" \
|
||||||
--loss_type "$loss_type" \
|
--device cuda)
|
||||||
--age_encoder "$age_encoder")
|
|
||||||
|
|
||||||
if [[ "$full_cov" == "True" || "$full_cov" == "true" || "$full_cov" == "1" ]]; then
|
if [[ ${#out_dir_arg[@]} -gt 0 ]]; then
|
||||||
run_cmd+=(--full_cov)
|
run_cmd+=("${out_dir_arg[@]}")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ ${#extra_args[@]} -gt 0 ]]; then
|
if [[ ${#extra_args[@]} -gt 0 ]]; then
|
||||||
run_cmd+=("${extra_args[@]}")
|
run_cmd+=("${extra_args[@]}")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov"
|
echo "[GPU $gpu] START job $job_id: run_dir=$run_dir"
|
||||||
|
|
||||||
ts="$(date +%Y%m%d-%H%M%S)"
|
|
||||||
safe_model="$(sanitize "$model_type")"
|
|
||||||
safe_loss="$(sanitize "$loss_type")"
|
|
||||||
safe_age="$(sanitize "$age_encoder")"
|
|
||||||
safe_cov="$(sanitize "$full_cov")"
|
|
||||||
log_file="${log_dir}/exp_${exp_id}_gpu${gpu}_${safe_model}_${safe_loss}_${safe_age}_${safe_cov}_${ts}.log"
|
|
||||||
|
|
||||||
|
log_file="${log_dir}/eval_${job_id}_gpu${gpu}_${safe_run}_${ts}.log"
|
||||||
{
|
{
|
||||||
echo "===== EXPERIMENT START ====="
|
echo "===== EVALUATION START ====="
|
||||||
echo "timestamp: $ts"
|
echo "timestamp: $ts"
|
||||||
echo "gpu: $gpu"
|
echo "gpu: $gpu"
|
||||||
echo "exp_id: $exp_id"
|
echo "job_id: $job_id"
|
||||||
echo "model_type: $model_type"
|
echo "run_dir: $run_dir"
|
||||||
echo "loss_type: $loss_type"
|
echo "out_root: ${out_root:-<default in run_dir>}"
|
||||||
echo "age_encoder: $age_encoder"
|
echo "out_json: $out_json"
|
||||||
echo "full_cov: $full_cov"
|
|
||||||
printf 'cmd:'
|
printf 'cmd:'
|
||||||
printf ' %q' "${run_cmd[@]}"
|
printf ' %q' "${run_cmd[@]}"
|
||||||
echo
|
echo
|
||||||
@@ -203,16 +290,16 @@ for i in "${!gpus[@]}"; do
|
|||||||
{
|
{
|
||||||
echo "============================"
|
echo "============================"
|
||||||
echo "exit_code: $rc"
|
echo "exit_code: $rc"
|
||||||
echo "===== EXPERIMENT END ======="
|
echo "===== EVALUATION END ======="
|
||||||
} >> "$log_file"
|
} >> "$log_file"
|
||||||
|
|
||||||
if [[ $rc -ne 0 ]]; then
|
if [[ $rc -ne 0 ]]; then
|
||||||
echo "[GPU $gpu] FAIL exp $exp_id (exit=$rc). Log: $log_file" >&2
|
echo "[GPU $gpu] FAIL job $job_id (exit=$rc). Log: $log_file" >&2
|
||||||
exit "$rc"
|
exit "$rc"
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "[GPU $gpu] DONE exp $exp_id (log: $log_file)"
|
echo "[GPU $gpu] DONE job $job_id (log: $log_file)"
|
||||||
done < "$qfile"
|
done < "$qfile"
|
||||||
) &
|
) &
|
||||||
|
|
||||||
@@ -232,4 +319,4 @@ if [[ $fail -ne 0 ]]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "All experiments complete."
|
echo "All evaluations complete."
|
||||||
|
|||||||
Reference in New Issue
Block a user