Refactor evaluation scripts for multi-GPU execution

- Removed `run_evaluations_multi_gpu.sh` script as it was redundant.
- Updated `run_experiments_multi_gpu.sh` to handle evaluation jobs instead of training.
- Changed command-line options to support evaluation-specific parameters.
- Implemented run directory discovery and validation for evaluation jobs.
- Enhanced logging to capture evaluation details and outputs.
- Added options for centralized output management and skipping existing results.
This commit is contained in:
2026-01-18 17:38:20 +08:00
parent b80d9a4256
commit 0057bc0dd9
3 changed files with 1391 additions and 413 deletions

View File

@@ -4,23 +4,41 @@ set -euo pipefail
usage() {
cat <<'USAGE'
Usage:
./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--log-dir experiment_logs] [--dry-run] [-- <extra train.py args>]
./run_experiments_multi_gpu.sh --gpus 0,1,2 [--runs-file runs_to_eval.txt | --runs-root runs] [--cmd "python evaluate.py"] [--log-dir evaluation_logs] [--out-root eval_outputs] [--skip-existing] [--dry-run] [-- <extra evaluate.py args>]
Description:
Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs
Distributes evaluation jobs across multiple GPUs (round-robin) and runs
at most one job per GPU at a time.
A job is a run directory containing:
- train_config.json
- best_model.pt
By default, run directories are auto-discovered under --runs-root (default: runs).
Alternatively, provide --runs-file with one run_dir per line.
Examples:
# Auto-discover run dirs under ./runs
./run_experiments_multi_gpu.sh --gpus 0,1,2
./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50
./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type discrete_time_cif
# Use an explicit list of run directories
./run_experiments_multi_gpu.sh --gpus 0,1 --runs-file runs_to_eval.txt
# Centralize outputs (CSV bundle + summary JSON) under eval_outputs/
./run_experiments_multi_gpu.sh --gpus 0,1 --out-root eval_outputs
# Forward args to evaluate.py
./run_experiments_multi_gpu.sh --gpus 0,1 -- --batch_size 512 --num_workers 8
USAGE
}
experiments_file="experiments.txt"
runs_file=""
runs_root="runs"
gpu_list=""
cmd_str="python train.py"
log_dir="experiment_logs"
cmd_str="python evaluate.py"
log_dir="evaluation_logs"
out_root=""
skip_existing=0
dry_run=0
extra_args=()
@@ -30,8 +48,12 @@ while [[ $# -gt 0 ]]; do
gpu_list="${2-}"
shift 2
;;
--experiments|-f)
experiments_file="${2-}"
--runs-file|-f)
runs_file="${2-}"
shift 2
;;
--runs-root)
runs_root="${2-}"
shift 2
;;
--cmd)
@@ -42,6 +64,14 @@ while [[ $# -gt 0 ]]; do
log_dir="${2-}"
shift 2
;;
--out-root)
out_root="${2-}"
shift 2
;;
--skip-existing)
skip_existing=1
shift
;;
--dry-run)
dry_run=1
shift
@@ -70,11 +100,6 @@ fi
mkdir -p "$log_dir"
if [[ ! -f "$experiments_file" ]]; then
echo "Error: experiments file not found: $experiments_file" >&2
exit 2
fi
IFS=',' read -r -a gpus <<< "$gpu_list"
if [[ ${#gpus[@]} -lt 1 ]]; then
echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
@@ -85,7 +110,7 @@ fi
# shellcheck disable=SC2206
cmd=($cmd_str)
if [[ ${#cmd[@]} -lt 2 ]]; then
echo "Error: --cmd should look like 'python train.py'" >&2
echo "Error: --cmd should look like 'python evaluate.py'" >&2
exit 2
fi
@@ -107,28 +132,81 @@ for i in "${!gpus[@]}"; do
queue_files+=("$qfile")
done
# Distribute experiments round-robin.
exp_idx=0
while IFS= read -r line || [[ -n "$line" ]]; do
line="${line%$'\r'}" # handle CRLF
[[ -z "$line" ]] && continue
# Skip header if present
if [[ "$line" == model_type,* ]]; then
continue
discover_runs() {
local root="${1-}"
if [[ -z "$root" ]]; then
return 0
fi
if [[ ! -d "$root" ]]; then
echo "Error: runs root not found: $root" >&2
return 2
fi
slot=$((exp_idx % ${#gpus[@]}))
# Prefix a stable experiment index for logging.
printf '%s,%s\n' "$exp_idx" "$line" >> "${queue_files[$slot]}"
exp_idx=$((exp_idx + 1))
done < "$experiments_file"
# shellcheck disable=SC2016
find "$root" -mindepth 1 -maxdepth 1 -type d -print 2>/dev/null |
sort
}
if [[ $exp_idx -eq 0 ]]; then
echo "No experiments found in $experiments_file" >&2
run_dirs=()
if [[ -n "$runs_file" ]]; then
if [[ ! -f "$runs_file" ]]; then
echo "Error: runs file not found: $runs_file" >&2
exit 2
fi
while IFS= read -r line || [[ -n "$line" ]]; do
line="${line%$'\r'}" # handle CRLF
[[ -z "$line" ]] && continue
[[ "$line" == \#* ]] && continue
run_dirs+=("$line")
done < "$runs_file"
else
while IFS= read -r d || [[ -n "${d-}" ]]; do
[[ -z "${d-}" ]] && continue
run_dirs+=("$d")
done < <(discover_runs "$runs_root")
fi
if [[ ${#run_dirs[@]} -eq 0 ]]; then
if [[ -n "$runs_file" ]]; then
echo "No run directories found in $runs_file" >&2
else
echo "No run directories found under $runs_root" >&2
fi
exit 1
fi
echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}"
is_valid_run_dir() {
local d="${1-}"
[[ -d "$d" ]] || return 1
[[ -f "$d/train_config.json" ]] || return 1
[[ -f "$d/best_model.pt" ]] || return 1
return 0
}
valid_run_dirs=()
for d in "${run_dirs[@]}"; do
if is_valid_run_dir "$d"; then
valid_run_dirs+=("$d")
else
echo "Skipping invalid run_dir (missing train_config.json or best_model.pt): $d" >&2
fi
done
if [[ ${#valid_run_dirs[@]} -eq 0 ]]; then
echo "No valid run directories found." >&2
exit 1
fi
# Distribute evaluation jobs round-robin.
job_idx=0
for d in "${valid_run_dirs[@]}"; do
slot=$((job_idx % ${#gpus[@]}))
printf '%s,%s\n' "$job_idx" "$d" >> "${queue_files[$slot]}"
job_idx=$((job_idx + 1))
done
echo "Queued $job_idx evaluation job(s) across ${#gpus[@]} GPU(s): ${gpus[*]}"
sanitize() {
# Replace any char outside [A-Za-z0-9._-] with '_'
@@ -145,44 +223,53 @@ for i in "${!gpus[@]}"; do
(
export CUDA_VISIBLE_DEVICES="$gpu"
while IFS=',' read -r exp_id model_type loss_type age_encoder full_cov || [[ -n "${exp_id-}" ]]; do
# Skip empty lines
[[ -z "${exp_id-}" ]] && continue
while IFS=',' read -r job_id run_dir || [[ -n "${job_id-}" ]]; do
[[ -z "${job_id-}" ]] && continue
[[ -z "${run_dir-}" ]] && continue
# Normalize booleans / strip whitespace
full_cov="${full_cov//[[:space:]]/}"
ts="$(date +%Y%m%d-%H%M%S)"
safe_run="$(sanitize "$(basename "$run_dir")")"
# Decide output locations.
out_dir_arg=()
out_json=""
if [[ -n "$out_root" ]]; then
job_out_dir="${out_root%/}/run_${job_id}_${safe_run}"
mkdir -p "$job_out_dir"
out_json="$job_out_dir/evaluation_summary.json"
out_dir_arg=(--out_dir "$job_out_dir" --output "$out_json")
else
out_json="$run_dir/evaluation_summary.json"
fi
if [[ $skip_existing -eq 1 && -f "$out_json" ]]; then
echo "[GPU $gpu] SKIP job $job_id: already exists ($out_json)"
continue
fi
run_cmd=("${cmd[@]}" \
--model_type "$model_type" \
--loss_type "$loss_type" \
--age_encoder "$age_encoder")
--run_dir "$run_dir" \
--device cuda)
if [[ "$full_cov" == "True" || "$full_cov" == "true" || "$full_cov" == "1" ]]; then
run_cmd+=(--full_cov)
if [[ ${#out_dir_arg[@]} -gt 0 ]]; then
run_cmd+=("${out_dir_arg[@]}")
fi
if [[ ${#extra_args[@]} -gt 0 ]]; then
run_cmd+=("${extra_args[@]}")
fi
echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov"
ts="$(date +%Y%m%d-%H%M%S)"
safe_model="$(sanitize "$model_type")"
safe_loss="$(sanitize "$loss_type")"
safe_age="$(sanitize "$age_encoder")"
safe_cov="$(sanitize "$full_cov")"
log_file="${log_dir}/exp_${exp_id}_gpu${gpu}_${safe_model}_${safe_loss}_${safe_age}_${safe_cov}_${ts}.log"
echo "[GPU $gpu] START job $job_id: run_dir=$run_dir"
log_file="${log_dir}/eval_${job_id}_gpu${gpu}_${safe_run}_${ts}.log"
{
echo "===== EXPERIMENT START ====="
echo "===== EVALUATION START ====="
echo "timestamp: $ts"
echo "gpu: $gpu"
echo "exp_id: $exp_id"
echo "model_type: $model_type"
echo "loss_type: $loss_type"
echo "age_encoder: $age_encoder"
echo "full_cov: $full_cov"
echo "job_id: $job_id"
echo "run_dir: $run_dir"
echo "out_root: ${out_root:-<default in run_dir>}"
echo "out_json: $out_json"
printf 'cmd:'
printf ' %q' "${run_cmd[@]}"
echo
@@ -203,16 +290,16 @@ for i in "${!gpus[@]}"; do
{
echo "============================"
echo "exit_code: $rc"
echo "===== EXPERIMENT END ======="
echo "===== EVALUATION END ======="
} >> "$log_file"
if [[ $rc -ne 0 ]]; then
echo "[GPU $gpu] FAIL exp $exp_id (exit=$rc). Log: $log_file" >&2
echo "[GPU $gpu] FAIL job $job_id (exit=$rc). Log: $log_file" >&2
exit "$rc"
fi
fi
echo "[GPU $gpu] DONE exp $exp_id (log: $log_file)"
echo "[GPU $gpu] DONE job $job_id (log: $log_file)"
done < "$qfile"
) &
@@ -232,4 +319,4 @@ if [[ $fail -ne 0 ]]; then
exit 1
fi
echo "All experiments complete."
echo "All evaluations complete."