diff --git a/run_experiments_multi_gpu.sh b/run_experiments_multi_gpu.sh index db0c175..adcc403 100644 --- a/run_experiments_multi_gpu.sh +++ b/run_experiments_multi_gpu.sh @@ -4,7 +4,7 @@ set -euo pipefail usage() { cat <<'USAGE' Usage: - ./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--dry-run] [-- ] + ./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--log-dir experiment_logs] [--dry-run] [-- ] Description: Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs @@ -13,13 +13,14 @@ Description: Examples: ./run_experiments_multi_gpu.sh --gpus 0,1,2 ./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50 - ./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type weibull + ./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type discrete_time_cif USAGE } experiments_file="experiments.txt" gpu_list="" cmd_str="python train.py" +log_dir="experiment_logs" dry_run=0 extra_args=() @@ -37,6 +38,10 @@ while [[ $# -gt 0 ]]; do cmd_str="${2-}" shift 2 ;; + --log-dir) + log_dir="${2-}" + shift 2 + ;; --dry-run) dry_run=1 shift @@ -63,6 +68,8 @@ if [[ -z "$gpu_list" ]]; then exit 2 fi +mkdir -p "$log_dir" + if [[ ! -f "$experiments_file" ]]; then echo "Error: experiments file not found: $experiments_file" >&2 exit 2 @@ -123,6 +130,13 @@ fi echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}" +sanitize() { + # Replace any char outside [A-Za-z0-9._-] with '_' + local s="${1-}" + s="${s//[^A-Za-z0-9._-]/_}" + printf '%s' "$s" +} + pids=() for i in "${!gpus[@]}"; do gpu="${gpus[$i]}" @@ -153,15 +167,52 @@ for i in "${!gpus[@]}"; do echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov" + ts="$(date +%Y%m%d-%H%M%S)" + safe_model="$(sanitize "$model_type")" + safe_loss="$(sanitize "$loss_type")" + safe_age="$(sanitize "$age_encoder")" + safe_cov="$(sanitize "$full_cov")" + log_file="${log_dir}/exp_${exp_id}_gpu${gpu}_${safe_model}_${safe_loss}_${safe_age}_${safe_cov}_${ts}.log" + + { + echo "===== EXPERIMENT START =====" + echo "timestamp: $ts" + echo "gpu: $gpu" + echo "exp_id: $exp_id" + echo "model_type: $model_type" + echo "loss_type: $loss_type" + echo "age_encoder: $age_encoder" + echo "full_cov: $full_cov" + printf 'cmd:' + printf ' %q' "${run_cmd[@]}" + echo + echo "============================" + } > "$log_file" + if [[ $dry_run -eq 1 ]]; then printf '[GPU %s] CMD: ' "$gpu" printf '%q ' "${run_cmd[@]}" echo + echo "[GPU $gpu] LOG: $log_file" else - "${run_cmd[@]}" + set +e + "${run_cmd[@]}" >> "$log_file" 2>&1 + rc=$? + set -e + + { + echo "============================" + echo "exit_code: $rc" + echo "===== EXPERIMENT END =======" + } >> "$log_file" + + if [[ $rc -ne 0 ]]; then + echo "[GPU $gpu] FAIL exp $exp_id (exit=$rc). Log: $log_file" >&2 + exit "$rc" + fi fi - echo "[GPU $gpu] DONE exp $exp_id" + echo "[GPU $gpu] DONE exp $exp_id (log: $log_file)" done < "$qfile" ) &