Enhance multi-GPU experiment runner: add log directory option, improve logging with sanitized filenames, and capture command output in log files.
This commit is contained in:
@@ -4,7 +4,7 @@ set -euo pipefail
|
|||||||
usage() {
|
usage() {
|
||||||
cat <<'USAGE'
|
cat <<'USAGE'
|
||||||
Usage:
|
Usage:
|
||||||
./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--dry-run] [-- <extra train.py args>]
|
./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--log-dir experiment_logs] [--dry-run] [-- <extra train.py args>]
|
||||||
|
|
||||||
Description:
|
Description:
|
||||||
Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs
|
Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs
|
||||||
@@ -13,13 +13,14 @@ Description:
|
|||||||
Examples:
|
Examples:
|
||||||
./run_experiments_multi_gpu.sh --gpus 0,1,2
|
./run_experiments_multi_gpu.sh --gpus 0,1,2
|
||||||
./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50
|
./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50
|
||||||
./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type weibull
|
./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type discrete_time_cif
|
||||||
USAGE
|
USAGE
|
||||||
}
|
}
|
||||||
|
|
||||||
experiments_file="experiments.txt"
|
experiments_file="experiments.txt"
|
||||||
gpu_list=""
|
gpu_list=""
|
||||||
cmd_str="python train.py"
|
cmd_str="python train.py"
|
||||||
|
log_dir="experiment_logs"
|
||||||
dry_run=0
|
dry_run=0
|
||||||
extra_args=()
|
extra_args=()
|
||||||
|
|
||||||
@@ -37,6 +38,10 @@ while [[ $# -gt 0 ]]; do
|
|||||||
cmd_str="${2-}"
|
cmd_str="${2-}"
|
||||||
shift 2
|
shift 2
|
||||||
;;
|
;;
|
||||||
|
--log-dir)
|
||||||
|
log_dir="${2-}"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
--dry-run)
|
--dry-run)
|
||||||
dry_run=1
|
dry_run=1
|
||||||
shift
|
shift
|
||||||
@@ -63,6 +68,8 @@ if [[ -z "$gpu_list" ]]; then
|
|||||||
exit 2
|
exit 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
mkdir -p "$log_dir"
|
||||||
|
|
||||||
if [[ ! -f "$experiments_file" ]]; then
|
if [[ ! -f "$experiments_file" ]]; then
|
||||||
echo "Error: experiments file not found: $experiments_file" >&2
|
echo "Error: experiments file not found: $experiments_file" >&2
|
||||||
exit 2
|
exit 2
|
||||||
@@ -123,6 +130,13 @@ fi
|
|||||||
|
|
||||||
echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}"
|
echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}"
|
||||||
|
|
||||||
|
sanitize() {
|
||||||
|
# Replace any char outside [A-Za-z0-9._-] with '_'
|
||||||
|
local s="${1-}"
|
||||||
|
s="${s//[^A-Za-z0-9._-]/_}"
|
||||||
|
printf '%s' "$s"
|
||||||
|
}
|
||||||
|
|
||||||
pids=()
|
pids=()
|
||||||
for i in "${!gpus[@]}"; do
|
for i in "${!gpus[@]}"; do
|
||||||
gpu="${gpus[$i]}"
|
gpu="${gpus[$i]}"
|
||||||
@@ -153,15 +167,52 @@ for i in "${!gpus[@]}"; do
|
|||||||
|
|
||||||
echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov"
|
echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov"
|
||||||
|
|
||||||
|
ts="$(date +%Y%m%d-%H%M%S)"
|
||||||
|
safe_model="$(sanitize "$model_type")"
|
||||||
|
safe_loss="$(sanitize "$loss_type")"
|
||||||
|
safe_age="$(sanitize "$age_encoder")"
|
||||||
|
safe_cov="$(sanitize "$full_cov")"
|
||||||
|
log_file="${log_dir}/exp_${exp_id}_gpu${gpu}_${safe_model}_${safe_loss}_${safe_age}_${safe_cov}_${ts}.log"
|
||||||
|
|
||||||
|
{
|
||||||
|
echo "===== EXPERIMENT START ====="
|
||||||
|
echo "timestamp: $ts"
|
||||||
|
echo "gpu: $gpu"
|
||||||
|
echo "exp_id: $exp_id"
|
||||||
|
echo "model_type: $model_type"
|
||||||
|
echo "loss_type: $loss_type"
|
||||||
|
echo "age_encoder: $age_encoder"
|
||||||
|
echo "full_cov: $full_cov"
|
||||||
|
printf 'cmd:'
|
||||||
|
printf ' %q' "${run_cmd[@]}"
|
||||||
|
echo
|
||||||
|
echo "============================"
|
||||||
|
} > "$log_file"
|
||||||
|
|
||||||
if [[ $dry_run -eq 1 ]]; then
|
if [[ $dry_run -eq 1 ]]; then
|
||||||
printf '[GPU %s] CMD: ' "$gpu"
|
printf '[GPU %s] CMD: ' "$gpu"
|
||||||
printf '%q ' "${run_cmd[@]}"
|
printf '%q ' "${run_cmd[@]}"
|
||||||
echo
|
echo
|
||||||
|
echo "[GPU $gpu] LOG: $log_file"
|
||||||
else
|
else
|
||||||
"${run_cmd[@]}"
|
set +e
|
||||||
|
"${run_cmd[@]}" >> "$log_file" 2>&1
|
||||||
|
rc=$?
|
||||||
|
set -e
|
||||||
|
|
||||||
|
{
|
||||||
|
echo "============================"
|
||||||
|
echo "exit_code: $rc"
|
||||||
|
echo "===== EXPERIMENT END ======="
|
||||||
|
} >> "$log_file"
|
||||||
|
|
||||||
|
if [[ $rc -ne 0 ]]; then
|
||||||
|
echo "[GPU $gpu] FAIL exp $exp_id (exit=$rc). Log: $log_file" >&2
|
||||||
|
exit "$rc"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "[GPU $gpu] DONE exp $exp_id"
|
echo "[GPU $gpu] DONE exp $exp_id (log: $log_file)"
|
||||||
done < "$qfile"
|
done < "$qfile"
|
||||||
) &
|
) &
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user