Files
DeepHealth/run_experiments_multi_gpu.sh

236 lines
5.4 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'USAGE'
Usage:
./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--log-dir experiment_logs] [--dry-run] [-- <extra train.py args>]
Description:
Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs
at most one job per GPU at a time.
Examples:
./run_experiments_multi_gpu.sh --gpus 0,1,2
./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50
./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type discrete_time_cif
USAGE
}
experiments_file="experiments.txt"
gpu_list=""
cmd_str="python train.py"
log_dir="experiment_logs"
dry_run=0
extra_args=()
while [[ $# -gt 0 ]]; do
case "$1" in
--gpus)
gpu_list="${2-}"
shift 2
;;
--experiments|-f)
experiments_file="${2-}"
shift 2
;;
--cmd)
cmd_str="${2-}"
shift 2
;;
--log-dir)
log_dir="${2-}"
shift 2
;;
--dry-run)
dry_run=1
shift
;;
--help|-h)
usage
exit 0
;;
--)
shift
extra_args=("$@")
break
;;
*)
echo "Unknown argument: $1" >&2
usage
exit 2
;;
esac
done
if [[ -z "$gpu_list" ]]; then
echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2
exit 2
fi
mkdir -p "$log_dir"
if [[ ! -f "$experiments_file" ]]; then
echo "Error: experiments file not found: $experiments_file" >&2
exit 2
fi
IFS=',' read -r -a gpus <<< "$gpu_list"
if [[ ${#gpus[@]} -lt 1 ]]; then
echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
exit 2
fi
# Parse cmd string into an argv array.
# shellcheck disable=SC2206
cmd=($cmd_str)
if [[ ${#cmd[@]} -lt 2 ]]; then
echo "Error: --cmd should look like 'python train.py'" >&2
exit 2
fi
_tmpdir=""
cleanup() {
if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then
rm -rf "${_tmpdir}"
fi
}
trap cleanup EXIT
_tmpdir="$(mktemp -d)"
# Prepare per-GPU queue files.
queue_files=()
for i in "${!gpus[@]}"; do
qfile="${_tmpdir}/queue_${i}.csv"
: > "$qfile"
queue_files+=("$qfile")
done
# Distribute experiments round-robin.
exp_idx=0
while IFS= read -r line || [[ -n "$line" ]]; do
line="${line%$'\r'}" # handle CRLF
[[ -z "$line" ]] && continue
# Skip header if present
if [[ "$line" == model_type,* ]]; then
continue
fi
slot=$((exp_idx % ${#gpus[@]}))
# Prefix a stable experiment index for logging.
printf '%s,%s\n' "$exp_idx" "$line" >> "${queue_files[$slot]}"
exp_idx=$((exp_idx + 1))
done < "$experiments_file"
if [[ $exp_idx -eq 0 ]]; then
echo "No experiments found in $experiments_file" >&2
exit 1
fi
echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}"
sanitize() {
# Replace any char outside [A-Za-z0-9._-] with '_'
local s="${1-}"
s="${s//[^A-Za-z0-9._-]/_}"
printf '%s' "$s"
}
pids=()
for i in "${!gpus[@]}"; do
gpu="${gpus[$i]}"
qfile="${queue_files[$i]}"
(
export CUDA_VISIBLE_DEVICES="$gpu"
while IFS=',' read -r exp_id model_type loss_type age_encoder full_cov || [[ -n "${exp_id-}" ]]; do
# Skip empty lines
[[ -z "${exp_id-}" ]] && continue
# Normalize booleans / strip whitespace
full_cov="${full_cov//[[:space:]]/}"
run_cmd=("${cmd[@]}" \
--model_type "$model_type" \
--loss_type "$loss_type" \
--age_encoder "$age_encoder")
if [[ "$full_cov" == "True" || "$full_cov" == "true" || "$full_cov" == "1" ]]; then
run_cmd+=(--full_cov)
fi
if [[ ${#extra_args[@]} -gt 0 ]]; then
run_cmd+=("${extra_args[@]}")
fi
echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov"
ts="$(date +%Y%m%d-%H%M%S)"
safe_model="$(sanitize "$model_type")"
safe_loss="$(sanitize "$loss_type")"
safe_age="$(sanitize "$age_encoder")"
safe_cov="$(sanitize "$full_cov")"
log_file="${log_dir}/exp_${exp_id}_gpu${gpu}_${safe_model}_${safe_loss}_${safe_age}_${safe_cov}_${ts}.log"
{
echo "===== EXPERIMENT START ====="
echo "timestamp: $ts"
echo "gpu: $gpu"
echo "exp_id: $exp_id"
echo "model_type: $model_type"
echo "loss_type: $loss_type"
echo "age_encoder: $age_encoder"
echo "full_cov: $full_cov"
printf 'cmd:'
printf ' %q' "${run_cmd[@]}"
echo
echo "============================"
} > "$log_file"
if [[ $dry_run -eq 1 ]]; then
printf '[GPU %s] CMD: ' "$gpu"
printf '%q ' "${run_cmd[@]}"
echo
echo "[GPU $gpu] LOG: $log_file"
else
set +e
"${run_cmd[@]}" >> "$log_file" 2>&1
rc=$?
set -e
{
echo "============================"
echo "exit_code: $rc"
echo "===== EXPERIMENT END ======="
} >> "$log_file"
if [[ $rc -ne 0 ]]; then
echo "[GPU $gpu] FAIL exp $exp_id (exit=$rc). Log: $log_file" >&2
exit "$rc"
fi
fi
echo "[GPU $gpu] DONE exp $exp_id (log: $log_file)"
done < "$qfile"
) &
pids+=("$!")
done
# Wait for all GPU workers.
fail=0
for pid in "${pids[@]}"; do
if ! wait "$pid"; then
fail=1
fi
done
if [[ $fail -ne 0 ]]; then
echo "One or more workers failed." >&2
exit 1
fi
echo "All experiments complete."