185 lines
4.0 KiB
Bash
185 lines
4.0 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
usage() {
|
||
|
|
cat <<'USAGE'
|
||
|
|
Usage:
|
||
|
|
./run_experiments_multi_gpu.sh --gpus 0,1,2 [--experiments experiments.txt] [--cmd "python train.py"] [--dry-run] [-- <extra train.py args>]
|
||
|
|
|
||
|
|
Description:
|
||
|
|
Distributes rows from experiments.txt across multiple GPUs (round-robin) and runs
|
||
|
|
at most one job per GPU at a time.
|
||
|
|
|
||
|
|
Examples:
|
||
|
|
./run_experiments_multi_gpu.sh --gpus 0,1,2
|
||
|
|
./run_experiments_multi_gpu.sh --gpus 0,1 --experiments experiments.txt -- --batch_size 64 --max_epochs 50
|
||
|
|
./run_experiments_multi_gpu.sh --gpus 3 --cmd "python train.py" -- --loss_type weibull
|
||
|
|
USAGE
|
||
|
|
}
|
||
|
|
|
||
|
|
experiments_file="experiments.txt"
|
||
|
|
gpu_list=""
|
||
|
|
cmd_str="python train.py"
|
||
|
|
dry_run=0
|
||
|
|
extra_args=()
|
||
|
|
|
||
|
|
while [[ $# -gt 0 ]]; do
|
||
|
|
case "$1" in
|
||
|
|
--gpus)
|
||
|
|
gpu_list="${2-}"
|
||
|
|
shift 2
|
||
|
|
;;
|
||
|
|
--experiments|-f)
|
||
|
|
experiments_file="${2-}"
|
||
|
|
shift 2
|
||
|
|
;;
|
||
|
|
--cmd)
|
||
|
|
cmd_str="${2-}"
|
||
|
|
shift 2
|
||
|
|
;;
|
||
|
|
--dry-run)
|
||
|
|
dry_run=1
|
||
|
|
shift
|
||
|
|
;;
|
||
|
|
--help|-h)
|
||
|
|
usage
|
||
|
|
exit 0
|
||
|
|
;;
|
||
|
|
--)
|
||
|
|
shift
|
||
|
|
extra_args=("$@")
|
||
|
|
break
|
||
|
|
;;
|
||
|
|
*)
|
||
|
|
echo "Unknown argument: $1" >&2
|
||
|
|
usage
|
||
|
|
exit 2
|
||
|
|
;;
|
||
|
|
esac
|
||
|
|
done
|
||
|
|
|
||
|
|
if [[ -z "$gpu_list" ]]; then
|
||
|
|
echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2
|
||
|
|
exit 2
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [[ ! -f "$experiments_file" ]]; then
|
||
|
|
echo "Error: experiments file not found: $experiments_file" >&2
|
||
|
|
exit 2
|
||
|
|
fi
|
||
|
|
|
||
|
|
IFS=',' read -r -a gpus <<< "$gpu_list"
|
||
|
|
if [[ ${#gpus[@]} -lt 1 ]]; then
|
||
|
|
echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
|
||
|
|
exit 2
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Parse cmd string into an argv array.
|
||
|
|
# shellcheck disable=SC2206
|
||
|
|
cmd=($cmd_str)
|
||
|
|
if [[ ${#cmd[@]} -lt 2 ]]; then
|
||
|
|
echo "Error: --cmd should look like 'python train.py'" >&2
|
||
|
|
exit 2
|
||
|
|
fi
|
||
|
|
|
||
|
|
_tmpdir=""
|
||
|
|
cleanup() {
|
||
|
|
if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then
|
||
|
|
rm -rf "${_tmpdir}"
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
trap cleanup EXIT
|
||
|
|
|
||
|
|
_tmpdir="$(mktemp -d)"
|
||
|
|
|
||
|
|
# Prepare per-GPU queue files.
|
||
|
|
queue_files=()
|
||
|
|
for i in "${!gpus[@]}"; do
|
||
|
|
qfile="${_tmpdir}/queue_${i}.csv"
|
||
|
|
: > "$qfile"
|
||
|
|
queue_files+=("$qfile")
|
||
|
|
done
|
||
|
|
|
||
|
|
# Distribute experiments round-robin.
|
||
|
|
exp_idx=0
|
||
|
|
while IFS= read -r line || [[ -n "$line" ]]; do
|
||
|
|
line="${line%$'\r'}" # handle CRLF
|
||
|
|
[[ -z "$line" ]] && continue
|
||
|
|
# Skip header if present
|
||
|
|
if [[ "$line" == model_type,* ]]; then
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
|
||
|
|
slot=$((exp_idx % ${#gpus[@]}))
|
||
|
|
# Prefix a stable experiment index for logging.
|
||
|
|
printf '%s,%s\n' "$exp_idx" "$line" >> "${queue_files[$slot]}"
|
||
|
|
exp_idx=$((exp_idx + 1))
|
||
|
|
done < "$experiments_file"
|
||
|
|
|
||
|
|
if [[ $exp_idx -eq 0 ]]; then
|
||
|
|
echo "No experiments found in $experiments_file" >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "Queued $exp_idx experiments across ${#gpus[@]} GPU(s): ${gpus[*]}"
|
||
|
|
|
||
|
|
pids=()
|
||
|
|
for i in "${!gpus[@]}"; do
|
||
|
|
gpu="${gpus[$i]}"
|
||
|
|
qfile="${queue_files[$i]}"
|
||
|
|
|
||
|
|
(
|
||
|
|
export CUDA_VISIBLE_DEVICES="$gpu"
|
||
|
|
|
||
|
|
while IFS=',' read -r exp_id model_type loss_type age_encoder full_cov || [[ -n "${exp_id-}" ]]; do
|
||
|
|
# Skip empty lines
|
||
|
|
[[ -z "${exp_id-}" ]] && continue
|
||
|
|
|
||
|
|
# Normalize booleans / strip whitespace
|
||
|
|
full_cov="${full_cov//[[:space:]]/}"
|
||
|
|
|
||
|
|
run_cmd=("${cmd[@]}" \
|
||
|
|
--model_type "$model_type" \
|
||
|
|
--loss_type "$loss_type" \
|
||
|
|
--age_encoder "$age_encoder")
|
||
|
|
|
||
|
|
if [[ "$full_cov" == "True" || "$full_cov" == "true" || "$full_cov" == "1" ]]; then
|
||
|
|
run_cmd+=(--full_cov)
|
||
|
|
fi
|
||
|
|
|
||
|
|
if [[ ${#extra_args[@]} -gt 0 ]]; then
|
||
|
|
run_cmd+=("${extra_args[@]}")
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "[GPU $gpu] START exp $exp_id: model_type=$model_type loss_type=$loss_type age_encoder=$age_encoder full_cov=$full_cov"
|
||
|
|
|
||
|
|
if [[ $dry_run -eq 1 ]]; then
|
||
|
|
printf '[GPU %s] CMD: ' "$gpu"
|
||
|
|
printf '%q ' "${run_cmd[@]}"
|
||
|
|
echo
|
||
|
|
else
|
||
|
|
"${run_cmd[@]}"
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "[GPU $gpu] DONE exp $exp_id"
|
||
|
|
done < "$qfile"
|
||
|
|
) &
|
||
|
|
|
||
|
|
pids+=("$!")
|
||
|
|
done
|
||
|
|
|
||
|
|
# Wait for all GPU workers.
|
||
|
|
fail=0
|
||
|
|
for pid in "${pids[@]}"; do
|
||
|
|
if ! wait "$pid"; then
|
||
|
|
fail=1
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
if [[ $fail -ne 0 ]]; then
|
||
|
|
echo "One or more workers failed." >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "All experiments complete."
|