DeepHealth/run_experiments_multi_gpu.sh

#!/usr/bin/env bash
set -euo pipefail

usage() {
  cat <<'USAGE'
Usage:
  ./run_experiments_multi_gpu.sh --gpus 0,1,2 [--runs-file runs_to_eval.txt | --runs-root runs] [--cmd "python evaluate.py"] [--log-dir evaluation_logs] [--out-root eval_outputs] [--skip-existing] [--dry-run] [-- <extra evaluate.py args>]

Description:
  Distributes evaluation jobs across multiple GPUs (round-robin) and runs
  at most one job per GPU at a time.

  A job is a run directory containing:
    - train_config.json
    - best_model.pt

  By default, run directories are auto-discovered under --runs-root (default: runs).
  Alternatively, provide --runs-file with one run_dir per line.

Examples:
  # Auto-discover run dirs under ./runs
  ./run_experiments_multi_gpu.sh --gpus 0,1,2

  # Use an explicit list of run directories
  ./run_experiments_multi_gpu.sh --gpus 0,1 --runs-file runs_to_eval.txt

  # Centralize outputs (CSV bundle + summary JSON) under eval_outputs/
  ./run_experiments_multi_gpu.sh --gpus 0,1 --out-root eval_outputs

  # Forward args to evaluate.py
  ./run_experiments_multi_gpu.sh --gpus 0,1 -- --batch_size 512 --num_workers 8
USAGE
}

runs_file=""
runs_root="runs"
gpu_list=""
cmd_str="python evaluate.py"
log_dir="evaluation_logs"
out_root=""
skip_existing=0
dry_run=0
extra_args=()

while [[ $# -gt 0 ]]; do
  case "$1" in
    --gpus)
      gpu_list="${2-}"
      shift 2
      ;;
    --runs-file|-f)
      runs_file="${2-}"
      shift 2
      ;;
    --runs-root)
      runs_root="${2-}"
      shift 2
      ;;
    --cmd)
      cmd_str="${2-}"
      shift 2
      ;;
    --log-dir)
      log_dir="${2-}"
      shift 2
      ;;
    --out-root)
      out_root="${2-}"
      shift 2
      ;;
    --skip-existing)
      skip_existing=1
      shift
      ;;
    --dry-run)
      dry_run=1
      shift
      ;;
    --help|-h)
      usage
      exit 0
      ;;
    --)
      shift
      extra_args=("$@")
      break
      ;;
    *)
      echo "Unknown argument: $1" >&2
      usage
      exit 2
      ;;
  esac
done

if [[ -z "$gpu_list" ]]; then
  echo "Error: --gpus is required (e.g. --gpus 0,1,2)." >&2
  exit 2
fi

mkdir -p "$log_dir"

IFS=',' read -r -a gpus <<< "$gpu_list"
if [[ ${#gpus[@]} -lt 1 ]]; then
  echo "Error: parsed 0 GPUs from --gpus '$gpu_list'" >&2
  exit 2
fi

# Parse cmd string into an argv array.
# shellcheck disable=SC2206
cmd=($cmd_str)
if [[ ${#cmd[@]} -lt 2 ]]; then
  echo "Error: --cmd should look like 'python evaluate.py'" >&2
  exit 2
fi

_tmpdir=""
cleanup() {
  if [[ -n "${_tmpdir}" && -d "${_tmpdir}" ]]; then
    rm -rf "${_tmpdir}"
  fi
}
trap cleanup EXIT

_tmpdir="$(mktemp -d)"

# Prepare per-GPU queue files.
queue_files=()
for i in "${!gpus[@]}"; do
  qfile="${_tmpdir}/queue_${i}.csv"
  : > "$qfile"
  queue_files+=("$qfile")
done

discover_runs() {
  local root="${1-}"
  if [[ -z "$root" ]]; then
    return 0
  fi
  if [[ ! -d "$root" ]]; then
    echo "Error: runs root not found: $root" >&2
    return 2
  fi

  # shellcheck disable=SC2016
  find "$root" -mindepth 1 -maxdepth 1 -type d -print 2>/dev/null |
    sort
}

run_dirs=()
if [[ -n "$runs_file" ]]; then
  if [[ ! -f "$runs_file" ]]; then
    echo "Error: runs file not found: $runs_file" >&2
    exit 2
  fi

  while IFS= read -r line || [[ -n "$line" ]]; do
    line="${line%$'\r'}" # handle CRLF
    [[ -z "$line" ]] && continue
    [[ "$line" == \#* ]] && continue
    run_dirs+=("$line")
  done < "$runs_file"
else
  while IFS= read -r d || [[ -n "${d-}" ]]; do
    [[ -z "${d-}" ]] && continue
    run_dirs+=("$d")
  done < <(discover_runs "$runs_root")
fi

if [[ ${#run_dirs[@]} -eq 0 ]]; then
  if [[ -n "$runs_file" ]]; then
    echo "No run directories found in $runs_file" >&2
  else
    echo "No run directories found under $runs_root" >&2
  fi
  exit 1
fi

is_valid_run_dir() {
  local d="${1-}"
  [[ -d "$d" ]] || return 1
  [[ -f "$d/train_config.json" ]] || return 1
  [[ -f "$d/best_model.pt" ]] || return 1
  return 0
}

valid_run_dirs=()
for d in "${run_dirs[@]}"; do
  if is_valid_run_dir "$d"; then
    valid_run_dirs+=("$d")
  else
    echo "Skipping invalid run_dir (missing train_config.json or best_model.pt): $d" >&2
  fi
done

if [[ ${#valid_run_dirs[@]} -eq 0 ]]; then
  echo "No valid run directories found." >&2
  exit 1
fi

# Distribute evaluation jobs round-robin.
job_idx=0
for d in "${valid_run_dirs[@]}"; do
  slot=$((job_idx % ${#gpus[@]}))
  printf '%s,%s\n' "$job_idx" "$d" >> "${queue_files[$slot]}"
  job_idx=$((job_idx + 1))
done

echo "Queued $job_idx evaluation job(s) across ${#gpus[@]} GPU(s): ${gpus[*]}"

sanitize() {
  # Replace any char outside [A-Za-z0-9._-] with '_'
  local s="${1-}"
  s="${s//[^A-Za-z0-9._-]/_}"
  printf '%s' "$s"
}

pids=()
for i in "${!gpus[@]}"; do
  gpu="${gpus[$i]}"
  qfile="${queue_files[$i]}"

  (
    export CUDA_VISIBLE_DEVICES="$gpu"

    while IFS=',' read -r job_id run_dir || [[ -n "${job_id-}" ]]; do
      [[ -z "${job_id-}" ]] && continue
      [[ -z "${run_dir-}" ]] && continue

      ts="$(date +%Y%m%d-%H%M%S)"
      safe_run="$(sanitize "$(basename "$run_dir")")"

      # Decide output locations.
      out_dir_arg=()
      out_json=""
      if [[ -n "$out_root" ]]; then
        job_out_dir="${out_root%/}/run_${job_id}_${safe_run}"
        mkdir -p "$job_out_dir"
        out_json="$job_out_dir/evaluation_summary.json"
        out_dir_arg=(--out_dir "$job_out_dir" --output "$out_json")
      else
        out_json="$run_dir/evaluation_summary.json"
      fi

      if [[ $skip_existing -eq 1 && -f "$out_json" ]]; then
        echo "[GPU $gpu] SKIP job $job_id: already exists ($out_json)"
        continue
      fi

      run_cmd=("${cmd[@]}" \
        --run_dir "$run_dir" \
        --device cuda)

      if [[ ${#out_dir_arg[@]} -gt 0 ]]; then
        run_cmd+=("${out_dir_arg[@]}")
      fi

      if [[ ${#extra_args[@]} -gt 0 ]]; then
        run_cmd+=("${extra_args[@]}")
      fi

      echo "[GPU $gpu] START job $job_id: run_dir=$run_dir"

      log_file="${log_dir}/eval_${job_id}_gpu${gpu}_${safe_run}_${ts}.log"
      {
        echo "===== EVALUATION START ====="
        echo "timestamp: $ts"
        echo "gpu: $gpu"
        echo "job_id: $job_id"
        echo "run_dir: $run_dir"
        echo "out_root: ${out_root:-<default in run_dir>}"
        echo "out_json: $out_json"
        printf 'cmd:'
        printf ' %q' "${run_cmd[@]}"
        echo
        echo "============================"
      } > "$log_file"

      if [[ $dry_run -eq 1 ]]; then
        printf '[GPU %s] CMD: ' "$gpu"
        printf '%q ' "${run_cmd[@]}"
        echo
        echo "[GPU $gpu] LOG: $log_file"
      else
        set +e
        "${run_cmd[@]}" >> "$log_file" 2>&1
        rc=$?
        set -e

        {
          echo "============================"
          echo "exit_code: $rc"
          echo "===== EVALUATION END ======="
        } >> "$log_file"

        if [[ $rc -ne 0 ]]; then
          echo "[GPU $gpu] FAIL job $job_id (exit=$rc). Log: $log_file" >&2
          exit "$rc"
        fi
      fi

      echo "[GPU $gpu] DONE  job $job_id (log: $log_file)"
    done < "$qfile"
  ) &

  pids+=("$!")
done

# Wait for all GPU workers.
fail=0
for pid in "${pids[@]}"; do
  if ! wait "$pid"; then
    fail=1
  fi
done

if [[ $fail -ne 0 ]]; then
  echo "One or more workers failed." >&2
  exit 1
fi

echo "All evaluations complete."