Enhance _maybe_torch_compile and add _maybe_cudagraph_mark_step_begin for improved CUDA Graphs handling

2026-01-18 18:04:54 +08:00
parent a4b19b6e08
commit 6e76d67a10
1 changed files with 33 additions and 2 deletions
--- a/evaluate.py
+++ b/evaluate.py
@@ -36,18 +36,45 @@ warnings.filterwarnings('ignore')
 def _maybe_torch_compile(module: torch.nn.Module, enabled: bool = True) -> torch.nn.Module:
-    """Best-effort torch.compile() wrapper (PyTorch 2.x)."""
+    """Best-effort torch.compile() wrapper (PyTorch 2.x).
    Notes:
      - Some PyTorch builds run compiled graphs via CUDA Graphs in certain modes.
        If you keep references to graph outputs across steps, PyTorch may raise:
        "accessing tensor output of CUDAGraphs that has been overwritten".
      - We default to settings that avoid cudagraph output-lifetime pitfalls.
    """
    if not enabled:
        return module
    try:
        torch_compile = getattr(torch, "compile", None)
        if torch_compile is None:
            return module
-        return torch_compile(module, mode="reduce-overhead")
+        # Prefer a safer mode for evaluation code; best-effort disable cudagraphs.
        kwargs = {"mode": "default"}
        try:
            kwargs["options"] = {"triton.cudagraphs": False}
        except Exception:
            pass
        return torch_compile(module, **kwargs)
    except Exception:
        return module
 def _maybe_cudagraph_mark_step_begin() -> None:
    """Best-effort step marker for CUDA Graphs compiled execution."""
    try:
        compiler_mod = getattr(torch, "compiler", None)
        if compiler_mod is None:
            return
        mark = getattr(compiler_mod, "cudagraph_mark_step_begin", None)
        if mark is None:
            return
        mark()
    except Exception:
        return
 def _ensure_dir(path: str) -> str:
    os.makedirs(path, exist_ok=True)
    return path
@@ -485,6 +512,9 @@ class LandmarkEvaluator:
            # Get model predictions at anchor points
            if has_anchor.any():
                # If torch.compile uses CUDA Graphs under the hood, mark a new step
                # before each compiled invocation to avoid output lifetime issues.
                _maybe_cudagraph_mark_step_begin()
                # Forward pass
                hidden = self.model(event_batch, time_batch,
                                    sex_batch, cont_batch, cate_batch)
@@ -1152,6 +1182,7 @@ class LandmarkEvaluator:
                batch_idx = torch.arange(B, device=self.device)
                # Backbone once per batch
                _maybe_cudagraph_mark_step_begin()
                hidden = self.model(
                    # (B, L, D)
                    event_batch, time_batch, sex_batch, cont_batch, cate_batch)