feat(models): Refactor generate function in TimeAwareGPT2 with competing risks sampling

2025-10-18 12:42:14 +08:00
parent a631ac6d59
commit 082c719975
1 changed files with 54 additions and 1 deletions
--- a/models.py
+++ b/models.py
@@ -177,9 +177,10 @@ class TimeAwareGPT2(nn.Module):
    A time-aware GPT-2 model with custom temporal features.
    """

-    def __init__(self, vocab_size: int, n_embd: int, n_layer: int, n_head: int, pdrop: float, token_pdrop: float):
+    def __init__(self, vocab_size: int, n_embd: int, n_layer: int, n_head: int, pdrop: float, token_pdrop: float, ignore_tokens: list[int] = None):
        super().__init__()
        self.token_pdrop = token_pdrop
+        self.ignore_tokens = ignore_tokens if ignore_tokens is not None else []

        self.wte = nn.Embedding(vocab_size, n_embd)
        self.age_encoder = AgeSinusoidalEncoding(n_embd)
@@ -234,6 +235,58 @@ class TimeAwareGPT2(nn.Module):
        """
        return sum(p.numel() for p in self.parameters() if p.requires_grad) / 1e6

+    @torch.no_grad()
+    def generate(self, x, t, max_new_tokens=100, max_age=85*365.25, no_repeat=True, termination_tokens=None, top_k=None):
+        """
+        Take a conditioning sequence of indices x (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        self.eval()
+
+        if termination_tokens is None:
+            termination_tokens = [1269]
+        
+        termination_tokens = torch.tensor(termination_tokens, dtype=torch.int64, device=x.device)
+        mask_time = -10000
+
+        for _ in range(max_new_tokens):
+            logits = self(x, t)
+            logits = logits[:, -1, :]
+            
+            if self.ignore_tokens:
+                logits[:, self.ignore_tokens] = -torch.inf
+
+            if no_repeat:
+                fill = x.clone()
+                fill[fill == 1] = 0
+                logits = logits.scatter(1, fill, -torch.inf)
+            
+            t_next_dist = torch.clamp(-torch.exp(-logits) * torch.rand(logits.shape, device=x.device).log(), min=0, max=365*80)
+            t_next_val, idx_next = t_next_dist.min(1)
+            
+            idx_next = idx_next.unsqueeze(1)
+            age_next = t[:, -1].unsqueeze(1) + t_next_val.unsqueeze(1)
+            
+            x = torch.cat((x, idx_next), dim=1)
+            t = torch.cat((t, age_next), dim=1)
+            
+            if torch.logical_or(torch.isin(x, termination_tokens).any(-1), age_next.squeeze() > max_age).all():
+                break
+        
+        pad = (torch.cumsum(torch.cumsum(torch.isin(x, termination_tokens), 1).bool().int(), 1) > 1) + (t > max_age)
+
+        final_logits = self(x, t)
+        x[pad] = 0
+        t[pad] = mask_time
+
+        if no_repeat:
+            fill = x.clone()
+            fill[fill == 1] = 0
+            final_logits = torch.stack([final_logits[:,j].scatter(1, fill[:,:j+1], -torch.inf) for j in range(fill.shape[1])]).transpose(0,1)
+
+        return x, t, final_logits
+
 class CovariateAwareGPT2(nn.Module):
    """
    Extends TimeAwareGPT2 to incorporate static and time-varying covariates.