Enhance DataLoader configuration and improve tensor transfer efficiency in Trainer class

This commit is contained in:
2026-01-08 13:20:32 +08:00
parent 5382f9f159
commit 01a96d37ea
4 changed files with 81 additions and 30 deletions

View File

@@ -46,10 +46,11 @@ class SelfAttention(nn.Module):
k = reshape_heads(k)
v = reshape_heads(v)
dropout_p = self.attn_pdrop if self.training else 0.0
attn = F.scaled_dot_product_attention(
q, k, v,
attn_mask=attn_mask,
dropout_p=self.attn_pdrop,
dropout_p=dropout_p,
) # (B, H, L, d)
attn = attn.transpose(1, 2).contiguous().view(B, L, D) # (B, L, D)