Enhance DataLoader configuration and improve tensor transfer efficiency in Trainer class
This commit is contained in:
@@ -46,10 +46,11 @@ class SelfAttention(nn.Module):
|
||||
k = reshape_heads(k)
|
||||
v = reshape_heads(v)
|
||||
|
||||
dropout_p = self.attn_pdrop if self.training else 0.0
|
||||
attn = F.scaled_dot_product_attention(
|
||||
q, k, v,
|
||||
attn_mask=attn_mask,
|
||||
dropout_p=self.attn_pdrop,
|
||||
dropout_p=dropout_p,
|
||||
) # (B, H, L, d)
|
||||
|
||||
attn = attn.transpose(1, 2).contiguous().view(B, L, D) # (B, L, D)
|
||||
|
||||
Reference in New Issue
Block a user