config: Tune hyperparameters for multi-GPU training
Increase model size (n_embd, n_layer, n_head) for the multi-GPU configuration. Explicitly set AdamW betas to (0.9, 0.99).
This commit is contained in:
@@ -23,9 +23,9 @@ class TrainConfig:
|
|||||||
block_length = 48 # Sequence length
|
block_length = 48 # Sequence length
|
||||||
|
|
||||||
# Model parameters
|
# Model parameters
|
||||||
n_embd = 120
|
n_embd = 256
|
||||||
n_layer = 12
|
n_layer = 16
|
||||||
n_head = 12
|
n_head = 16
|
||||||
pdrop = 0.1
|
pdrop = 0.1
|
||||||
token_pdrop = 0.1
|
token_pdrop = 0.1
|
||||||
|
|
||||||
@@ -112,7 +112,7 @@ def main():
|
|||||||
print(f"Model initialized with {model.module.get_num_params():.2f}M trainable parameters.")
|
print(f"Model initialized with {model.module.get_num_params():.2f}M trainable parameters.")
|
||||||
|
|
||||||
loss_fn = CombinedLoss(config.ignored_token_ids)
|
loss_fn = CombinedLoss(config.ignored_token_ids)
|
||||||
optimizer = AdamW(model.parameters(), lr=config.lr_initial, weight_decay=config.weight_decay)
|
optimizer = AdamW(model.parameters(), lr=config.lr_initial, weight_decay=config.weight_decay, betas=(0.9, 0.99))
|
||||||
|
|
||||||
# --- 3. Training Loop ---
|
# --- 3. Training Loop ---
|
||||||
best_val_loss = float('inf')
|
best_val_loss = float('inf')
|
||||||
|
Reference in New Issue
Block a user