diff --git a/train_multigpu.py b/train_multigpu.py index 0bfaf7b..8ce817e 100644 --- a/train_multigpu.py +++ b/train_multigpu.py @@ -23,9 +23,9 @@ class TrainConfig: block_length = 48 # Sequence length # Model parameters - n_embd = 120 - n_layer = 12 - n_head = 12 + n_embd = 256 + n_layer = 16 + n_head = 16 pdrop = 0.1 token_pdrop = 0.1 @@ -112,7 +112,7 @@ def main(): print(f"Model initialized with {model.module.get_num_params():.2f}M trainable parameters.") loss_fn = CombinedLoss(config.ignored_token_ids) - optimizer = AdamW(model.parameters(), lr=config.lr_initial, weight_decay=config.weight_decay) + optimizer = AdamW(model.parameters(), lr=config.lr_initial, weight_decay=config.weight_decay, betas=(0.9, 0.99)) # --- 3. Training Loop --- best_val_loss = float('inf')