diff --git a/best_model_n_embd_120_n_layer_12_n_head_12.pt b/best_model_n_embd_120_n_layer_12_n_head_12.pt new file mode 100644 index 0000000..bc05e60 Binary files /dev/null and b/best_model_n_embd_120_n_layer_12_n_head_12.pt differ diff --git a/best_model_n_embd_256_n_layer_16_n_head_16.pt b/best_model_n_embd_256_n_layer_16_n_head_16.pt new file mode 100644 index 0000000..3e32386 Binary files /dev/null and b/best_model_n_embd_256_n_layer_16_n_head_16.pt differ diff --git a/config_n_embd_120_n_layer_12_n_head_12.json b/config_n_embd_120_n_layer_12_n_head_12.json new file mode 100644 index 0000000..633a6e1 --- /dev/null +++ b/config_n_embd_120_n_layer_12_n_head_12.json @@ -0,0 +1,18 @@ +{ + "n_layer": 12, + "n_embd": 120, + "n_head": 12, + "max_epoch": 200, + "batch_size": 128, + "lr_initial": 0.0006, + "lr_final": 6e-05, + "weight_decay": 0.2, + "warmup_epochs": 10, + "early_stopping_patience": 10, + "pdrop": 0.0, + "token_pdrop": 0.0, + "betas": [ + 0.9, + 0.99 + ] +} \ No newline at end of file diff --git a/config_n_embd_256_n_layer_16_n_head_16.json b/config_n_embd_256_n_layer_16_n_head_16.json new file mode 100644 index 0000000..b49d775 --- /dev/null +++ b/config_n_embd_256_n_layer_16_n_head_16.json @@ -0,0 +1,18 @@ +{ + "n_layer": 16, + "n_embd": 256, + "n_head": 16, + "max_epoch": 200, + "batch_size": 128, + "lr_initial": 0.0006, + "lr_final": 6e-05, + "weight_decay": 0.2, + "warmup_epochs": 10, + "early_stopping_patience": 10, + "pdrop": 0.0, + "token_pdrop": 0.0, + "betas": [ + 0.9, + 0.99 + ] +} \ No newline at end of file