{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9972714870395634, "eval_steps": 500, "global_step": 366, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.054570259208731244, "grad_norm": 2.4109368924528636, "learning_rate": 2.702702702702703e-06, "loss": 0.7385, "step": 10 }, { "epoch": 0.10914051841746249, "grad_norm": 1.411542387667167, "learning_rate": 5.405405405405406e-06, "loss": 0.6258, "step": 20 }, { "epoch": 0.16371077762619374, "grad_norm": 1.1090603733773723, "learning_rate": 8.108108108108109e-06, "loss": 0.5777, "step": 30 }, { "epoch": 0.21828103683492497, "grad_norm": 1.0114013426864816, "learning_rate": 9.997948550797227e-06, "loss": 0.5768, "step": 40 }, { "epoch": 0.2728512960436562, "grad_norm": 1.0925544693817912, "learning_rate": 9.961525153583327e-06, "loss": 0.5571, "step": 50 }, { "epoch": 0.3274215552523875, "grad_norm": 1.1229468005660994, "learning_rate": 9.879896064123961e-06, "loss": 0.5579, "step": 60 }, { "epoch": 0.3819918144611187, "grad_norm": 1.1457306310920201, "learning_rate": 9.75380502539778e-06, "loss": 0.5437, "step": 70 }, { "epoch": 0.43656207366984995, "grad_norm": 1.0550317300389418, "learning_rate": 9.584400884284546e-06, "loss": 0.5459, "step": 80 }, { "epoch": 0.49113233287858116, "grad_norm": 1.1312778139498736, "learning_rate": 9.373227124134888e-06, "loss": 0.5455, "step": 90 }, { "epoch": 0.5457025920873124, "grad_norm": 1.0356499530726737, "learning_rate": 9.122207801708802e-06, "loss": 0.5388, "step": 100 }, { "epoch": 0.6002728512960437, "grad_norm": 1.0985324135142938, "learning_rate": 8.833630016614976e-06, "loss": 0.5489, "step": 110 }, { "epoch": 0.654843110504775, "grad_norm": 1.1083844896783392, "learning_rate": 8.51012307297624e-06, "loss": 0.5442, "step": 120 }, { "epoch": 0.7094133697135061, "grad_norm": 1.0400224538030558, "learning_rate": 8.154634523184389e-06, "loss": 0.5359, "step": 130 }, { "epoch": 0.7639836289222374, "grad_norm": 1.0744202310569617, "learning_rate": 7.77040331201572e-06, "loss": 0.5318, "step": 140 }, { "epoch": 0.8185538881309686, "grad_norm": 1.0968285492605718, "learning_rate": 7.360930265797934e-06, "loss": 0.5323, "step": 150 }, { "epoch": 0.8731241473396999, "grad_norm": 1.021879688858344, "learning_rate": 6.929946195508933e-06, "loss": 0.5171, "step": 160 }, { "epoch": 0.927694406548431, "grad_norm": 1.0678204884195794, "learning_rate": 6.481377904428171e-06, "loss": 0.5234, "step": 170 }, { "epoch": 0.9822646657571623, "grad_norm": 0.9343628148722539, "learning_rate": 6.019312410053286e-06, "loss": 0.5209, "step": 180 }, { "epoch": 1.0368349249658937, "grad_norm": 1.2427110143315105, "learning_rate": 5.547959706265068e-06, "loss": 0.4878, "step": 190 }, { "epoch": 1.0914051841746248, "grad_norm": 1.0091906053534088, "learning_rate": 5.071614405023938e-06, "loss": 0.4144, "step": 200 }, { "epoch": 1.145975443383356, "grad_norm": 1.007806012878209, "learning_rate": 4.594616607090028e-06, "loss": 0.4102, "step": 210 }, { "epoch": 1.2005457025920874, "grad_norm": 0.9761625285978685, "learning_rate": 4.121312358283464e-06, "loss": 0.3954, "step": 220 }, { "epoch": 1.2551159618008185, "grad_norm": 0.9057196288186306, "learning_rate": 3.656014051577713e-06, "loss": 0.4026, "step": 230 }, { "epoch": 1.30968622100955, "grad_norm": 0.9923780381040166, "learning_rate": 3.202961135812437e-06, "loss": 0.4061, "step": 240 }, { "epoch": 1.364256480218281, "grad_norm": 0.9366887214418035, "learning_rate": 2.766281489018482e-06, "loss": 0.4072, "step": 250 }, { "epoch": 1.4188267394270122, "grad_norm": 0.9615122411717134, "learning_rate": 2.3499538082923607e-06, "loss": 0.4081, "step": 260 }, { "epoch": 1.4733969986357436, "grad_norm": 0.9926176178093427, "learning_rate": 1.9577713588953797e-06, "loss": 0.4033, "step": 270 }, { "epoch": 1.5279672578444747, "grad_norm": 0.940790518539519, "learning_rate": 1.5933074128684333e-06, "loss": 0.4045, "step": 280 }, { "epoch": 1.5825375170532059, "grad_norm": 0.9731821866758915, "learning_rate": 1.2598826920598773e-06, "loss": 0.4101, "step": 290 }, { "epoch": 1.6371077762619373, "grad_norm": 0.9806874432871104, "learning_rate": 9.605351122011308e-07, "loss": 0.4028, "step": 300 }, { "epoch": 1.6916780354706686, "grad_norm": 0.9218297203583149, "learning_rate": 6.979921036993042e-07, "loss": 0.4015, "step": 310 }, { "epoch": 1.7462482946793996, "grad_norm": 0.9247290967153217, "learning_rate": 4.7464576133899043e-07, "loss": 0.3958, "step": 320 }, { "epoch": 1.800818553888131, "grad_norm": 0.9418425606930034, "learning_rate": 2.925310493105099e-07, "loss": 0.4015, "step": 330 }, { "epoch": 1.8553888130968623, "grad_norm": 0.9321370632360508, "learning_rate": 1.5330726014397668e-07, "loss": 0.3974, "step": 340 }, { "epoch": 1.9099590723055935, "grad_norm": 0.9562537189946939, "learning_rate": 5.824289648152126e-08, "loss": 0.3824, "step": 350 }, { "epoch": 1.9645293315143246, "grad_norm": 0.9275908894260719, "learning_rate": 8.204113433559202e-09, "loss": 0.4059, "step": 360 }, { "epoch": 1.9972714870395634, "step": 366, "total_flos": 53584849797120.0, "train_loss": 0.48227007988372134, "train_runtime": 1900.9535, "train_samples_per_second": 24.658, "train_steps_per_second": 0.193 } ], "logging_steps": 10, "max_steps": 366, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 53584849797120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }