{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 9.09090909090909e-07, "loss": 2.3833, "step": 1 }, { "epoch": 0.07, "learning_rate": 3.636363636363636e-06, "loss": 2.4779, "step": 4 }, { "epoch": 0.15, "learning_rate": 7.272727272727272e-06, "loss": 2.3099, "step": 8 }, { "epoch": 0.22, "learning_rate": 1.0909090909090909e-05, "loss": 2.3051, "step": 12 }, { "epoch": 0.29, "learning_rate": 1.4545454545454545e-05, "loss": 2.2475, "step": 16 }, { "epoch": 0.36, "learning_rate": 1.8181818181818182e-05, "loss": 2.2982, "step": 20 }, { "epoch": 0.44, "learning_rate": 2.1818181818181818e-05, "loss": 2.2629, "step": 24 }, { "epoch": 0.51, "learning_rate": 2.5454545454545454e-05, "loss": 2.4193, "step": 28 }, { "epoch": 0.58, "learning_rate": 2.909090909090909e-05, "loss": 2.2926, "step": 32 }, { "epoch": 0.65, "learning_rate": 3.272727272727273e-05, "loss": 2.1356, "step": 36 }, { "epoch": 0.73, "learning_rate": 3.6363636363636364e-05, "loss": 1.9494, "step": 40 }, { "epoch": 0.8, "learning_rate": 4e-05, "loss": 2.0152, "step": 44 }, { "epoch": 0.87, "learning_rate": 4.3636363636363636e-05, "loss": 2.1044, "step": 48 }, { "epoch": 0.95, "learning_rate": 4.7272727272727275e-05, "loss": 1.9755, "step": 52 }, { "epoch": 1.02, "learning_rate": 4.999949650182266e-05, "loss": 1.991, "step": 56 }, { "epoch": 1.09, "learning_rate": 4.9987413559579636e-05, "loss": 1.8347, "step": 60 }, { "epoch": 1.16, "learning_rate": 4.995922759815339e-05, "loss": 1.9198, "step": 64 }, { "epoch": 1.24, "learning_rate": 4.991495678185202e-05, "loss": 1.8634, "step": 68 }, { "epoch": 1.31, "learning_rate": 4.985462964079137e-05, "loss": 1.8104, "step": 72 }, { "epoch": 1.38, "learning_rate": 4.977828505250903e-05, "loss": 1.7029, "step": 76 }, { "epoch": 1.45, "learning_rate": 4.968597221690986e-05, "loss": 1.7826, "step": 80 }, { "epoch": 1.53, "learning_rate": 4.957775062455933e-05, "loss": 1.7827, "step": 84 }, { "epoch": 1.6, "learning_rate": 4.9453690018345144e-05, "loss": 1.806, "step": 88 }, { "epoch": 1.67, "learning_rate": 4.931387034853173e-05, "loss": 1.7537, "step": 92 }, { "epoch": 1.75, "learning_rate": 4.915838172123671e-05, "loss": 1.7506, "step": 96 }, { "epoch": 1.82, "learning_rate": 4.898732434036244e-05, "loss": 1.7881, "step": 100 }, { "epoch": 1.89, "learning_rate": 4.880080844302004e-05, "loss": 1.736, "step": 104 }, { "epoch": 1.96, "learning_rate": 4.859895422848767e-05, "loss": 1.6668, "step": 108 }, { "epoch": 2.0, "gpt4_scores": 0.4759999999999999, "step": 110 }, { "epoch": 2.0, "std": 0.0573103830034314, "step": 110 }, { "epoch": 2.0, "eval_loss": 1.8269920349121094, "eval_runtime": 4.9635, "eval_samples_per_second": 4.634, "eval_steps_per_second": 1.209, "step": 110 }, { "epoch": 2.04, "learning_rate": 4.838189178074867e-05, "loss": 1.8895, "step": 112 }, { "epoch": 2.11, "learning_rate": 4.8149760984659506e-05, "loss": 1.6187, "step": 116 }, { "epoch": 2.18, "learning_rate": 4.790271143580174e-05, "loss": 1.7413, "step": 120 }, { "epoch": 2.25, "learning_rate": 4.764090234407577e-05, "loss": 1.6013, "step": 124 }, { "epoch": 2.33, "learning_rate": 4.7364502431098844e-05, "loss": 1.5573, "step": 128 }, { "epoch": 2.4, "learning_rate": 4.707368982147318e-05, "loss": 1.6328, "step": 132 }, { "epoch": 2.47, "learning_rate": 4.6768651927994434e-05, "loss": 1.592, "step": 136 }, { "epoch": 2.55, "learning_rate": 4.644958533087443e-05, "loss": 1.5389, "step": 140 }, { "epoch": 2.62, "learning_rate": 4.611669565105596e-05, "loss": 1.5301, "step": 144 }, { "epoch": 2.69, "learning_rate": 4.5770197417701365e-05, "loss": 1.5516, "step": 148 }, { "epoch": 2.76, "learning_rate": 4.5410313929940244e-05, "loss": 1.5279, "step": 152 }, { "epoch": 2.84, "learning_rate": 4.503727711296538e-05, "loss": 1.6698, "step": 156 }, { "epoch": 2.91, "learning_rate": 4.465132736856969e-05, "loss": 1.7181, "step": 160 }, { "epoch": 2.98, "learning_rate": 4.425271342022039e-05, "loss": 1.5865, "step": 164 }, { "epoch": 3.0, "gpt4_scores": 0.6679999999999999, "step": 165 }, { "epoch": 3.0, "std": 0.05110303317808053, "step": 165 }, { "epoch": 3.0, "eval_loss": 1.8256371021270752, "eval_runtime": 4.9731, "eval_samples_per_second": 4.625, "eval_steps_per_second": 1.206, "step": 165 }, { "epoch": 3.05, "learning_rate": 4.384169215277041e-05, "loss": 1.3406, "step": 168 }, { "epoch": 3.13, "learning_rate": 4.341852844691012e-05, "loss": 1.4171, "step": 172 }, { "epoch": 3.2, "learning_rate": 4.2983495008466276e-05, "loss": 1.3461, "step": 176 }, { "epoch": 3.27, "learning_rate": 4.2536872192658036e-05, "loss": 1.408, "step": 180 }, { "epoch": 3.35, "learning_rate": 4.2078947823423364e-05, "loss": 1.3575, "step": 184 }, { "epoch": 3.42, "learning_rate": 4.161001700793231e-05, "loss": 1.3114, "step": 188 }, { "epoch": 3.49, "learning_rate": 4.113038194640658e-05, "loss": 1.2919, "step": 192 }, { "epoch": 3.56, "learning_rate": 4.064035173736804e-05, "loss": 1.3044, "step": 196 }, { "epoch": 3.64, "learning_rate": 4.014024217844167e-05, "loss": 1.3425, "step": 200 }, { "epoch": 3.71, "learning_rate": 3.9630375562841295e-05, "loss": 1.3524, "step": 204 }, { "epoch": 3.78, "learning_rate": 3.911108047166924e-05, "loss": 1.3514, "step": 208 }, { "epoch": 3.85, "learning_rate": 3.858269156216383e-05, "loss": 1.2055, "step": 212 }, { "epoch": 3.93, "learning_rate": 3.804554935203115e-05, "loss": 1.3618, "step": 216 }, { "epoch": 4.0, "learning_rate": 3.7500000000000003e-05, "loss": 1.264, "step": 220 }, { "epoch": 4.0, "gpt4_scores": 0.6719999999999999, "step": 220 }, { "epoch": 4.0, "std": 0.049561275205547324, "step": 220 }, { "epoch": 4.0, "eval_loss": 1.9825626611709595, "eval_runtime": 4.9575, "eval_samples_per_second": 4.639, "eval_steps_per_second": 1.21, "step": 220 }, { "epoch": 4.07, "learning_rate": 3.694639508274158e-05, "loss": 1.0472, "step": 224 }, { "epoch": 4.15, "learning_rate": 3.638509136829758e-05, "loss": 1.0674, "step": 228 }, { "epoch": 4.22, "learning_rate": 3.581645058616271e-05, "loss": 0.9661, "step": 232 }, { "epoch": 4.29, "learning_rate": 3.5240839194169885e-05, "loss": 1.0761, "step": 236 }, { "epoch": 4.36, "learning_rate": 3.465862814232822e-05, "loss": 1.0675, "step": 240 }, { "epoch": 4.44, "learning_rate": 3.4070192633766025e-05, "loss": 0.9969, "step": 244 }, { "epoch": 4.51, "learning_rate": 3.3475911882933015e-05, "loss": 1.0195, "step": 248 }, { "epoch": 4.58, "learning_rate": 3.2876168871217325e-05, "loss": 1.0246, "step": 252 }, { "epoch": 4.65, "learning_rate": 3.2271350100134975e-05, "loss": 1.0308, "step": 256 }, { "epoch": 4.73, "learning_rate": 3.166184534225087e-05, "loss": 0.9465, "step": 260 }, { "epoch": 4.8, "learning_rate": 3.104804738999169e-05, "loss": 1.0121, "step": 264 }, { "epoch": 4.87, "learning_rate": 3.0430351802512698e-05, "loss": 0.9884, "step": 268 }, { "epoch": 4.95, "learning_rate": 2.9809156650781528e-05, "loss": 0.99, "step": 272 }, { "epoch": 5.0, "gpt4_scores": 0.6259999999999999, "step": 275 }, { "epoch": 5.0, "std": 0.05452045487704591, "step": 275 }, { "epoch": 5.0, "eval_loss": 2.212796449661255, "eval_runtime": 4.9762, "eval_samples_per_second": 4.622, "eval_steps_per_second": 1.206, "step": 275 }, { "epoch": 5.02, "learning_rate": 2.918486226104327e-05, "loss": 0.9253, "step": 276 }, { "epoch": 5.09, "learning_rate": 2.8557870956832132e-05, "loss": 0.7838, "step": 280 }, { "epoch": 5.16, "learning_rate": 2.792858679969596e-05, "loss": 0.8005, "step": 284 }, { "epoch": 5.24, "learning_rate": 2.7297415328800692e-05, "loss": 0.7363, "step": 288 }, { "epoch": 5.31, "learning_rate": 2.6664763299582602e-05, "loss": 0.6807, "step": 292 }, { "epoch": 5.38, "learning_rate": 2.6031038421616683e-05, "loss": 0.7882, "step": 296 }, { "epoch": 5.45, "learning_rate": 2.5396649095870202e-05, "loss": 0.6869, "step": 300 }, { "epoch": 5.53, "learning_rate": 2.4762004151510584e-05, "loss": 0.7016, "step": 304 }, { "epoch": 5.6, "learning_rate": 2.4127512582437485e-05, "loss": 0.6709, "step": 308 }, { "epoch": 5.67, "learning_rate": 2.349358328370854e-05, "loss": 0.7557, "step": 312 }, { "epoch": 5.75, "learning_rate": 2.2860624788029013e-05, "loss": 0.7442, "step": 316 }, { "epoch": 5.82, "learning_rate": 2.222904500247473e-05, "loss": 0.7328, "step": 320 }, { "epoch": 5.89, "learning_rate": 2.1599250945618402e-05, "loss": 0.663, "step": 324 }, { "epoch": 5.96, "learning_rate": 2.09716484852284e-05, "loss": 0.7218, "step": 328 }, { "epoch": 6.0, "gpt4_scores": 0.54, "step": 330 }, { "epoch": 6.0, "std": 0.06151422599691879, "step": 330 }, { "epoch": 6.0, "eval_loss": 2.538802146911621, "eval_runtime": 4.9512, "eval_samples_per_second": 4.645, "eval_steps_per_second": 1.212, "step": 330 }, { "epoch": 6.04, "learning_rate": 2.034664207670925e-05, "loss": 0.61, "step": 332 }, { "epoch": 6.11, "learning_rate": 1.972463450245226e-05, "loss": 0.4884, "step": 336 }, { "epoch": 6.18, "learning_rate": 1.9106026612264316e-05, "loss": 0.5136, "step": 340 }, { "epoch": 6.25, "learning_rate": 1.84912170650422e-05, "loss": 0.5489, "step": 344 }, { "epoch": 6.33, "learning_rate": 1.7880602071858692e-05, "loss": 0.51, "step": 348 }, { "epoch": 6.4, "learning_rate": 1.7274575140626318e-05, "loss": 0.4517, "step": 352 }, { "epoch": 6.47, "learning_rate": 1.667352682250298e-05, "loss": 0.5259, "step": 356 }, { "epoch": 6.55, "learning_rate": 1.6077844460203206e-05, "loss": 0.5205, "step": 360 }, { "epoch": 6.62, "learning_rate": 1.5487911938376924e-05, "loss": 0.5436, "step": 364 }, { "epoch": 6.69, "learning_rate": 1.4904109436216884e-05, "loss": 0.5362, "step": 368 }, { "epoch": 6.76, "learning_rate": 1.4326813182453958e-05, "loss": 0.4788, "step": 372 }, { "epoch": 6.84, "learning_rate": 1.3756395212898359e-05, "loss": 0.4858, "step": 376 }, { "epoch": 6.91, "learning_rate": 1.3193223130682936e-05, "loss": 0.5685, "step": 380 }, { "epoch": 6.98, "learning_rate": 1.2637659869363083e-05, "loss": 0.5291, "step": 384 }, { "epoch": 7.0, "gpt4_scores": 0.5159999999999999, "step": 385 }, { "epoch": 7.0, "std": 0.058265598769771526, "step": 385 }, { "epoch": 7.0, "eval_loss": 2.774492025375366, "eval_runtime": 4.9875, "eval_samples_per_second": 4.612, "eval_steps_per_second": 1.203, "step": 385 }, { "epoch": 7.05, "learning_rate": 1.2090063459025955e-05, "loss": 0.3004, "step": 388 }, { "epoch": 7.13, "learning_rate": 1.155078679555969e-05, "loss": 0.3865, "step": 392 }, { "epoch": 7.2, "learning_rate": 1.1020177413231334e-05, "loss": 0.3657, "step": 396 }, { "epoch": 7.27, "learning_rate": 1.049857726072005e-05, "loss": 0.347, "step": 400 }, { "epoch": 7.35, "learning_rate": 9.986322480749927e-06, "loss": 0.3479, "step": 404 }, { "epoch": 7.42, "learning_rate": 9.483743193464408e-06, "loss": 0.4031, "step": 408 }, { "epoch": 7.49, "learning_rate": 8.991163283681944e-06, "loss": 0.4777, "step": 412 }, { "epoch": 7.56, "learning_rate": 8.508900192169964e-06, "loss": 0.3905, "step": 416 }, { "epoch": 7.64, "learning_rate": 8.0372647110717e-06, "loss": 0.3871, "step": 420 }, { "epoch": 7.71, "learning_rate": 7.576560783617668e-06, "loss": 0.3674, "step": 424 }, { "epoch": 7.78, "learning_rate": 7.127085308250914e-06, "loss": 0.3878, "step": 428 }, { "epoch": 7.85, "learning_rate": 6.689127947292231e-06, "loss": 0.4137, "step": 432 }, { "epoch": 7.93, "learning_rate": 6.2629709402686535e-06, "loss": 0.4006, "step": 436 }, { "epoch": 8.0, "learning_rate": 5.848888922025553e-06, "loss": 0.3732, "step": 440 }, { "epoch": 8.07, "learning_rate": 5.4471487457395225e-06, "loss": 1.2569, "step": 444 }, { "epoch": 8.15, "learning_rate": 5.058009310946119e-06, "loss": 1.2022, "step": 448 }, { "epoch": 8.22, "learning_rate": 4.681721396693303e-06, "loss": 1.0283, "step": 452 }, { "epoch": 8.29, "learning_rate": 4.318527499928074e-06, "loss": 0.8692, "step": 456 }, { "epoch": 8.36, "learning_rate": 3.968661679220468e-06, "loss": 0.9156, "step": 460 }, { "epoch": 8.44, "learning_rate": 3.632349403925664e-06, "loss": 0.9772, "step": 464 }, { "epoch": 8.51, "learning_rate": 3.3098074088812686e-06, "loss": 0.9159, "step": 468 }, { "epoch": 8.58, "learning_rate": 3.0012435547336737e-06, "loss": 0.7694, "step": 472 }, { "epoch": 8.65, "learning_rate": 2.7068566939831645e-06, "loss": 0.8599, "step": 476 }, { "epoch": 8.73, "learning_rate": 2.4268365428344736e-06, "loss": 0.8493, "step": 480 }, { "epoch": 8.8, "learning_rate": 2.1613635589349756e-06, "loss": 0.9502, "step": 484 }, { "epoch": 8.87, "learning_rate": 1.9106088250797267e-06, "loss": 0.8163, "step": 488 }, { "epoch": 8.95, "learning_rate": 1.674733938957873e-06, "loss": 0.791, "step": 492 }, { "epoch": 9.02, "learning_rate": 1.4538909090118846e-06, "loss": 0.8613, "step": 496 }, { "epoch": 9.09, "learning_rate": 1.248222056476367e-06, "loss": 0.7612, "step": 500 }, { "epoch": 9.16, "learning_rate": 1.0578599236598707e-06, "loss": 0.7711, "step": 504 }, { "epoch": 9.24, "learning_rate": 8.829271885286094e-07, "loss": 0.7856, "step": 508 }, { "epoch": 9.31, "learning_rate": 7.235365856472442e-07, "loss": 0.8414, "step": 512 }, { "epoch": 9.38, "learning_rate": 5.797908335276214e-07, "loss": 0.7963, "step": 516 }, { "epoch": 9.45, "learning_rate": 4.517825684323324e-07, "loss": 0.7588, "step": 520 }, { "epoch": 9.53, "learning_rate": 3.395942846757066e-07, "loss": 0.7257, "step": 524 }, { "epoch": 9.6, "learning_rate": 2.4329828146074095e-07, "loss": 0.8102, "step": 528 }, { "epoch": 9.67, "learning_rate": 1.6295661628624447e-07, "loss": 0.8791, "step": 532 }, { "epoch": 9.75, "learning_rate": 9.862106495415469e-08, "loss": 0.7275, "step": 536 }, { "epoch": 9.82, "learning_rate": 5.033308820289184e-08, "loss": 0.8194, "step": 540 }, { "epoch": 9.89, "learning_rate": 1.812380498815991e-08, "loss": 0.892, "step": 544 }, { "epoch": 9.96, "learning_rate": 2.0139724285161977e-09, "loss": 0.8365, "step": 548 }, { "epoch": 10.0, "step": 550, "total_flos": 1.881214300967731e+16, "train_loss": 0.0, "train_runtime": 12.8072, "train_samples_per_second": 169.436, "train_steps_per_second": 42.945 } ], "logging_steps": 4, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 55, "total_flos": 1.881214300967731e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }