{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 9.09090909090909e-07, "loss": 2.7431, "step": 1 }, { "epoch": 0.07, "learning_rate": 3.636363636363636e-06, "loss": 2.889, "step": 4 }, { "epoch": 0.15, "learning_rate": 7.272727272727272e-06, "loss": 2.7193, "step": 8 }, { "epoch": 0.22, "learning_rate": 1.0909090909090909e-05, "loss": 2.6675, "step": 12 }, { "epoch": 0.29, "learning_rate": 1.4545454545454545e-05, "loss": 2.608, "step": 16 }, { "epoch": 0.36, "learning_rate": 1.8181818181818182e-05, "loss": 2.6623, "step": 20 }, { "epoch": 0.44, "learning_rate": 2.1818181818181818e-05, "loss": 2.7249, "step": 24 }, { "epoch": 0.51, "learning_rate": 2.5454545454545454e-05, "loss": 2.7954, "step": 28 }, { "epoch": 0.58, "learning_rate": 2.909090909090909e-05, "loss": 2.6101, "step": 32 }, { "epoch": 0.65, "learning_rate": 3.272727272727273e-05, "loss": 2.4391, "step": 36 }, { "epoch": 0.73, "learning_rate": 3.6363636363636364e-05, "loss": 2.2331, "step": 40 }, { "epoch": 0.8, "learning_rate": 4e-05, "loss": 2.2489, "step": 44 }, { "epoch": 0.87, "learning_rate": 4.3636363636363636e-05, "loss": 2.1787, "step": 48 }, { "epoch": 0.95, "learning_rate": 4.7272727272727275e-05, "loss": 2.2181, "step": 52 }, { "epoch": 1.02, "learning_rate": 4.999949650182266e-05, "loss": 2.2997, "step": 56 }, { "epoch": 1.09, "learning_rate": 4.9987413559579636e-05, "loss": 2.0619, "step": 60 }, { "epoch": 1.16, "learning_rate": 4.995922759815339e-05, "loss": 2.1645, "step": 64 }, { "epoch": 1.24, "learning_rate": 4.991495678185202e-05, "loss": 1.9819, "step": 68 }, { "epoch": 1.31, "learning_rate": 4.985462964079137e-05, "loss": 2.0363, "step": 72 }, { "epoch": 1.38, "learning_rate": 4.977828505250903e-05, "loss": 1.9152, "step": 76 }, { "epoch": 1.45, "learning_rate": 4.968597221690986e-05, "loss": 1.9794, "step": 80 }, { "epoch": 1.53, "learning_rate": 4.957775062455933e-05, "loss": 1.9805, "step": 84 }, { "epoch": 1.6, "learning_rate": 4.9453690018345144e-05, "loss": 1.9941, "step": 88 }, { "epoch": 1.67, "learning_rate": 4.931387034853173e-05, "loss": 1.9186, "step": 92 }, { "epoch": 1.75, "learning_rate": 4.915838172123671e-05, "loss": 1.9123, "step": 96 }, { "epoch": 1.82, "learning_rate": 4.898732434036244e-05, "loss": 1.965, "step": 100 }, { "epoch": 1.89, "learning_rate": 4.880080844302004e-05, "loss": 1.8973, "step": 104 }, { "epoch": 1.96, "learning_rate": 4.859895422848767e-05, "loss": 1.8473, "step": 108 }, { "epoch": 2.0, "pls_score": 58.0, "std": 4.354308211415448, "step": 110 }, { "epoch": 2.0, "eval_loss": 1.9843206405639648, "eval_runtime": 4.904, "eval_samples_per_second": 4.69, "eval_steps_per_second": 1.223, "step": 110 }, { "epoch": 2.04, "learning_rate": 4.838189178074867e-05, "loss": 2.0581, "step": 112 }, { "epoch": 2.11, "learning_rate": 4.8149760984659506e-05, "loss": 1.8467, "step": 116 }, { "epoch": 2.18, "learning_rate": 4.790271143580174e-05, "loss": 1.9691, "step": 120 }, { "epoch": 2.25, "learning_rate": 4.764090234407577e-05, "loss": 1.8183, "step": 124 }, { "epoch": 2.33, "learning_rate": 4.7364502431098844e-05, "loss": 1.7403, "step": 128 }, { "epoch": 2.4, "learning_rate": 4.707368982147318e-05, "loss": 1.7877, "step": 132 }, { "epoch": 2.47, "learning_rate": 4.6768651927994434e-05, "loss": 1.7595, "step": 136 }, { "epoch": 2.55, "learning_rate": 4.644958533087443e-05, "loss": 1.7273, "step": 140 }, { "epoch": 2.62, "learning_rate": 4.611669565105596e-05, "loss": 1.669, "step": 144 }, { "epoch": 2.69, "learning_rate": 4.5770197417701365e-05, "loss": 1.742, "step": 148 }, { "epoch": 2.76, "learning_rate": 4.5410313929940244e-05, "loss": 1.6976, "step": 152 }, { "epoch": 2.84, "learning_rate": 4.503727711296538e-05, "loss": 1.8632, "step": 156 }, { "epoch": 2.91, "learning_rate": 4.465132736856969e-05, "loss": 1.9186, "step": 160 }, { "epoch": 2.98, "learning_rate": 4.425271342022039e-05, "loss": 1.7645, "step": 164 }, { "epoch": 3.0, "pls_score": 64.0, "std": 3.7523326078587433, "step": 165 }, { "epoch": 3.0, "eval_loss": 1.968030333518982, "eval_runtime": 4.9015, "eval_samples_per_second": 4.692, "eval_steps_per_second": 1.224, "step": 165 }, { "epoch": 3.05, "learning_rate": 4.384169215277041e-05, "loss": 1.4812, "step": 168 }, { "epoch": 3.13, "learning_rate": 4.341852844691012e-05, "loss": 1.6649, "step": 172 }, { "epoch": 3.2, "learning_rate": 4.2983495008466276e-05, "loss": 1.5835, "step": 176 }, { "epoch": 3.27, "learning_rate": 4.2536872192658036e-05, "loss": 1.6401, "step": 180 }, { "epoch": 3.35, "learning_rate": 4.2078947823423364e-05, "loss": 1.573, "step": 184 }, { "epoch": 3.42, "learning_rate": 4.161001700793231e-05, "loss": 1.4984, "step": 188 }, { "epoch": 3.49, "learning_rate": 4.113038194640658e-05, "loss": 1.5315, "step": 192 }, { "epoch": 3.56, "learning_rate": 4.064035173736804e-05, "loss": 1.541, "step": 196 }, { "epoch": 3.64, "learning_rate": 4.014024217844167e-05, "loss": 1.5492, "step": 200 }, { "epoch": 3.71, "learning_rate": 3.9630375562841295e-05, "loss": 1.6265, "step": 204 }, { "epoch": 3.78, "learning_rate": 3.911108047166924e-05, "loss": 1.5753, "step": 208 }, { "epoch": 3.85, "learning_rate": 3.858269156216383e-05, "loss": 1.4876, "step": 212 }, { "epoch": 3.93, "learning_rate": 3.804554935203115e-05, "loss": 1.5884, "step": 216 }, { "epoch": 4.0, "learning_rate": 3.7500000000000003e-05, "loss": 1.4795, "step": 220 }, { "epoch": 4.0, "pls_score": 60.8, "std": 3.750626614313932, "step": 220 }, { "epoch": 4.0, "eval_loss": 2.076031446456909, "eval_runtime": 4.9029, "eval_samples_per_second": 4.691, "eval_steps_per_second": 1.224, "step": 220 }, { "epoch": 4.07, "learning_rate": 3.694639508274158e-05, "loss": 1.3002, "step": 224 }, { "epoch": 4.15, "learning_rate": 3.638509136829758e-05, "loss": 1.344, "step": 228 }, { "epoch": 4.22, "learning_rate": 3.581645058616271e-05, "loss": 1.209, "step": 232 }, { "epoch": 4.29, "learning_rate": 3.5240839194169885e-05, "loss": 1.3386, "step": 236 }, { "epoch": 4.36, "learning_rate": 3.465862814232822e-05, "loss": 1.3247, "step": 240 }, { "epoch": 4.44, "learning_rate": 3.4070192633766025e-05, "loss": 1.2247, "step": 244 }, { "epoch": 4.51, "learning_rate": 3.3475911882933015e-05, "loss": 1.2683, "step": 248 }, { "epoch": 4.58, "learning_rate": 3.2876168871217325e-05, "loss": 1.2856, "step": 252 }, { "epoch": 4.65, "learning_rate": 3.2271350100134975e-05, "loss": 1.271, "step": 256 }, { "epoch": 4.73, "learning_rate": 3.166184534225087e-05, "loss": 1.207, "step": 260 }, { "epoch": 4.8, "learning_rate": 3.104804738999169e-05, "loss": 1.2884, "step": 264 }, { "epoch": 4.87, "learning_rate": 3.0430351802512698e-05, "loss": 1.2375, "step": 268 }, { "epoch": 4.95, "learning_rate": 2.9809156650781528e-05, "loss": 1.2467, "step": 272 }, { "epoch": 5.0, "pls_score": 62.8, "std": 4.15730682052696, "step": 275 }, { "epoch": 5.0, "eval_loss": 2.2714545726776123, "eval_runtime": 4.904, "eval_samples_per_second": 4.69, "eval_steps_per_second": 1.224, "step": 275 }, { "epoch": 5.02, "learning_rate": 2.918486226104327e-05, "loss": 1.1868, "step": 276 }, { "epoch": 5.09, "learning_rate": 2.8557870956832132e-05, "loss": 1.099, "step": 280 }, { "epoch": 5.16, "learning_rate": 2.792858679969596e-05, "loss": 1.0657, "step": 284 }, { "epoch": 5.24, "learning_rate": 2.7297415328800692e-05, "loss": 1.0596, "step": 288 }, { "epoch": 5.31, "learning_rate": 2.6664763299582602e-05, "loss": 0.9201, "step": 292 }, { "epoch": 5.38, "learning_rate": 2.6031038421616683e-05, "loss": 1.0683, "step": 296 }, { "epoch": 5.45, "learning_rate": 2.5396649095870202e-05, "loss": 1.0069, "step": 300 }, { "epoch": 5.53, "learning_rate": 2.4762004151510584e-05, "loss": 0.9486, "step": 304 }, { "epoch": 5.6, "learning_rate": 2.4127512582437485e-05, "loss": 0.937, "step": 308 }, { "epoch": 5.67, "learning_rate": 2.349358328370854e-05, "loss": 1.0102, "step": 312 }, { "epoch": 5.75, "learning_rate": 2.2860624788029013e-05, "loss": 1.0058, "step": 316 }, { "epoch": 5.82, "learning_rate": 2.222904500247473e-05, "loss": 1.0564, "step": 320 }, { "epoch": 5.89, "learning_rate": 2.1599250945618402e-05, "loss": 0.9122, "step": 324 }, { "epoch": 5.96, "learning_rate": 2.09716484852284e-05, "loss": 1.0034, "step": 328 }, { "epoch": 6.0, "pls_score": 59.6, "std": 4.059162475191158, "step": 330 }, { "epoch": 6.0, "eval_loss": 2.565575361251831, "eval_runtime": 4.9106, "eval_samples_per_second": 4.684, "eval_steps_per_second": 1.222, "step": 330 }, { "epoch": 6.04, "learning_rate": 2.034664207670925e-05, "loss": 0.8958, "step": 332 }, { "epoch": 6.11, "learning_rate": 1.972463450245226e-05, "loss": 0.8214, "step": 336 }, { "epoch": 6.18, "learning_rate": 1.9106026612264316e-05, "loss": 0.7499, "step": 340 }, { "epoch": 6.25, "learning_rate": 1.84912170650422e-05, "loss": 0.7933, "step": 344 }, { "epoch": 6.33, "learning_rate": 1.7880602071858692e-05, "loss": 0.8006, "step": 348 }, { "epoch": 6.4, "learning_rate": 1.7274575140626318e-05, "loss": 0.7292, "step": 352 }, { "epoch": 6.47, "learning_rate": 1.667352682250298e-05, "loss": 0.8303, "step": 356 }, { "epoch": 6.55, "learning_rate": 1.6077844460203206e-05, "loss": 0.7661, "step": 360 }, { "epoch": 6.62, "learning_rate": 1.5487911938376924e-05, "loss": 0.8356, "step": 364 }, { "epoch": 6.69, "learning_rate": 1.4904109436216884e-05, "loss": 0.7956, "step": 368 }, { "epoch": 6.76, "learning_rate": 1.4326813182453958e-05, "loss": 0.7794, "step": 372 }, { "epoch": 6.84, "learning_rate": 1.3756395212898359e-05, "loss": 0.7468, "step": 376 }, { "epoch": 6.91, "learning_rate": 1.3193223130682936e-05, "loss": 0.8426, "step": 380 }, { "epoch": 6.98, "learning_rate": 1.2637659869363083e-05, "loss": 0.8124, "step": 384 }, { "epoch": 7.0, "pls_score": 56.326530612244895, "std": 4.027124583051197, "step": 385 }, { "epoch": 7.0, "eval_loss": 2.80517578125, "eval_runtime": 4.9124, "eval_samples_per_second": 4.682, "eval_steps_per_second": 1.221, "step": 385 }, { "epoch": 7.05, "learning_rate": 1.2090063459025955e-05, "loss": 0.5562, "step": 388 }, { "epoch": 7.13, "learning_rate": 1.155078679555969e-05, "loss": 0.6813, "step": 392 }, { "epoch": 7.2, "learning_rate": 1.1020177413231334e-05, "loss": 0.6176, "step": 396 }, { "epoch": 7.27, "learning_rate": 1.049857726072005e-05, "loss": 0.5879, "step": 400 }, { "epoch": 7.35, "learning_rate": 9.986322480749927e-06, "loss": 0.6449, "step": 404 }, { "epoch": 7.42, "learning_rate": 9.483743193464408e-06, "loss": 0.6934, "step": 408 }, { "epoch": 7.49, "learning_rate": 8.991163283681944e-06, "loss": 0.7495, "step": 412 }, { "epoch": 7.56, "learning_rate": 8.508900192169964e-06, "loss": 0.6499, "step": 416 }, { "epoch": 7.64, "learning_rate": 8.0372647110717e-06, "loss": 0.6279, "step": 420 }, { "epoch": 7.71, "learning_rate": 7.576560783617668e-06, "loss": 0.6939, "step": 424 }, { "epoch": 7.78, "learning_rate": 7.127085308250914e-06, "loss": 0.6682, "step": 428 }, { "epoch": 7.85, "learning_rate": 6.689127947292231e-06, "loss": 0.7422, "step": 432 }, { "epoch": 7.93, "learning_rate": 6.2629709402686535e-06, "loss": 0.6291, "step": 436 }, { "epoch": 8.0, "learning_rate": 5.848888922025553e-06, "loss": 0.6269, "step": 440 }, { "epoch": 8.0, "pls_score": 57.95918367346939, "std": 4.290372510588257, "step": 440 }, { "epoch": 8.0, "eval_loss": 2.9865965843200684, "eval_runtime": 4.9092, "eval_samples_per_second": 4.685, "eval_steps_per_second": 1.222, "step": 440 }, { "epoch": 8.07, "learning_rate": 5.4471487457395225e-06, "loss": 0.6428, "step": 444 }, { "epoch": 8.15, "learning_rate": 5.058009310946119e-06, "loss": 0.6155, "step": 448 }, { "epoch": 8.22, "learning_rate": 4.681721396693303e-06, "loss": 0.5257, "step": 452 }, { "epoch": 8.29, "learning_rate": 4.318527499928074e-06, "loss": 0.5577, "step": 456 }, { "epoch": 8.36, "learning_rate": 3.968661679220468e-06, "loss": 0.6239, "step": 460 }, { "epoch": 8.44, "learning_rate": 3.632349403925664e-06, "loss": 0.6014, "step": 464 }, { "epoch": 8.51, "learning_rate": 3.3098074088812686e-06, "loss": 0.5279, "step": 468 }, { "epoch": 8.58, "learning_rate": 3.0012435547336737e-06, "loss": 0.5482, "step": 472 }, { "epoch": 8.65, "learning_rate": 2.7068566939831645e-06, "loss": 0.5768, "step": 476 }, { "epoch": 8.73, "learning_rate": 2.4268365428344736e-06, "loss": 0.5444, "step": 480 }, { "epoch": 8.8, "learning_rate": 2.1613635589349756e-06, "loss": 0.674, "step": 484 }, { "epoch": 8.87, "learning_rate": 1.9106088250797267e-06, "loss": 0.6033, "step": 488 }, { "epoch": 8.95, "learning_rate": 1.674733938957873e-06, "loss": 0.5743, "step": 492 }, { "epoch": 9.0, "pls_score": 55.51020408163265, "std": 4.29116489632049, "step": 495 }, { "epoch": 9.0, "eval_loss": 3.064878463745117, "eval_runtime": 4.9105, "eval_samples_per_second": 4.684, "eval_steps_per_second": 1.222, "step": 495 }, { "epoch": 9.02, "learning_rate": 1.4538909090118846e-06, "loss": 0.5749, "step": 496 }, { "epoch": 9.09, "learning_rate": 1.248222056476367e-06, "loss": 0.6153, "step": 500 }, { "epoch": 9.16, "learning_rate": 1.0578599236598707e-06, "loss": 0.5336, "step": 504 }, { "epoch": 9.24, "learning_rate": 8.829271885286094e-07, "loss": 0.519, "step": 508 }, { "epoch": 9.31, "learning_rate": 7.235365856472442e-07, "loss": 0.6333, "step": 512 }, { "epoch": 9.38, "learning_rate": 5.797908335276214e-07, "loss": 0.5356, "step": 516 }, { "epoch": 9.45, "learning_rate": 4.517825684323324e-07, "loss": 0.557, "step": 520 }, { "epoch": 9.53, "learning_rate": 3.395942846757066e-07, "loss": 0.6558, "step": 524 }, { "epoch": 9.6, "learning_rate": 2.4329828146074095e-07, "loss": 0.5504, "step": 528 }, { "epoch": 9.67, "learning_rate": 1.6295661628624447e-07, "loss": 0.5123, "step": 532 }, { "epoch": 9.75, "learning_rate": 9.862106495415469e-08, "loss": 0.5928, "step": 536 }, { "epoch": 9.82, "learning_rate": 5.033308820289184e-08, "loss": 0.5185, "step": 540 }, { "epoch": 9.89, "learning_rate": 1.812380498815991e-08, "loss": 0.5582, "step": 544 }, { "epoch": 9.96, "learning_rate": 2.0139724285161977e-09, "loss": 0.5465, "step": 548 }, { "epoch": 10.0, "step": 550, "total_flos": 1.876652342808576e+16, "train_loss": 0.0, "train_runtime": 9.9064, "train_samples_per_second": 219.05, "train_steps_per_second": 55.52 } ], "logging_steps": 4, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 55, "total_flos": 1.876652342808576e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }