{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992972593113141, "eval_steps": 500, "global_step": 711, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.777777777777778e-06, "loss": 1.8297, "step": 1 }, { "epoch": 0.01, "learning_rate": 1.388888888888889e-05, "loss": 1.8176, "step": 5 }, { "epoch": 0.01, "learning_rate": 2.777777777777778e-05, "loss": 1.8447, "step": 10 }, { "epoch": 0.02, "learning_rate": 4.166666666666667e-05, "loss": 1.8067, "step": 15 }, { "epoch": 0.03, "learning_rate": 5.555555555555556e-05, "loss": 1.7216, "step": 20 }, { "epoch": 0.04, "learning_rate": 6.944444444444444e-05, "loss": 1.5794, "step": 25 }, { "epoch": 0.04, "learning_rate": 8.333333333333334e-05, "loss": 1.5115, "step": 30 }, { "epoch": 0.05, "learning_rate": 9.722222222222223e-05, "loss": 1.4649, "step": 35 }, { "epoch": 0.06, "learning_rate": 0.00011111111111111112, "loss": 1.4083, "step": 40 }, { "epoch": 0.06, "learning_rate": 0.000125, "loss": 1.3655, "step": 45 }, { "epoch": 0.07, "learning_rate": 0.0001388888888888889, "loss": 1.3277, "step": 50 }, { "epoch": 0.08, "learning_rate": 0.00015277777777777777, "loss": 1.3157, "step": 55 }, { "epoch": 0.08, "learning_rate": 0.0001666666666666667, "loss": 1.2691, "step": 60 }, { "epoch": 0.09, "learning_rate": 0.00018055555555555557, "loss": 1.25, "step": 65 }, { "epoch": 0.1, "learning_rate": 0.00019444444444444446, "loss": 1.2222, "step": 70 }, { "epoch": 0.11, "learning_rate": 0.0001999891231617599, "loss": 1.0998, "step": 75 }, { "epoch": 0.11, "learning_rate": 0.00019992266216318035, "loss": 0.9863, "step": 80 }, { "epoch": 0.12, "learning_rate": 0.0001997958229642588, "loss": 1.0027, "step": 85 }, { "epoch": 0.13, "learning_rate": 0.00019960868220749448, "loss": 0.9667, "step": 90 }, { "epoch": 0.13, "learning_rate": 0.00019936135297256185, "loss": 0.9511, "step": 95 }, { "epoch": 0.14, "learning_rate": 0.00019905398470798207, "loss": 0.9538, "step": 100 }, { "epoch": 0.15, "learning_rate": 0.00019868676314081904, "loss": 0.9471, "step": 105 }, { "epoch": 0.15, "learning_rate": 0.00019825991016445387, "loss": 0.9514, "step": 110 }, { "epoch": 0.16, "learning_rate": 0.0001977736837045058, "loss": 0.936, "step": 115 }, { "epoch": 0.17, "learning_rate": 0.00019722837756298113, "loss": 0.9268, "step": 120 }, { "epoch": 0.18, "learning_rate": 0.00019662432124074327, "loss": 0.93, "step": 125 }, { "epoch": 0.18, "learning_rate": 0.00019596187973841217, "loss": 0.9328, "step": 130 }, { "epoch": 0.19, "learning_rate": 0.00019524145333581317, "loss": 0.9228, "step": 135 }, { "epoch": 0.2, "learning_rate": 0.00019446347735010763, "loss": 0.9346, "step": 140 }, { "epoch": 0.2, "learning_rate": 0.00019362842187275355, "loss": 0.9255, "step": 145 }, { "epoch": 0.21, "learning_rate": 0.00019273679148545245, "loss": 0.9112, "step": 150 }, { "epoch": 0.22, "learning_rate": 0.00019178912495525677, "loss": 0.9124, "step": 155 }, { "epoch": 0.22, "learning_rate": 0.00019078599490901984, "loss": 0.9172, "step": 160 }, { "epoch": 0.23, "learning_rate": 0.0001897280074873868, "loss": 0.9099, "step": 165 }, { "epoch": 0.24, "learning_rate": 0.00018861580197853422, "loss": 0.9153, "step": 170 }, { "epoch": 0.25, "learning_rate": 0.00018745005043188103, "loss": 0.9142, "step": 175 }, { "epoch": 0.25, "learning_rate": 0.00018623145725200278, "loss": 0.9315, "step": 180 }, { "epoch": 0.26, "learning_rate": 0.00018496075877299584, "loss": 0.9145, "step": 185 }, { "epoch": 0.27, "learning_rate": 0.00018363872281354797, "loss": 0.8927, "step": 190 }, { "epoch": 0.27, "learning_rate": 0.0001822661482129844, "loss": 0.9047, "step": 195 }, { "epoch": 0.28, "learning_rate": 0.0001808438643485698, "loss": 0.8972, "step": 200 }, { "epoch": 0.29, "learning_rate": 0.00017937273063435737, "loss": 0.9032, "step": 205 }, { "epoch": 0.3, "learning_rate": 0.00017785363600188894, "loss": 0.9133, "step": 210 }, { "epoch": 0.3, "learning_rate": 0.0001762874983630582, "loss": 0.9044, "step": 215 }, { "epoch": 0.31, "learning_rate": 0.00017467526405546343, "loss": 0.8999, "step": 220 }, { "epoch": 0.32, "learning_rate": 0.00017301790727058345, "loss": 0.9027, "step": 225 }, { "epoch": 0.32, "learning_rate": 0.00017131642946512313, "loss": 0.8985, "step": 230 }, { "epoch": 0.33, "learning_rate": 0.000169571858755884, "loss": 0.909, "step": 235 }, { "epoch": 0.34, "learning_rate": 0.00016778524929852512, "loss": 0.8904, "step": 240 }, { "epoch": 0.34, "learning_rate": 0.00016595768065059047, "loss": 0.8983, "step": 245 }, { "epoch": 0.35, "learning_rate": 0.0001640902571191869, "loss": 0.8829, "step": 250 }, { "epoch": 0.36, "learning_rate": 0.00016218410709370736, "loss": 0.9014, "step": 255 }, { "epoch": 0.37, "learning_rate": 0.00016024038236400246, "loss": 0.8854, "step": 260 }, { "epoch": 0.37, "learning_rate": 0.00015826025742441207, "loss": 0.8947, "step": 265 }, { "epoch": 0.38, "learning_rate": 0.0001562449287640781, "loss": 0.894, "step": 270 }, { "epoch": 0.39, "learning_rate": 0.00015419561414396657, "loss": 0.898, "step": 275 }, { "epoch": 0.39, "learning_rate": 0.00015211355186103655, "loss": 0.9012, "step": 280 }, { "epoch": 0.4, "learning_rate": 0.00015000000000000001, "loss": 0.8836, "step": 285 }, { "epoch": 0.41, "learning_rate": 0.00014785623567312492, "loss": 0.8784, "step": 290 }, { "epoch": 0.41, "learning_rate": 0.00014568355424854113, "loss": 0.8905, "step": 295 }, { "epoch": 0.42, "learning_rate": 0.00014348326856751496, "loss": 0.8873, "step": 300 }, { "epoch": 0.43, "learning_rate": 0.00014125670815116588, "loss": 0.8773, "step": 305 }, { "epoch": 0.44, "learning_rate": 0.00013900521839710426, "loss": 0.9042, "step": 310 }, { "epoch": 0.44, "learning_rate": 0.00013673015976647568, "loss": 0.9007, "step": 315 }, { "epoch": 0.45, "learning_rate": 0.00013443290696190334, "loss": 0.8909, "step": 320 }, { "epoch": 0.46, "learning_rate": 0.00013211484809682483, "loss": 0.887, "step": 325 }, { "epoch": 0.46, "learning_rate": 0.00012977738385672557, "loss": 0.8729, "step": 330 }, { "epoch": 0.47, "learning_rate": 0.00012742192665277568, "loss": 0.8833, "step": 335 }, { "epoch": 0.48, "learning_rate": 0.00012504989976838132, "loss": 0.8891, "step": 340 }, { "epoch": 0.48, "learning_rate": 0.0001226627364991667, "loss": 0.884, "step": 345 }, { "epoch": 0.49, "learning_rate": 0.00012026187928690629, "loss": 0.886, "step": 350 }, { "epoch": 0.5, "learning_rate": 0.00011784877884793031, "loss": 0.8876, "step": 355 }, { "epoch": 0.51, "learning_rate": 0.00011542489329653024, "loss": 0.8865, "step": 360 }, { "epoch": 0.51, "learning_rate": 0.00011299168726389448, "loss": 0.8861, "step": 365 }, { "epoch": 0.52, "learning_rate": 0.00011055063101310581, "loss": 0.874, "step": 370 }, { "epoch": 0.53, "learning_rate": 0.00010810319955073601, "loss": 0.8807, "step": 375 }, { "epoch": 0.53, "learning_rate": 0.00010565087173557395, "loss": 0.8913, "step": 380 }, { "epoch": 0.54, "learning_rate": 0.00010319512938502654, "loss": 0.8937, "step": 385 }, { "epoch": 0.55, "learning_rate": 0.00010073745637973124, "loss": 0.8738, "step": 390 }, { "epoch": 0.56, "learning_rate": 9.827933776692235e-05, "loss": 0.8868, "step": 395 }, { "epoch": 0.56, "learning_rate": 9.582225886309217e-05, "loss": 0.8901, "step": 400 }, { "epoch": 0.57, "learning_rate": 9.336770435648964e-05, "loss": 0.881, "step": 405 }, { "epoch": 0.58, "learning_rate": 9.091715740999828e-05, "loss": 0.8922, "step": 410 }, { "epoch": 0.58, "learning_rate": 8.84720987649363e-05, "loss": 0.8695, "step": 415 }, { "epoch": 0.59, "learning_rate": 8.60340058463194e-05, "loss": 0.8959, "step": 420 }, { "epoch": 0.6, "learning_rate": 8.360435187012788e-05, "loss": 0.8757, "step": 425 }, { "epoch": 0.6, "learning_rate": 8.118460495311686e-05, "loss": 0.8832, "step": 430 }, { "epoch": 0.61, "learning_rate": 7.877622722570771e-05, "loss": 0.8807, "step": 435 }, { "epoch": 0.62, "learning_rate": 7.638067394849671e-05, "loss": 0.8664, "step": 440 }, { "epoch": 0.63, "learning_rate": 7.399939263291493e-05, "loss": 0.8539, "step": 445 }, { "epoch": 0.63, "learning_rate": 7.163382216657034e-05, "loss": 0.8616, "step": 450 }, { "epoch": 0.64, "learning_rate": 6.928539194380102e-05, "loss": 0.8619, "step": 455 }, { "epoch": 0.65, "learning_rate": 6.695552100196452e-05, "loss": 0.8686, "step": 460 }, { "epoch": 0.65, "learning_rate": 6.464561716398565e-05, "loss": 0.8732, "step": 465 }, { "epoch": 0.66, "learning_rate": 6.235707618768032e-05, "loss": 0.8701, "step": 470 }, { "epoch": 0.67, "learning_rate": 6.009128092236983e-05, "loss": 0.8812, "step": 475 }, { "epoch": 0.67, "learning_rate": 5.784960047329519e-05, "loss": 0.8758, "step": 480 }, { "epoch": 0.68, "learning_rate": 5.563338937433622e-05, "loss": 0.877, "step": 485 }, { "epoch": 0.69, "learning_rate": 5.344398676953526e-05, "loss": 0.8754, "step": 490 }, { "epoch": 0.7, "learning_rate": 5.1282715603920374e-05, "loss": 0.8702, "step": 495 }, { "epoch": 0.7, "learning_rate": 4.915088182411675e-05, "loss": 0.8798, "step": 500 }, { "epoch": 0.71, "learning_rate": 4.7049773589229306e-05, "loss": 0.8813, "step": 505 }, { "epoch": 0.72, "learning_rate": 4.498066049247344e-05, "loss": 0.8857, "step": 510 }, { "epoch": 0.72, "learning_rate": 4.29447927940242e-05, "loss": 0.8769, "step": 515 }, { "epoch": 0.73, "learning_rate": 4.094340066554743e-05, "loss": 0.8794, "step": 520 }, { "epoch": 0.74, "learning_rate": 3.897769344686929e-05, "loss": 0.8766, "step": 525 }, { "epoch": 0.74, "learning_rate": 3.7048858915233664e-05, "loss": 0.8678, "step": 530 }, { "epoch": 0.75, "learning_rate": 3.515806256758847e-05, "loss": 0.8538, "step": 535 }, { "epoch": 0.76, "learning_rate": 3.330644691633492e-05, "loss": 0.8692, "step": 540 }, { "epoch": 0.77, "learning_rate": 3.149513079896521e-05, "loss": 0.8638, "step": 545 }, { "epoch": 0.77, "learning_rate": 2.9725208702005734e-05, "loss": 0.8612, "step": 550 }, { "epoch": 0.78, "learning_rate": 2.799775009967428e-05, "loss": 0.872, "step": 555 }, { "epoch": 0.79, "learning_rate": 2.631379880765107e-05, "loss": 0.8836, "step": 560 }, { "epoch": 0.79, "learning_rate": 2.4674372352353782e-05, "loss": 0.8773, "step": 565 }, { "epoch": 0.8, "learning_rate": 2.3080461356097937e-05, "loss": 0.8609, "step": 570 }, { "epoch": 0.81, "learning_rate": 2.1533028938514012e-05, "loss": 0.8571, "step": 575 }, { "epoch": 0.82, "learning_rate": 2.0033010134583086e-05, "loss": 0.8782, "step": 580 }, { "epoch": 0.82, "learning_rate": 1.858131132964259e-05, "loss": 0.878, "step": 585 }, { "epoch": 0.83, "learning_rate": 1.7178809711703523e-05, "loss": 0.8755, "step": 590 }, { "epoch": 0.84, "learning_rate": 1.5826352741410334e-05, "loss": 0.8753, "step": 595 }, { "epoch": 0.84, "learning_rate": 1.452475763996326e-05, "loss": 0.8657, "step": 600 }, { "epoch": 0.85, "learning_rate": 1.3274810895313083e-05, "loss": 0.8544, "step": 605 }, { "epoch": 0.86, "learning_rate": 1.207726778692625e-05, "loss": 0.8521, "step": 610 }, { "epoch": 0.86, "learning_rate": 1.0932851929407827e-05, "loss": 0.8825, "step": 615 }, { "epoch": 0.87, "learning_rate": 9.842254835257791e-06, "loss": 0.8695, "step": 620 }, { "epoch": 0.88, "learning_rate": 8.80613549702518e-06, "loss": 0.8845, "step": 625 }, { "epoch": 0.89, "learning_rate": 7.825119989112173e-06, "loss": 0.8821, "step": 630 }, { "epoch": 0.89, "learning_rate": 6.899801089469204e-06, "loss": 0.8722, "step": 635 }, { "epoch": 0.9, "learning_rate": 6.030737921409169e-06, "loss": 0.8653, "step": 640 }, { "epoch": 0.91, "learning_rate": 5.2184556157576e-06, "loss": 0.8717, "step": 645 }, { "epoch": 0.91, "learning_rate": 4.463444993542721e-06, "loss": 0.8811, "step": 650 }, { "epoch": 0.92, "learning_rate": 3.7661622694171394e-06, "loss": 0.8849, "step": 655 }, { "epoch": 0.93, "learning_rate": 3.127028775990515e-06, "loss": 0.8598, "step": 660 }, { "epoch": 0.93, "learning_rate": 2.546430709239578e-06, "loss": 0.8475, "step": 665 }, { "epoch": 0.94, "learning_rate": 2.02471889514948e-06, "loss": 0.8658, "step": 670 }, { "epoch": 0.95, "learning_rate": 1.562208577727442e-06, "loss": 0.8674, "step": 675 }, { "epoch": 0.96, "learning_rate": 1.1591792285167603e-06, "loss": 0.8774, "step": 680 }, { "epoch": 0.96, "learning_rate": 8.158743777263333e-07, "loss": 0.8786, "step": 685 }, { "epoch": 0.97, "learning_rate": 5.325014670776951e-07, "loss": 0.8766, "step": 690 }, { "epoch": 0.98, "learning_rate": 3.092317244584919e-07, "loss": 0.874, "step": 695 }, { "epoch": 0.98, "learning_rate": 1.4620006045816815e-07, "loss": 0.8683, "step": 700 }, { "epoch": 0.99, "learning_rate": 4.350498684829729e-08, "loss": 0.8548, "step": 705 }, { "epoch": 1.0, "learning_rate": 1.20855705696421e-09, "loss": 0.8799, "step": 710 }, { "epoch": 1.0, "eval_loss": 0.8712352514266968, "eval_runtime": 637.1016, "eval_samples_per_second": 0.995, "eval_steps_per_second": 0.126, "step": 711 }, { "epoch": 1.0, "step": 711, "total_flos": 9.02655718774014e+17, "train_loss": 0.9507392455421587, "train_runtime": 19724.4028, "train_samples_per_second": 0.288, "train_steps_per_second": 0.036 } ], "logging_steps": 5, "max_steps": 711, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 9.02655718774014e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }