|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9999209511896846, |
|
"eval_steps": 500, |
|
"global_step": 1680, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005951910423748122, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 1.1904761904761904e-06, |
|
"loss": 1.0942, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0029759552118740614, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 5.9523809523809525e-06, |
|
"loss": 1.1063, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005951910423748123, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 1.1904761904761905e-05, |
|
"loss": 1.0913, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008927865635622183, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 1.0858, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.011903820847496246, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 2.380952380952381e-05, |
|
"loss": 1.0707, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.014879776059370306, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 2.9761904761904762e-05, |
|
"loss": 1.0274, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.017855731271244367, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 0.9783, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02083168648311843, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.9564, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02380764169499249, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 0.9325, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02678359690686655, |
|
"grad_norm": 0.11767578125, |
|
"learning_rate": 5.3571428571428575e-05, |
|
"loss": 0.9242, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.029759552118740613, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 5.9523809523809524e-05, |
|
"loss": 0.8951, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03273550733061467, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 6.547619047619048e-05, |
|
"loss": 0.8703, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.035711462542488734, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.8513, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.038687417754362796, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 7.738095238095239e-05, |
|
"loss": 0.8488, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04166337296623686, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.8218, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04463932817811092, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 8.92857142857143e-05, |
|
"loss": 0.8321, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04761528338998498, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 9.523809523809524e-05, |
|
"loss": 0.8271, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.050591238601859045, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 0.0001011904761904762, |
|
"loss": 0.8097, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0535671938137331, |
|
"grad_norm": 0.0546875, |
|
"learning_rate": 0.00010714285714285715, |
|
"loss": 0.7871, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05654314902560716, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.00011309523809523809, |
|
"loss": 0.8205, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.059519104237481225, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00011904761904761905, |
|
"loss": 0.8013, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06249505944935529, |
|
"grad_norm": 0.059814453125, |
|
"learning_rate": 0.000125, |
|
"loss": 0.7996, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06547101466122934, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 0.00013095238095238096, |
|
"loss": 0.8194, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0684469698731034, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.0001369047619047619, |
|
"loss": 0.7937, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07142292508497747, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.7972, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07439888029685153, |
|
"grad_norm": 0.056640625, |
|
"learning_rate": 0.00014880952380952382, |
|
"loss": 0.8139, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07737483550872559, |
|
"grad_norm": 0.056640625, |
|
"learning_rate": 0.00015476190476190478, |
|
"loss": 0.7933, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08035079072059965, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 0.00016071428571428573, |
|
"loss": 0.7852, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08332674593247372, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.8041, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08630270114434778, |
|
"grad_norm": 0.055419921875, |
|
"learning_rate": 0.00017261904761904764, |
|
"loss": 0.8021, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.08927865635622184, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.0001785714285714286, |
|
"loss": 0.7883, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0922546115680959, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 0.00018452380952380955, |
|
"loss": 0.8034, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.09523056677996997, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 0.00019047619047619048, |
|
"loss": 0.7865, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09820652199184403, |
|
"grad_norm": 0.0546875, |
|
"learning_rate": 0.00019642857142857144, |
|
"loss": 0.7945, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.10118247720371809, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.0001999991365731819, |
|
"loss": 0.7968, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10415843241559214, |
|
"grad_norm": 0.058349609375, |
|
"learning_rate": 0.00019998942319271077, |
|
"loss": 0.7942, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1071343876274662, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 0.00019996891820008164, |
|
"loss": 0.8003, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.11011034283934026, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.00019993762380834785, |
|
"loss": 0.7792, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.11308629805121433, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00019989554339503612, |
|
"loss": 0.7829, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11606225326308839, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 0.00019984268150178167, |
|
"loss": 0.7946, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.11903820847496245, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.0001997790438338385, |
|
"loss": 0.7961, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12201416368683651, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.00019970463725946336, |
|
"loss": 0.7896, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.12499011889871058, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 0.782, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12796607411058464, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.00019952355067488523, |
|
"loss": 0.8018, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1309420293224587, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.0001994168902089112, |
|
"loss": 0.7943, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13391798453433276, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 0.00019929949992285396, |
|
"loss": 0.8005, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1368939397462068, |
|
"grad_norm": 0.055908203125, |
|
"learning_rate": 0.00019917139248635786, |
|
"loss": 0.7776, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1398698949580809, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.00019903258172574302, |
|
"loss": 0.7905, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.14284585016995494, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00019888308262251285, |
|
"loss": 0.7782, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.145821805381829, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 0.00019872291131173742, |
|
"loss": 0.7798, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.14879776059370306, |
|
"grad_norm": 0.0615234375, |
|
"learning_rate": 0.0001985520850803117, |
|
"loss": 0.7842, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.15177371580557714, |
|
"grad_norm": 0.054443359375, |
|
"learning_rate": 0.00019837062236509014, |
|
"loss": 0.7778, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.15474967101745118, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.0001981785427508966, |
|
"loss": 0.7813, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15772562622932526, |
|
"grad_norm": 0.0546875, |
|
"learning_rate": 0.00019797586696841072, |
|
"loss": 0.7956, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.1607015814411993, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 0.00019776261689193048, |
|
"loss": 0.8056, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16367753665307336, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.00019753881553701138, |
|
"loss": 0.7886, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.16665349186494743, |
|
"grad_norm": 0.0576171875, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.8025, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.16962944707682148, |
|
"grad_norm": 0.05712890625, |
|
"learning_rate": 0.0001970596567453391, |
|
"loss": 0.7803, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.17260540228869556, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 0.00019680435102301412, |
|
"loss": 0.7844, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1755813575005696, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 0.0001965385974455251, |
|
"loss": 0.797, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.17855731271244368, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.0001962624246950012, |
|
"loss": 0.7774, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18153326792431773, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00019597586257808712, |
|
"loss": 0.7756, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.1845092231361918, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.0001956789420227262, |
|
"loss": 0.7703, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.18748517834806586, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0001953716950748227, |
|
"loss": 0.7901, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.19046113355993993, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.0001950541548947829, |
|
"loss": 0.78, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.19343708877181398, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 0.0001947263557539363, |
|
"loss": 0.767, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.19641304398368806, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00019438833303083678, |
|
"loss": 0.7963, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1993889991955621, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 0.00019404012320744417, |
|
"loss": 0.783, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.20236495440743618, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 0.0001936817638651871, |
|
"loss": 0.7814, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.20534090961931023, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 0.00019331329368090666, |
|
"loss": 0.7791, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.20831686483118428, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.00019293475242268223, |
|
"loss": 0.7782, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.21129282004305835, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 0.00019254618094553949, |
|
"loss": 0.7835, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.2142687752549324, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.00019214762118704076, |
|
"loss": 0.7886, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.21724473046680648, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.00019173911616275917, |
|
"loss": 0.7766, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.22022068567868053, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.00019132070996163568, |
|
"loss": 0.7838, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2231966408905546, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.0001908924477412211, |
|
"loss": 0.7837, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.22617259610242865, |
|
"grad_norm": 0.044921875, |
|
"learning_rate": 0.00019045437572280194, |
|
"loss": 0.7961, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.22914855131430273, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 0.00019000654118641211, |
|
"loss": 0.7903, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.23212450652617678, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 0.0001895489924657301, |
|
"loss": 0.7776, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.23510046173805085, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00018908177894286232, |
|
"loss": 0.7768, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.2380764169499249, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.00018860495104301345, |
|
"loss": 0.7842, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.24105237216179898, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 0.00018811856022904423, |
|
"loss": 0.7728, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.24402832737367303, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.00018762265899591722, |
|
"loss": 0.7787, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.24700428258554707, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.000187117300865031, |
|
"loss": 0.7893, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.24998023779742115, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.7846, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2529561930092952, |
|
"grad_norm": 0.055908203125, |
|
"learning_rate": 0.00018607843309298723, |
|
"loss": 0.785, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2559321482211693, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 0.00018554503557426948, |
|
"loss": 0.7861, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2589081034330433, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 0.0001850024053905709, |
|
"loss": 0.7605, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2618840586449174, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 0.0001844506011066308, |
|
"loss": 0.7691, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2648600138567915, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 0.00018388968227732626, |
|
"loss": 0.7743, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.2678359690686655, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 0.0001833197094412449, |
|
"loss": 0.7834, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2708119242805396, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00018274074411415105, |
|
"loss": 0.7804, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2737878794924136, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 0.00018215284878234642, |
|
"loss": 0.7985, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2767638347042877, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00018155608689592604, |
|
"loss": 0.7852, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.2797397899161618, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 0.0001809505228619304, |
|
"loss": 0.7697, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2827157451280358, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.0001803362220373942, |
|
"loss": 0.772, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.28569170033990987, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 0.00017971325072229226, |
|
"loss": 0.7613, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2886676555517839, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.00017908167615238415, |
|
"loss": 0.7737, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.291643610763658, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 0.00017844156649195759, |
|
"loss": 0.7813, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.29461956597553207, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.00017779299082647148, |
|
"loss": 0.7709, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2975955211874061, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 0.0001771360191551, |
|
"loss": 0.7838, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.30057147639928017, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.00017647072238317728, |
|
"loss": 0.7638, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3035474316111543, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.0001757971723145453, |
|
"loss": 0.7838, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3065233868230283, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 0.00017511544164380388, |
|
"loss": 0.7852, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.30949934203490237, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.00017442560394846516, |
|
"loss": 0.7664, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3124752972467764, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.0001737277336810124, |
|
"loss": 0.7715, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.3154512524586505, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00017302190616086464, |
|
"loss": 0.7474, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.31842720767052457, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 0.0001723081975662476, |
|
"loss": 0.7906, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.3214031628823986, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00017158668492597186, |
|
"loss": 0.7694, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.32437911809427267, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.00017085744611111957, |
|
"loss": 0.7855, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.3273550733061467, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.0001701205598266398, |
|
"loss": 0.7674, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3303310285180208, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.00016937610560285418, |
|
"loss": 0.7619, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.33330698372989487, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.7777, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3362829389417689, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 0.00016786481553392548, |
|
"loss": 0.7792, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.33925889415364296, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.00016709814279859702, |
|
"loss": 0.7601, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.34223484936551707, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00016632422832598795, |
|
"loss": 0.7817, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.3452108045773911, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.000165543155642781, |
|
"loss": 0.7905, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.34818675978926517, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.00016475500904822706, |
|
"loss": 0.7581, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.3511627150011392, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.00016395987360504668, |
|
"loss": 0.7693, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3541386702130133, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00016315783513024977, |
|
"loss": 0.7861, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.35711462542488737, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.7765, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3600905806367614, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 0.0001615333960696393, |
|
"loss": 0.7682, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.36306653584863546, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00016071117080553236, |
|
"loss": 0.7768, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3660424910605095, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.00015988239313430004, |
|
"loss": 0.7684, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.3690184462723836, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 0.00015904715250387498, |
|
"loss": 0.7677, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.37199440148425766, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.0001582055390597212, |
|
"loss": 0.7866, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3749703566961317, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 0.7753, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.37794631190800576, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.00015650355774129, |
|
"loss": 0.7854, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.38092226711987986, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.00015564337355766412, |
|
"loss": 0.7639, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3838982223317539, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00015477718392178716, |
|
"loss": 0.7574, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.38687417754362796, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 0.00015390508231937297, |
|
"loss": 0.7663, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.389850132755502, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.00015302716287419945, |
|
"loss": 0.7746, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.3928260879673761, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 0.0001521435203379498, |
|
"loss": 0.7617, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.39580204317925016, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.00015125425007998653, |
|
"loss": 0.7993, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.3987779983911242, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.0001503594480770581, |
|
"loss": 0.7887, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.40175395360299826, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.00014945921090294076, |
|
"loss": 0.7648, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.40472990881487236, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 0.00014855363571801523, |
|
"loss": 0.7811, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.4077058640267464, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.00014764282025878068, |
|
"loss": 0.7674, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.41068181923862046, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.0001467268628273062, |
|
"loss": 0.7684, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4136577744504945, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00014580586228062122, |
|
"loss": 0.7957, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.41663372966236856, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 0.00014487991802004623, |
|
"loss": 0.7778, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.41960968487424266, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 0.0001439491299804645, |
|
"loss": 0.7593, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.4225856400861167, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.0001430135986195365, |
|
"loss": 0.7576, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.42556159529799076, |
|
"grad_norm": 0.057373046875, |
|
"learning_rate": 0.00014207342490685774, |
|
"loss": 0.7772, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.4285375505098648, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.00014112871031306119, |
|
"loss": 0.7811, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4315135057217389, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.00014017955679886598, |
|
"loss": 0.7644, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.43448946093361296, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00013922606680407307, |
|
"loss": 0.7757, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.437465416145487, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 0.7764, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.44044137135736106, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.0001373064894609194, |
|
"loss": 0.7784, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.44341732656923516, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.0001363406092878131, |
|
"loss": 0.7663, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.4463932817811092, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.00013537080696225814, |
|
"loss": 0.7828, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.44936923699298326, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 0.0001343971871526307, |
|
"loss": 0.7764, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.4523451922048573, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.00013341985493931877, |
|
"loss": 0.7753, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.45532114741673135, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.00013243891580338072, |
|
"loss": 0.7663, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.45829710262860546, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 0.00013145447561516138, |
|
"loss": 0.7801, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.4612730578404795, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.00013046664062286545, |
|
"loss": 0.7917, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.46424901305235355, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00012947551744109043, |
|
"loss": 0.7751, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4672249682642276, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 0.00012848121303932013, |
|
"loss": 0.7585, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.4702009234761017, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 0.00012748383473037948, |
|
"loss": 0.7888, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.47317687868797575, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 0.00012648349015885273, |
|
"loss": 0.7766, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.4761528338998498, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 0.0001254802872894655, |
|
"loss": 0.7707, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.47912878911172385, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.0001244743343954324, |
|
"loss": 0.7678, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.48210474432359796, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 0.00012346574004677154, |
|
"loss": 0.7477, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.485080699535472, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.0001224546130985867, |
|
"loss": 0.7758, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.48805665474734605, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 0.00012144106267931876, |
|
"loss": 0.7797, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4910326099592201, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 0.00012042519817896804, |
|
"loss": 0.764, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.49400856517109415, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.00011940712923728783, |
|
"loss": 0.7602, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.49698452038296825, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.00011838696573195139, |
|
"loss": 0.7631, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.4999604755948423, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.7793, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5029364308067164, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 0.00011634079565942497, |
|
"loss": 0.7658, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.5059123860185905, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.00011531500993033093, |
|
"loss": 0.7499, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5088883412304644, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 0.00011428757128993802, |
|
"loss": 0.7706, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5118642964423386, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00011325859062716795, |
|
"loss": 0.7757, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5148402516542127, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 0.00011222817899736914, |
|
"loss": 0.7828, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5178162068660866, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 0.00011119644761033078, |
|
"loss": 0.7687, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5207921620779608, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00011016350781828019, |
|
"loss": 0.775, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.5237681172898347, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 0.00010912947110386484, |
|
"loss": 0.7664, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5267440725017088, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00010809444906812033, |
|
"loss": 0.7962, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.529720027713583, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.00010705855341842563, |
|
"loss": 0.7806, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.532695982925457, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 0.0001060218959564466, |
|
"loss": 0.7626, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.535671938137331, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.00010498458856606972, |
|
"loss": 0.7747, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.538647893349205, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.00010394674320132662, |
|
"loss": 0.7813, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.5416238485610791, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.00010290847187431113, |
|
"loss": 0.7913, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5445998037729533, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 0.00010186988664309023, |
|
"loss": 0.7648, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.5475757589848272, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.00010083109959960973, |
|
"loss": 0.7837, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5505517141967013, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 9.979222285759651e-05, |
|
"loss": 0.7756, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.5535276694085755, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 9.875336854045851e-05, |
|
"loss": 0.7512, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5565036246204494, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 9.771464876918331e-05, |
|
"loss": 0.7696, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.5594795798323235, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 9.667617565023735e-05, |
|
"loss": 0.7855, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5624555350441975, |
|
"grad_norm": 0.061279296875, |
|
"learning_rate": 9.563806126346642e-05, |
|
"loss": 0.7613, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.5654314902560716, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 9.460041764999928e-05, |
|
"loss": 0.7782, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5684074454679457, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 9.356335680015534e-05, |
|
"loss": 0.7781, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.5713834006798197, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 9.252699064135758e-05, |
|
"loss": 0.7953, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5743593558916938, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 9.149143102605294e-05, |
|
"loss": 0.7649, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.5773353111035678, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 9.045678971963988e-05, |
|
"loss": 0.7635, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5803112663154419, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 8.942317838840623e-05, |
|
"loss": 0.7689, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.583287221527316, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 8.839070858747697e-05, |
|
"loss": 0.7869, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.58626317673919, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 8.735949174877466e-05, |
|
"loss": 0.7606, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.5892391319510641, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 8.632963916899268e-05, |
|
"loss": 0.7665, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5922150871629382, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 8.530126199758323e-05, |
|
"loss": 0.7967, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.5951910423748122, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 8.427447122476148e-05, |
|
"loss": 0.7784, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5981669975866863, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 8.324937766952638e-05, |
|
"loss": 0.7651, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.6011429527985603, |
|
"grad_norm": 0.05810546875, |
|
"learning_rate": 8.222609196770036e-05, |
|
"loss": 0.7773, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6041189080104344, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 8.120472455998882e-05, |
|
"loss": 0.7509, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.6070948632223085, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 8.018538568006027e-05, |
|
"loss": 0.7623, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6100708184341825, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 7.916818534264921e-05, |
|
"loss": 0.7492, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.6130467736460566, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 7.815323333168262e-05, |
|
"loss": 0.7612, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.6160227288579306, |
|
"grad_norm": 0.0556640625, |
|
"learning_rate": 7.714063918843106e-05, |
|
"loss": 0.7799, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.6189986840698047, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 7.613051219968623e-05, |
|
"loss": 0.7611, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6219746392816788, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 7.512296138596601e-05, |
|
"loss": 0.7782, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.6249505944935528, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 7.411809548974792e-05, |
|
"loss": 0.7671, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6279265497054269, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 7.31160229637331e-05, |
|
"loss": 0.7753, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.630902504917301, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 7.211685195914097e-05, |
|
"loss": 0.7663, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.633878460129175, |
|
"grad_norm": 0.055419921875, |
|
"learning_rate": 7.112069031403704e-05, |
|
"loss": 0.7702, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.6368544153410491, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 7.012764554169393e-05, |
|
"loss": 0.7724, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6398303705529231, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 6.913782481898789e-05, |
|
"loss": 0.7607, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.6428063257647972, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 6.815133497483157e-05, |
|
"loss": 0.7879, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6457822809766713, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 6.71682824786439e-05, |
|
"loss": 0.7736, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.6487582361885453, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 6.618877342885945e-05, |
|
"loss": 0.7452, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.6517341914004194, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 6.521291354147727e-05, |
|
"loss": 0.7774, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.6547101466122934, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 6.424080813865138e-05, |
|
"loss": 0.7825, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6576861018241675, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 6.327256213732344e-05, |
|
"loss": 0.7568, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.6606620570360416, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 6.230828003789949e-05, |
|
"loss": 0.7697, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6636380122479156, |
|
"grad_norm": 0.052490234375, |
|
"learning_rate": 6.134806591297133e-05, |
|
"loss": 0.7536, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.6666139674597897, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.7799, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6695899226716638, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 5.944025567055251e-05, |
|
"loss": 0.7605, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.6725658778835378, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 5.849286545832211e-05, |
|
"loss": 0.7747, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6755418330954119, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 5.7549955008885294e-05, |
|
"loss": 0.7746, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.6785177883072859, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 0.7533, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.68149374351916, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 5.567797996792813e-05, |
|
"loss": 0.7644, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.6844696987310341, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 5.47491174140631e-05, |
|
"loss": 0.7807, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6874456539429081, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 5.382513867649663e-05, |
|
"loss": 0.7398, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.6904216091547822, |
|
"grad_norm": 0.055908203125, |
|
"learning_rate": 5.290614347797802e-05, |
|
"loss": 0.8002, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6933975643666562, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 5.199223100339539e-05, |
|
"loss": 0.76, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.6963735195785303, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 5.108349988907111e-05, |
|
"loss": 0.7712, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6993494747904044, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 5.0180048212115924e-05, |
|
"loss": 0.781, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.7023254300022784, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 4.92819734798441e-05, |
|
"loss": 0.7758, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.7053013852141525, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 4.8389372619249326e-05, |
|
"loss": 0.7727, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.7082773404260266, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 4.7502341966544e-05, |
|
"loss": 0.7605, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.7112532956379006, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 4.6620977256761514e-05, |
|
"loss": 0.7649, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.7142292508497747, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 4.574537361342407e-05, |
|
"loss": 0.7786, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7172052060616487, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 4.487562553827622e-05, |
|
"loss": 0.753, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.7201811612735228, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 4.401182690108534e-05, |
|
"loss": 0.7849, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.7231571164853969, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 4.315407092951078e-05, |
|
"loss": 0.7742, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.7261330716972709, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 4.23024501990417e-05, |
|
"loss": 0.7956, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.729109026909145, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 4.145705662300595e-05, |
|
"loss": 0.7837, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.732084982121019, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 4.0617981442649855e-05, |
|
"loss": 0.7597, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7350609373328931, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 3.978531521729084e-05, |
|
"loss": 0.7705, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.7380368925447672, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 3.89591478145437e-05, |
|
"loss": 0.7775, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7410128477566412, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 3.813956840062118e-05, |
|
"loss": 0.7534, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.7439888029685153, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 3.732666543071079e-05, |
|
"loss": 0.7725, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7469647581803894, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 3.652052663942769e-05, |
|
"loss": 0.7661, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.7499407133922634, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.7778, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.7529166686041375, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 3.492888887160866e-05, |
|
"loss": 0.7704, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.7558926238160115, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 3.4143561676616575e-05, |
|
"loss": 0.7785, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.7588685790278856, |
|
"grad_norm": 0.05615234375, |
|
"learning_rate": 3.336534220479961e-05, |
|
"loss": 0.7834, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.7618445342397597, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 3.259431444746846e-05, |
|
"loss": 0.7778, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.7648204894516337, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 3.1830561619749863e-05, |
|
"loss": 0.7658, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.7677964446635078, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 3.10741661516053e-05, |
|
"loss": 0.7822, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.7707723998753819, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 3.032520967893453e-05, |
|
"loss": 0.7763, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.7737483550872559, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 2.9583773034764826e-05, |
|
"loss": 0.7855, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.77672431029913, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 2.8849936240527008e-05, |
|
"loss": 0.754, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.779700265511004, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 2.8123778497418685e-05, |
|
"loss": 0.772, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7826762207228781, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 2.74053781778565e-05, |
|
"loss": 0.7646, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.7856521759347522, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 2.669481281701739e-05, |
|
"loss": 0.7848, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7886281311466262, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 2.5992159104470526e-05, |
|
"loss": 0.7679, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.7916040863585003, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 2.529749287590042e-05, |
|
"loss": 0.7532, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7945800415703743, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 2.461088910492202e-05, |
|
"loss": 0.77, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.7975559967822484, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 2.3932421894989167e-05, |
|
"loss": 0.7768, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.8005319519941225, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 2.326216447139663e-05, |
|
"loss": 0.7543, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.8035079072059965, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 2.260018917337726e-05, |
|
"loss": 0.7606, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.8064838624178706, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 2.194656744629442e-05, |
|
"loss": 0.7629, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.8094598176297447, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 2.1301369833931117e-05, |
|
"loss": 0.7619, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.8124357728416187, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 2.0664665970876496e-05, |
|
"loss": 0.7534, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.8154117280534928, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 2.0036524575010172e-05, |
|
"loss": 0.7765, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.8183876832653668, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 1.9417013440085864e-05, |
|
"loss": 0.7622, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.8213636384772409, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 1.880619942841435e-05, |
|
"loss": 0.7795, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.824339593689115, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 1.8204148463647453e-05, |
|
"loss": 0.7599, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.827315548900989, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 1.7610925523662835e-05, |
|
"loss": 0.7617, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.8302915041128631, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 1.702659463355125e-05, |
|
"loss": 0.7608, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.8332674593247371, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.782, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8362434145366112, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 1.5884860298018322e-05, |
|
"loss": 0.7622, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.8392193697484853, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.5327580077171587e-05, |
|
"loss": 0.771, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8421953249603593, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.4779438342047713e-05, |
|
"loss": 0.7684, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.8451712801722334, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 1.4240494252234049e-05, |
|
"loss": 0.7886, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8481472353841075, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 1.3710805974638696e-05, |
|
"loss": 0.7813, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.8511231905959815, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 1.3190430677212794e-05, |
|
"loss": 0.7557, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.8540991458078556, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 1.2679424522780426e-05, |
|
"loss": 0.7487, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.8570751010197296, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 1.2177842662977135e-05, |
|
"loss": 0.777, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.8600510562316037, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 1.1685739232297643e-05, |
|
"loss": 0.7724, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.8630270114434778, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 1.1203167342253062e-05, |
|
"loss": 0.7689, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.8660029666553518, |
|
"grad_norm": 0.054443359375, |
|
"learning_rate": 1.0730179075638868e-05, |
|
"loss": 0.7415, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.8689789218672259, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 1.0266825480913611e-05, |
|
"loss": 0.7772, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.8719548770790999, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 9.813156566689518e-06, |
|
"loss": 0.7715, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.874930832290974, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 9.369221296335006e-06, |
|
"loss": 0.7646, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.8779067875028481, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 8.935067582690382e-06, |
|
"loss": 0.7446, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.8808827427147221, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 8.510742282896544e-06, |
|
"loss": 0.7768, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.8838586979265962, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 8.096291193337934e-06, |
|
"loss": 0.7535, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.8868346531384703, |
|
"grad_norm": 0.048828125, |
|
"learning_rate": 7.69175904469982e-06, |
|
"loss": 0.7691, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.8898106083503443, |
|
"grad_norm": 0.050537109375, |
|
"learning_rate": 7.2971894971405665e-06, |
|
"loss": 0.7562, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.8927865635622184, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 6.9126251355795864e-06, |
|
"loss": 0.7574, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8957625187740924, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 6.538107465101162e-06, |
|
"loss": 0.7733, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.8987384739859665, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 6.173676906475012e-06, |
|
"loss": 0.7656, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.9017144291978406, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 5.8193727917936536e-06, |
|
"loss": 0.7604, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.9046903844097146, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 5.475233360227516e-06, |
|
"loss": 0.7688, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.9076663396215887, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 5.14129575389779e-06, |
|
"loss": 0.7556, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.9106422948334627, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 4.817596013867764e-06, |
|
"loss": 0.7765, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.9136182500453368, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 4.504169076253084e-06, |
|
"loss": 0.7788, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.9165942052572109, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 4.20104876845111e-06, |
|
"loss": 0.7723, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9195701604690849, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 3.908267805490051e-06, |
|
"loss": 0.7748, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.922546115680959, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 3.625857786498055e-06, |
|
"loss": 0.762, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.9255220708928331, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 3.3538491912928792e-06, |
|
"loss": 0.7728, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.9284980261047071, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 3.092271377092215e-06, |
|
"loss": 0.7595, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.9314739813165812, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 2.8411525753452185e-06, |
|
"loss": 0.7776, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.9344499365284552, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 2.6005198886856487e-06, |
|
"loss": 0.7814, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9374258917403293, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 2.3703992880066638e-06, |
|
"loss": 0.7533, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.9404018469522034, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 2.150815609657875e-06, |
|
"loss": 0.7801, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.9433778021640774, |
|
"grad_norm": 0.05126953125, |
|
"learning_rate": 1.9417925527648096e-06, |
|
"loss": 0.766, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.9463537573759515, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 1.7433526766711728e-06, |
|
"loss": 0.7911, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.9493297125878255, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 1.5555173985039918e-06, |
|
"loss": 0.7798, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.9523056677996996, |
|
"grad_norm": 0.05029296875, |
|
"learning_rate": 1.378306990862177e-06, |
|
"loss": 0.7833, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9552816230115737, |
|
"grad_norm": 0.046875, |
|
"learning_rate": 1.2117405796285286e-06, |
|
"loss": 0.7826, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.9582575782234477, |
|
"grad_norm": 0.055908203125, |
|
"learning_rate": 1.055836141905553e-06, |
|
"loss": 0.7792, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.9612335334353218, |
|
"grad_norm": 0.0517578125, |
|
"learning_rate": 9.106105040751822e-07, |
|
"loss": 0.7687, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.9642094886471959, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 7.760793399827937e-07, |
|
"loss": 0.7452, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.9671854438590699, |
|
"grad_norm": 0.0576171875, |
|
"learning_rate": 6.522571692455736e-07, |
|
"loss": 0.7721, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.970161399070944, |
|
"grad_norm": 0.1337890625, |
|
"learning_rate": 5.391573556854157e-07, |
|
"loss": 0.7848, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.973137354282818, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 4.3679210588661866e-07, |
|
"loss": 0.7767, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.9761133094946921, |
|
"grad_norm": 0.048095703125, |
|
"learning_rate": 3.451724678784518e-07, |
|
"loss": 0.7726, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.9790892647065662, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 2.643083299427751e-07, |
|
"loss": 0.7683, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.9820652199184402, |
|
"grad_norm": 0.054931640625, |
|
"learning_rate": 1.9420841954681525e-07, |
|
"loss": 0.7589, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.9850411751303143, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 1.3488030240123017e-07, |
|
"loss": 0.7689, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.9880171303421883, |
|
"grad_norm": 0.052001953125, |
|
"learning_rate": 8.633038164358454e-08, |
|
"loss": 0.7738, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.9909930855540624, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 4.856389714723575e-08, |
|
"loss": 0.7728, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.9939690407659365, |
|
"grad_norm": 0.05322265625, |
|
"learning_rate": 2.1584924955819764e-08, |
|
"loss": 0.7646, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.9969449959778105, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 5.396376843369577e-09, |
|
"loss": 0.7784, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.9999209511896846, |
|
"grad_norm": 0.048583984375, |
|
"learning_rate": 0.0, |
|
"loss": 0.7765, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.9999209511896846, |
|
"step": 1680, |
|
"total_flos": 4.668038789401149e+18, |
|
"train_loss": 0.0, |
|
"train_runtime": 0.023, |
|
"train_samples_per_second": 9337043.614, |
|
"train_steps_per_second": 72939.887 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1680, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 4.668038789401149e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|