{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.050793650793650794, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005079365079365079, "grad_norm": 6.519351482391357, "learning_rate": 2e-05, "loss": 3.2267, "step": 1 }, { "epoch": 0.0005079365079365079, "eval_loss": 3.624514102935791, "eval_runtime": 204.3923, "eval_samples_per_second": 4.056, "eval_steps_per_second": 2.03, "step": 1 }, { "epoch": 0.0010158730158730158, "grad_norm": 5.146887302398682, "learning_rate": 4e-05, "loss": 3.3216, "step": 2 }, { "epoch": 0.0015238095238095239, "grad_norm": 5.531928062438965, "learning_rate": 6e-05, "loss": 3.0883, "step": 3 }, { "epoch": 0.0020317460317460317, "grad_norm": 5.524338245391846, "learning_rate": 8e-05, "loss": 3.7995, "step": 4 }, { "epoch": 0.0025396825396825397, "grad_norm": 13.655096054077148, "learning_rate": 0.0001, "loss": 3.1211, "step": 5 }, { "epoch": 0.0030476190476190477, "grad_norm": 4.864871501922607, "learning_rate": 0.00012, "loss": 3.0722, "step": 6 }, { "epoch": 0.0035555555555555557, "grad_norm": 3.7362592220306396, "learning_rate": 0.00014, "loss": 3.1339, "step": 7 }, { "epoch": 0.004063492063492063, "grad_norm": 3.1778581142425537, "learning_rate": 0.00016, "loss": 1.9299, "step": 8 }, { "epoch": 0.004571428571428572, "grad_norm": 1.8088997602462769, "learning_rate": 0.00018, "loss": 1.8856, "step": 9 }, { "epoch": 0.005079365079365079, "grad_norm": 1.6604986190795898, "learning_rate": 0.0002, "loss": 1.6712, "step": 10 }, { "epoch": 0.005587301587301587, "grad_norm": 0.8797962069511414, "learning_rate": 0.0001999390827019096, "loss": 1.8232, "step": 11 }, { "epoch": 0.006095238095238095, "grad_norm": 0.8502254486083984, "learning_rate": 0.00019975640502598244, "loss": 1.82, "step": 12 }, { "epoch": 0.006603174603174603, "grad_norm": 1.8626749515533447, "learning_rate": 0.00019945218953682734, "loss": 1.4507, "step": 13 }, { "epoch": 0.0071111111111111115, "grad_norm": 0.8453308939933777, "learning_rate": 0.00019902680687415705, "loss": 1.5455, "step": 14 }, { "epoch": 0.007619047619047619, "grad_norm": 0.7060232162475586, "learning_rate": 0.00019848077530122083, "loss": 1.9394, "step": 15 }, { "epoch": 0.008126984126984127, "grad_norm": 0.6864155530929565, "learning_rate": 0.00019781476007338058, "loss": 1.3181, "step": 16 }, { "epoch": 0.008634920634920634, "grad_norm": 0.5969262719154358, "learning_rate": 0.00019702957262759965, "loss": 1.4341, "step": 17 }, { "epoch": 0.009142857142857144, "grad_norm": 0.31484541296958923, "learning_rate": 0.0001961261695938319, "loss": 1.4729, "step": 18 }, { "epoch": 0.009650793650793651, "grad_norm": 0.46714216470718384, "learning_rate": 0.00019510565162951537, "loss": 2.0658, "step": 19 }, { "epoch": 0.010158730158730159, "grad_norm": 0.44908955693244934, "learning_rate": 0.00019396926207859084, "loss": 1.3463, "step": 20 }, { "epoch": 0.010666666666666666, "grad_norm": 0.6508448719978333, "learning_rate": 0.00019271838545667876, "loss": 1.3049, "step": 21 }, { "epoch": 0.011174603174603174, "grad_norm": 0.5306481719017029, "learning_rate": 0.0001913545457642601, "loss": 1.34, "step": 22 }, { "epoch": 0.011682539682539683, "grad_norm": 0.5218005776405334, "learning_rate": 0.0001898794046299167, "loss": 1.4621, "step": 23 }, { "epoch": 0.01219047619047619, "grad_norm": 0.4745350480079651, "learning_rate": 0.00018829475928589271, "loss": 1.5099, "step": 24 }, { "epoch": 0.012698412698412698, "grad_norm": 0.6343680620193481, "learning_rate": 0.00018660254037844388, "loss": 1.1358, "step": 25 }, { "epoch": 0.012698412698412698, "eval_loss": 1.5563476085662842, "eval_runtime": 206.0859, "eval_samples_per_second": 4.023, "eval_steps_per_second": 2.014, "step": 25 }, { "epoch": 0.013206349206349206, "grad_norm": 0.3028874695301056, "learning_rate": 0.0001848048096156426, "loss": 1.6123, "step": 26 }, { "epoch": 0.013714285714285714, "grad_norm": 0.5397087335586548, "learning_rate": 0.00018290375725550417, "loss": 1.6102, "step": 27 }, { "epoch": 0.014222222222222223, "grad_norm": 0.28948265314102173, "learning_rate": 0.00018090169943749476, "loss": 1.3875, "step": 28 }, { "epoch": 0.01473015873015873, "grad_norm": 0.41503822803497314, "learning_rate": 0.00017880107536067218, "loss": 1.5284, "step": 29 }, { "epoch": 0.015238095238095238, "grad_norm": 0.6742813587188721, "learning_rate": 0.0001766044443118978, "loss": 1.3041, "step": 30 }, { "epoch": 0.015746031746031747, "grad_norm": 0.35028955340385437, "learning_rate": 0.00017431448254773944, "loss": 1.5403, "step": 31 }, { "epoch": 0.016253968253968253, "grad_norm": 0.39901459217071533, "learning_rate": 0.0001719339800338651, "loss": 1.1394, "step": 32 }, { "epoch": 0.016761904761904763, "grad_norm": 0.9433032274246216, "learning_rate": 0.00016946583704589973, "loss": 2.1009, "step": 33 }, { "epoch": 0.01726984126984127, "grad_norm": 0.2524099051952362, "learning_rate": 0.00016691306063588583, "loss": 1.1299, "step": 34 }, { "epoch": 0.017777777777777778, "grad_norm": 0.548650324344635, "learning_rate": 0.00016427876096865394, "loss": 1.3585, "step": 35 }, { "epoch": 0.018285714285714287, "grad_norm": 0.6572702527046204, "learning_rate": 0.0001615661475325658, "loss": 1.2422, "step": 36 }, { "epoch": 0.018793650793650793, "grad_norm": 0.4808460772037506, "learning_rate": 0.00015877852522924732, "loss": 1.6923, "step": 37 }, { "epoch": 0.019301587301587302, "grad_norm": 0.4411121904850006, "learning_rate": 0.0001559192903470747, "loss": 1.2018, "step": 38 }, { "epoch": 0.019809523809523808, "grad_norm": 0.6042914986610413, "learning_rate": 0.0001529919264233205, "loss": 1.5242, "step": 39 }, { "epoch": 0.020317460317460317, "grad_norm": 0.4709426462650299, "learning_rate": 0.00015000000000000001, "loss": 1.6164, "step": 40 }, { "epoch": 0.020825396825396827, "grad_norm": 0.307210236787796, "learning_rate": 0.00014694715627858908, "loss": 1.642, "step": 41 }, { "epoch": 0.021333333333333333, "grad_norm": 0.3629468083381653, "learning_rate": 0.00014383711467890774, "loss": 1.3277, "step": 42 }, { "epoch": 0.021841269841269842, "grad_norm": 0.47343459725379944, "learning_rate": 0.00014067366430758004, "loss": 1.3657, "step": 43 }, { "epoch": 0.022349206349206348, "grad_norm": 0.5083032250404358, "learning_rate": 0.00013746065934159123, "loss": 1.7898, "step": 44 }, { "epoch": 0.022857142857142857, "grad_norm": 0.34503334760665894, "learning_rate": 0.00013420201433256689, "loss": 1.4534, "step": 45 }, { "epoch": 0.023365079365079366, "grad_norm": 0.513176679611206, "learning_rate": 0.00013090169943749476, "loss": 2.0591, "step": 46 }, { "epoch": 0.023873015873015872, "grad_norm": 0.6394305229187012, "learning_rate": 0.0001275637355816999, "loss": 1.4114, "step": 47 }, { "epoch": 0.02438095238095238, "grad_norm": 0.3904706537723541, "learning_rate": 0.00012419218955996676, "loss": 1.468, "step": 48 }, { "epoch": 0.024888888888888887, "grad_norm": 0.33670979738235474, "learning_rate": 0.00012079116908177593, "loss": 1.4328, "step": 49 }, { "epoch": 0.025396825396825397, "grad_norm": 0.31365063786506653, "learning_rate": 0.00011736481776669306, "loss": 1.2578, "step": 50 }, { "epoch": 0.025396825396825397, "eval_loss": 1.503420352935791, "eval_runtime": 206.1156, "eval_samples_per_second": 4.022, "eval_steps_per_second": 2.013, "step": 50 }, { "epoch": 0.025904761904761906, "grad_norm": 0.33596453070640564, "learning_rate": 0.00011391731009600654, "loss": 1.5421, "step": 51 }, { "epoch": 0.026412698412698412, "grad_norm": 0.2696133852005005, "learning_rate": 0.00011045284632676536, "loss": 1.3148, "step": 52 }, { "epoch": 0.02692063492063492, "grad_norm": 0.5372976660728455, "learning_rate": 0.00010697564737441252, "loss": 1.64, "step": 53 }, { "epoch": 0.027428571428571427, "grad_norm": 0.6545993089675903, "learning_rate": 0.00010348994967025012, "loss": 1.2776, "step": 54 }, { "epoch": 0.027936507936507936, "grad_norm": 0.2829376757144928, "learning_rate": 0.0001, "loss": 1.2245, "step": 55 }, { "epoch": 0.028444444444444446, "grad_norm": 0.3656099736690521, "learning_rate": 9.651005032974994e-05, "loss": 1.4617, "step": 56 }, { "epoch": 0.02895238095238095, "grad_norm": 0.3836618959903717, "learning_rate": 9.302435262558747e-05, "loss": 1.2219, "step": 57 }, { "epoch": 0.02946031746031746, "grad_norm": 0.30446094274520874, "learning_rate": 8.954715367323468e-05, "loss": 1.5095, "step": 58 }, { "epoch": 0.029968253968253967, "grad_norm": 0.4856981635093689, "learning_rate": 8.608268990399349e-05, "loss": 1.5771, "step": 59 }, { "epoch": 0.030476190476190476, "grad_norm": 0.45834746956825256, "learning_rate": 8.263518223330697e-05, "loss": 1.3313, "step": 60 }, { "epoch": 0.030984126984126985, "grad_norm": 0.3143262267112732, "learning_rate": 7.920883091822408e-05, "loss": 1.1435, "step": 61 }, { "epoch": 0.031492063492063495, "grad_norm": 0.40068405866622925, "learning_rate": 7.580781044003324e-05, "loss": 1.6526, "step": 62 }, { "epoch": 0.032, "grad_norm": 0.2322355955839157, "learning_rate": 7.243626441830009e-05, "loss": 1.2496, "step": 63 }, { "epoch": 0.032507936507936507, "grad_norm": 0.44400423765182495, "learning_rate": 6.909830056250527e-05, "loss": 1.3121, "step": 64 }, { "epoch": 0.03301587301587302, "grad_norm": 0.46300458908081055, "learning_rate": 6.579798566743314e-05, "loss": 1.5739, "step": 65 }, { "epoch": 0.033523809523809525, "grad_norm": 0.27108630537986755, "learning_rate": 6.25393406584088e-05, "loss": 0.9874, "step": 66 }, { "epoch": 0.03403174603174603, "grad_norm": 0.23532073199748993, "learning_rate": 5.9326335692419995e-05, "loss": 1.2079, "step": 67 }, { "epoch": 0.03453968253968254, "grad_norm": 0.4134567379951477, "learning_rate": 5.616288532109225e-05, "loss": 1.5086, "step": 68 }, { "epoch": 0.03504761904761905, "grad_norm": 0.3667893707752228, "learning_rate": 5.305284372141095e-05, "loss": 1.5459, "step": 69 }, { "epoch": 0.035555555555555556, "grad_norm": 0.5127963423728943, "learning_rate": 5.000000000000002e-05, "loss": 1.4526, "step": 70 }, { "epoch": 0.03606349206349206, "grad_norm": 0.23248711228370667, "learning_rate": 4.700807357667952e-05, "loss": 1.7219, "step": 71 }, { "epoch": 0.036571428571428574, "grad_norm": 0.36493247747421265, "learning_rate": 4.4080709652925336e-05, "loss": 1.2866, "step": 72 }, { "epoch": 0.03707936507936508, "grad_norm": 0.3120146691799164, "learning_rate": 4.12214747707527e-05, "loss": 1.7468, "step": 73 }, { "epoch": 0.037587301587301586, "grad_norm": 0.2617102265357971, "learning_rate": 3.843385246743417e-05, "loss": 1.3033, "step": 74 }, { "epoch": 0.0380952380952381, "grad_norm": 0.37346601486206055, "learning_rate": 3.5721239031346066e-05, "loss": 1.3576, "step": 75 }, { "epoch": 0.0380952380952381, "eval_loss": 1.4916026592254639, "eval_runtime": 206.1501, "eval_samples_per_second": 4.021, "eval_steps_per_second": 2.013, "step": 75 }, { "epoch": 0.038603174603174605, "grad_norm": 0.28713932633399963, "learning_rate": 3.308693936411421e-05, "loss": 1.5865, "step": 76 }, { "epoch": 0.03911111111111111, "grad_norm": 0.26906633377075195, "learning_rate": 3.053416295410026e-05, "loss": 1.1854, "step": 77 }, { "epoch": 0.039619047619047616, "grad_norm": 0.4078941345214844, "learning_rate": 2.8066019966134904e-05, "loss": 1.2432, "step": 78 }, { "epoch": 0.04012698412698413, "grad_norm": 0.32490774989128113, "learning_rate": 2.5685517452260567e-05, "loss": 1.4465, "step": 79 }, { "epoch": 0.040634920634920635, "grad_norm": 0.30676373839378357, "learning_rate": 2.339555568810221e-05, "loss": 1.4783, "step": 80 }, { "epoch": 0.04114285714285714, "grad_norm": 0.3845832943916321, "learning_rate": 2.119892463932781e-05, "loss": 1.3254, "step": 81 }, { "epoch": 0.041650793650793654, "grad_norm": 0.41841885447502136, "learning_rate": 1.9098300562505266e-05, "loss": 1.756, "step": 82 }, { "epoch": 0.04215873015873016, "grad_norm": 0.26209527254104614, "learning_rate": 1.7096242744495837e-05, "loss": 1.1009, "step": 83 }, { "epoch": 0.042666666666666665, "grad_norm": 0.4069371819496155, "learning_rate": 1.5195190384357404e-05, "loss": 1.3217, "step": 84 }, { "epoch": 0.04317460317460318, "grad_norm": 0.29204556345939636, "learning_rate": 1.339745962155613e-05, "loss": 1.437, "step": 85 }, { "epoch": 0.043682539682539684, "grad_norm": 0.36702778935432434, "learning_rate": 1.1705240714107302e-05, "loss": 1.2403, "step": 86 }, { "epoch": 0.04419047619047619, "grad_norm": 0.2762695252895355, "learning_rate": 1.0120595370083318e-05, "loss": 1.2892, "step": 87 }, { "epoch": 0.044698412698412696, "grad_norm": 0.30418360233306885, "learning_rate": 8.645454235739903e-06, "loss": 1.4427, "step": 88 }, { "epoch": 0.04520634920634921, "grad_norm": 0.30312514305114746, "learning_rate": 7.281614543321269e-06, "loss": 1.4674, "step": 89 }, { "epoch": 0.045714285714285714, "grad_norm": 0.36299508810043335, "learning_rate": 6.030737921409169e-06, "loss": 1.2849, "step": 90 }, { "epoch": 0.04622222222222222, "grad_norm": 0.3190608024597168, "learning_rate": 4.8943483704846475e-06, "loss": 1.3682, "step": 91 }, { "epoch": 0.04673015873015873, "grad_norm": 0.27337679266929626, "learning_rate": 3.873830406168111e-06, "loss": 1.2428, "step": 92 }, { "epoch": 0.04723809523809524, "grad_norm": 0.3793995976448059, "learning_rate": 2.970427372400353e-06, "loss": 1.9111, "step": 93 }, { "epoch": 0.047746031746031745, "grad_norm": 0.3148786723613739, "learning_rate": 2.1852399266194314e-06, "loss": 1.2274, "step": 94 }, { "epoch": 0.04825396825396826, "grad_norm": 0.487860769033432, "learning_rate": 1.5192246987791981e-06, "loss": 1.401, "step": 95 }, { "epoch": 0.04876190476190476, "grad_norm": 0.2578515410423279, "learning_rate": 9.731931258429638e-07, "loss": 1.0545, "step": 96 }, { "epoch": 0.04926984126984127, "grad_norm": 0.30165234208106995, "learning_rate": 5.478104631726711e-07, "loss": 1.0555, "step": 97 }, { "epoch": 0.049777777777777775, "grad_norm": 0.27550289034843445, "learning_rate": 2.4359497401758024e-07, "loss": 1.542, "step": 98 }, { "epoch": 0.05028571428571429, "grad_norm": 0.21497561037540436, "learning_rate": 6.09172980904238e-08, "loss": 1.19, "step": 99 }, { "epoch": 0.050793650793650794, "grad_norm": 0.5045850872993469, "learning_rate": 0.0, "loss": 1.5234, "step": 100 }, { "epoch": 0.050793650793650794, "eval_loss": 1.4889129400253296, "eval_runtime": 206.2008, "eval_samples_per_second": 4.02, "eval_steps_per_second": 2.013, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.318461275897856e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }