{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0380952380952381, "eval_steps": 25, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005079365079365079, "grad_norm": 6.519351482391357, "learning_rate": 2e-05, "loss": 3.2267, "step": 1 }, { "epoch": 0.0005079365079365079, "eval_loss": 3.624514102935791, "eval_runtime": 204.3923, "eval_samples_per_second": 4.056, "eval_steps_per_second": 2.03, "step": 1 }, { "epoch": 0.0010158730158730158, "grad_norm": 5.146887302398682, "learning_rate": 4e-05, "loss": 3.3216, "step": 2 }, { "epoch": 0.0015238095238095239, "grad_norm": 5.531928062438965, "learning_rate": 6e-05, "loss": 3.0883, "step": 3 }, { "epoch": 0.0020317460317460317, "grad_norm": 5.524338245391846, "learning_rate": 8e-05, "loss": 3.7995, "step": 4 }, { "epoch": 0.0025396825396825397, "grad_norm": 13.655096054077148, "learning_rate": 0.0001, "loss": 3.1211, "step": 5 }, { "epoch": 0.0030476190476190477, "grad_norm": 4.864871501922607, "learning_rate": 0.00012, "loss": 3.0722, "step": 6 }, { "epoch": 0.0035555555555555557, "grad_norm": 3.7362592220306396, "learning_rate": 0.00014, "loss": 3.1339, "step": 7 }, { "epoch": 0.004063492063492063, "grad_norm": 3.1778581142425537, "learning_rate": 0.00016, "loss": 1.9299, "step": 8 }, { "epoch": 0.004571428571428572, "grad_norm": 1.8088997602462769, "learning_rate": 0.00018, "loss": 1.8856, "step": 9 }, { "epoch": 0.005079365079365079, "grad_norm": 1.6604986190795898, "learning_rate": 0.0002, "loss": 1.6712, "step": 10 }, { "epoch": 0.005587301587301587, "grad_norm": 0.8797962069511414, "learning_rate": 0.0001999390827019096, "loss": 1.8232, "step": 11 }, { "epoch": 0.006095238095238095, "grad_norm": 0.8502254486083984, "learning_rate": 0.00019975640502598244, "loss": 1.82, "step": 12 }, { "epoch": 0.006603174603174603, "grad_norm": 1.8626749515533447, "learning_rate": 0.00019945218953682734, "loss": 1.4507, "step": 13 }, { "epoch": 0.0071111111111111115, "grad_norm": 0.8453308939933777, "learning_rate": 0.00019902680687415705, "loss": 1.5455, "step": 14 }, { "epoch": 0.007619047619047619, "grad_norm": 0.7060232162475586, "learning_rate": 0.00019848077530122083, "loss": 1.9394, "step": 15 }, { "epoch": 0.008126984126984127, "grad_norm": 0.6864155530929565, "learning_rate": 0.00019781476007338058, "loss": 1.3181, "step": 16 }, { "epoch": 0.008634920634920634, "grad_norm": 0.5969262719154358, "learning_rate": 0.00019702957262759965, "loss": 1.4341, "step": 17 }, { "epoch": 0.009142857142857144, "grad_norm": 0.31484541296958923, "learning_rate": 0.0001961261695938319, "loss": 1.4729, "step": 18 }, { "epoch": 0.009650793650793651, "grad_norm": 0.46714216470718384, "learning_rate": 0.00019510565162951537, "loss": 2.0658, "step": 19 }, { "epoch": 0.010158730158730159, "grad_norm": 0.44908955693244934, "learning_rate": 0.00019396926207859084, "loss": 1.3463, "step": 20 }, { "epoch": 0.010666666666666666, "grad_norm": 0.6508448719978333, "learning_rate": 0.00019271838545667876, "loss": 1.3049, "step": 21 }, { "epoch": 0.011174603174603174, "grad_norm": 0.5306481719017029, "learning_rate": 0.0001913545457642601, "loss": 1.34, "step": 22 }, { "epoch": 0.011682539682539683, "grad_norm": 0.5218005776405334, "learning_rate": 0.0001898794046299167, "loss": 1.4621, "step": 23 }, { "epoch": 0.01219047619047619, "grad_norm": 0.4745350480079651, "learning_rate": 0.00018829475928589271, "loss": 1.5099, "step": 24 }, { "epoch": 0.012698412698412698, "grad_norm": 0.6343680620193481, "learning_rate": 0.00018660254037844388, "loss": 1.1358, "step": 25 }, { "epoch": 0.012698412698412698, "eval_loss": 1.5563476085662842, "eval_runtime": 206.0859, "eval_samples_per_second": 4.023, "eval_steps_per_second": 2.014, "step": 25 }, { "epoch": 0.013206349206349206, "grad_norm": 0.3028874695301056, "learning_rate": 0.0001848048096156426, "loss": 1.6123, "step": 26 }, { "epoch": 0.013714285714285714, "grad_norm": 0.5397087335586548, "learning_rate": 0.00018290375725550417, "loss": 1.6102, "step": 27 }, { "epoch": 0.014222222222222223, "grad_norm": 0.28948265314102173, "learning_rate": 0.00018090169943749476, "loss": 1.3875, "step": 28 }, { "epoch": 0.01473015873015873, "grad_norm": 0.41503822803497314, "learning_rate": 0.00017880107536067218, "loss": 1.5284, "step": 29 }, { "epoch": 0.015238095238095238, "grad_norm": 0.6742813587188721, "learning_rate": 0.0001766044443118978, "loss": 1.3041, "step": 30 }, { "epoch": 0.015746031746031747, "grad_norm": 0.35028955340385437, "learning_rate": 0.00017431448254773944, "loss": 1.5403, "step": 31 }, { "epoch": 0.016253968253968253, "grad_norm": 0.39901459217071533, "learning_rate": 0.0001719339800338651, "loss": 1.1394, "step": 32 }, { "epoch": 0.016761904761904763, "grad_norm": 0.9433032274246216, "learning_rate": 0.00016946583704589973, "loss": 2.1009, "step": 33 }, { "epoch": 0.01726984126984127, "grad_norm": 0.2524099051952362, "learning_rate": 0.00016691306063588583, "loss": 1.1299, "step": 34 }, { "epoch": 0.017777777777777778, "grad_norm": 0.548650324344635, "learning_rate": 0.00016427876096865394, "loss": 1.3585, "step": 35 }, { "epoch": 0.018285714285714287, "grad_norm": 0.6572702527046204, "learning_rate": 0.0001615661475325658, "loss": 1.2422, "step": 36 }, { "epoch": 0.018793650793650793, "grad_norm": 0.4808460772037506, "learning_rate": 0.00015877852522924732, "loss": 1.6923, "step": 37 }, { "epoch": 0.019301587301587302, "grad_norm": 0.4411121904850006, "learning_rate": 0.0001559192903470747, "loss": 1.2018, "step": 38 }, { "epoch": 0.019809523809523808, "grad_norm": 0.6042914986610413, "learning_rate": 0.0001529919264233205, "loss": 1.5242, "step": 39 }, { "epoch": 0.020317460317460317, "grad_norm": 0.4709426462650299, "learning_rate": 0.00015000000000000001, "loss": 1.6164, "step": 40 }, { "epoch": 0.020825396825396827, "grad_norm": 0.307210236787796, "learning_rate": 0.00014694715627858908, "loss": 1.642, "step": 41 }, { "epoch": 0.021333333333333333, "grad_norm": 0.3629468083381653, "learning_rate": 0.00014383711467890774, "loss": 1.3277, "step": 42 }, { "epoch": 0.021841269841269842, "grad_norm": 0.47343459725379944, "learning_rate": 0.00014067366430758004, "loss": 1.3657, "step": 43 }, { "epoch": 0.022349206349206348, "grad_norm": 0.5083032250404358, "learning_rate": 0.00013746065934159123, "loss": 1.7898, "step": 44 }, { "epoch": 0.022857142857142857, "grad_norm": 0.34503334760665894, "learning_rate": 0.00013420201433256689, "loss": 1.4534, "step": 45 }, { "epoch": 0.023365079365079366, "grad_norm": 0.513176679611206, "learning_rate": 0.00013090169943749476, "loss": 2.0591, "step": 46 }, { "epoch": 0.023873015873015872, "grad_norm": 0.6394305229187012, "learning_rate": 0.0001275637355816999, "loss": 1.4114, "step": 47 }, { "epoch": 0.02438095238095238, "grad_norm": 0.3904706537723541, "learning_rate": 0.00012419218955996676, "loss": 1.468, "step": 48 }, { "epoch": 0.024888888888888887, "grad_norm": 0.33670979738235474, "learning_rate": 0.00012079116908177593, "loss": 1.4328, "step": 49 }, { "epoch": 0.025396825396825397, "grad_norm": 0.31365063786506653, "learning_rate": 0.00011736481776669306, "loss": 1.2578, "step": 50 }, { "epoch": 0.025396825396825397, "eval_loss": 1.503420352935791, "eval_runtime": 206.1156, "eval_samples_per_second": 4.022, "eval_steps_per_second": 2.013, "step": 50 }, { "epoch": 0.025904761904761906, "grad_norm": 0.33596453070640564, "learning_rate": 0.00011391731009600654, "loss": 1.5421, "step": 51 }, { "epoch": 0.026412698412698412, "grad_norm": 0.2696133852005005, "learning_rate": 0.00011045284632676536, "loss": 1.3148, "step": 52 }, { "epoch": 0.02692063492063492, "grad_norm": 0.5372976660728455, "learning_rate": 0.00010697564737441252, "loss": 1.64, "step": 53 }, { "epoch": 0.027428571428571427, "grad_norm": 0.6545993089675903, "learning_rate": 0.00010348994967025012, "loss": 1.2776, "step": 54 }, { "epoch": 0.027936507936507936, "grad_norm": 0.2829376757144928, "learning_rate": 0.0001, "loss": 1.2245, "step": 55 }, { "epoch": 0.028444444444444446, "grad_norm": 0.3656099736690521, "learning_rate": 9.651005032974994e-05, "loss": 1.4617, "step": 56 }, { "epoch": 0.02895238095238095, "grad_norm": 0.3836618959903717, "learning_rate": 9.302435262558747e-05, "loss": 1.2219, "step": 57 }, { "epoch": 0.02946031746031746, "grad_norm": 0.30446094274520874, "learning_rate": 8.954715367323468e-05, "loss": 1.5095, "step": 58 }, { "epoch": 0.029968253968253967, "grad_norm": 0.4856981635093689, "learning_rate": 8.608268990399349e-05, "loss": 1.5771, "step": 59 }, { "epoch": 0.030476190476190476, "grad_norm": 0.45834746956825256, "learning_rate": 8.263518223330697e-05, "loss": 1.3313, "step": 60 }, { "epoch": 0.030984126984126985, "grad_norm": 0.3143262267112732, "learning_rate": 7.920883091822408e-05, "loss": 1.1435, "step": 61 }, { "epoch": 0.031492063492063495, "grad_norm": 0.40068405866622925, "learning_rate": 7.580781044003324e-05, "loss": 1.6526, "step": 62 }, { "epoch": 0.032, "grad_norm": 0.2322355955839157, "learning_rate": 7.243626441830009e-05, "loss": 1.2496, "step": 63 }, { "epoch": 0.032507936507936507, "grad_norm": 0.44400423765182495, "learning_rate": 6.909830056250527e-05, "loss": 1.3121, "step": 64 }, { "epoch": 0.03301587301587302, "grad_norm": 0.46300458908081055, "learning_rate": 6.579798566743314e-05, "loss": 1.5739, "step": 65 }, { "epoch": 0.033523809523809525, "grad_norm": 0.27108630537986755, "learning_rate": 6.25393406584088e-05, "loss": 0.9874, "step": 66 }, { "epoch": 0.03403174603174603, "grad_norm": 0.23532073199748993, "learning_rate": 5.9326335692419995e-05, "loss": 1.2079, "step": 67 }, { "epoch": 0.03453968253968254, "grad_norm": 0.4134567379951477, "learning_rate": 5.616288532109225e-05, "loss": 1.5086, "step": 68 }, { "epoch": 0.03504761904761905, "grad_norm": 0.3667893707752228, "learning_rate": 5.305284372141095e-05, "loss": 1.5459, "step": 69 }, { "epoch": 0.035555555555555556, "grad_norm": 0.5127963423728943, "learning_rate": 5.000000000000002e-05, "loss": 1.4526, "step": 70 }, { "epoch": 0.03606349206349206, "grad_norm": 0.23248711228370667, "learning_rate": 4.700807357667952e-05, "loss": 1.7219, "step": 71 }, { "epoch": 0.036571428571428574, "grad_norm": 0.36493247747421265, "learning_rate": 4.4080709652925336e-05, "loss": 1.2866, "step": 72 }, { "epoch": 0.03707936507936508, "grad_norm": 0.3120146691799164, "learning_rate": 4.12214747707527e-05, "loss": 1.7468, "step": 73 }, { "epoch": 0.037587301587301586, "grad_norm": 0.2617102265357971, "learning_rate": 3.843385246743417e-05, "loss": 1.3033, "step": 74 }, { "epoch": 0.0380952380952381, "grad_norm": 0.37346601486206055, "learning_rate": 3.5721239031346066e-05, "loss": 1.3576, "step": 75 }, { "epoch": 0.0380952380952381, "eval_loss": 1.4916026592254639, "eval_runtime": 206.1501, "eval_samples_per_second": 4.021, "eval_steps_per_second": 2.013, "step": 75 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.88845956923392e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }