TR-Qwen2-7B-001-241129 / trainer_state.json
Quardo's picture
Upload folder using huggingface_hub
944a9d6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 185,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005405405405405406,
"grad_norm": 4.0049285888671875,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.4543,
"step": 1
},
{
"epoch": 0.010810810810810811,
"grad_norm": 3.6172754764556885,
"learning_rate": 6.666666666666667e-05,
"loss": 1.4798,
"step": 2
},
{
"epoch": 0.016216216216216217,
"grad_norm": 15.047930717468262,
"learning_rate": 0.0001,
"loss": 2.3166,
"step": 3
},
{
"epoch": 0.021621621621621623,
"grad_norm": 30.211719512939453,
"learning_rate": 0.00013333333333333334,
"loss": 3.0968,
"step": 4
},
{
"epoch": 0.02702702702702703,
"grad_norm": 85.43743133544922,
"learning_rate": 0.0001666666666666667,
"loss": 9.2485,
"step": 5
},
{
"epoch": 0.032432432432432434,
"grad_norm": 31.648372650146484,
"learning_rate": 0.0002,
"loss": 5.6812,
"step": 6
},
{
"epoch": 0.03783783783783784,
"grad_norm": 28.307153701782227,
"learning_rate": 0.00019899441340782124,
"loss": 4.923,
"step": 7
},
{
"epoch": 0.043243243243243246,
"grad_norm": 72.10746002197266,
"learning_rate": 0.00019798882681564247,
"loss": 5.2033,
"step": 8
},
{
"epoch": 0.04864864864864865,
"grad_norm": 245.99508666992188,
"learning_rate": 0.00019698324022346367,
"loss": 4.7967,
"step": 9
},
{
"epoch": 0.05405405405405406,
"grad_norm": 19.966106414794922,
"learning_rate": 0.00019597765363128493,
"loss": 4.4746,
"step": 10
},
{
"epoch": 0.05945945945945946,
"grad_norm": 13.21216869354248,
"learning_rate": 0.00019497206703910616,
"loss": 3.853,
"step": 11
},
{
"epoch": 0.06486486486486487,
"grad_norm": 22.403423309326172,
"learning_rate": 0.00019396648044692737,
"loss": 3.8436,
"step": 12
},
{
"epoch": 0.07027027027027027,
"grad_norm": 19.907819747924805,
"learning_rate": 0.0001929608938547486,
"loss": 3.4515,
"step": 13
},
{
"epoch": 0.07567567567567568,
"grad_norm": 13.797648429870605,
"learning_rate": 0.00019195530726256985,
"loss": 3.1302,
"step": 14
},
{
"epoch": 0.08108108108108109,
"grad_norm": 13.124836921691895,
"learning_rate": 0.00019094972067039108,
"loss": 2.9113,
"step": 15
},
{
"epoch": 0.08648648648648649,
"grad_norm": 5.7703022956848145,
"learning_rate": 0.0001899441340782123,
"loss": 2.63,
"step": 16
},
{
"epoch": 0.0918918918918919,
"grad_norm": 10.830033302307129,
"learning_rate": 0.00018893854748603352,
"loss": 2.3445,
"step": 17
},
{
"epoch": 0.0972972972972973,
"grad_norm": 6.587128162384033,
"learning_rate": 0.00018793296089385475,
"loss": 2.4421,
"step": 18
},
{
"epoch": 0.10270270270270271,
"grad_norm": 8.627386093139648,
"learning_rate": 0.000186927374301676,
"loss": 2.55,
"step": 19
},
{
"epoch": 0.10810810810810811,
"grad_norm": 6.208451747894287,
"learning_rate": 0.0001859217877094972,
"loss": 2.3098,
"step": 20
},
{
"epoch": 0.11351351351351352,
"grad_norm": 3.859027624130249,
"learning_rate": 0.00018491620111731844,
"loss": 2.1369,
"step": 21
},
{
"epoch": 0.11891891891891893,
"grad_norm": 4.324620723724365,
"learning_rate": 0.00018391061452513967,
"loss": 2.2362,
"step": 22
},
{
"epoch": 0.12432432432432433,
"grad_norm": 4.892393112182617,
"learning_rate": 0.00018290502793296093,
"loss": 2.1266,
"step": 23
},
{
"epoch": 0.12972972972972974,
"grad_norm": 5.422415733337402,
"learning_rate": 0.00018189944134078213,
"loss": 2.0899,
"step": 24
},
{
"epoch": 0.13513513513513514,
"grad_norm": 5.536099910736084,
"learning_rate": 0.00018089385474860333,
"loss": 1.9812,
"step": 25
},
{
"epoch": 0.14054054054054055,
"grad_norm": 3.9717321395874023,
"learning_rate": 0.0001798882681564246,
"loss": 1.8578,
"step": 26
},
{
"epoch": 0.14594594594594595,
"grad_norm": 4.257945537567139,
"learning_rate": 0.00017888268156424582,
"loss": 1.9502,
"step": 27
},
{
"epoch": 0.15135135135135136,
"grad_norm": 3.4768412113189697,
"learning_rate": 0.00017787709497206705,
"loss": 1.8821,
"step": 28
},
{
"epoch": 0.15675675675675677,
"grad_norm": 5.640548229217529,
"learning_rate": 0.00017687150837988826,
"loss": 1.8934,
"step": 29
},
{
"epoch": 0.16216216216216217,
"grad_norm": 3.2775862216949463,
"learning_rate": 0.00017586592178770951,
"loss": 1.7708,
"step": 30
},
{
"epoch": 0.16756756756756758,
"grad_norm": 3.5710699558258057,
"learning_rate": 0.00017486033519553075,
"loss": 1.7744,
"step": 31
},
{
"epoch": 0.17297297297297298,
"grad_norm": 2.6257646083831787,
"learning_rate": 0.00017385474860335198,
"loss": 1.6934,
"step": 32
},
{
"epoch": 0.1783783783783784,
"grad_norm": 3.9629523754119873,
"learning_rate": 0.00017284916201117318,
"loss": 1.7363,
"step": 33
},
{
"epoch": 0.1837837837837838,
"grad_norm": 2.3908259868621826,
"learning_rate": 0.00017184357541899444,
"loss": 1.7277,
"step": 34
},
{
"epoch": 0.1891891891891892,
"grad_norm": 2.75465989112854,
"learning_rate": 0.00017083798882681567,
"loss": 1.7093,
"step": 35
},
{
"epoch": 0.1945945945945946,
"grad_norm": 8.39667797088623,
"learning_rate": 0.00016983240223463687,
"loss": 1.8453,
"step": 36
},
{
"epoch": 0.2,
"grad_norm": 55.12360382080078,
"learning_rate": 0.0001688268156424581,
"loss": 2.7956,
"step": 37
},
{
"epoch": 0.20540540540540542,
"grad_norm": 7.613361358642578,
"learning_rate": 0.00016782122905027933,
"loss": 1.9827,
"step": 38
},
{
"epoch": 0.21081081081081082,
"grad_norm": 4.211851119995117,
"learning_rate": 0.00016681564245810056,
"loss": 1.8414,
"step": 39
},
{
"epoch": 0.21621621621621623,
"grad_norm": 3.3652498722076416,
"learning_rate": 0.0001658100558659218,
"loss": 1.8754,
"step": 40
},
{
"epoch": 0.22162162162162163,
"grad_norm": 4.680253982543945,
"learning_rate": 0.00016480446927374302,
"loss": 1.8269,
"step": 41
},
{
"epoch": 0.22702702702702704,
"grad_norm": 2.4161698818206787,
"learning_rate": 0.00016379888268156425,
"loss": 1.7132,
"step": 42
},
{
"epoch": 0.23243243243243245,
"grad_norm": 2.6352972984313965,
"learning_rate": 0.00016279329608938548,
"loss": 1.7756,
"step": 43
},
{
"epoch": 0.23783783783783785,
"grad_norm": 2.8735787868499756,
"learning_rate": 0.00016178770949720671,
"loss": 1.7029,
"step": 44
},
{
"epoch": 0.24324324324324326,
"grad_norm": 2.2981088161468506,
"learning_rate": 0.00016078212290502792,
"loss": 1.7449,
"step": 45
},
{
"epoch": 0.24864864864864866,
"grad_norm": 3.6938095092773438,
"learning_rate": 0.00015977653631284918,
"loss": 1.6877,
"step": 46
},
{
"epoch": 0.25405405405405407,
"grad_norm": 2.38474702835083,
"learning_rate": 0.00015877094972067038,
"loss": 1.5995,
"step": 47
},
{
"epoch": 0.2594594594594595,
"grad_norm": 2.466663360595703,
"learning_rate": 0.00015776536312849164,
"loss": 1.5037,
"step": 48
},
{
"epoch": 0.2648648648648649,
"grad_norm": 2.7223317623138428,
"learning_rate": 0.00015675977653631284,
"loss": 1.5707,
"step": 49
},
{
"epoch": 0.2702702702702703,
"grad_norm": 2.1740567684173584,
"learning_rate": 0.0001557541899441341,
"loss": 1.5779,
"step": 50
},
{
"epoch": 0.2756756756756757,
"grad_norm": 2.130438804626465,
"learning_rate": 0.0001547486033519553,
"loss": 1.581,
"step": 51
},
{
"epoch": 0.2810810810810811,
"grad_norm": 3.1053080558776855,
"learning_rate": 0.00015374301675977656,
"loss": 1.6141,
"step": 52
},
{
"epoch": 0.2864864864864865,
"grad_norm": 2.1347055435180664,
"learning_rate": 0.00015273743016759776,
"loss": 1.5882,
"step": 53
},
{
"epoch": 0.2918918918918919,
"grad_norm": 2.012467384338379,
"learning_rate": 0.000151731843575419,
"loss": 1.5219,
"step": 54
},
{
"epoch": 0.2972972972972973,
"grad_norm": 2.5574147701263428,
"learning_rate": 0.00015072625698324022,
"loss": 1.6298,
"step": 55
},
{
"epoch": 0.3027027027027027,
"grad_norm": 3.091801881790161,
"learning_rate": 0.00014972067039106145,
"loss": 1.515,
"step": 56
},
{
"epoch": 0.3081081081081081,
"grad_norm": 2.234355926513672,
"learning_rate": 0.00014871508379888268,
"loss": 1.5501,
"step": 57
},
{
"epoch": 0.31351351351351353,
"grad_norm": 3.299154281616211,
"learning_rate": 0.00014770949720670391,
"loss": 1.6285,
"step": 58
},
{
"epoch": 0.31891891891891894,
"grad_norm": 2.0043587684631348,
"learning_rate": 0.00014670391061452514,
"loss": 1.5218,
"step": 59
},
{
"epoch": 0.32432432432432434,
"grad_norm": 2.3809549808502197,
"learning_rate": 0.00014569832402234638,
"loss": 1.4872,
"step": 60
},
{
"epoch": 0.32972972972972975,
"grad_norm": 2.2580623626708984,
"learning_rate": 0.0001446927374301676,
"loss": 1.4445,
"step": 61
},
{
"epoch": 0.33513513513513515,
"grad_norm": 1.822764277458191,
"learning_rate": 0.00014368715083798884,
"loss": 1.551,
"step": 62
},
{
"epoch": 0.34054054054054056,
"grad_norm": 1.8461302518844604,
"learning_rate": 0.00014268156424581004,
"loss": 1.4394,
"step": 63
},
{
"epoch": 0.34594594594594597,
"grad_norm": 1.7777131795883179,
"learning_rate": 0.0001416759776536313,
"loss": 1.3559,
"step": 64
},
{
"epoch": 0.35135135135135137,
"grad_norm": 1.775188684463501,
"learning_rate": 0.00014067039106145253,
"loss": 1.506,
"step": 65
},
{
"epoch": 0.3567567567567568,
"grad_norm": 1.8579380512237549,
"learning_rate": 0.00013966480446927376,
"loss": 1.5416,
"step": 66
},
{
"epoch": 0.3621621621621622,
"grad_norm": 2.0866875648498535,
"learning_rate": 0.00013865921787709496,
"loss": 1.5526,
"step": 67
},
{
"epoch": 0.3675675675675676,
"grad_norm": 2.1562321186065674,
"learning_rate": 0.00013765363128491622,
"loss": 1.4969,
"step": 68
},
{
"epoch": 0.372972972972973,
"grad_norm": 1.928788185119629,
"learning_rate": 0.00013664804469273745,
"loss": 1.5515,
"step": 69
},
{
"epoch": 0.3783783783783784,
"grad_norm": 2.124756336212158,
"learning_rate": 0.00013564245810055868,
"loss": 1.4266,
"step": 70
},
{
"epoch": 0.3837837837837838,
"grad_norm": 1.4556527137756348,
"learning_rate": 0.00013463687150837988,
"loss": 1.4718,
"step": 71
},
{
"epoch": 0.3891891891891892,
"grad_norm": 1.5127558708190918,
"learning_rate": 0.00013363128491620111,
"loss": 1.4323,
"step": 72
},
{
"epoch": 0.3945945945945946,
"grad_norm": 1.6139051914215088,
"learning_rate": 0.00013262569832402237,
"loss": 1.3959,
"step": 73
},
{
"epoch": 0.4,
"grad_norm": 2.1050167083740234,
"learning_rate": 0.00013162011173184358,
"loss": 1.508,
"step": 74
},
{
"epoch": 0.40540540540540543,
"grad_norm": 2.010802745819092,
"learning_rate": 0.0001306145251396648,
"loss": 1.4919,
"step": 75
},
{
"epoch": 0.41081081081081083,
"grad_norm": 5.8437981605529785,
"learning_rate": 0.00012960893854748604,
"loss": 1.4919,
"step": 76
},
{
"epoch": 0.41621621621621624,
"grad_norm": 3.061354637145996,
"learning_rate": 0.0001286033519553073,
"loss": 1.5347,
"step": 77
},
{
"epoch": 0.42162162162162165,
"grad_norm": 1.7366749048233032,
"learning_rate": 0.0001275977653631285,
"loss": 1.4741,
"step": 78
},
{
"epoch": 0.42702702702702705,
"grad_norm": 3.3530421257019043,
"learning_rate": 0.00012659217877094973,
"loss": 1.5562,
"step": 79
},
{
"epoch": 0.43243243243243246,
"grad_norm": 1.7036885023117065,
"learning_rate": 0.00012558659217877096,
"loss": 1.467,
"step": 80
},
{
"epoch": 0.43783783783783786,
"grad_norm": 1.7056543827056885,
"learning_rate": 0.00012458100558659222,
"loss": 1.5217,
"step": 81
},
{
"epoch": 0.44324324324324327,
"grad_norm": 1.5444260835647583,
"learning_rate": 0.0001235754189944134,
"loss": 1.3959,
"step": 82
},
{
"epoch": 0.4486486486486487,
"grad_norm": 1.461717963218689,
"learning_rate": 0.00012256983240223462,
"loss": 1.3684,
"step": 83
},
{
"epoch": 0.4540540540540541,
"grad_norm": 2.405122756958008,
"learning_rate": 0.00012156424581005588,
"loss": 1.4736,
"step": 84
},
{
"epoch": 0.4594594594594595,
"grad_norm": 2.271719455718994,
"learning_rate": 0.0001205586592178771,
"loss": 1.4197,
"step": 85
},
{
"epoch": 0.4648648648648649,
"grad_norm": 1.6628215312957764,
"learning_rate": 0.00011955307262569834,
"loss": 1.4345,
"step": 86
},
{
"epoch": 0.4702702702702703,
"grad_norm": 1.5047897100448608,
"learning_rate": 0.00011854748603351954,
"loss": 1.4675,
"step": 87
},
{
"epoch": 0.4756756756756757,
"grad_norm": 2.330070972442627,
"learning_rate": 0.0001175418994413408,
"loss": 1.5053,
"step": 88
},
{
"epoch": 0.4810810810810811,
"grad_norm": 1.5151646137237549,
"learning_rate": 0.00011653631284916202,
"loss": 1.4277,
"step": 89
},
{
"epoch": 0.4864864864864865,
"grad_norm": 2.1238574981689453,
"learning_rate": 0.00011553072625698326,
"loss": 1.4749,
"step": 90
},
{
"epoch": 0.4918918918918919,
"grad_norm": 1.6031622886657715,
"learning_rate": 0.00011452513966480447,
"loss": 1.3802,
"step": 91
},
{
"epoch": 0.4972972972972973,
"grad_norm": 1.80471932888031,
"learning_rate": 0.0001135195530726257,
"loss": 1.4286,
"step": 92
},
{
"epoch": 0.5027027027027027,
"grad_norm": 1.7329200506210327,
"learning_rate": 0.00011251396648044694,
"loss": 1.4543,
"step": 93
},
{
"epoch": 0.5081081081081081,
"grad_norm": 2.031339406967163,
"learning_rate": 0.00011150837988826817,
"loss": 1.3877,
"step": 94
},
{
"epoch": 0.5135135135135135,
"grad_norm": 1.5385271310806274,
"learning_rate": 0.00011050279329608939,
"loss": 1.3736,
"step": 95
},
{
"epoch": 0.518918918918919,
"grad_norm": 2.143282413482666,
"learning_rate": 0.00010949720670391062,
"loss": 1.7171,
"step": 96
},
{
"epoch": 0.5243243243243243,
"grad_norm": 1.5371496677398682,
"learning_rate": 0.00010849162011173184,
"loss": 1.3208,
"step": 97
},
{
"epoch": 0.5297297297297298,
"grad_norm": 2.0396029949188232,
"learning_rate": 0.00010748603351955308,
"loss": 1.3534,
"step": 98
},
{
"epoch": 0.5351351351351351,
"grad_norm": 1.9422366619110107,
"learning_rate": 0.00010648044692737431,
"loss": 1.421,
"step": 99
},
{
"epoch": 0.5405405405405406,
"grad_norm": 1.494828462600708,
"learning_rate": 0.00010547486033519554,
"loss": 1.2746,
"step": 100
},
{
"epoch": 0.5459459459459459,
"grad_norm": 2.1698765754699707,
"learning_rate": 0.00010446927374301676,
"loss": 1.3989,
"step": 101
},
{
"epoch": 0.5513513513513514,
"grad_norm": 1.3124092817306519,
"learning_rate": 0.000103463687150838,
"loss": 1.3855,
"step": 102
},
{
"epoch": 0.5567567567567567,
"grad_norm": 1.4328157901763916,
"learning_rate": 0.00010245810055865923,
"loss": 1.4112,
"step": 103
},
{
"epoch": 0.5621621621621622,
"grad_norm": 1.3698210716247559,
"learning_rate": 0.00010145251396648045,
"loss": 1.3152,
"step": 104
},
{
"epoch": 0.5675675675675675,
"grad_norm": 1.291865348815918,
"learning_rate": 0.00010044692737430168,
"loss": 1.3646,
"step": 105
},
{
"epoch": 0.572972972972973,
"grad_norm": 1.4178961515426636,
"learning_rate": 9.944134078212291e-05,
"loss": 1.2852,
"step": 106
},
{
"epoch": 0.5783783783783784,
"grad_norm": 1.1877104043960571,
"learning_rate": 9.843575418994413e-05,
"loss": 1.3402,
"step": 107
},
{
"epoch": 0.5837837837837838,
"grad_norm": 1.4503647089004517,
"learning_rate": 9.743016759776537e-05,
"loss": 1.3028,
"step": 108
},
{
"epoch": 0.5891891891891892,
"grad_norm": 1.3877456188201904,
"learning_rate": 9.64245810055866e-05,
"loss": 1.3231,
"step": 109
},
{
"epoch": 0.5945945945945946,
"grad_norm": 1.4520429372787476,
"learning_rate": 9.541899441340782e-05,
"loss": 1.2608,
"step": 110
},
{
"epoch": 0.6,
"grad_norm": 1.1344528198242188,
"learning_rate": 9.441340782122905e-05,
"loss": 1.3409,
"step": 111
},
{
"epoch": 0.6054054054054054,
"grad_norm": 1.9510555267333984,
"learning_rate": 9.34078212290503e-05,
"loss": 1.2762,
"step": 112
},
{
"epoch": 0.6108108108108108,
"grad_norm": 1.6498372554779053,
"learning_rate": 9.240223463687152e-05,
"loss": 1.3558,
"step": 113
},
{
"epoch": 0.6162162162162163,
"grad_norm": 1.3852072954177856,
"learning_rate": 9.139664804469274e-05,
"loss": 1.3515,
"step": 114
},
{
"epoch": 0.6216216216216216,
"grad_norm": 1.516605019569397,
"learning_rate": 9.039106145251397e-05,
"loss": 1.3595,
"step": 115
},
{
"epoch": 0.6270270270270271,
"grad_norm": 1.387160062789917,
"learning_rate": 8.938547486033519e-05,
"loss": 1.2759,
"step": 116
},
{
"epoch": 0.6324324324324324,
"grad_norm": 1.218645691871643,
"learning_rate": 8.837988826815642e-05,
"loss": 1.3228,
"step": 117
},
{
"epoch": 0.6378378378378379,
"grad_norm": 1.1323994398117065,
"learning_rate": 8.737430167597766e-05,
"loss": 1.3792,
"step": 118
},
{
"epoch": 0.6432432432432432,
"grad_norm": 1.492294430732727,
"learning_rate": 8.63687150837989e-05,
"loss": 1.2875,
"step": 119
},
{
"epoch": 0.6486486486486487,
"grad_norm": 1.343035340309143,
"learning_rate": 8.536312849162011e-05,
"loss": 1.2473,
"step": 120
},
{
"epoch": 0.654054054054054,
"grad_norm": 1.3583838939666748,
"learning_rate": 8.435754189944134e-05,
"loss": 1.367,
"step": 121
},
{
"epoch": 0.6594594594594595,
"grad_norm": 1.4810901880264282,
"learning_rate": 8.335195530726259e-05,
"loss": 1.3429,
"step": 122
},
{
"epoch": 0.6648648648648648,
"grad_norm": 1.1570441722869873,
"learning_rate": 8.234636871508382e-05,
"loss": 1.269,
"step": 123
},
{
"epoch": 0.6702702702702703,
"grad_norm": 1.2883822917938232,
"learning_rate": 8.134078212290503e-05,
"loss": 1.2602,
"step": 124
},
{
"epoch": 0.6756756756756757,
"grad_norm": 1.3522834777832031,
"learning_rate": 8.033519553072626e-05,
"loss": 1.2891,
"step": 125
},
{
"epoch": 0.6810810810810811,
"grad_norm": 1.0803565979003906,
"learning_rate": 7.932960893854748e-05,
"loss": 1.2391,
"step": 126
},
{
"epoch": 0.6864864864864865,
"grad_norm": 1.2738792896270752,
"learning_rate": 7.832402234636872e-05,
"loss": 1.2819,
"step": 127
},
{
"epoch": 0.6918918918918919,
"grad_norm": 1.2565838098526,
"learning_rate": 7.731843575418995e-05,
"loss": 1.3168,
"step": 128
},
{
"epoch": 0.6972972972972973,
"grad_norm": 1.1551238298416138,
"learning_rate": 7.631284916201119e-05,
"loss": 1.3582,
"step": 129
},
{
"epoch": 0.7027027027027027,
"grad_norm": 1.153626799583435,
"learning_rate": 7.53072625698324e-05,
"loss": 1.2153,
"step": 130
},
{
"epoch": 0.7081081081081081,
"grad_norm": 1.3645069599151611,
"learning_rate": 7.430167597765365e-05,
"loss": 1.2957,
"step": 131
},
{
"epoch": 0.7135135135135136,
"grad_norm": 1.1639150381088257,
"learning_rate": 7.329608938547488e-05,
"loss": 1.2426,
"step": 132
},
{
"epoch": 0.7189189189189189,
"grad_norm": 1.2247486114501953,
"learning_rate": 7.22905027932961e-05,
"loss": 1.2479,
"step": 133
},
{
"epoch": 0.7243243243243244,
"grad_norm": 1.3136733770370483,
"learning_rate": 7.128491620111732e-05,
"loss": 1.2577,
"step": 134
},
{
"epoch": 0.7297297297297297,
"grad_norm": 1.129638910293579,
"learning_rate": 7.027932960893855e-05,
"loss": 1.2942,
"step": 135
},
{
"epoch": 0.7351351351351352,
"grad_norm": 1.5812125205993652,
"learning_rate": 6.927374301675977e-05,
"loss": 1.3149,
"step": 136
},
{
"epoch": 0.7405405405405405,
"grad_norm": 1.1031138896942139,
"learning_rate": 6.826815642458102e-05,
"loss": 1.2825,
"step": 137
},
{
"epoch": 0.745945945945946,
"grad_norm": 1.4569562673568726,
"learning_rate": 6.726256983240225e-05,
"loss": 1.3553,
"step": 138
},
{
"epoch": 0.7513513513513513,
"grad_norm": 1.351787805557251,
"learning_rate": 6.625698324022346e-05,
"loss": 1.3805,
"step": 139
},
{
"epoch": 0.7567567567567568,
"grad_norm": 1.4399067163467407,
"learning_rate": 6.52513966480447e-05,
"loss": 1.3488,
"step": 140
},
{
"epoch": 0.7621621621621621,
"grad_norm": 1.097604751586914,
"learning_rate": 6.424581005586592e-05,
"loss": 1.2444,
"step": 141
},
{
"epoch": 0.7675675675675676,
"grad_norm": 1.2245107889175415,
"learning_rate": 6.324022346368715e-05,
"loss": 1.2415,
"step": 142
},
{
"epoch": 0.772972972972973,
"grad_norm": 1.2633042335510254,
"learning_rate": 6.223463687150838e-05,
"loss": 1.1964,
"step": 143
},
{
"epoch": 0.7783783783783784,
"grad_norm": 1.1645766496658325,
"learning_rate": 6.122905027932962e-05,
"loss": 1.1874,
"step": 144
},
{
"epoch": 0.7837837837837838,
"grad_norm": 1.013777494430542,
"learning_rate": 6.022346368715084e-05,
"loss": 1.2065,
"step": 145
},
{
"epoch": 0.7891891891891892,
"grad_norm": 1.1051980257034302,
"learning_rate": 5.921787709497206e-05,
"loss": 1.2878,
"step": 146
},
{
"epoch": 0.7945945945945946,
"grad_norm": 1.0944007635116577,
"learning_rate": 5.82122905027933e-05,
"loss": 1.2634,
"step": 147
},
{
"epoch": 0.8,
"grad_norm": 1.1613037586212158,
"learning_rate": 5.720670391061454e-05,
"loss": 1.3074,
"step": 148
},
{
"epoch": 0.8054054054054054,
"grad_norm": 1.0192413330078125,
"learning_rate": 5.620111731843576e-05,
"loss": 1.1785,
"step": 149
},
{
"epoch": 0.8108108108108109,
"grad_norm": 1.0536669492721558,
"learning_rate": 5.5195530726256985e-05,
"loss": 1.2221,
"step": 150
},
{
"epoch": 0.8162162162162162,
"grad_norm": 1.260764241218567,
"learning_rate": 5.418994413407821e-05,
"loss": 1.1202,
"step": 151
},
{
"epoch": 0.8216216216216217,
"grad_norm": 1.1566349267959595,
"learning_rate": 5.3184357541899446e-05,
"loss": 1.2774,
"step": 152
},
{
"epoch": 0.827027027027027,
"grad_norm": 1.132250189781189,
"learning_rate": 5.2178770949720676e-05,
"loss": 1.2011,
"step": 153
},
{
"epoch": 0.8324324324324325,
"grad_norm": 1.0502394437789917,
"learning_rate": 5.11731843575419e-05,
"loss": 1.2538,
"step": 154
},
{
"epoch": 0.8378378378378378,
"grad_norm": 2.7037501335144043,
"learning_rate": 5.016759776536313e-05,
"loss": 1.4687,
"step": 155
},
{
"epoch": 0.8432432432432433,
"grad_norm": 1.1927903890609741,
"learning_rate": 4.916201117318436e-05,
"loss": 1.2044,
"step": 156
},
{
"epoch": 0.8486486486486486,
"grad_norm": 1.0077776908874512,
"learning_rate": 4.815642458100559e-05,
"loss": 1.1471,
"step": 157
},
{
"epoch": 0.8540540540540541,
"grad_norm": 0.9799959659576416,
"learning_rate": 4.715083798882682e-05,
"loss": 1.2181,
"step": 158
},
{
"epoch": 0.8594594594594595,
"grad_norm": 1.0267740488052368,
"learning_rate": 4.614525139664805e-05,
"loss": 1.244,
"step": 159
},
{
"epoch": 0.8648648648648649,
"grad_norm": 3.7082221508026123,
"learning_rate": 4.5139664804469276e-05,
"loss": 2.2304,
"step": 160
},
{
"epoch": 0.8702702702702703,
"grad_norm": 1.0884734392166138,
"learning_rate": 4.413407821229051e-05,
"loss": 1.1469,
"step": 161
},
{
"epoch": 0.8756756756756757,
"grad_norm": 1.1083415746688843,
"learning_rate": 4.312849162011173e-05,
"loss": 1.2713,
"step": 162
},
{
"epoch": 0.8810810810810811,
"grad_norm": 1.0334668159484863,
"learning_rate": 4.212290502793296e-05,
"loss": 1.0654,
"step": 163
},
{
"epoch": 0.8864864864864865,
"grad_norm": 0.9540927410125732,
"learning_rate": 4.111731843575419e-05,
"loss": 1.1151,
"step": 164
},
{
"epoch": 0.8918918918918919,
"grad_norm": 0.9412338137626648,
"learning_rate": 4.0111731843575415e-05,
"loss": 1.2111,
"step": 165
},
{
"epoch": 0.8972972972972973,
"grad_norm": 1.0096511840820312,
"learning_rate": 3.910614525139665e-05,
"loss": 1.3087,
"step": 166
},
{
"epoch": 0.9027027027027027,
"grad_norm": 1.0874724388122559,
"learning_rate": 3.8100558659217876e-05,
"loss": 1.1717,
"step": 167
},
{
"epoch": 0.9081081081081082,
"grad_norm": 0.9713255167007446,
"learning_rate": 3.709497206703911e-05,
"loss": 1.1705,
"step": 168
},
{
"epoch": 0.9135135135135135,
"grad_norm": 0.9664978981018066,
"learning_rate": 3.608938547486034e-05,
"loss": 1.2627,
"step": 169
},
{
"epoch": 0.918918918918919,
"grad_norm": 0.966834306716919,
"learning_rate": 3.508379888268157e-05,
"loss": 1.1916,
"step": 170
},
{
"epoch": 0.9243243243243243,
"grad_norm": 1.0459810495376587,
"learning_rate": 3.40782122905028e-05,
"loss": 1.1613,
"step": 171
},
{
"epoch": 0.9297297297297298,
"grad_norm": 1.3951828479766846,
"learning_rate": 3.307262569832403e-05,
"loss": 1.1818,
"step": 172
},
{
"epoch": 0.9351351351351351,
"grad_norm": 1.0367494821548462,
"learning_rate": 3.206703910614525e-05,
"loss": 1.0986,
"step": 173
},
{
"epoch": 0.9405405405405406,
"grad_norm": 0.9425138831138611,
"learning_rate": 3.106145251396648e-05,
"loss": 1.1385,
"step": 174
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.9617014527320862,
"learning_rate": 3.0055865921787714e-05,
"loss": 1.1954,
"step": 175
},
{
"epoch": 0.9513513513513514,
"grad_norm": 0.9694061279296875,
"learning_rate": 2.9050279329608944e-05,
"loss": 1.2628,
"step": 176
},
{
"epoch": 0.9567567567567568,
"grad_norm": 0.9338908195495605,
"learning_rate": 2.8044692737430168e-05,
"loss": 1.1221,
"step": 177
},
{
"epoch": 0.9621621621621622,
"grad_norm": 0.8477990031242371,
"learning_rate": 2.70391061452514e-05,
"loss": 1.1914,
"step": 178
},
{
"epoch": 0.9675675675675676,
"grad_norm": 0.8513604998588562,
"learning_rate": 2.603351955307263e-05,
"loss": 1.1203,
"step": 179
},
{
"epoch": 0.972972972972973,
"grad_norm": 1.0121779441833496,
"learning_rate": 2.5027932960893856e-05,
"loss": 1.1387,
"step": 180
},
{
"epoch": 0.9783783783783784,
"grad_norm": 0.8904174566268921,
"learning_rate": 2.4022346368715086e-05,
"loss": 1.1362,
"step": 181
},
{
"epoch": 0.9837837837837838,
"grad_norm": 0.9687130451202393,
"learning_rate": 2.3016759776536314e-05,
"loss": 1.1407,
"step": 182
},
{
"epoch": 0.9891891891891892,
"grad_norm": 0.936590850353241,
"learning_rate": 2.2011173184357544e-05,
"loss": 1.1859,
"step": 183
},
{
"epoch": 0.9945945945945946,
"grad_norm": 0.8786422610282898,
"learning_rate": 2.100558659217877e-05,
"loss": 1.184,
"step": 184
},
{
"epoch": 1.0,
"grad_norm": 0.894349217414856,
"learning_rate": 2e-05,
"loss": 0.9644,
"step": 185
}
],
"logging_steps": 1.0,
"max_steps": 185,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.38021889121321e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}