qwen2-cangjie-lora / trainer_state.json
Zxilly's picture
Upload folder using huggingface_hub
81ecfc5 verified
raw
history blame
40.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.979631425800194,
"eval_steps": 5000,
"global_step": 384,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015518913676042677,
"grad_norm": 0.416015625,
"learning_rate": 0.0001,
"loss": 0.831,
"num_input_tokens_seen": 524288,
"step": 2
},
{
"epoch": 0.031037827352085354,
"grad_norm": 0.49609375,
"learning_rate": 9.999323662872997e-05,
"loss": 0.7398,
"num_input_tokens_seen": 1048576,
"step": 4
},
{
"epoch": 0.04655674102812803,
"grad_norm": 0.333984375,
"learning_rate": 9.99729483446475e-05,
"loss": 0.6438,
"num_input_tokens_seen": 1572864,
"step": 6
},
{
"epoch": 0.06207565470417071,
"grad_norm": 1.0703125,
"learning_rate": 9.993914063644052e-05,
"loss": 0.6032,
"num_input_tokens_seen": 2097152,
"step": 8
},
{
"epoch": 0.07759456838021339,
"grad_norm": 0.314453125,
"learning_rate": 9.989182265027232e-05,
"loss": 0.5433,
"num_input_tokens_seen": 2621440,
"step": 10
},
{
"epoch": 0.09311348205625607,
"grad_norm": 0.2236328125,
"learning_rate": 9.98310071873072e-05,
"loss": 0.5228,
"num_input_tokens_seen": 3145728,
"step": 12
},
{
"epoch": 0.10863239573229874,
"grad_norm": 0.3359375,
"learning_rate": 9.97567107002474e-05,
"loss": 0.4702,
"num_input_tokens_seen": 3670016,
"step": 14
},
{
"epoch": 0.12415130940834142,
"grad_norm": 0.2109375,
"learning_rate": 9.966895328888194e-05,
"loss": 0.4574,
"num_input_tokens_seen": 4194304,
"step": 16
},
{
"epoch": 0.1396702230843841,
"grad_norm": 0.224609375,
"learning_rate": 9.956775869464901e-05,
"loss": 0.5093,
"num_input_tokens_seen": 4718592,
"step": 18
},
{
"epoch": 0.15518913676042678,
"grad_norm": 0.25390625,
"learning_rate": 9.945315429421306e-05,
"loss": 0.4771,
"num_input_tokens_seen": 5242880,
"step": 20
},
{
"epoch": 0.17070805043646944,
"grad_norm": 0.201171875,
"learning_rate": 9.932517109205849e-05,
"loss": 0.4343,
"num_input_tokens_seen": 5767168,
"step": 22
},
{
"epoch": 0.18622696411251213,
"grad_norm": 0.1650390625,
"learning_rate": 9.918384371210176e-05,
"loss": 0.4455,
"num_input_tokens_seen": 6291456,
"step": 24
},
{
"epoch": 0.2017458777885548,
"grad_norm": 3.265625,
"learning_rate": 9.902921038832455e-05,
"loss": 0.4669,
"num_input_tokens_seen": 6815744,
"step": 26
},
{
"epoch": 0.21726479146459748,
"grad_norm": 2.421875,
"learning_rate": 9.886131295443003e-05,
"loss": 0.4723,
"num_input_tokens_seen": 7340032,
"step": 28
},
{
"epoch": 0.23278370514064015,
"grad_norm": 0.2001953125,
"learning_rate": 9.868019683252543e-05,
"loss": 0.4364,
"num_input_tokens_seen": 7864320,
"step": 30
},
{
"epoch": 0.24830261881668284,
"grad_norm": 2.0625,
"learning_rate": 9.848591102083375e-05,
"loss": 0.4013,
"num_input_tokens_seen": 8388608,
"step": 32
},
{
"epoch": 0.2638215324927255,
"grad_norm": 0.1435546875,
"learning_rate": 9.82785080804381e-05,
"loss": 0.3875,
"num_input_tokens_seen": 8912896,
"step": 34
},
{
"epoch": 0.2793404461687682,
"grad_norm": 0.125,
"learning_rate": 9.805804412106198e-05,
"loss": 0.4187,
"num_input_tokens_seen": 9437184,
"step": 36
},
{
"epoch": 0.2948593598448109,
"grad_norm": 0.12255859375,
"learning_rate": 9.782457878588977e-05,
"loss": 0.3981,
"num_input_tokens_seen": 9961472,
"step": 38
},
{
"epoch": 0.31037827352085356,
"grad_norm": 0.10302734375,
"learning_rate": 9.757817523543109e-05,
"loss": 0.4121,
"num_input_tokens_seen": 10485760,
"step": 40
},
{
"epoch": 0.3258971871968962,
"grad_norm": 0.1318359375,
"learning_rate": 9.731890013043368e-05,
"loss": 0.392,
"num_input_tokens_seen": 11010048,
"step": 42
},
{
"epoch": 0.3414161008729389,
"grad_norm": 0.10546875,
"learning_rate": 9.704682361384941e-05,
"loss": 0.3845,
"num_input_tokens_seen": 11534336,
"step": 44
},
{
"epoch": 0.3569350145489816,
"grad_norm": 0.0859375,
"learning_rate": 9.676201929185809e-05,
"loss": 0.397,
"num_input_tokens_seen": 12058624,
"step": 46
},
{
"epoch": 0.37245392822502427,
"grad_norm": 0.083984375,
"learning_rate": 9.646456421395446e-05,
"loss": 0.3753,
"num_input_tokens_seen": 12582912,
"step": 48
},
{
"epoch": 0.3879728419010669,
"grad_norm": 0.0888671875,
"learning_rate": 9.615453885210369e-05,
"loss": 0.387,
"num_input_tokens_seen": 13107200,
"step": 50
},
{
"epoch": 0.4034917555771096,
"grad_norm": 0.0703125,
"learning_rate": 9.583202707897074e-05,
"loss": 0.3724,
"num_input_tokens_seen": 13631488,
"step": 52
},
{
"epoch": 0.4190106692531523,
"grad_norm": 0.07958984375,
"learning_rate": 9.549711614523007e-05,
"loss": 0.4394,
"num_input_tokens_seen": 14155776,
"step": 54
},
{
"epoch": 0.43452958292919497,
"grad_norm": 0.0751953125,
"learning_rate": 9.514989665596114e-05,
"loss": 0.4177,
"num_input_tokens_seen": 14680064,
"step": 56
},
{
"epoch": 0.45004849660523766,
"grad_norm": 0.078125,
"learning_rate": 9.479046254613673e-05,
"loss": 0.3939,
"num_input_tokens_seen": 15204352,
"step": 58
},
{
"epoch": 0.4655674102812803,
"grad_norm": 0.076171875,
"learning_rate": 9.441891105521006e-05,
"loss": 0.4207,
"num_input_tokens_seen": 15728640,
"step": 60
},
{
"epoch": 0.481086323957323,
"grad_norm": 0.07177734375,
"learning_rate": 9.403534270080829e-05,
"loss": 0.3653,
"num_input_tokens_seen": 16252928,
"step": 62
},
{
"epoch": 0.49660523763336567,
"grad_norm": 0.0966796875,
"learning_rate": 9.3639861251539e-05,
"loss": 0.3925,
"num_input_tokens_seen": 16777216,
"step": 64
},
{
"epoch": 0.5121241513094084,
"grad_norm": 0.0859375,
"learning_rate": 9.323257369891703e-05,
"loss": 0.3982,
"num_input_tokens_seen": 17301504,
"step": 66
},
{
"epoch": 0.527643064985451,
"grad_norm": 0.07080078125,
"learning_rate": 9.281359022841965e-05,
"loss": 0.3709,
"num_input_tokens_seen": 17825792,
"step": 68
},
{
"epoch": 0.5431619786614937,
"grad_norm": 0.068359375,
"learning_rate": 9.238302418967756e-05,
"loss": 0.3744,
"num_input_tokens_seen": 18350080,
"step": 70
},
{
"epoch": 0.5586808923375364,
"grad_norm": 0.07666015625,
"learning_rate": 9.194099206580982e-05,
"loss": 0.3929,
"num_input_tokens_seen": 18874368,
"step": 72
},
{
"epoch": 0.574199806013579,
"grad_norm": 0.0771484375,
"learning_rate": 9.148761344191109e-05,
"loss": 0.3716,
"num_input_tokens_seen": 19398656,
"step": 74
},
{
"epoch": 0.5897187196896218,
"grad_norm": 0.0751953125,
"learning_rate": 9.102301097269974e-05,
"loss": 0.3959,
"num_input_tokens_seen": 19922944,
"step": 76
},
{
"epoch": 0.6052376333656644,
"grad_norm": 0.07666015625,
"learning_rate": 9.054731034933549e-05,
"loss": 0.3514,
"num_input_tokens_seen": 20447232,
"step": 78
},
{
"epoch": 0.6207565470417071,
"grad_norm": 0.0751953125,
"learning_rate": 9.006064026541548e-05,
"loss": 0.3767,
"num_input_tokens_seen": 20971520,
"step": 80
},
{
"epoch": 0.6362754607177498,
"grad_norm": 0.1376953125,
"learning_rate": 8.956313238215824e-05,
"loss": 0.371,
"num_input_tokens_seen": 21495808,
"step": 82
},
{
"epoch": 0.6517943743937924,
"grad_norm": 0.1171875,
"learning_rate": 8.905492129278478e-05,
"loss": 0.3529,
"num_input_tokens_seen": 22020096,
"step": 84
},
{
"epoch": 0.6673132880698351,
"grad_norm": 0.06640625,
"learning_rate": 8.853614448610631e-05,
"loss": 0.3044,
"num_input_tokens_seen": 22544384,
"step": 86
},
{
"epoch": 0.6828322017458778,
"grad_norm": 0.072265625,
"learning_rate": 8.800694230932884e-05,
"loss": 0.3532,
"num_input_tokens_seen": 23068672,
"step": 88
},
{
"epoch": 0.6983511154219205,
"grad_norm": 0.06787109375,
"learning_rate": 8.74674579300843e-05,
"loss": 0.3461,
"num_input_tokens_seen": 23592960,
"step": 90
},
{
"epoch": 0.7138700290979632,
"grad_norm": 0.0693359375,
"learning_rate": 8.691783729769874e-05,
"loss": 0.3513,
"num_input_tokens_seen": 24117248,
"step": 92
},
{
"epoch": 0.7293889427740058,
"grad_norm": 0.06689453125,
"learning_rate": 8.635822910370792e-05,
"loss": 0.3842,
"num_input_tokens_seen": 24641536,
"step": 94
},
{
"epoch": 0.7449078564500485,
"grad_norm": 0.11083984375,
"learning_rate": 8.578878474163115e-05,
"loss": 0.363,
"num_input_tokens_seen": 25165824,
"step": 96
},
{
"epoch": 0.7604267701260912,
"grad_norm": 0.06787109375,
"learning_rate": 8.520965826601394e-05,
"loss": 0.3079,
"num_input_tokens_seen": 25690112,
"step": 98
},
{
"epoch": 0.7759456838021338,
"grad_norm": 0.08203125,
"learning_rate": 8.462100635075097e-05,
"loss": 0.3769,
"num_input_tokens_seen": 26214400,
"step": 100
},
{
"epoch": 0.7914645974781765,
"grad_norm": 0.07470703125,
"learning_rate": 8.40229882467003e-05,
"loss": 0.3907,
"num_input_tokens_seen": 26738688,
"step": 102
},
{
"epoch": 0.8069835111542192,
"grad_norm": 0.07080078125,
"learning_rate": 8.341576573860048e-05,
"loss": 0.3457,
"num_input_tokens_seen": 27262976,
"step": 104
},
{
"epoch": 0.8225024248302619,
"grad_norm": 0.07666015625,
"learning_rate": 8.279950310130217e-05,
"loss": 0.3889,
"num_input_tokens_seen": 27787264,
"step": 106
},
{
"epoch": 0.8380213385063046,
"grad_norm": 0.06494140625,
"learning_rate": 8.2174367055326e-05,
"loss": 0.3142,
"num_input_tokens_seen": 28311552,
"step": 108
},
{
"epoch": 0.8535402521823472,
"grad_norm": 0.07275390625,
"learning_rate": 8.154052672175887e-05,
"loss": 0.3299,
"num_input_tokens_seen": 28835840,
"step": 110
},
{
"epoch": 0.8690591658583899,
"grad_norm": 0.0712890625,
"learning_rate": 8.089815357650089e-05,
"loss": 0.3425,
"num_input_tokens_seen": 29360128,
"step": 112
},
{
"epoch": 0.8845780795344326,
"grad_norm": 0.0712890625,
"learning_rate": 8.024742140387506e-05,
"loss": 0.3363,
"num_input_tokens_seen": 29884416,
"step": 114
},
{
"epoch": 0.9000969932104753,
"grad_norm": 0.083984375,
"learning_rate": 7.95885062496126e-05,
"loss": 0.3725,
"num_input_tokens_seen": 30408704,
"step": 116
},
{
"epoch": 0.915615906886518,
"grad_norm": 0.07666015625,
"learning_rate": 7.892158637322646e-05,
"loss": 0.3397,
"num_input_tokens_seen": 30932992,
"step": 118
},
{
"epoch": 0.9311348205625606,
"grad_norm": 0.0751953125,
"learning_rate": 7.824684219978591e-05,
"loss": 0.2812,
"num_input_tokens_seen": 31457280,
"step": 120
},
{
"epoch": 0.9466537342386033,
"grad_norm": 0.1015625,
"learning_rate": 7.756445627110523e-05,
"loss": 0.3555,
"num_input_tokens_seen": 31981568,
"step": 122
},
{
"epoch": 0.962172647914646,
"grad_norm": 0.072265625,
"learning_rate": 7.687461319635981e-05,
"loss": 0.3362,
"num_input_tokens_seen": 32505856,
"step": 124
},
{
"epoch": 0.9776915615906887,
"grad_norm": 0.07177734375,
"learning_rate": 7.6177499602143e-05,
"loss": 0.3133,
"num_input_tokens_seen": 33030144,
"step": 126
},
{
"epoch": 0.9932104752667313,
"grad_norm": 0.06982421875,
"learning_rate": 7.547330408197695e-05,
"loss": 0.3119,
"num_input_tokens_seen": 33554432,
"step": 128
},
{
"epoch": 1.008729388942774,
"grad_norm": 0.07666015625,
"learning_rate": 7.476221714529167e-05,
"loss": 0.3117,
"num_input_tokens_seen": 34078720,
"step": 130
},
{
"epoch": 1.0242483026188167,
"grad_norm": 0.07958984375,
"learning_rate": 7.404443116588548e-05,
"loss": 0.329,
"num_input_tokens_seen": 34603008,
"step": 132
},
{
"epoch": 1.0397672162948595,
"grad_norm": 0.078125,
"learning_rate": 7.332014032988123e-05,
"loss": 0.279,
"num_input_tokens_seen": 35127296,
"step": 134
},
{
"epoch": 1.055286129970902,
"grad_norm": 0.0703125,
"learning_rate": 7.258954058319216e-05,
"loss": 0.2682,
"num_input_tokens_seen": 35651584,
"step": 136
},
{
"epoch": 1.0708050436469447,
"grad_norm": 0.0732421875,
"learning_rate": 7.185282957851175e-05,
"loss": 0.293,
"num_input_tokens_seen": 36175872,
"step": 138
},
{
"epoch": 1.0863239573229875,
"grad_norm": 0.08154296875,
"learning_rate": 7.111020662184174e-05,
"loss": 0.315,
"num_input_tokens_seen": 36700160,
"step": 140
},
{
"epoch": 1.10184287099903,
"grad_norm": 0.072265625,
"learning_rate": 7.036187261857289e-05,
"loss": 0.289,
"num_input_tokens_seen": 37224448,
"step": 142
},
{
"epoch": 1.1173617846750727,
"grad_norm": 0.07763671875,
"learning_rate": 6.960803001913314e-05,
"loss": 0.2808,
"num_input_tokens_seen": 37748736,
"step": 144
},
{
"epoch": 1.1328806983511155,
"grad_norm": 0.07958984375,
"learning_rate": 6.884888276421766e-05,
"loss": 0.318,
"num_input_tokens_seen": 38273024,
"step": 146
},
{
"epoch": 1.148399612027158,
"grad_norm": 0.08251953125,
"learning_rate": 6.808463622961578e-05,
"loss": 0.2685,
"num_input_tokens_seen": 38797312,
"step": 148
},
{
"epoch": 1.1639185257032008,
"grad_norm": 0.0791015625,
"learning_rate": 6.731549717064974e-05,
"loss": 0.3121,
"num_input_tokens_seen": 39321600,
"step": 150
},
{
"epoch": 1.1794374393792435,
"grad_norm": 0.0830078125,
"learning_rate": 6.654167366624009e-05,
"loss": 0.2835,
"num_input_tokens_seen": 39845888,
"step": 152
},
{
"epoch": 1.1949563530552862,
"grad_norm": 0.0830078125,
"learning_rate": 6.576337506261314e-05,
"loss": 0.2905,
"num_input_tokens_seen": 40370176,
"step": 154
},
{
"epoch": 1.2104752667313288,
"grad_norm": 0.08984375,
"learning_rate": 6.498081191666548e-05,
"loss": 0.3277,
"num_input_tokens_seen": 40894464,
"step": 156
},
{
"epoch": 1.2259941804073715,
"grad_norm": 0.0859375,
"learning_rate": 6.419419593900108e-05,
"loss": 0.2788,
"num_input_tokens_seen": 41418752,
"step": 158
},
{
"epoch": 1.2415130940834143,
"grad_norm": 0.0791015625,
"learning_rate": 6.340373993665607e-05,
"loss": 0.2971,
"num_input_tokens_seen": 41943040,
"step": 160
},
{
"epoch": 1.2570320077594568,
"grad_norm": 0.091796875,
"learning_rate": 6.260965775552712e-05,
"loss": 0.287,
"num_input_tokens_seen": 42467328,
"step": 162
},
{
"epoch": 1.2725509214354995,
"grad_norm": 0.0849609375,
"learning_rate": 6.181216422251862e-05,
"loss": 0.3196,
"num_input_tokens_seen": 42991616,
"step": 164
},
{
"epoch": 1.2880698351115423,
"grad_norm": 0.083984375,
"learning_rate": 6.101147508742455e-05,
"loss": 0.3021,
"num_input_tokens_seen": 43515904,
"step": 166
},
{
"epoch": 1.3035887487875848,
"grad_norm": 0.0810546875,
"learning_rate": 6.0207806964560584e-05,
"loss": 0.2329,
"num_input_tokens_seen": 44040192,
"step": 168
},
{
"epoch": 1.3191076624636275,
"grad_norm": 0.08984375,
"learning_rate": 5.940137727416246e-05,
"loss": 0.2803,
"num_input_tokens_seen": 44564480,
"step": 170
},
{
"epoch": 1.3346265761396703,
"grad_norm": 0.0869140625,
"learning_rate": 5.8592404183566144e-05,
"loss": 0.2744,
"num_input_tokens_seen": 45088768,
"step": 172
},
{
"epoch": 1.3501454898157128,
"grad_norm": 0.08544921875,
"learning_rate": 5.778110654818601e-05,
"loss": 0.3332,
"num_input_tokens_seen": 45613056,
"step": 174
},
{
"epoch": 1.3656644034917556,
"grad_norm": 0.09814453125,
"learning_rate": 5.6967703852306786e-05,
"loss": 0.3223,
"num_input_tokens_seen": 46137344,
"step": 176
},
{
"epoch": 1.3811833171677983,
"grad_norm": 0.083984375,
"learning_rate": 5.6152416149705455e-05,
"loss": 0.3127,
"num_input_tokens_seen": 46661632,
"step": 178
},
{
"epoch": 1.3967022308438408,
"grad_norm": 0.09326171875,
"learning_rate": 5.5335464004118986e-05,
"loss": 0.2908,
"num_input_tokens_seen": 47185920,
"step": 180
},
{
"epoch": 1.4122211445198836,
"grad_norm": 0.08984375,
"learning_rate": 5.4517068429574215e-05,
"loss": 0.2918,
"num_input_tokens_seen": 47710208,
"step": 182
},
{
"epoch": 1.4277400581959263,
"grad_norm": 0.10400390625,
"learning_rate": 5.3697450830595774e-05,
"loss": 0.268,
"num_input_tokens_seen": 48234496,
"step": 184
},
{
"epoch": 1.4432589718719688,
"grad_norm": 0.0830078125,
"learning_rate": 5.287683294230855e-05,
"loss": 0.2862,
"num_input_tokens_seen": 48758784,
"step": 186
},
{
"epoch": 1.4587778855480116,
"grad_norm": 0.0966796875,
"learning_rate": 5.205543677045049e-05,
"loss": 0.3054,
"num_input_tokens_seen": 49283072,
"step": 188
},
{
"epoch": 1.4742967992240543,
"grad_norm": 0.087890625,
"learning_rate": 5.1233484531312414e-05,
"loss": 0.2814,
"num_input_tokens_seen": 49807360,
"step": 190
},
{
"epoch": 1.489815712900097,
"grad_norm": 0.1123046875,
"learning_rate": 5.0411198591620676e-05,
"loss": 0.2703,
"num_input_tokens_seen": 50331648,
"step": 192
},
{
"epoch": 1.5053346265761398,
"grad_norm": 0.11181640625,
"learning_rate": 4.958880140837933e-05,
"loss": 0.2689,
"num_input_tokens_seen": 50855936,
"step": 194
},
{
"epoch": 1.5208535402521823,
"grad_norm": 0.07861328125,
"learning_rate": 4.876651546868759e-05,
"loss": 0.3013,
"num_input_tokens_seen": 51380224,
"step": 196
},
{
"epoch": 1.536372453928225,
"grad_norm": 0.0849609375,
"learning_rate": 4.794456322954952e-05,
"loss": 0.2751,
"num_input_tokens_seen": 51904512,
"step": 198
},
{
"epoch": 1.5518913676042678,
"grad_norm": 0.2119140625,
"learning_rate": 4.712316705769145e-05,
"loss": 0.3178,
"num_input_tokens_seen": 52428800,
"step": 200
},
{
"epoch": 1.5674102812803103,
"grad_norm": 0.0947265625,
"learning_rate": 4.630254916940424e-05,
"loss": 0.2742,
"num_input_tokens_seen": 52953088,
"step": 202
},
{
"epoch": 1.582929194956353,
"grad_norm": 0.0888671875,
"learning_rate": 4.548293157042581e-05,
"loss": 0.2751,
"num_input_tokens_seen": 53477376,
"step": 204
},
{
"epoch": 1.5984481086323958,
"grad_norm": 0.09619140625,
"learning_rate": 4.466453599588103e-05,
"loss": 0.3256,
"num_input_tokens_seen": 54001664,
"step": 206
},
{
"epoch": 1.6139670223084384,
"grad_norm": 0.09423828125,
"learning_rate": 4.384758385029457e-05,
"loss": 0.2603,
"num_input_tokens_seen": 54525952,
"step": 208
},
{
"epoch": 1.629485935984481,
"grad_norm": 0.08740234375,
"learning_rate": 4.3032296147693225e-05,
"loss": 0.2598,
"num_input_tokens_seen": 55050240,
"step": 210
},
{
"epoch": 1.6450048496605238,
"grad_norm": 0.0908203125,
"learning_rate": 4.2218893451814005e-05,
"loss": 0.2811,
"num_input_tokens_seen": 55574528,
"step": 212
},
{
"epoch": 1.6605237633365664,
"grad_norm": 0.08544921875,
"learning_rate": 4.140759581643386e-05,
"loss": 0.2386,
"num_input_tokens_seen": 56098816,
"step": 214
},
{
"epoch": 1.6760426770126091,
"grad_norm": 0.09326171875,
"learning_rate": 4.059862272583755e-05,
"loss": 0.2999,
"num_input_tokens_seen": 56623104,
"step": 216
},
{
"epoch": 1.6915615906886519,
"grad_norm": 0.08935546875,
"learning_rate": 3.979219303543942e-05,
"loss": 0.2857,
"num_input_tokens_seen": 57147392,
"step": 218
},
{
"epoch": 1.7070805043646944,
"grad_norm": 0.09228515625,
"learning_rate": 3.898852491257546e-05,
"loss": 0.2533,
"num_input_tokens_seen": 57671680,
"step": 220
},
{
"epoch": 1.7225994180407371,
"grad_norm": 0.10009765625,
"learning_rate": 3.818783577748138e-05,
"loss": 0.306,
"num_input_tokens_seen": 58195968,
"step": 222
},
{
"epoch": 1.7381183317167799,
"grad_norm": 0.0927734375,
"learning_rate": 3.739034224447289e-05,
"loss": 0.2594,
"num_input_tokens_seen": 58720256,
"step": 224
},
{
"epoch": 1.7536372453928224,
"grad_norm": 0.09716796875,
"learning_rate": 3.659626006334395e-05,
"loss": 0.284,
"num_input_tokens_seen": 59244544,
"step": 226
},
{
"epoch": 1.7691561590688651,
"grad_norm": 0.10546875,
"learning_rate": 3.580580406099893e-05,
"loss": 0.33,
"num_input_tokens_seen": 59768832,
"step": 228
},
{
"epoch": 1.7846750727449079,
"grad_norm": 0.10009765625,
"learning_rate": 3.501918808333453e-05,
"loss": 0.2968,
"num_input_tokens_seen": 60293120,
"step": 230
},
{
"epoch": 1.8001939864209504,
"grad_norm": 0.0869140625,
"learning_rate": 3.4236624937386876e-05,
"loss": 0.2836,
"num_input_tokens_seen": 60817408,
"step": 232
},
{
"epoch": 1.8157129000969934,
"grad_norm": 0.09716796875,
"learning_rate": 3.3458326333759925e-05,
"loss": 0.2452,
"num_input_tokens_seen": 61341696,
"step": 234
},
{
"epoch": 1.831231813773036,
"grad_norm": 0.09326171875,
"learning_rate": 3.268450282935026e-05,
"loss": 0.2526,
"num_input_tokens_seen": 61865984,
"step": 236
},
{
"epoch": 1.8467507274490784,
"grad_norm": 0.0927734375,
"learning_rate": 3.191536377038422e-05,
"loss": 0.2578,
"num_input_tokens_seen": 62390272,
"step": 238
},
{
"epoch": 1.8622696411251214,
"grad_norm": 0.09375,
"learning_rate": 3.115111723578235e-05,
"loss": 0.2895,
"num_input_tokens_seen": 62914560,
"step": 240
},
{
"epoch": 1.877788554801164,
"grad_norm": 0.134765625,
"learning_rate": 3.0391969980866875e-05,
"loss": 0.3047,
"num_input_tokens_seen": 63438848,
"step": 242
},
{
"epoch": 1.8933074684772064,
"grad_norm": 0.09521484375,
"learning_rate": 2.963812738142713e-05,
"loss": 0.2958,
"num_input_tokens_seen": 63963136,
"step": 244
},
{
"epoch": 1.9088263821532494,
"grad_norm": 0.1015625,
"learning_rate": 2.888979337815828e-05,
"loss": 0.2598,
"num_input_tokens_seen": 64487424,
"step": 246
},
{
"epoch": 1.924345295829292,
"grad_norm": 0.09375,
"learning_rate": 2.8147170421488272e-05,
"loss": 0.2699,
"num_input_tokens_seen": 65011712,
"step": 248
},
{
"epoch": 1.9398642095053347,
"grad_norm": 0.09130859375,
"learning_rate": 2.7410459416807853e-05,
"loss": 0.2827,
"num_input_tokens_seen": 65536000,
"step": 250
},
{
"epoch": 1.9553831231813774,
"grad_norm": 0.1953125,
"learning_rate": 2.6679859670118783e-05,
"loss": 0.3119,
"num_input_tokens_seen": 66060288,
"step": 252
},
{
"epoch": 1.97090203685742,
"grad_norm": 0.09716796875,
"learning_rate": 2.5955568834114524e-05,
"loss": 0.2837,
"num_input_tokens_seen": 66584576,
"step": 254
},
{
"epoch": 1.9864209505334627,
"grad_norm": 0.08935546875,
"learning_rate": 2.5237782854708348e-05,
"loss": 0.2511,
"num_input_tokens_seen": 67108864,
"step": 256
},
{
"epoch": 2.0019398642095054,
"grad_norm": 0.0966796875,
"learning_rate": 2.452669591802307e-05,
"loss": 0.2501,
"num_input_tokens_seen": 67633152,
"step": 258
},
{
"epoch": 2.017458777885548,
"grad_norm": 0.09619140625,
"learning_rate": 2.3822500397857018e-05,
"loss": 0.2296,
"num_input_tokens_seen": 68157440,
"step": 260
},
{
"epoch": 2.0329776915615905,
"grad_norm": 0.0908203125,
"learning_rate": 2.3125386803640187e-05,
"loss": 0.2333,
"num_input_tokens_seen": 68681728,
"step": 262
},
{
"epoch": 2.0484966052376334,
"grad_norm": 0.09716796875,
"learning_rate": 2.2435543728894792e-05,
"loss": 0.2119,
"num_input_tokens_seen": 69206016,
"step": 264
},
{
"epoch": 2.064015518913676,
"grad_norm": 0.099609375,
"learning_rate": 2.175315780021411e-05,
"loss": 0.2676,
"num_input_tokens_seen": 69730304,
"step": 266
},
{
"epoch": 2.079534432589719,
"grad_norm": 0.08837890625,
"learning_rate": 2.1078413626773546e-05,
"loss": 0.2285,
"num_input_tokens_seen": 70254592,
"step": 268
},
{
"epoch": 2.0950533462657615,
"grad_norm": 0.10546875,
"learning_rate": 2.0411493750387423e-05,
"loss": 0.2281,
"num_input_tokens_seen": 70778880,
"step": 270
},
{
"epoch": 2.110572259941804,
"grad_norm": 0.1025390625,
"learning_rate": 1.9752578596124954e-05,
"loss": 0.2701,
"num_input_tokens_seen": 71303168,
"step": 272
},
{
"epoch": 2.126091173617847,
"grad_norm": 0.08984375,
"learning_rate": 1.9101846423499116e-05,
"loss": 0.2033,
"num_input_tokens_seen": 71827456,
"step": 274
},
{
"epoch": 2.1416100872938895,
"grad_norm": 0.10546875,
"learning_rate": 1.8459473278241126e-05,
"loss": 0.2489,
"num_input_tokens_seen": 72351744,
"step": 276
},
{
"epoch": 2.157129000969932,
"grad_norm": 0.1015625,
"learning_rate": 1.7825632944674015e-05,
"loss": 0.2294,
"num_input_tokens_seen": 72876032,
"step": 278
},
{
"epoch": 2.172647914645975,
"grad_norm": 0.10302734375,
"learning_rate": 1.7200496898697832e-05,
"loss": 0.2452,
"num_input_tokens_seen": 73400320,
"step": 280
},
{
"epoch": 2.1881668283220175,
"grad_norm": 0.09423828125,
"learning_rate": 1.6584234261399534e-05,
"loss": 0.242,
"num_input_tokens_seen": 73924608,
"step": 282
},
{
"epoch": 2.20368574199806,
"grad_norm": 0.10888671875,
"learning_rate": 1.5977011753299725e-05,
"loss": 0.2894,
"num_input_tokens_seen": 74448896,
"step": 284
},
{
"epoch": 2.219204655674103,
"grad_norm": 0.0947265625,
"learning_rate": 1.537899364924905e-05,
"loss": 0.231,
"num_input_tokens_seen": 74973184,
"step": 286
},
{
"epoch": 2.2347235693501455,
"grad_norm": 0.11279296875,
"learning_rate": 1.4790341733986085e-05,
"loss": 0.2412,
"num_input_tokens_seen": 75497472,
"step": 288
},
{
"epoch": 2.250242483026188,
"grad_norm": 0.10595703125,
"learning_rate": 1.4211215258368866e-05,
"loss": 0.2464,
"num_input_tokens_seen": 76021760,
"step": 290
},
{
"epoch": 2.265761396702231,
"grad_norm": 0.10693359375,
"learning_rate": 1.3641770896292084e-05,
"loss": 0.2231,
"num_input_tokens_seen": 76546048,
"step": 292
},
{
"epoch": 2.2812803103782735,
"grad_norm": 0.0986328125,
"learning_rate": 1.3082162702301276e-05,
"loss": 0.2432,
"num_input_tokens_seen": 77070336,
"step": 294
},
{
"epoch": 2.296799224054316,
"grad_norm": 0.09423828125,
"learning_rate": 1.253254206991572e-05,
"loss": 0.2147,
"num_input_tokens_seen": 77594624,
"step": 296
},
{
"epoch": 2.312318137730359,
"grad_norm": 0.09521484375,
"learning_rate": 1.1993057690671173e-05,
"loss": 0.249,
"num_input_tokens_seen": 78118912,
"step": 298
},
{
"epoch": 2.3278370514064015,
"grad_norm": 0.0927734375,
"learning_rate": 1.1463855513893695e-05,
"loss": 0.2362,
"num_input_tokens_seen": 78643200,
"step": 300
},
{
"epoch": 2.343355965082444,
"grad_norm": 0.10009765625,
"learning_rate": 1.0945078707215222e-05,
"loss": 0.2232,
"num_input_tokens_seen": 79167488,
"step": 302
},
{
"epoch": 2.358874878758487,
"grad_norm": 0.10595703125,
"learning_rate": 1.0436867617841768e-05,
"loss": 0.2569,
"num_input_tokens_seen": 79691776,
"step": 304
},
{
"epoch": 2.3743937924345295,
"grad_norm": 0.10546875,
"learning_rate": 9.939359734584553e-06,
"loss": 0.214,
"num_input_tokens_seen": 80216064,
"step": 306
},
{
"epoch": 2.3899127061105725,
"grad_norm": 0.09765625,
"learning_rate": 9.452689650664515e-06,
"loss": 0.2451,
"num_input_tokens_seen": 80740352,
"step": 308
},
{
"epoch": 2.405431619786615,
"grad_norm": 0.09375,
"learning_rate": 8.976989027300264e-06,
"loss": 0.2288,
"num_input_tokens_seen": 81264640,
"step": 310
},
{
"epoch": 2.4209505334626575,
"grad_norm": 0.09228515625,
"learning_rate": 8.51238655808892e-06,
"loss": 0.2332,
"num_input_tokens_seen": 81788928,
"step": 312
},
{
"epoch": 2.4364694471387,
"grad_norm": 0.095703125,
"learning_rate": 8.059007934190194e-06,
"loss": 0.202,
"num_input_tokens_seen": 82313216,
"step": 314
},
{
"epoch": 2.451988360814743,
"grad_norm": 0.0908203125,
"learning_rate": 7.61697581032243e-06,
"loss": 0.227,
"num_input_tokens_seen": 82837504,
"step": 316
},
{
"epoch": 2.4675072744907856,
"grad_norm": 0.1025390625,
"learning_rate": 7.186409771580354e-06,
"loss": 0.2429,
"num_input_tokens_seen": 83361792,
"step": 318
},
{
"epoch": 2.4830261881668285,
"grad_norm": 0.09375,
"learning_rate": 6.76742630108298e-06,
"loss": 0.2147,
"num_input_tokens_seen": 83886080,
"step": 320
},
{
"epoch": 2.498545101842871,
"grad_norm": 0.1025390625,
"learning_rate": 6.3601387484610145e-06,
"loss": 0.2423,
"num_input_tokens_seen": 84410368,
"step": 322
},
{
"epoch": 2.5140640155189136,
"grad_norm": 0.09326171875,
"learning_rate": 5.9646572991917116e-06,
"loss": 0.2828,
"num_input_tokens_seen": 84934656,
"step": 324
},
{
"epoch": 2.529582929194956,
"grad_norm": 0.09716796875,
"learning_rate": 5.581088944789953e-06,
"loss": 0.2461,
"num_input_tokens_seen": 85458944,
"step": 326
},
{
"epoch": 2.545101842870999,
"grad_norm": 0.10400390625,
"learning_rate": 5.209537453863289e-06,
"loss": 0.296,
"num_input_tokens_seen": 85983232,
"step": 328
},
{
"epoch": 2.5606207565470416,
"grad_norm": 0.08984375,
"learning_rate": 4.850103344038853e-06,
"loss": 0.2061,
"num_input_tokens_seen": 86507520,
"step": 330
},
{
"epoch": 2.5761396702230845,
"grad_norm": 0.0966796875,
"learning_rate": 4.502883854769935e-06,
"loss": 0.2323,
"num_input_tokens_seen": 87031808,
"step": 332
},
{
"epoch": 2.591658583899127,
"grad_norm": 0.0966796875,
"learning_rate": 4.167972921029262e-06,
"loss": 0.2156,
"num_input_tokens_seen": 87556096,
"step": 334
},
{
"epoch": 2.6071774975751696,
"grad_norm": 0.09228515625,
"learning_rate": 3.845461147896323e-06,
"loss": 0.2393,
"num_input_tokens_seen": 88080384,
"step": 336
},
{
"epoch": 2.6226964112512126,
"grad_norm": 0.09130859375,
"learning_rate": 3.535435786045538e-06,
"loss": 0.2165,
"num_input_tokens_seen": 88604672,
"step": 338
},
{
"epoch": 2.638215324927255,
"grad_norm": 0.0986328125,
"learning_rate": 3.2379807081419187e-06,
"loss": 0.2313,
"num_input_tokens_seen": 89128960,
"step": 340
},
{
"epoch": 2.653734238603298,
"grad_norm": 0.099609375,
"learning_rate": 2.9531763861505966e-06,
"loss": 0.2336,
"num_input_tokens_seen": 89653248,
"step": 342
},
{
"epoch": 2.6692531522793406,
"grad_norm": 0.1005859375,
"learning_rate": 2.6810998695663282e-06,
"loss": 0.2311,
"num_input_tokens_seen": 90177536,
"step": 344
},
{
"epoch": 2.684772065955383,
"grad_norm": 0.09716796875,
"learning_rate": 2.4218247645689307e-06,
"loss": 0.213,
"num_input_tokens_seen": 90701824,
"step": 346
},
{
"epoch": 2.7002909796314256,
"grad_norm": 0.09912109375,
"learning_rate": 2.1754212141102346e-06,
"loss": 0.2364,
"num_input_tokens_seen": 91226112,
"step": 348
},
{
"epoch": 2.7158098933074686,
"grad_norm": 0.0947265625,
"learning_rate": 1.941955878938029e-06,
"loss": 0.2147,
"num_input_tokens_seen": 91750400,
"step": 350
},
{
"epoch": 2.731328806983511,
"grad_norm": 0.10205078125,
"learning_rate": 1.7214919195619127e-06,
"loss": 0.2316,
"num_input_tokens_seen": 92274688,
"step": 352
},
{
"epoch": 2.746847720659554,
"grad_norm": 0.10791015625,
"learning_rate": 1.514088979166256e-06,
"loss": 0.2263,
"num_input_tokens_seen": 92798976,
"step": 354
},
{
"epoch": 2.7623666343355966,
"grad_norm": 0.09423828125,
"learning_rate": 1.3198031674745813e-06,
"loss": 0.2323,
"num_input_tokens_seen": 93323264,
"step": 356
},
{
"epoch": 2.777885548011639,
"grad_norm": 0.09912109375,
"learning_rate": 1.138687045569975e-06,
"loss": 0.2246,
"num_input_tokens_seen": 93847552,
"step": 358
},
{
"epoch": 2.7934044616876816,
"grad_norm": 0.10107421875,
"learning_rate": 9.707896116754488e-07,
"loss": 0.2287,
"num_input_tokens_seen": 94371840,
"step": 360
},
{
"epoch": 2.8089233753637246,
"grad_norm": 0.09521484375,
"learning_rate": 8.161562878982398e-07,
"loss": 0.2081,
"num_input_tokens_seen": 94896128,
"step": 362
},
{
"epoch": 2.824442289039767,
"grad_norm": 0.10009765625,
"learning_rate": 6.74828907941516e-07,
"loss": 0.226,
"num_input_tokens_seen": 95420416,
"step": 364
},
{
"epoch": 2.83996120271581,
"grad_norm": 0.10498046875,
"learning_rate": 5.468457057869358e-07,
"loss": 0.273,
"num_input_tokens_seen": 95944704,
"step": 366
},
{
"epoch": 2.8554801163918526,
"grad_norm": 0.107421875,
"learning_rate": 4.322413053509944e-07,
"loss": 0.2634,
"num_input_tokens_seen": 96468992,
"step": 368
},
{
"epoch": 2.870999030067895,
"grad_norm": 0.10498046875,
"learning_rate": 3.3104671111806593e-07,
"loss": 0.2592,
"num_input_tokens_seen": 96993280,
"step": 370
},
{
"epoch": 2.8865179437439377,
"grad_norm": 0.10595703125,
"learning_rate": 2.432892997526026e-07,
"loss": 0.2566,
"num_input_tokens_seen": 97517568,
"step": 372
},
{
"epoch": 2.9020368574199806,
"grad_norm": 0.099609375,
"learning_rate": 1.6899281269279755e-07,
"loss": 0.2575,
"num_input_tokens_seen": 98041856,
"step": 374
},
{
"epoch": 2.917555771096023,
"grad_norm": 0.09423828125,
"learning_rate": 1.0817734972768944e-07,
"loss": 0.2482,
"num_input_tokens_seen": 98566144,
"step": 376
},
{
"epoch": 2.933074684772066,
"grad_norm": 0.09814453125,
"learning_rate": 6.085936355947897e-08,
"loss": 0.2483,
"num_input_tokens_seen": 99090432,
"step": 378
},
{
"epoch": 2.9485935984481086,
"grad_norm": 0.10205078125,
"learning_rate": 2.7051655352494652e-08,
"loss": 0.2359,
"num_input_tokens_seen": 99614720,
"step": 380
},
{
"epoch": 2.964112512124151,
"grad_norm": 0.10595703125,
"learning_rate": 6.763371270035457e-09,
"loss": 0.2434,
"num_input_tokens_seen": 100139008,
"step": 382
},
{
"epoch": 2.979631425800194,
"grad_norm": 0.09619140625,
"learning_rate": 0.0,
"loss": 0.2042,
"num_input_tokens_seen": 100663296,
"step": 384
},
{
"epoch": 2.979631425800194,
"num_input_tokens_seen": 100663296,
"step": 384,
"total_flos": 4.2827022437921587e+18,
"train_loss": 0.3106601850595325,
"train_runtime": 8133.8849,
"train_samples_per_second": 12.157,
"train_steps_per_second": 0.047
}
],
"logging_steps": 2,
"max_steps": 384,
"num_input_tokens_seen": 100663296,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.2827022437921587e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}