falcon-7b-ft-alpaca-cleaned-dutch / trainer_state.json
BramVanroy's picture
init model
227a3af
raw
history blame
161 kB
{
"best_metric": 1.5448263883590698,
"best_model_checkpoint": "/home/ampere/vanroy/llm-finetuning/instruct-tuning/results/falcon-7b-ft-alpaca-cleaned-dutch/checkpoint-1140",
"epoch": 2.9931618365353305,
"global_step": 1149,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": 2.0637,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 5.714285714285715e-07,
"loss": 2.0295,
"step": 2
},
{
"epoch": 0.01,
"learning_rate": 8.571428571428572e-07,
"loss": 1.9624,
"step": 3
},
{
"epoch": 0.01,
"learning_rate": 1.142857142857143e-06,
"loss": 1.9473,
"step": 4
},
{
"epoch": 0.01,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.9286,
"step": 5
},
{
"epoch": 0.02,
"learning_rate": 1.7142857142857145e-06,
"loss": 1.9895,
"step": 6
},
{
"epoch": 0.02,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.9534,
"step": 7
},
{
"epoch": 0.02,
"learning_rate": 2.285714285714286e-06,
"loss": 1.9382,
"step": 8
},
{
"epoch": 0.02,
"learning_rate": 2.571428571428571e-06,
"loss": 1.9521,
"step": 9
},
{
"epoch": 0.03,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.9832,
"step": 10
},
{
"epoch": 0.03,
"eval_loss": 1.8889312744140625,
"eval_runtime": 221.7455,
"eval_samples_per_second": 11.662,
"eval_steps_per_second": 0.731,
"step": 10
},
{
"epoch": 0.03,
"learning_rate": 3.142857142857143e-06,
"loss": 1.9819,
"step": 11
},
{
"epoch": 0.03,
"learning_rate": 3.428571428571429e-06,
"loss": 1.9702,
"step": 12
},
{
"epoch": 0.03,
"learning_rate": 3.7142857142857146e-06,
"loss": 1.9717,
"step": 13
},
{
"epoch": 0.04,
"learning_rate": 4.000000000000001e-06,
"loss": 2.0242,
"step": 14
},
{
"epoch": 0.04,
"learning_rate": 4.2857142857142855e-06,
"loss": 2.0544,
"step": 15
},
{
"epoch": 0.04,
"learning_rate": 4.571428571428572e-06,
"loss": 1.8782,
"step": 16
},
{
"epoch": 0.04,
"learning_rate": 4.857142857142858e-06,
"loss": 2.0745,
"step": 17
},
{
"epoch": 0.05,
"learning_rate": 5.142857142857142e-06,
"loss": 1.9383,
"step": 18
},
{
"epoch": 0.05,
"learning_rate": 5.428571428571429e-06,
"loss": 1.9806,
"step": 19
},
{
"epoch": 0.05,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.9355,
"step": 20
},
{
"epoch": 0.05,
"eval_loss": 1.8833638429641724,
"eval_runtime": 221.7609,
"eval_samples_per_second": 11.661,
"eval_steps_per_second": 0.731,
"step": 20
},
{
"epoch": 0.05,
"learning_rate": 6e-06,
"loss": 1.9604,
"step": 21
},
{
"epoch": 0.06,
"learning_rate": 6.285714285714286e-06,
"loss": 1.8709,
"step": 22
},
{
"epoch": 0.06,
"learning_rate": 6.571428571428572e-06,
"loss": 1.9495,
"step": 23
},
{
"epoch": 0.06,
"learning_rate": 6.857142857142858e-06,
"loss": 1.9632,
"step": 24
},
{
"epoch": 0.07,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.9315,
"step": 25
},
{
"epoch": 0.07,
"learning_rate": 7.428571428571429e-06,
"loss": 1.9402,
"step": 26
},
{
"epoch": 0.07,
"learning_rate": 7.714285714285716e-06,
"loss": 1.9944,
"step": 27
},
{
"epoch": 0.07,
"learning_rate": 8.000000000000001e-06,
"loss": 1.9004,
"step": 28
},
{
"epoch": 0.08,
"learning_rate": 8.285714285714287e-06,
"loss": 2.0983,
"step": 29
},
{
"epoch": 0.08,
"learning_rate": 8.571428571428571e-06,
"loss": 1.9694,
"step": 30
},
{
"epoch": 0.08,
"eval_loss": 1.8670501708984375,
"eval_runtime": 221.6801,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.731,
"step": 30
},
{
"epoch": 0.08,
"learning_rate": 8.857142857142858e-06,
"loss": 1.9217,
"step": 31
},
{
"epoch": 0.08,
"learning_rate": 9.142857142857144e-06,
"loss": 1.878,
"step": 32
},
{
"epoch": 0.09,
"learning_rate": 9.42857142857143e-06,
"loss": 1.9016,
"step": 33
},
{
"epoch": 0.09,
"learning_rate": 9.714285714285715e-06,
"loss": 1.9793,
"step": 34
},
{
"epoch": 0.09,
"learning_rate": 1e-05,
"loss": 1.961,
"step": 35
},
{
"epoch": 0.09,
"learning_rate": 9.999980117587285e-06,
"loss": 1.9236,
"step": 36
},
{
"epoch": 0.1,
"learning_rate": 9.999920470507263e-06,
"loss": 1.9858,
"step": 37
},
{
"epoch": 0.1,
"learning_rate": 9.999821059234308e-06,
"loss": 1.9147,
"step": 38
},
{
"epoch": 0.1,
"learning_rate": 9.999681884559027e-06,
"loss": 1.8903,
"step": 39
},
{
"epoch": 0.1,
"learning_rate": 9.999502947588279e-06,
"loss": 1.9048,
"step": 40
},
{
"epoch": 0.1,
"eval_loss": 1.8328224420547485,
"eval_runtime": 221.7913,
"eval_samples_per_second": 11.66,
"eval_steps_per_second": 0.73,
"step": 40
},
{
"epoch": 0.11,
"learning_rate": 9.999284249745143e-06,
"loss": 1.8736,
"step": 41
},
{
"epoch": 0.11,
"learning_rate": 9.99902579276891e-06,
"loss": 1.9191,
"step": 42
},
{
"epoch": 0.11,
"learning_rate": 9.998727578715083e-06,
"loss": 1.903,
"step": 43
},
{
"epoch": 0.11,
"learning_rate": 9.998389609955348e-06,
"loss": 1.8531,
"step": 44
},
{
"epoch": 0.12,
"learning_rate": 9.998011889177558e-06,
"loss": 2.0097,
"step": 45
},
{
"epoch": 0.12,
"learning_rate": 9.997594419385712e-06,
"loss": 1.7101,
"step": 46
},
{
"epoch": 0.12,
"learning_rate": 9.997137203899935e-06,
"loss": 1.9653,
"step": 47
},
{
"epoch": 0.13,
"learning_rate": 9.996640246356446e-06,
"loss": 1.9002,
"step": 48
},
{
"epoch": 0.13,
"learning_rate": 9.996103550707528e-06,
"loss": 1.9036,
"step": 49
},
{
"epoch": 0.13,
"learning_rate": 9.995527121221504e-06,
"loss": 1.8443,
"step": 50
},
{
"epoch": 0.13,
"eval_loss": 1.7970249652862549,
"eval_runtime": 221.713,
"eval_samples_per_second": 11.664,
"eval_steps_per_second": 0.731,
"step": 50
},
{
"epoch": 0.13,
"learning_rate": 9.9949109624827e-06,
"loss": 1.9025,
"step": 51
},
{
"epoch": 0.14,
"learning_rate": 9.994255079391402e-06,
"loss": 1.7249,
"step": 52
},
{
"epoch": 0.14,
"learning_rate": 9.993559477163827e-06,
"loss": 1.8214,
"step": 53
},
{
"epoch": 0.14,
"learning_rate": 9.992824161332073e-06,
"loss": 1.785,
"step": 54
},
{
"epoch": 0.14,
"learning_rate": 9.992049137744084e-06,
"loss": 1.8474,
"step": 55
},
{
"epoch": 0.15,
"learning_rate": 9.991234412563594e-06,
"loss": 1.8446,
"step": 56
},
{
"epoch": 0.15,
"learning_rate": 9.990379992270084e-06,
"loss": 1.9928,
"step": 57
},
{
"epoch": 0.15,
"learning_rate": 9.989485883658729e-06,
"loss": 1.7823,
"step": 58
},
{
"epoch": 0.15,
"learning_rate": 9.988552093840344e-06,
"loss": 1.8821,
"step": 59
},
{
"epoch": 0.16,
"learning_rate": 9.987578630241326e-06,
"loss": 1.7448,
"step": 60
},
{
"epoch": 0.16,
"eval_loss": 1.7710908651351929,
"eval_runtime": 221.7207,
"eval_samples_per_second": 11.663,
"eval_steps_per_second": 0.731,
"step": 60
},
{
"epoch": 0.16,
"learning_rate": 9.986565500603598e-06,
"loss": 1.8721,
"step": 61
},
{
"epoch": 0.16,
"learning_rate": 9.985512712984543e-06,
"loss": 1.8571,
"step": 62
},
{
"epoch": 0.16,
"learning_rate": 9.984420275756945e-06,
"loss": 1.9442,
"step": 63
},
{
"epoch": 0.17,
"learning_rate": 9.98328819760892e-06,
"loss": 1.8412,
"step": 64
},
{
"epoch": 0.17,
"learning_rate": 9.982116487543844e-06,
"loss": 1.9281,
"step": 65
},
{
"epoch": 0.17,
"learning_rate": 9.980905154880288e-06,
"loss": 1.8329,
"step": 66
},
{
"epoch": 0.17,
"learning_rate": 9.979654209251939e-06,
"loss": 1.8401,
"step": 67
},
{
"epoch": 0.18,
"learning_rate": 9.978363660607522e-06,
"loss": 1.8755,
"step": 68
},
{
"epoch": 0.18,
"learning_rate": 9.977033519210725e-06,
"loss": 1.8779,
"step": 69
},
{
"epoch": 0.18,
"learning_rate": 9.975663795640118e-06,
"loss": 1.8004,
"step": 70
},
{
"epoch": 0.18,
"eval_loss": 1.7521506547927856,
"eval_runtime": 221.7459,
"eval_samples_per_second": 11.662,
"eval_steps_per_second": 0.731,
"step": 70
},
{
"epoch": 0.18,
"learning_rate": 9.974254500789065e-06,
"loss": 1.8134,
"step": 71
},
{
"epoch": 0.19,
"learning_rate": 9.972805645865637e-06,
"loss": 1.7562,
"step": 72
},
{
"epoch": 0.19,
"learning_rate": 9.971317242392527e-06,
"loss": 1.9037,
"step": 73
},
{
"epoch": 0.19,
"learning_rate": 9.969789302206957e-06,
"loss": 1.7513,
"step": 74
},
{
"epoch": 0.2,
"learning_rate": 9.968221837460578e-06,
"loss": 1.8005,
"step": 75
},
{
"epoch": 0.2,
"learning_rate": 9.96661486061939e-06,
"loss": 1.7634,
"step": 76
},
{
"epoch": 0.2,
"learning_rate": 9.964968384463616e-06,
"loss": 1.7449,
"step": 77
},
{
"epoch": 0.2,
"learning_rate": 9.963282422087628e-06,
"loss": 1.8057,
"step": 78
},
{
"epoch": 0.21,
"learning_rate": 9.961556986899824e-06,
"loss": 1.7487,
"step": 79
},
{
"epoch": 0.21,
"learning_rate": 9.959792092622532e-06,
"loss": 1.7767,
"step": 80
},
{
"epoch": 0.21,
"eval_loss": 1.7370113134384155,
"eval_runtime": 221.8285,
"eval_samples_per_second": 11.658,
"eval_steps_per_second": 0.73,
"step": 80
},
{
"epoch": 0.21,
"learning_rate": 9.95798775329189e-06,
"loss": 1.8068,
"step": 81
},
{
"epoch": 0.21,
"learning_rate": 9.95614398325775e-06,
"loss": 1.7506,
"step": 82
},
{
"epoch": 0.22,
"learning_rate": 9.95426079718355e-06,
"loss": 1.814,
"step": 83
},
{
"epoch": 0.22,
"learning_rate": 9.952338210046202e-06,
"loss": 1.7255,
"step": 84
},
{
"epoch": 0.22,
"learning_rate": 9.950376237135974e-06,
"loss": 1.7329,
"step": 85
},
{
"epoch": 0.22,
"learning_rate": 9.94837489405637e-06,
"loss": 1.7829,
"step": 86
},
{
"epoch": 0.23,
"learning_rate": 9.946334196724e-06,
"loss": 1.8209,
"step": 87
},
{
"epoch": 0.23,
"learning_rate": 9.944254161368457e-06,
"loss": 1.9201,
"step": 88
},
{
"epoch": 0.23,
"learning_rate": 9.942134804532194e-06,
"loss": 1.7912,
"step": 89
},
{
"epoch": 0.23,
"learning_rate": 9.939976143070378e-06,
"loss": 1.7733,
"step": 90
},
{
"epoch": 0.23,
"eval_loss": 1.7247874736785889,
"eval_runtime": 221.7361,
"eval_samples_per_second": 11.663,
"eval_steps_per_second": 0.731,
"step": 90
},
{
"epoch": 0.24,
"learning_rate": 9.937778194150771e-06,
"loss": 1.7701,
"step": 91
},
{
"epoch": 0.24,
"learning_rate": 9.935540975253582e-06,
"loss": 1.8053,
"step": 92
},
{
"epoch": 0.24,
"learning_rate": 9.933264504171337e-06,
"loss": 1.8445,
"step": 93
},
{
"epoch": 0.24,
"learning_rate": 9.930948799008728e-06,
"loss": 1.7114,
"step": 94
},
{
"epoch": 0.25,
"learning_rate": 9.92859387818248e-06,
"loss": 1.8548,
"step": 95
},
{
"epoch": 0.25,
"learning_rate": 9.926199760421196e-06,
"loss": 1.8339,
"step": 96
},
{
"epoch": 0.25,
"learning_rate": 9.923766464765208e-06,
"loss": 1.7359,
"step": 97
},
{
"epoch": 0.26,
"learning_rate": 9.921294010566434e-06,
"loss": 1.8538,
"step": 98
},
{
"epoch": 0.26,
"learning_rate": 9.918782417488216e-06,
"loss": 1.7922,
"step": 99
},
{
"epoch": 0.26,
"learning_rate": 9.916231705505166e-06,
"loss": 1.7926,
"step": 100
},
{
"epoch": 0.26,
"eval_loss": 1.7149242162704468,
"eval_runtime": 221.7644,
"eval_samples_per_second": 11.661,
"eval_steps_per_second": 0.731,
"step": 100
},
{
"epoch": 0.26,
"learning_rate": 9.913641894903006e-06,
"loss": 1.7366,
"step": 101
},
{
"epoch": 0.27,
"learning_rate": 9.91101300627841e-06,
"loss": 1.7165,
"step": 102
},
{
"epoch": 0.27,
"learning_rate": 9.908345060538838e-06,
"loss": 1.7273,
"step": 103
},
{
"epoch": 0.27,
"learning_rate": 9.905638078902367e-06,
"loss": 1.7634,
"step": 104
},
{
"epoch": 0.27,
"learning_rate": 9.90289208289753e-06,
"loss": 1.7922,
"step": 105
},
{
"epoch": 0.28,
"learning_rate": 9.900107094363139e-06,
"loss": 1.6726,
"step": 106
},
{
"epoch": 0.28,
"learning_rate": 9.897283135448106e-06,
"loss": 1.9,
"step": 107
},
{
"epoch": 0.28,
"learning_rate": 9.89442022861128e-06,
"loss": 1.8644,
"step": 108
},
{
"epoch": 0.28,
"learning_rate": 9.891518396621257e-06,
"loss": 1.7699,
"step": 109
},
{
"epoch": 0.29,
"learning_rate": 9.888577662556211e-06,
"loss": 1.8258,
"step": 110
},
{
"epoch": 0.29,
"eval_loss": 1.7066211700439453,
"eval_runtime": 221.7672,
"eval_samples_per_second": 11.661,
"eval_steps_per_second": 0.73,
"step": 110
},
{
"epoch": 0.29,
"learning_rate": 9.885598049803693e-06,
"loss": 1.7653,
"step": 111
},
{
"epoch": 0.29,
"learning_rate": 9.882579582060459e-06,
"loss": 1.716,
"step": 112
},
{
"epoch": 0.29,
"learning_rate": 9.87952228333228e-06,
"loss": 1.8161,
"step": 113
},
{
"epoch": 0.3,
"learning_rate": 9.876426177933743e-06,
"loss": 1.6403,
"step": 114
},
{
"epoch": 0.3,
"learning_rate": 9.873291290488068e-06,
"loss": 1.8212,
"step": 115
},
{
"epoch": 0.3,
"learning_rate": 9.870117645926907e-06,
"loss": 1.7464,
"step": 116
},
{
"epoch": 0.3,
"learning_rate": 9.866905269490141e-06,
"loss": 1.812,
"step": 117
},
{
"epoch": 0.31,
"learning_rate": 9.863654186725688e-06,
"loss": 1.7119,
"step": 118
},
{
"epoch": 0.31,
"learning_rate": 9.860364423489299e-06,
"loss": 1.7449,
"step": 119
},
{
"epoch": 0.31,
"learning_rate": 9.857036005944344e-06,
"loss": 1.6709,
"step": 120
},
{
"epoch": 0.31,
"eval_loss": 1.699345588684082,
"eval_runtime": 221.7101,
"eval_samples_per_second": 11.664,
"eval_steps_per_second": 0.731,
"step": 120
},
{
"epoch": 0.32,
"learning_rate": 9.853668960561611e-06,
"loss": 1.8353,
"step": 121
},
{
"epoch": 0.32,
"learning_rate": 9.850263314119095e-06,
"loss": 1.8131,
"step": 122
},
{
"epoch": 0.32,
"learning_rate": 9.846819093701782e-06,
"loss": 1.8758,
"step": 123
},
{
"epoch": 0.32,
"learning_rate": 9.84333632670144e-06,
"loss": 1.7605,
"step": 124
},
{
"epoch": 0.33,
"learning_rate": 9.839815040816391e-06,
"loss": 1.8341,
"step": 125
},
{
"epoch": 0.33,
"learning_rate": 9.8362552640513e-06,
"loss": 1.7422,
"step": 126
},
{
"epoch": 0.33,
"learning_rate": 9.832657024716944e-06,
"loss": 1.7603,
"step": 127
},
{
"epoch": 0.33,
"learning_rate": 9.829020351429999e-06,
"loss": 1.6797,
"step": 128
},
{
"epoch": 0.34,
"learning_rate": 9.825345273112796e-06,
"loss": 1.778,
"step": 129
},
{
"epoch": 0.34,
"learning_rate": 9.82163181899311e-06,
"loss": 1.6612,
"step": 130
},
{
"epoch": 0.34,
"eval_loss": 1.692633032798767,
"eval_runtime": 221.7638,
"eval_samples_per_second": 11.661,
"eval_steps_per_second": 0.731,
"step": 130
},
{
"epoch": 0.34,
"learning_rate": 9.81788001860391e-06,
"loss": 1.799,
"step": 131
},
{
"epoch": 0.34,
"learning_rate": 9.81408990178313e-06,
"loss": 1.6594,
"step": 132
},
{
"epoch": 0.35,
"learning_rate": 9.810261498673441e-06,
"loss": 1.8236,
"step": 133
},
{
"epoch": 0.35,
"learning_rate": 9.806394839722e-06,
"loss": 1.8106,
"step": 134
},
{
"epoch": 0.35,
"learning_rate": 9.802489955680206e-06,
"loss": 1.6582,
"step": 135
},
{
"epoch": 0.35,
"learning_rate": 9.798546877603468e-06,
"loss": 1.8047,
"step": 136
},
{
"epoch": 0.36,
"learning_rate": 9.794565636850948e-06,
"loss": 1.7607,
"step": 137
},
{
"epoch": 0.36,
"learning_rate": 9.790546265085317e-06,
"loss": 1.7379,
"step": 138
},
{
"epoch": 0.36,
"learning_rate": 9.786488794272494e-06,
"loss": 1.7817,
"step": 139
},
{
"epoch": 0.36,
"learning_rate": 9.782393256681406e-06,
"loss": 1.8463,
"step": 140
},
{
"epoch": 0.36,
"eval_loss": 1.686733365058899,
"eval_runtime": 221.7968,
"eval_samples_per_second": 11.659,
"eval_steps_per_second": 0.73,
"step": 140
},
{
"epoch": 0.37,
"learning_rate": 9.77825968488372e-06,
"loss": 1.6746,
"step": 141
},
{
"epoch": 0.37,
"learning_rate": 9.774088111753586e-06,
"loss": 1.81,
"step": 142
},
{
"epoch": 0.37,
"learning_rate": 9.769878570467382e-06,
"loss": 1.7666,
"step": 143
},
{
"epoch": 0.38,
"learning_rate": 9.765631094503442e-06,
"loss": 1.7299,
"step": 144
},
{
"epoch": 0.38,
"learning_rate": 9.761345717641794e-06,
"loss": 1.6813,
"step": 145
},
{
"epoch": 0.38,
"learning_rate": 9.757022473963891e-06,
"loss": 1.8533,
"step": 146
},
{
"epoch": 0.38,
"learning_rate": 9.752661397852338e-06,
"loss": 1.7705,
"step": 147
},
{
"epoch": 0.39,
"learning_rate": 9.748262523990621e-06,
"loss": 1.7602,
"step": 148
},
{
"epoch": 0.39,
"learning_rate": 9.743825887362832e-06,
"loss": 1.8127,
"step": 149
},
{
"epoch": 0.39,
"learning_rate": 9.739351523253386e-06,
"loss": 1.8413,
"step": 150
},
{
"epoch": 0.39,
"eval_loss": 1.6814411878585815,
"eval_runtime": 222.7399,
"eval_samples_per_second": 11.61,
"eval_steps_per_second": 0.727,
"step": 150
},
{
"epoch": 0.39,
"learning_rate": 9.734839467246744e-06,
"loss": 1.7403,
"step": 151
},
{
"epoch": 0.4,
"learning_rate": 9.730289755227131e-06,
"loss": 1.7251,
"step": 152
},
{
"epoch": 0.4,
"learning_rate": 9.725702423378248e-06,
"loss": 1.793,
"step": 153
},
{
"epoch": 0.4,
"learning_rate": 9.721077508182983e-06,
"loss": 1.8803,
"step": 154
},
{
"epoch": 0.4,
"learning_rate": 9.716415046423126e-06,
"loss": 1.877,
"step": 155
},
{
"epoch": 0.41,
"learning_rate": 9.711715075179075e-06,
"loss": 1.7526,
"step": 156
},
{
"epoch": 0.41,
"learning_rate": 9.706977631829535e-06,
"loss": 1.6221,
"step": 157
},
{
"epoch": 0.41,
"learning_rate": 9.702202754051227e-06,
"loss": 1.7265,
"step": 158
},
{
"epoch": 0.41,
"learning_rate": 9.69739047981859e-06,
"loss": 1.7845,
"step": 159
},
{
"epoch": 0.42,
"learning_rate": 9.692540847403468e-06,
"loss": 1.7659,
"step": 160
},
{
"epoch": 0.42,
"eval_loss": 1.6765244007110596,
"eval_runtime": 223.2206,
"eval_samples_per_second": 11.585,
"eval_steps_per_second": 0.726,
"step": 160
},
{
"epoch": 0.42,
"learning_rate": 9.687653895374824e-06,
"loss": 1.7978,
"step": 161
},
{
"epoch": 0.42,
"learning_rate": 9.682729662598412e-06,
"loss": 1.7052,
"step": 162
},
{
"epoch": 0.42,
"learning_rate": 9.677768188236487e-06,
"loss": 1.8091,
"step": 163
},
{
"epoch": 0.43,
"learning_rate": 9.67276951174748e-06,
"loss": 1.7734,
"step": 164
},
{
"epoch": 0.43,
"learning_rate": 9.667733672885688e-06,
"loss": 1.7277,
"step": 165
},
{
"epoch": 0.43,
"learning_rate": 9.662660711700967e-06,
"loss": 1.8164,
"step": 166
},
{
"epoch": 0.44,
"learning_rate": 9.657550668538396e-06,
"loss": 1.7124,
"step": 167
},
{
"epoch": 0.44,
"learning_rate": 9.652403584037973e-06,
"loss": 1.7963,
"step": 168
},
{
"epoch": 0.44,
"learning_rate": 9.647219499134278e-06,
"loss": 1.6524,
"step": 169
},
{
"epoch": 0.44,
"learning_rate": 9.641998455056158e-06,
"loss": 1.69,
"step": 170
},
{
"epoch": 0.44,
"eval_loss": 1.6714941263198853,
"eval_runtime": 221.7019,
"eval_samples_per_second": 11.664,
"eval_steps_per_second": 0.731,
"step": 170
},
{
"epoch": 0.45,
"learning_rate": 9.636740493326398e-06,
"loss": 1.7079,
"step": 171
},
{
"epoch": 0.45,
"learning_rate": 9.631445655761378e-06,
"loss": 1.6603,
"step": 172
},
{
"epoch": 0.45,
"learning_rate": 9.626113984470761e-06,
"loss": 1.6752,
"step": 173
},
{
"epoch": 0.45,
"learning_rate": 9.62074552185714e-06,
"loss": 1.7644,
"step": 174
},
{
"epoch": 0.46,
"learning_rate": 9.615340310615713e-06,
"loss": 1.6803,
"step": 175
},
{
"epoch": 0.46,
"learning_rate": 9.609898393733933e-06,
"loss": 1.6749,
"step": 176
},
{
"epoch": 0.46,
"learning_rate": 9.604419814491179e-06,
"loss": 1.6519,
"step": 177
},
{
"epoch": 0.46,
"learning_rate": 9.598904616458398e-06,
"loss": 1.7138,
"step": 178
},
{
"epoch": 0.47,
"learning_rate": 9.593352843497768e-06,
"loss": 1.6877,
"step": 179
},
{
"epoch": 0.47,
"learning_rate": 9.587764539762345e-06,
"loss": 1.7219,
"step": 180
},
{
"epoch": 0.47,
"eval_loss": 1.667332410812378,
"eval_runtime": 221.7085,
"eval_samples_per_second": 11.664,
"eval_steps_per_second": 0.731,
"step": 180
},
{
"epoch": 0.47,
"learning_rate": 9.582139749695713e-06,
"loss": 1.7226,
"step": 181
},
{
"epoch": 0.47,
"learning_rate": 9.576478518031634e-06,
"loss": 1.7295,
"step": 182
},
{
"epoch": 0.48,
"learning_rate": 9.570780889793681e-06,
"loss": 1.721,
"step": 183
},
{
"epoch": 0.48,
"learning_rate": 9.565046910294895e-06,
"loss": 1.6858,
"step": 184
},
{
"epoch": 0.48,
"learning_rate": 9.559276625137416e-06,
"loss": 1.7445,
"step": 185
},
{
"epoch": 0.48,
"learning_rate": 9.553470080212122e-06,
"loss": 1.6903,
"step": 186
},
{
"epoch": 0.49,
"learning_rate": 9.547627321698257e-06,
"loss": 1.7992,
"step": 187
},
{
"epoch": 0.49,
"learning_rate": 9.541748396063077e-06,
"loss": 1.7254,
"step": 188
},
{
"epoch": 0.49,
"learning_rate": 9.535833350061473e-06,
"loss": 1.6893,
"step": 189
},
{
"epoch": 0.49,
"learning_rate": 9.5298822307356e-06,
"loss": 1.6755,
"step": 190
},
{
"epoch": 0.49,
"eval_loss": 1.662742257118225,
"eval_runtime": 221.8841,
"eval_samples_per_second": 11.655,
"eval_steps_per_second": 0.73,
"step": 190
},
{
"epoch": 0.5,
"learning_rate": 9.523895085414503e-06,
"loss": 1.6372,
"step": 191
},
{
"epoch": 0.5,
"learning_rate": 9.517871961713736e-06,
"loss": 1.8021,
"step": 192
},
{
"epoch": 0.5,
"learning_rate": 9.511812907534995e-06,
"loss": 1.6815,
"step": 193
},
{
"epoch": 0.51,
"learning_rate": 9.505717971065724e-06,
"loss": 1.7048,
"step": 194
},
{
"epoch": 0.51,
"learning_rate": 9.499587200778744e-06,
"loss": 1.6524,
"step": 195
},
{
"epoch": 0.51,
"learning_rate": 9.493420645431853e-06,
"loss": 1.6642,
"step": 196
},
{
"epoch": 0.51,
"learning_rate": 9.48721835406745e-06,
"loss": 1.7152,
"step": 197
},
{
"epoch": 0.52,
"learning_rate": 9.480980376012145e-06,
"loss": 1.6411,
"step": 198
},
{
"epoch": 0.52,
"learning_rate": 9.474706760876356e-06,
"loss": 1.7822,
"step": 199
},
{
"epoch": 0.52,
"learning_rate": 9.468397558553928e-06,
"loss": 1.7823,
"step": 200
},
{
"epoch": 0.52,
"eval_loss": 1.6584206819534302,
"eval_runtime": 221.589,
"eval_samples_per_second": 11.67,
"eval_steps_per_second": 0.731,
"step": 200
},
{
"epoch": 0.52,
"learning_rate": 9.462052819221726e-06,
"loss": 1.6793,
"step": 201
},
{
"epoch": 0.53,
"learning_rate": 9.455672593339241e-06,
"loss": 1.7228,
"step": 202
},
{
"epoch": 0.53,
"learning_rate": 9.449256931648185e-06,
"loss": 1.6657,
"step": 203
},
{
"epoch": 0.53,
"learning_rate": 9.442805885172092e-06,
"loss": 1.7024,
"step": 204
},
{
"epoch": 0.53,
"learning_rate": 9.43631950521591e-06,
"loss": 1.6369,
"step": 205
},
{
"epoch": 0.54,
"learning_rate": 9.429797843365594e-06,
"loss": 1.7064,
"step": 206
},
{
"epoch": 0.54,
"learning_rate": 9.42324095148769e-06,
"loss": 1.6594,
"step": 207
},
{
"epoch": 0.54,
"learning_rate": 9.41664888172893e-06,
"loss": 1.6718,
"step": 208
},
{
"epoch": 0.54,
"learning_rate": 9.410021686515815e-06,
"loss": 1.7348,
"step": 209
},
{
"epoch": 0.55,
"learning_rate": 9.4033594185542e-06,
"loss": 1.7635,
"step": 210
},
{
"epoch": 0.55,
"eval_loss": 1.6544885635375977,
"eval_runtime": 221.753,
"eval_samples_per_second": 11.662,
"eval_steps_per_second": 0.731,
"step": 210
},
{
"epoch": 0.55,
"learning_rate": 9.396662130828869e-06,
"loss": 1.7155,
"step": 211
},
{
"epoch": 0.55,
"learning_rate": 9.389929876603112e-06,
"loss": 1.6977,
"step": 212
},
{
"epoch": 0.55,
"learning_rate": 9.38316270941832e-06,
"loss": 1.8122,
"step": 213
},
{
"epoch": 0.56,
"learning_rate": 9.37636068309353e-06,
"loss": 1.7403,
"step": 214
},
{
"epoch": 0.56,
"learning_rate": 9.369523851725024e-06,
"loss": 1.7272,
"step": 215
},
{
"epoch": 0.56,
"learning_rate": 9.362652269685881e-06,
"loss": 1.6559,
"step": 216
},
{
"epoch": 0.57,
"learning_rate": 9.355745991625556e-06,
"loss": 1.6567,
"step": 217
},
{
"epoch": 0.57,
"learning_rate": 9.348805072469435e-06,
"loss": 1.6923,
"step": 218
},
{
"epoch": 0.57,
"learning_rate": 9.341829567418407e-06,
"loss": 1.7745,
"step": 219
},
{
"epoch": 0.57,
"learning_rate": 9.334819531948418e-06,
"loss": 1.7335,
"step": 220
},
{
"epoch": 0.57,
"eval_loss": 1.6505930423736572,
"eval_runtime": 221.6702,
"eval_samples_per_second": 11.666,
"eval_steps_per_second": 0.731,
"step": 220
},
{
"epoch": 0.58,
"learning_rate": 9.327775021810037e-06,
"loss": 1.6126,
"step": 221
},
{
"epoch": 0.58,
"learning_rate": 9.320696093028009e-06,
"loss": 1.7739,
"step": 222
},
{
"epoch": 0.58,
"learning_rate": 9.313582801900802e-06,
"loss": 1.7429,
"step": 223
},
{
"epoch": 0.58,
"learning_rate": 9.306435205000178e-06,
"loss": 1.6804,
"step": 224
},
{
"epoch": 0.59,
"learning_rate": 9.299253359170722e-06,
"loss": 1.731,
"step": 225
},
{
"epoch": 0.59,
"learning_rate": 9.292037321529404e-06,
"loss": 1.6897,
"step": 226
},
{
"epoch": 0.59,
"learning_rate": 9.284787149465119e-06,
"loss": 1.6957,
"step": 227
},
{
"epoch": 0.59,
"learning_rate": 9.277502900638233e-06,
"loss": 1.7935,
"step": 228
},
{
"epoch": 0.6,
"learning_rate": 9.270184632980121e-06,
"loss": 1.7074,
"step": 229
},
{
"epoch": 0.6,
"learning_rate": 9.262832404692714e-06,
"loss": 1.7272,
"step": 230
},
{
"epoch": 0.6,
"eval_loss": 1.647078037261963,
"eval_runtime": 221.5996,
"eval_samples_per_second": 11.67,
"eval_steps_per_second": 0.731,
"step": 230
},
{
"epoch": 0.6,
"learning_rate": 9.255446274248023e-06,
"loss": 1.644,
"step": 231
},
{
"epoch": 0.6,
"learning_rate": 9.248026300387688e-06,
"loss": 1.7362,
"step": 232
},
{
"epoch": 0.61,
"learning_rate": 9.240572542122502e-06,
"loss": 1.6591,
"step": 233
},
{
"epoch": 0.61,
"learning_rate": 9.23308505873194e-06,
"loss": 1.7088,
"step": 234
},
{
"epoch": 0.61,
"learning_rate": 9.225563909763701e-06,
"loss": 1.6153,
"step": 235
},
{
"epoch": 0.61,
"learning_rate": 9.218009155033218e-06,
"loss": 1.7105,
"step": 236
},
{
"epoch": 0.62,
"learning_rate": 9.210420854623191e-06,
"loss": 1.7702,
"step": 237
},
{
"epoch": 0.62,
"learning_rate": 9.202799068883113e-06,
"loss": 1.6542,
"step": 238
},
{
"epoch": 0.62,
"learning_rate": 9.195143858428773e-06,
"loss": 1.6457,
"step": 239
},
{
"epoch": 0.63,
"learning_rate": 9.187455284141798e-06,
"loss": 1.718,
"step": 240
},
{
"epoch": 0.63,
"eval_loss": 1.6436262130737305,
"eval_runtime": 221.6177,
"eval_samples_per_second": 11.669,
"eval_steps_per_second": 0.731,
"step": 240
},
{
"epoch": 0.63,
"learning_rate": 9.179733407169146e-06,
"loss": 1.7206,
"step": 241
},
{
"epoch": 0.63,
"learning_rate": 9.171978288922638e-06,
"loss": 1.7417,
"step": 242
},
{
"epoch": 0.63,
"learning_rate": 9.164189991078458e-06,
"loss": 1.6934,
"step": 243
},
{
"epoch": 0.64,
"learning_rate": 9.156368575576667e-06,
"loss": 1.6417,
"step": 244
},
{
"epoch": 0.64,
"learning_rate": 9.148514104620711e-06,
"loss": 1.6462,
"step": 245
},
{
"epoch": 0.64,
"learning_rate": 9.14062664067692e-06,
"loss": 1.5682,
"step": 246
},
{
"epoch": 0.64,
"learning_rate": 9.13270624647402e-06,
"loss": 1.7027,
"step": 247
},
{
"epoch": 0.65,
"learning_rate": 9.124752985002632e-06,
"loss": 1.7179,
"step": 248
},
{
"epoch": 0.65,
"learning_rate": 9.116766919514765e-06,
"loss": 1.7081,
"step": 249
},
{
"epoch": 0.65,
"learning_rate": 9.10874811352332e-06,
"loss": 1.6899,
"step": 250
},
{
"epoch": 0.65,
"eval_loss": 1.6402881145477295,
"eval_runtime": 221.6301,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 250
},
{
"epoch": 0.65,
"learning_rate": 9.10069663080158e-06,
"loss": 1.7782,
"step": 251
},
{
"epoch": 0.66,
"learning_rate": 9.092612535382705e-06,
"loss": 1.5032,
"step": 252
},
{
"epoch": 0.66,
"learning_rate": 9.084495891559227e-06,
"loss": 1.6477,
"step": 253
},
{
"epoch": 0.66,
"learning_rate": 9.076346763882529e-06,
"loss": 1.6938,
"step": 254
},
{
"epoch": 0.66,
"learning_rate": 9.068165217162337e-06,
"loss": 1.7275,
"step": 255
},
{
"epoch": 0.67,
"learning_rate": 9.059951316466209e-06,
"loss": 1.6538,
"step": 256
},
{
"epoch": 0.67,
"learning_rate": 9.05170512711901e-06,
"loss": 1.7431,
"step": 257
},
{
"epoch": 0.67,
"learning_rate": 9.043426714702398e-06,
"loss": 1.7214,
"step": 258
},
{
"epoch": 0.67,
"learning_rate": 9.035116145054292e-06,
"loss": 1.6552,
"step": 259
},
{
"epoch": 0.68,
"learning_rate": 9.026773484268368e-06,
"loss": 1.622,
"step": 260
},
{
"epoch": 0.68,
"eval_loss": 1.6370071172714233,
"eval_runtime": 221.6656,
"eval_samples_per_second": 11.666,
"eval_steps_per_second": 0.731,
"step": 260
},
{
"epoch": 0.68,
"learning_rate": 9.018398798693512e-06,
"loss": 1.7426,
"step": 261
},
{
"epoch": 0.68,
"learning_rate": 9.009992154933309e-06,
"loss": 1.6963,
"step": 262
},
{
"epoch": 0.69,
"learning_rate": 9.001553619845502e-06,
"loss": 1.6488,
"step": 263
},
{
"epoch": 0.69,
"learning_rate": 8.993083260541467e-06,
"loss": 1.6403,
"step": 264
},
{
"epoch": 0.69,
"learning_rate": 8.984581144385673e-06,
"loss": 1.7677,
"step": 265
},
{
"epoch": 0.69,
"learning_rate": 8.976047338995156e-06,
"loss": 1.7812,
"step": 266
},
{
"epoch": 0.7,
"learning_rate": 8.967481912238971e-06,
"loss": 1.8025,
"step": 267
},
{
"epoch": 0.7,
"learning_rate": 8.958884932237658e-06,
"loss": 1.7102,
"step": 268
},
{
"epoch": 0.7,
"learning_rate": 8.9502564673627e-06,
"loss": 1.7224,
"step": 269
},
{
"epoch": 0.7,
"learning_rate": 8.941596586235972e-06,
"loss": 1.6556,
"step": 270
},
{
"epoch": 0.7,
"eval_loss": 1.6336780786514282,
"eval_runtime": 221.5029,
"eval_samples_per_second": 11.675,
"eval_steps_per_second": 0.731,
"step": 270
},
{
"epoch": 0.71,
"learning_rate": 8.932905357729213e-06,
"loss": 1.6515,
"step": 271
},
{
"epoch": 0.71,
"learning_rate": 8.924182850963457e-06,
"loss": 1.6204,
"step": 272
},
{
"epoch": 0.71,
"learning_rate": 8.915429135308496e-06,
"loss": 1.7968,
"step": 273
},
{
"epoch": 0.71,
"learning_rate": 8.906644280382325e-06,
"loss": 1.706,
"step": 274
},
{
"epoch": 0.72,
"learning_rate": 8.897828356050587e-06,
"loss": 1.7528,
"step": 275
},
{
"epoch": 0.72,
"learning_rate": 8.88898143242602e-06,
"loss": 1.6825,
"step": 276
},
{
"epoch": 0.72,
"learning_rate": 8.880103579867903e-06,
"loss": 1.663,
"step": 277
},
{
"epoch": 0.72,
"learning_rate": 8.871194868981483e-06,
"loss": 1.7998,
"step": 278
},
{
"epoch": 0.73,
"learning_rate": 8.86225537061743e-06,
"loss": 1.6817,
"step": 279
},
{
"epoch": 0.73,
"learning_rate": 8.853285155871258e-06,
"loss": 1.7912,
"step": 280
},
{
"epoch": 0.73,
"eval_loss": 1.6304128170013428,
"eval_runtime": 221.5812,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 280
},
{
"epoch": 0.73,
"learning_rate": 8.844284296082776e-06,
"loss": 1.5936,
"step": 281
},
{
"epoch": 0.73,
"learning_rate": 8.835252862835508e-06,
"loss": 1.7402,
"step": 282
},
{
"epoch": 0.74,
"learning_rate": 8.826190927956123e-06,
"loss": 1.6576,
"step": 283
},
{
"epoch": 0.74,
"learning_rate": 8.817098563513874e-06,
"loss": 1.658,
"step": 284
},
{
"epoch": 0.74,
"learning_rate": 8.807975841820023e-06,
"loss": 1.6579,
"step": 285
},
{
"epoch": 0.75,
"learning_rate": 8.79882283542725e-06,
"loss": 1.6569,
"step": 286
},
{
"epoch": 0.75,
"learning_rate": 8.7896396171291e-06,
"loss": 1.7584,
"step": 287
},
{
"epoch": 0.75,
"learning_rate": 8.780426259959385e-06,
"loss": 1.7632,
"step": 288
},
{
"epoch": 0.75,
"learning_rate": 8.771182837191614e-06,
"loss": 1.778,
"step": 289
},
{
"epoch": 0.76,
"learning_rate": 8.761909422338404e-06,
"loss": 1.6025,
"step": 290
},
{
"epoch": 0.76,
"eval_loss": 1.6273553371429443,
"eval_runtime": 221.6644,
"eval_samples_per_second": 11.666,
"eval_steps_per_second": 0.731,
"step": 290
},
{
"epoch": 0.76,
"learning_rate": 8.752606089150903e-06,
"loss": 1.7473,
"step": 291
},
{
"epoch": 0.76,
"learning_rate": 8.743272911618193e-06,
"loss": 1.7449,
"step": 292
},
{
"epoch": 0.76,
"learning_rate": 8.733909963966709e-06,
"loss": 1.6992,
"step": 293
},
{
"epoch": 0.77,
"learning_rate": 8.724517320659644e-06,
"loss": 1.7442,
"step": 294
},
{
"epoch": 0.77,
"learning_rate": 8.715095056396369e-06,
"loss": 1.7049,
"step": 295
},
{
"epoch": 0.77,
"learning_rate": 8.705643246111817e-06,
"loss": 1.6719,
"step": 296
},
{
"epoch": 0.77,
"learning_rate": 8.696161964975907e-06,
"loss": 1.6751,
"step": 297
},
{
"epoch": 0.78,
"learning_rate": 8.686651288392937e-06,
"loss": 1.7948,
"step": 298
},
{
"epoch": 0.78,
"learning_rate": 8.677111292000985e-06,
"loss": 1.6709,
"step": 299
},
{
"epoch": 0.78,
"learning_rate": 8.66754205167131e-06,
"loss": 1.7181,
"step": 300
},
{
"epoch": 0.78,
"eval_loss": 1.6246178150177002,
"eval_runtime": 221.5384,
"eval_samples_per_second": 11.673,
"eval_steps_per_second": 0.731,
"step": 300
},
{
"epoch": 0.78,
"learning_rate": 8.657943643507747e-06,
"loss": 1.7374,
"step": 301
},
{
"epoch": 0.79,
"learning_rate": 8.6483161438461e-06,
"loss": 1.6974,
"step": 302
},
{
"epoch": 0.79,
"learning_rate": 8.638659629253536e-06,
"loss": 1.7096,
"step": 303
},
{
"epoch": 0.79,
"learning_rate": 8.628974176527982e-06,
"loss": 1.7283,
"step": 304
},
{
"epoch": 0.79,
"learning_rate": 8.619259862697504e-06,
"loss": 1.7745,
"step": 305
},
{
"epoch": 0.8,
"learning_rate": 8.609516765019699e-06,
"loss": 1.6235,
"step": 306
},
{
"epoch": 0.8,
"learning_rate": 8.599744960981085e-06,
"loss": 1.679,
"step": 307
},
{
"epoch": 0.8,
"learning_rate": 8.589944528296476e-06,
"loss": 1.7144,
"step": 308
},
{
"epoch": 0.8,
"learning_rate": 8.580115544908374e-06,
"loss": 1.6576,
"step": 309
},
{
"epoch": 0.81,
"learning_rate": 8.570258088986339e-06,
"loss": 1.7452,
"step": 310
},
{
"epoch": 0.81,
"eval_loss": 1.6217124462127686,
"eval_runtime": 221.6724,
"eval_samples_per_second": 11.666,
"eval_steps_per_second": 0.731,
"step": 310
},
{
"epoch": 0.81,
"learning_rate": 8.560372238926372e-06,
"loss": 1.7452,
"step": 311
},
{
"epoch": 0.81,
"learning_rate": 8.550458073350296e-06,
"loss": 1.7251,
"step": 312
},
{
"epoch": 0.82,
"learning_rate": 8.540515671105122e-06,
"loss": 1.7005,
"step": 313
},
{
"epoch": 0.82,
"learning_rate": 8.530545111262432e-06,
"loss": 1.7335,
"step": 314
},
{
"epoch": 0.82,
"learning_rate": 8.520546473117735e-06,
"loss": 1.6461,
"step": 315
},
{
"epoch": 0.82,
"learning_rate": 8.510519836189853e-06,
"loss": 1.7743,
"step": 316
},
{
"epoch": 0.83,
"learning_rate": 8.500465280220278e-06,
"loss": 1.6977,
"step": 317
},
{
"epoch": 0.83,
"learning_rate": 8.490382885172545e-06,
"loss": 1.6567,
"step": 318
},
{
"epoch": 0.83,
"learning_rate": 8.480272731231591e-06,
"loss": 1.7074,
"step": 319
},
{
"epoch": 0.83,
"learning_rate": 8.470134898803113e-06,
"loss": 1.5975,
"step": 320
},
{
"epoch": 0.83,
"eval_loss": 1.618903636932373,
"eval_runtime": 221.623,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 320
},
{
"epoch": 0.84,
"learning_rate": 8.459969468512943e-06,
"loss": 1.7499,
"step": 321
},
{
"epoch": 0.84,
"learning_rate": 8.44977652120639e-06,
"loss": 1.6924,
"step": 322
},
{
"epoch": 0.84,
"learning_rate": 8.439556137947607e-06,
"loss": 1.7727,
"step": 323
},
{
"epoch": 0.84,
"learning_rate": 8.429308400018949e-06,
"loss": 1.6344,
"step": 324
},
{
"epoch": 0.85,
"learning_rate": 8.419033388920315e-06,
"loss": 1.689,
"step": 325
},
{
"epoch": 0.85,
"learning_rate": 8.40873118636851e-06,
"loss": 1.6717,
"step": 326
},
{
"epoch": 0.85,
"learning_rate": 8.398401874296595e-06,
"loss": 1.6416,
"step": 327
},
{
"epoch": 0.85,
"learning_rate": 8.388045534853222e-06,
"loss": 1.7411,
"step": 328
},
{
"epoch": 0.86,
"learning_rate": 8.377662250402001e-06,
"loss": 1.6613,
"step": 329
},
{
"epoch": 0.86,
"learning_rate": 8.36725210352083e-06,
"loss": 1.5754,
"step": 330
},
{
"epoch": 0.86,
"eval_loss": 1.6161595582962036,
"eval_runtime": 221.5348,
"eval_samples_per_second": 11.673,
"eval_steps_per_second": 0.731,
"step": 330
},
{
"epoch": 0.86,
"learning_rate": 8.356815177001243e-06,
"loss": 1.7595,
"step": 331
},
{
"epoch": 0.86,
"learning_rate": 8.346351553847754e-06,
"loss": 1.6056,
"step": 332
},
{
"epoch": 0.87,
"learning_rate": 8.33586131727719e-06,
"loss": 1.7163,
"step": 333
},
{
"epoch": 0.87,
"learning_rate": 8.325344550718037e-06,
"loss": 1.6641,
"step": 334
},
{
"epoch": 0.87,
"learning_rate": 8.314801337809775e-06,
"loss": 1.5893,
"step": 335
},
{
"epoch": 0.88,
"learning_rate": 8.304231762402203e-06,
"loss": 1.6807,
"step": 336
},
{
"epoch": 0.88,
"learning_rate": 8.29363590855479e-06,
"loss": 1.6655,
"step": 337
},
{
"epoch": 0.88,
"learning_rate": 8.28301386053599e-06,
"loss": 1.7629,
"step": 338
},
{
"epoch": 0.88,
"learning_rate": 8.272365702822577e-06,
"loss": 1.667,
"step": 339
},
{
"epoch": 0.89,
"learning_rate": 8.261691520098985e-06,
"loss": 1.7077,
"step": 340
},
{
"epoch": 0.89,
"eval_loss": 1.6135671138763428,
"eval_runtime": 221.6285,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 340
},
{
"epoch": 0.89,
"learning_rate": 8.250991397256609e-06,
"loss": 1.6214,
"step": 341
},
{
"epoch": 0.89,
"learning_rate": 8.240265419393157e-06,
"loss": 1.7911,
"step": 342
},
{
"epoch": 0.89,
"learning_rate": 8.229513671811953e-06,
"loss": 1.6051,
"step": 343
},
{
"epoch": 0.9,
"learning_rate": 8.218736240021271e-06,
"loss": 1.7498,
"step": 344
},
{
"epoch": 0.9,
"learning_rate": 8.207933209733654e-06,
"loss": 1.6407,
"step": 345
},
{
"epoch": 0.9,
"learning_rate": 8.197104666865218e-06,
"loss": 1.6836,
"step": 346
},
{
"epoch": 0.9,
"learning_rate": 8.186250697534993e-06,
"loss": 1.6793,
"step": 347
},
{
"epoch": 0.91,
"learning_rate": 8.175371388064212e-06,
"loss": 1.7107,
"step": 348
},
{
"epoch": 0.91,
"learning_rate": 8.164466824975648e-06,
"loss": 1.6544,
"step": 349
},
{
"epoch": 0.91,
"learning_rate": 8.153537094992907e-06,
"loss": 1.5848,
"step": 350
},
{
"epoch": 0.91,
"eval_loss": 1.6112371683120728,
"eval_runtime": 221.5593,
"eval_samples_per_second": 11.672,
"eval_steps_per_second": 0.731,
"step": 350
},
{
"epoch": 0.91,
"learning_rate": 8.142582285039753e-06,
"loss": 1.6878,
"step": 351
},
{
"epoch": 0.92,
"learning_rate": 8.131602482239405e-06,
"loss": 1.7152,
"step": 352
},
{
"epoch": 0.92,
"learning_rate": 8.120597773913851e-06,
"loss": 1.6608,
"step": 353
},
{
"epoch": 0.92,
"learning_rate": 8.109568247583155e-06,
"loss": 1.7256,
"step": 354
},
{
"epoch": 0.92,
"learning_rate": 8.098513990964754e-06,
"loss": 1.6854,
"step": 355
},
{
"epoch": 0.93,
"learning_rate": 8.087435091972762e-06,
"loss": 1.6055,
"step": 356
},
{
"epoch": 0.93,
"learning_rate": 8.076331638717278e-06,
"loss": 1.6369,
"step": 357
},
{
"epoch": 0.93,
"learning_rate": 8.065203719503679e-06,
"loss": 1.6198,
"step": 358
},
{
"epoch": 0.94,
"learning_rate": 8.054051422831918e-06,
"loss": 1.6972,
"step": 359
},
{
"epoch": 0.94,
"learning_rate": 8.042874837395816e-06,
"loss": 1.7011,
"step": 360
},
{
"epoch": 0.94,
"eval_loss": 1.6087462902069092,
"eval_runtime": 221.6783,
"eval_samples_per_second": 11.666,
"eval_steps_per_second": 0.731,
"step": 360
},
{
"epoch": 0.94,
"learning_rate": 8.031674052082372e-06,
"loss": 1.699,
"step": 361
},
{
"epoch": 0.94,
"learning_rate": 8.02044915597104e-06,
"loss": 1.6248,
"step": 362
},
{
"epoch": 0.95,
"learning_rate": 8.009200238333028e-06,
"loss": 1.6441,
"step": 363
},
{
"epoch": 0.95,
"learning_rate": 7.99792738863058e-06,
"loss": 1.6321,
"step": 364
},
{
"epoch": 0.95,
"learning_rate": 7.986630696516281e-06,
"loss": 1.7742,
"step": 365
},
{
"epoch": 0.95,
"learning_rate": 7.975310251832328e-06,
"loss": 1.5576,
"step": 366
},
{
"epoch": 0.96,
"learning_rate": 7.963966144609821e-06,
"loss": 1.6902,
"step": 367
},
{
"epoch": 0.96,
"learning_rate": 7.95259846506805e-06,
"loss": 1.5873,
"step": 368
},
{
"epoch": 0.96,
"learning_rate": 7.941207303613773e-06,
"loss": 1.7496,
"step": 369
},
{
"epoch": 0.96,
"learning_rate": 7.929792750840499e-06,
"loss": 1.6697,
"step": 370
},
{
"epoch": 0.96,
"eval_loss": 1.6065231561660767,
"eval_runtime": 221.5733,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 370
},
{
"epoch": 0.97,
"learning_rate": 7.918354897527767e-06,
"loss": 1.6903,
"step": 371
},
{
"epoch": 0.97,
"learning_rate": 7.906893834640428e-06,
"loss": 1.6335,
"step": 372
},
{
"epoch": 0.97,
"learning_rate": 7.89540965332791e-06,
"loss": 1.6909,
"step": 373
},
{
"epoch": 0.97,
"learning_rate": 7.883902444923513e-06,
"loss": 1.6651,
"step": 374
},
{
"epoch": 0.98,
"learning_rate": 7.872372300943657e-06,
"loss": 1.7373,
"step": 375
},
{
"epoch": 0.98,
"learning_rate": 7.860819313087177e-06,
"loss": 1.7589,
"step": 376
},
{
"epoch": 0.98,
"learning_rate": 7.849243573234582e-06,
"loss": 1.5605,
"step": 377
},
{
"epoch": 0.98,
"learning_rate": 7.837645173447329e-06,
"loss": 1.5867,
"step": 378
},
{
"epoch": 0.99,
"learning_rate": 7.826024205967084e-06,
"loss": 1.6352,
"step": 379
},
{
"epoch": 0.99,
"learning_rate": 7.814380763214996e-06,
"loss": 1.6633,
"step": 380
},
{
"epoch": 0.99,
"eval_loss": 1.6041749715805054,
"eval_runtime": 221.6305,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 380
},
{
"epoch": 0.99,
"learning_rate": 7.80271493779096e-06,
"loss": 1.6448,
"step": 381
},
{
"epoch": 1.0,
"learning_rate": 7.791026822472876e-06,
"loss": 1.6371,
"step": 382
},
{
"epoch": 1.0,
"learning_rate": 7.779316510215919e-06,
"loss": 1.6645,
"step": 383
},
{
"epoch": 1.0,
"learning_rate": 7.767584094151793e-06,
"loss": 1.757,
"step": 384
},
{
"epoch": 1.0,
"learning_rate": 7.755829667587993e-06,
"loss": 1.7246,
"step": 385
},
{
"epoch": 1.01,
"learning_rate": 7.744053324007064e-06,
"loss": 1.72,
"step": 386
},
{
"epoch": 1.01,
"learning_rate": 7.732255157065854e-06,
"loss": 1.7041,
"step": 387
},
{
"epoch": 1.01,
"learning_rate": 7.720435260594774e-06,
"loss": 1.7085,
"step": 388
},
{
"epoch": 1.01,
"learning_rate": 7.708593728597047e-06,
"loss": 1.6751,
"step": 389
},
{
"epoch": 1.02,
"learning_rate": 7.696730655247963e-06,
"loss": 1.6722,
"step": 390
},
{
"epoch": 1.02,
"eval_loss": 1.601514220237732,
"eval_runtime": 221.608,
"eval_samples_per_second": 11.669,
"eval_steps_per_second": 0.731,
"step": 390
},
{
"epoch": 1.02,
"learning_rate": 7.684846134894133e-06,
"loss": 1.6057,
"step": 391
},
{
"epoch": 1.02,
"learning_rate": 7.67294026205273e-06,
"loss": 1.6744,
"step": 392
},
{
"epoch": 1.02,
"learning_rate": 7.661013131410745e-06,
"loss": 1.6633,
"step": 393
},
{
"epoch": 1.03,
"learning_rate": 7.649064837824231e-06,
"loss": 1.5247,
"step": 394
},
{
"epoch": 1.03,
"learning_rate": 7.637095476317553e-06,
"loss": 1.7302,
"step": 395
},
{
"epoch": 1.03,
"learning_rate": 7.6251051420826224e-06,
"loss": 1.645,
"step": 396
},
{
"epoch": 1.03,
"learning_rate": 7.613093930478148e-06,
"loss": 1.5705,
"step": 397
},
{
"epoch": 1.04,
"learning_rate": 7.601061937028881e-06,
"loss": 1.6186,
"step": 398
},
{
"epoch": 1.04,
"learning_rate": 7.58900925742484e-06,
"loss": 1.6353,
"step": 399
},
{
"epoch": 1.04,
"learning_rate": 7.576935987520566e-06,
"loss": 1.7181,
"step": 400
},
{
"epoch": 1.04,
"eval_loss": 1.5993202924728394,
"eval_runtime": 221.5695,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 400
},
{
"epoch": 1.04,
"learning_rate": 7.5648422233343564e-06,
"loss": 1.7363,
"step": 401
},
{
"epoch": 1.05,
"learning_rate": 7.552728061047492e-06,
"loss": 1.5845,
"step": 402
},
{
"epoch": 1.05,
"learning_rate": 7.54059359700348e-06,
"loss": 1.6761,
"step": 403
},
{
"epoch": 1.05,
"learning_rate": 7.528438927707298e-06,
"loss": 1.7179,
"step": 404
},
{
"epoch": 1.06,
"learning_rate": 7.5162641498246e-06,
"loss": 1.66,
"step": 405
},
{
"epoch": 1.06,
"learning_rate": 7.504069360180971e-06,
"loss": 1.5965,
"step": 406
},
{
"epoch": 1.06,
"learning_rate": 7.491854655761149e-06,
"loss": 1.6249,
"step": 407
},
{
"epoch": 1.06,
"learning_rate": 7.479620133708246e-06,
"loss": 1.7097,
"step": 408
},
{
"epoch": 1.07,
"learning_rate": 7.467365891322996e-06,
"loss": 1.6683,
"step": 409
},
{
"epoch": 1.07,
"learning_rate": 7.455092026062955e-06,
"loss": 1.6414,
"step": 410
},
{
"epoch": 1.07,
"eval_loss": 1.597200870513916,
"eval_runtime": 221.5771,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 410
},
{
"epoch": 1.07,
"learning_rate": 7.442798635541749e-06,
"loss": 1.6526,
"step": 411
},
{
"epoch": 1.07,
"learning_rate": 7.430485817528281e-06,
"loss": 1.7332,
"step": 412
},
{
"epoch": 1.08,
"learning_rate": 7.418153669945966e-06,
"loss": 1.7415,
"step": 413
},
{
"epoch": 1.08,
"learning_rate": 7.40580229087194e-06,
"loss": 1.681,
"step": 414
},
{
"epoch": 1.08,
"learning_rate": 7.3934317785362905e-06,
"loss": 1.6391,
"step": 415
},
{
"epoch": 1.08,
"learning_rate": 7.381042231321269e-06,
"loss": 1.6842,
"step": 416
},
{
"epoch": 1.09,
"learning_rate": 7.368633747760515e-06,
"loss": 1.6552,
"step": 417
},
{
"epoch": 1.09,
"learning_rate": 7.356206426538262e-06,
"loss": 1.5381,
"step": 418
},
{
"epoch": 1.09,
"learning_rate": 7.343760366488564e-06,
"loss": 1.6983,
"step": 419
},
{
"epoch": 1.09,
"learning_rate": 7.3312956665945e-06,
"loss": 1.6856,
"step": 420
},
{
"epoch": 1.09,
"eval_loss": 1.5951831340789795,
"eval_runtime": 221.5238,
"eval_samples_per_second": 11.674,
"eval_steps_per_second": 0.731,
"step": 420
},
{
"epoch": 1.1,
"learning_rate": 7.3188124259873946e-06,
"loss": 1.6686,
"step": 421
},
{
"epoch": 1.1,
"learning_rate": 7.306310743946024e-06,
"loss": 1.7121,
"step": 422
},
{
"epoch": 1.1,
"learning_rate": 7.29379071989583e-06,
"loss": 1.6359,
"step": 423
},
{
"epoch": 1.1,
"learning_rate": 7.281252453408125e-06,
"loss": 1.5617,
"step": 424
},
{
"epoch": 1.11,
"learning_rate": 7.268696044199305e-06,
"loss": 1.5664,
"step": 425
},
{
"epoch": 1.11,
"learning_rate": 7.2561215921300545e-06,
"loss": 1.5337,
"step": 426
},
{
"epoch": 1.11,
"learning_rate": 7.243529197204552e-06,
"loss": 1.6212,
"step": 427
},
{
"epoch": 1.11,
"learning_rate": 7.230918959569675e-06,
"loss": 1.5584,
"step": 428
},
{
"epoch": 1.12,
"learning_rate": 7.218290979514202e-06,
"loss": 1.579,
"step": 429
},
{
"epoch": 1.12,
"learning_rate": 7.205645357468016e-06,
"loss": 1.6491,
"step": 430
},
{
"epoch": 1.12,
"eval_loss": 1.5929887294769287,
"eval_runtime": 221.6187,
"eval_samples_per_second": 11.669,
"eval_steps_per_second": 0.731,
"step": 430
},
{
"epoch": 1.12,
"learning_rate": 7.192982194001312e-06,
"loss": 1.6226,
"step": 431
},
{
"epoch": 1.13,
"learning_rate": 7.180301589823784e-06,
"loss": 1.5809,
"step": 432
},
{
"epoch": 1.13,
"learning_rate": 7.167603645783835e-06,
"loss": 1.6546,
"step": 433
},
{
"epoch": 1.13,
"learning_rate": 7.154888462867771e-06,
"loss": 1.7025,
"step": 434
},
{
"epoch": 1.13,
"learning_rate": 7.142156142198997e-06,
"loss": 1.7083,
"step": 435
},
{
"epoch": 1.14,
"learning_rate": 7.129406785037214e-06,
"loss": 1.674,
"step": 436
},
{
"epoch": 1.14,
"learning_rate": 7.116640492777617e-06,
"loss": 1.6955,
"step": 437
},
{
"epoch": 1.14,
"learning_rate": 7.103857366950081e-06,
"loss": 1.5851,
"step": 438
},
{
"epoch": 1.14,
"learning_rate": 7.091057509218357e-06,
"loss": 1.6826,
"step": 439
},
{
"epoch": 1.15,
"learning_rate": 7.078241021379272e-06,
"loss": 1.6736,
"step": 440
},
{
"epoch": 1.15,
"eval_loss": 1.591227412223816,
"eval_runtime": 221.512,
"eval_samples_per_second": 11.674,
"eval_steps_per_second": 0.731,
"step": 440
},
{
"epoch": 1.15,
"learning_rate": 7.065408005361902e-06,
"loss": 1.5673,
"step": 441
},
{
"epoch": 1.15,
"learning_rate": 7.052558563226777e-06,
"loss": 1.7022,
"step": 442
},
{
"epoch": 1.15,
"learning_rate": 7.039692797165061e-06,
"loss": 1.6071,
"step": 443
},
{
"epoch": 1.16,
"learning_rate": 7.026810809497744e-06,
"loss": 1.5948,
"step": 444
},
{
"epoch": 1.16,
"learning_rate": 7.013912702674821e-06,
"loss": 1.5293,
"step": 445
},
{
"epoch": 1.16,
"learning_rate": 7.000998579274487e-06,
"loss": 1.6306,
"step": 446
},
{
"epoch": 1.16,
"learning_rate": 6.988068542002316e-06,
"loss": 1.732,
"step": 447
},
{
"epoch": 1.17,
"learning_rate": 6.9751226936904415e-06,
"loss": 1.6915,
"step": 448
},
{
"epoch": 1.17,
"learning_rate": 6.9621611372967436e-06,
"loss": 1.6018,
"step": 449
},
{
"epoch": 1.17,
"learning_rate": 6.949183975904027e-06,
"loss": 1.619,
"step": 450
},
{
"epoch": 1.17,
"eval_loss": 1.5893467664718628,
"eval_runtime": 221.6889,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.731,
"step": 450
},
{
"epoch": 1.17,
"learning_rate": 6.9361913127192026e-06,
"loss": 1.6266,
"step": 451
},
{
"epoch": 1.18,
"learning_rate": 6.923183251072468e-06,
"loss": 1.5814,
"step": 452
},
{
"epoch": 1.18,
"learning_rate": 6.910159894416485e-06,
"loss": 1.6962,
"step": 453
},
{
"epoch": 1.18,
"learning_rate": 6.897121346325552e-06,
"loss": 1.6047,
"step": 454
},
{
"epoch": 1.19,
"learning_rate": 6.884067710494788e-06,
"loss": 1.6322,
"step": 455
},
{
"epoch": 1.19,
"learning_rate": 6.870999090739301e-06,
"loss": 1.6904,
"step": 456
},
{
"epoch": 1.19,
"learning_rate": 6.857915590993371e-06,
"loss": 1.6751,
"step": 457
},
{
"epoch": 1.19,
"learning_rate": 6.844817315309611e-06,
"loss": 1.6209,
"step": 458
},
{
"epoch": 1.2,
"learning_rate": 6.831704367858154e-06,
"loss": 1.5679,
"step": 459
},
{
"epoch": 1.2,
"learning_rate": 6.818576852925809e-06,
"loss": 1.6452,
"step": 460
},
{
"epoch": 1.2,
"eval_loss": 1.5870401859283447,
"eval_runtime": 221.5795,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 460
},
{
"epoch": 1.2,
"learning_rate": 6.805434874915249e-06,
"loss": 1.5774,
"step": 461
},
{
"epoch": 1.2,
"learning_rate": 6.792278538344161e-06,
"loss": 1.5994,
"step": 462
},
{
"epoch": 1.21,
"learning_rate": 6.779107947844434e-06,
"loss": 1.7274,
"step": 463
},
{
"epoch": 1.21,
"learning_rate": 6.765923208161313e-06,
"loss": 1.673,
"step": 464
},
{
"epoch": 1.21,
"learning_rate": 6.752724424152575e-06,
"loss": 1.5622,
"step": 465
},
{
"epoch": 1.21,
"learning_rate": 6.739511700787683e-06,
"loss": 1.5549,
"step": 466
},
{
"epoch": 1.22,
"learning_rate": 6.726285143146969e-06,
"loss": 1.7017,
"step": 467
},
{
"epoch": 1.22,
"learning_rate": 6.713044856420781e-06,
"loss": 1.651,
"step": 468
},
{
"epoch": 1.22,
"learning_rate": 6.699790945908662e-06,
"loss": 1.6616,
"step": 469
},
{
"epoch": 1.22,
"learning_rate": 6.686523517018494e-06,
"loss": 1.6498,
"step": 470
},
{
"epoch": 1.22,
"eval_loss": 1.5853816270828247,
"eval_runtime": 221.6093,
"eval_samples_per_second": 11.669,
"eval_steps_per_second": 0.731,
"step": 470
},
{
"epoch": 1.23,
"learning_rate": 6.67324267526568e-06,
"loss": 1.7092,
"step": 471
},
{
"epoch": 1.23,
"learning_rate": 6.6599485262722885e-06,
"loss": 1.5833,
"step": 472
},
{
"epoch": 1.23,
"learning_rate": 6.646641175766222e-06,
"loss": 1.543,
"step": 473
},
{
"epoch": 1.23,
"learning_rate": 6.633320729580376e-06,
"loss": 1.7974,
"step": 474
},
{
"epoch": 1.24,
"learning_rate": 6.6199872936517915e-06,
"loss": 1.598,
"step": 475
},
{
"epoch": 1.24,
"learning_rate": 6.606640974020824e-06,
"loss": 1.5978,
"step": 476
},
{
"epoch": 1.24,
"learning_rate": 6.593281876830281e-06,
"loss": 1.6066,
"step": 477
},
{
"epoch": 1.25,
"learning_rate": 6.5799101083246e-06,
"loss": 1.5975,
"step": 478
},
{
"epoch": 1.25,
"learning_rate": 6.566525774848988e-06,
"loss": 1.6255,
"step": 479
},
{
"epoch": 1.25,
"learning_rate": 6.553128982848584e-06,
"loss": 1.675,
"step": 480
},
{
"epoch": 1.25,
"eval_loss": 1.5839300155639648,
"eval_runtime": 221.6131,
"eval_samples_per_second": 11.669,
"eval_steps_per_second": 0.731,
"step": 480
},
{
"epoch": 1.25,
"learning_rate": 6.539719838867604e-06,
"loss": 1.6697,
"step": 481
},
{
"epoch": 1.26,
"learning_rate": 6.526298449548502e-06,
"loss": 1.6142,
"step": 482
},
{
"epoch": 1.26,
"learning_rate": 6.512864921631121e-06,
"loss": 1.5982,
"step": 483
},
{
"epoch": 1.26,
"learning_rate": 6.499419361951837e-06,
"loss": 1.8056,
"step": 484
},
{
"epoch": 1.26,
"learning_rate": 6.4859618774427195e-06,
"loss": 1.6077,
"step": 485
},
{
"epoch": 1.27,
"learning_rate": 6.472492575130671e-06,
"loss": 1.6571,
"step": 486
},
{
"epoch": 1.27,
"learning_rate": 6.459011562136582e-06,
"loss": 1.5264,
"step": 487
},
{
"epoch": 1.27,
"learning_rate": 6.44551894567448e-06,
"loss": 1.6544,
"step": 488
},
{
"epoch": 1.27,
"learning_rate": 6.432014833050671e-06,
"loss": 1.6281,
"step": 489
},
{
"epoch": 1.28,
"learning_rate": 6.41849933166289e-06,
"loss": 1.684,
"step": 490
},
{
"epoch": 1.28,
"eval_loss": 1.5822992324829102,
"eval_runtime": 221.5143,
"eval_samples_per_second": 11.674,
"eval_steps_per_second": 0.731,
"step": 490
},
{
"epoch": 1.28,
"learning_rate": 6.404972548999453e-06,
"loss": 1.5226,
"step": 491
},
{
"epoch": 1.28,
"learning_rate": 6.3914345926383855e-06,
"loss": 1.5802,
"step": 492
},
{
"epoch": 1.28,
"learning_rate": 6.3778855702465835e-06,
"loss": 1.6287,
"step": 493
},
{
"epoch": 1.29,
"learning_rate": 6.364325589578948e-06,
"loss": 1.5856,
"step": 494
},
{
"epoch": 1.29,
"learning_rate": 6.350754758477534e-06,
"loss": 1.6461,
"step": 495
},
{
"epoch": 1.29,
"learning_rate": 6.337173184870683e-06,
"loss": 1.7742,
"step": 496
},
{
"epoch": 1.29,
"learning_rate": 6.323580976772181e-06,
"loss": 1.6065,
"step": 497
},
{
"epoch": 1.3,
"learning_rate": 6.309978242280382e-06,
"loss": 1.647,
"step": 498
},
{
"epoch": 1.3,
"learning_rate": 6.2963650895773566e-06,
"loss": 1.5884,
"step": 499
},
{
"epoch": 1.3,
"learning_rate": 6.282741626928036e-06,
"loss": 1.6379,
"step": 500
},
{
"epoch": 1.3,
"eval_loss": 1.5801777839660645,
"eval_runtime": 221.7292,
"eval_samples_per_second": 11.663,
"eval_steps_per_second": 0.731,
"step": 500
},
{
"epoch": 1.31,
"learning_rate": 6.269107962679344e-06,
"loss": 1.633,
"step": 501
},
{
"epoch": 1.31,
"learning_rate": 6.255464205259332e-06,
"loss": 1.7188,
"step": 502
},
{
"epoch": 1.31,
"learning_rate": 6.241810463176329e-06,
"loss": 1.558,
"step": 503
},
{
"epoch": 1.31,
"learning_rate": 6.228146845018068e-06,
"loss": 1.5924,
"step": 504
},
{
"epoch": 1.32,
"learning_rate": 6.214473459450828e-06,
"loss": 1.6427,
"step": 505
},
{
"epoch": 1.32,
"learning_rate": 6.200790415218568e-06,
"loss": 1.6439,
"step": 506
},
{
"epoch": 1.32,
"learning_rate": 6.187097821142061e-06,
"loss": 1.6606,
"step": 507
},
{
"epoch": 1.32,
"learning_rate": 6.173395786118026e-06,
"loss": 1.7101,
"step": 508
},
{
"epoch": 1.33,
"learning_rate": 6.159684419118274e-06,
"loss": 1.6778,
"step": 509
},
{
"epoch": 1.33,
"learning_rate": 6.145963829188823e-06,
"loss": 1.5173,
"step": 510
},
{
"epoch": 1.33,
"eval_loss": 1.5786410570144653,
"eval_runtime": 221.549,
"eval_samples_per_second": 11.672,
"eval_steps_per_second": 0.731,
"step": 510
},
{
"epoch": 1.33,
"learning_rate": 6.1322341254490515e-06,
"loss": 1.6827,
"step": 511
},
{
"epoch": 1.33,
"learning_rate": 6.118495417090809e-06,
"loss": 1.5186,
"step": 512
},
{
"epoch": 1.34,
"learning_rate": 6.104747813377567e-06,
"loss": 1.633,
"step": 513
},
{
"epoch": 1.34,
"learning_rate": 6.0909914236435355e-06,
"loss": 1.6671,
"step": 514
},
{
"epoch": 1.34,
"learning_rate": 6.077226357292802e-06,
"loss": 1.6258,
"step": 515
},
{
"epoch": 1.34,
"learning_rate": 6.063452723798461e-06,
"loss": 1.6377,
"step": 516
},
{
"epoch": 1.35,
"learning_rate": 6.0496706327017355e-06,
"loss": 1.5934,
"step": 517
},
{
"epoch": 1.35,
"learning_rate": 6.0358801936111145e-06,
"loss": 1.5484,
"step": 518
},
{
"epoch": 1.35,
"learning_rate": 6.022081516201483e-06,
"loss": 1.6516,
"step": 519
},
{
"epoch": 1.35,
"learning_rate": 6.008274710213235e-06,
"loss": 1.6443,
"step": 520
},
{
"epoch": 1.35,
"eval_loss": 1.577327013015747,
"eval_runtime": 221.6766,
"eval_samples_per_second": 11.666,
"eval_steps_per_second": 0.731,
"step": 520
},
{
"epoch": 1.36,
"learning_rate": 5.994459885451423e-06,
"loss": 1.6277,
"step": 521
},
{
"epoch": 1.36,
"learning_rate": 5.9806371517848605e-06,
"loss": 1.6301,
"step": 522
},
{
"epoch": 1.36,
"learning_rate": 5.966806619145268e-06,
"loss": 1.6091,
"step": 523
},
{
"epoch": 1.37,
"learning_rate": 5.952968397526388e-06,
"loss": 1.6848,
"step": 524
},
{
"epoch": 1.37,
"learning_rate": 5.9391225969831145e-06,
"loss": 1.5756,
"step": 525
},
{
"epoch": 1.37,
"learning_rate": 5.925269327630615e-06,
"loss": 1.6308,
"step": 526
},
{
"epoch": 1.37,
"learning_rate": 5.911408699643458e-06,
"loss": 1.5454,
"step": 527
},
{
"epoch": 1.38,
"learning_rate": 5.897540823254735e-06,
"loss": 1.6046,
"step": 528
},
{
"epoch": 1.38,
"learning_rate": 5.883665808755179e-06,
"loss": 1.6638,
"step": 529
},
{
"epoch": 1.38,
"learning_rate": 5.8697837664923e-06,
"loss": 1.5628,
"step": 530
},
{
"epoch": 1.38,
"eval_loss": 1.5755133628845215,
"eval_runtime": 221.5842,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 530
},
{
"epoch": 1.38,
"learning_rate": 5.855894806869493e-06,
"loss": 1.6173,
"step": 531
},
{
"epoch": 1.39,
"learning_rate": 5.841999040345168e-06,
"loss": 1.5758,
"step": 532
},
{
"epoch": 1.39,
"learning_rate": 5.828096577431874e-06,
"loss": 1.6503,
"step": 533
},
{
"epoch": 1.39,
"learning_rate": 5.814187528695412e-06,
"loss": 1.6151,
"step": 534
},
{
"epoch": 1.39,
"learning_rate": 5.800272004753961e-06,
"loss": 1.5789,
"step": 535
},
{
"epoch": 1.4,
"learning_rate": 5.786350116277195e-06,
"loss": 1.5589,
"step": 536
},
{
"epoch": 1.4,
"learning_rate": 5.772421973985412e-06,
"loss": 1.6134,
"step": 537
},
{
"epoch": 1.4,
"learning_rate": 5.758487688648635e-06,
"loss": 1.5238,
"step": 538
},
{
"epoch": 1.4,
"learning_rate": 5.744547371085752e-06,
"loss": 1.6672,
"step": 539
},
{
"epoch": 1.41,
"learning_rate": 5.730601132163623e-06,
"loss": 1.7287,
"step": 540
},
{
"epoch": 1.41,
"eval_loss": 1.5738086700439453,
"eval_runtime": 221.6243,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 540
},
{
"epoch": 1.41,
"learning_rate": 5.716649082796199e-06,
"loss": 1.5054,
"step": 541
},
{
"epoch": 1.41,
"learning_rate": 5.702691333943638e-06,
"loss": 1.7154,
"step": 542
},
{
"epoch": 1.41,
"learning_rate": 5.688727996611434e-06,
"loss": 1.5585,
"step": 543
},
{
"epoch": 1.42,
"learning_rate": 5.6747591818495185e-06,
"loss": 1.625,
"step": 544
},
{
"epoch": 1.42,
"learning_rate": 5.6607850007513876e-06,
"loss": 1.6745,
"step": 545
},
{
"epoch": 1.42,
"learning_rate": 5.646805564453216e-06,
"loss": 1.748,
"step": 546
},
{
"epoch": 1.42,
"learning_rate": 5.632820984132973e-06,
"loss": 1.6759,
"step": 547
},
{
"epoch": 1.43,
"learning_rate": 5.6188313710095375e-06,
"loss": 1.636,
"step": 548
},
{
"epoch": 1.43,
"learning_rate": 5.604836836341816e-06,
"loss": 1.5845,
"step": 549
},
{
"epoch": 1.43,
"learning_rate": 5.5908374914278495e-06,
"loss": 1.5615,
"step": 550
},
{
"epoch": 1.43,
"eval_loss": 1.5724763870239258,
"eval_runtime": 221.5669,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 550
},
{
"epoch": 1.44,
"learning_rate": 5.576833447603943e-06,
"loss": 1.5203,
"step": 551
},
{
"epoch": 1.44,
"learning_rate": 5.562824816243769e-06,
"loss": 1.6132,
"step": 552
},
{
"epoch": 1.44,
"learning_rate": 5.5488117087574785e-06,
"loss": 1.6566,
"step": 553
},
{
"epoch": 1.44,
"learning_rate": 5.5347942365908315e-06,
"loss": 1.6915,
"step": 554
},
{
"epoch": 1.45,
"learning_rate": 5.520772511224293e-06,
"loss": 1.739,
"step": 555
},
{
"epoch": 1.45,
"learning_rate": 5.506746644172154e-06,
"loss": 1.5929,
"step": 556
},
{
"epoch": 1.45,
"learning_rate": 5.492716746981646e-06,
"loss": 1.5804,
"step": 557
},
{
"epoch": 1.45,
"learning_rate": 5.478682931232053e-06,
"loss": 1.6294,
"step": 558
},
{
"epoch": 1.46,
"learning_rate": 5.46464530853382e-06,
"loss": 1.6411,
"step": 559
},
{
"epoch": 1.46,
"learning_rate": 5.45060399052767e-06,
"loss": 1.6129,
"step": 560
},
{
"epoch": 1.46,
"eval_loss": 1.5711848735809326,
"eval_runtime": 221.6895,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.731,
"step": 560
},
{
"epoch": 1.46,
"learning_rate": 5.4365590888837175e-06,
"loss": 1.6629,
"step": 561
},
{
"epoch": 1.46,
"learning_rate": 5.4225107153005715e-06,
"loss": 1.6555,
"step": 562
},
{
"epoch": 1.47,
"learning_rate": 5.408458981504458e-06,
"loss": 1.5144,
"step": 563
},
{
"epoch": 1.47,
"learning_rate": 5.394403999248327e-06,
"loss": 1.5986,
"step": 564
},
{
"epoch": 1.47,
"learning_rate": 5.3803458803109606e-06,
"loss": 1.6898,
"step": 565
},
{
"epoch": 1.47,
"learning_rate": 5.3662847364960855e-06,
"loss": 1.6196,
"step": 566
},
{
"epoch": 1.48,
"learning_rate": 5.352220679631491e-06,
"loss": 1.6847,
"step": 567
},
{
"epoch": 1.48,
"learning_rate": 5.338153821568127e-06,
"loss": 1.6208,
"step": 568
},
{
"epoch": 1.48,
"learning_rate": 5.324084274179228e-06,
"loss": 1.6083,
"step": 569
},
{
"epoch": 1.48,
"learning_rate": 5.310012149359411e-06,
"loss": 1.6709,
"step": 570
},
{
"epoch": 1.48,
"eval_loss": 1.5699827671051025,
"eval_runtime": 221.5944,
"eval_samples_per_second": 11.67,
"eval_steps_per_second": 0.731,
"step": 570
},
{
"epoch": 1.49,
"learning_rate": 5.295937559023794e-06,
"loss": 1.565,
"step": 571
},
{
"epoch": 1.49,
"learning_rate": 5.2818606151071015e-06,
"loss": 1.667,
"step": 572
},
{
"epoch": 1.49,
"learning_rate": 5.26778142956278e-06,
"loss": 1.4954,
"step": 573
},
{
"epoch": 1.5,
"learning_rate": 5.253700114362096e-06,
"loss": 1.5335,
"step": 574
},
{
"epoch": 1.5,
"learning_rate": 5.23961678149326e-06,
"loss": 1.645,
"step": 575
},
{
"epoch": 1.5,
"learning_rate": 5.225531542960528e-06,
"loss": 1.5962,
"step": 576
},
{
"epoch": 1.5,
"learning_rate": 5.211444510783309e-06,
"loss": 1.6939,
"step": 577
},
{
"epoch": 1.51,
"learning_rate": 5.197355796995277e-06,
"loss": 1.6092,
"step": 578
},
{
"epoch": 1.51,
"learning_rate": 5.183265513643484e-06,
"loss": 1.5633,
"step": 579
},
{
"epoch": 1.51,
"learning_rate": 5.169173772787458e-06,
"loss": 1.5818,
"step": 580
},
{
"epoch": 1.51,
"eval_loss": 1.5683159828186035,
"eval_runtime": 221.5828,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 580
},
{
"epoch": 1.51,
"learning_rate": 5.1550806864983235e-06,
"loss": 1.6241,
"step": 581
},
{
"epoch": 1.52,
"learning_rate": 5.140986366857904e-06,
"loss": 1.6045,
"step": 582
},
{
"epoch": 1.52,
"learning_rate": 5.126890925957832e-06,
"loss": 1.6047,
"step": 583
},
{
"epoch": 1.52,
"learning_rate": 5.1127944758986545e-06,
"loss": 1.6186,
"step": 584
},
{
"epoch": 1.52,
"learning_rate": 5.098697128788951e-06,
"loss": 1.5565,
"step": 585
},
{
"epoch": 1.53,
"learning_rate": 5.084598996744426e-06,
"loss": 1.6093,
"step": 586
},
{
"epoch": 1.53,
"learning_rate": 5.070500191887034e-06,
"loss": 1.5773,
"step": 587
},
{
"epoch": 1.53,
"learning_rate": 5.056400826344078e-06,
"loss": 1.6378,
"step": 588
},
{
"epoch": 1.53,
"learning_rate": 5.042301012247317e-06,
"loss": 1.6531,
"step": 589
},
{
"epoch": 1.54,
"learning_rate": 5.028200861732083e-06,
"loss": 1.6358,
"step": 590
},
{
"epoch": 1.54,
"eval_loss": 1.567191481590271,
"eval_runtime": 221.5558,
"eval_samples_per_second": 11.672,
"eval_steps_per_second": 0.731,
"step": 590
},
{
"epoch": 1.54,
"learning_rate": 5.014100486936383e-06,
"loss": 1.6578,
"step": 591
},
{
"epoch": 1.54,
"learning_rate": 5e-06,
"loss": 1.5503,
"step": 592
},
{
"epoch": 1.54,
"learning_rate": 4.985899513063618e-06,
"loss": 1.5313,
"step": 593
},
{
"epoch": 1.55,
"learning_rate": 4.971799138267918e-06,
"loss": 1.5539,
"step": 594
},
{
"epoch": 1.55,
"learning_rate": 4.957698987752684e-06,
"loss": 1.6979,
"step": 595
},
{
"epoch": 1.55,
"learning_rate": 4.943599173655924e-06,
"loss": 1.6068,
"step": 596
},
{
"epoch": 1.56,
"learning_rate": 4.929499808112969e-06,
"loss": 1.6521,
"step": 597
},
{
"epoch": 1.56,
"learning_rate": 4.915401003255577e-06,
"loss": 1.6487,
"step": 598
},
{
"epoch": 1.56,
"learning_rate": 4.9013028712110526e-06,
"loss": 1.7284,
"step": 599
},
{
"epoch": 1.56,
"learning_rate": 4.8872055241013455e-06,
"loss": 1.6513,
"step": 600
},
{
"epoch": 1.56,
"eval_loss": 1.566185474395752,
"eval_runtime": 221.56,
"eval_samples_per_second": 11.672,
"eval_steps_per_second": 0.731,
"step": 600
},
{
"epoch": 1.57,
"learning_rate": 4.873109074042169e-06,
"loss": 1.541,
"step": 601
},
{
"epoch": 1.57,
"learning_rate": 4.859013633142096e-06,
"loss": 1.6876,
"step": 602
},
{
"epoch": 1.57,
"learning_rate": 4.844919313501677e-06,
"loss": 1.607,
"step": 603
},
{
"epoch": 1.57,
"learning_rate": 4.830826227212543e-06,
"loss": 1.6963,
"step": 604
},
{
"epoch": 1.58,
"learning_rate": 4.816734486356518e-06,
"loss": 1.5876,
"step": 605
},
{
"epoch": 1.58,
"learning_rate": 4.802644203004724e-06,
"loss": 1.6434,
"step": 606
},
{
"epoch": 1.58,
"learning_rate": 4.788555489216692e-06,
"loss": 1.678,
"step": 607
},
{
"epoch": 1.58,
"learning_rate": 4.7744684570394734e-06,
"loss": 1.5858,
"step": 608
},
{
"epoch": 1.59,
"learning_rate": 4.760383218506742e-06,
"loss": 1.5771,
"step": 609
},
{
"epoch": 1.59,
"learning_rate": 4.7462998856379065e-06,
"loss": 1.5637,
"step": 610
},
{
"epoch": 1.59,
"eval_loss": 1.5654144287109375,
"eval_runtime": 221.6983,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.731,
"step": 610
},
{
"epoch": 1.59,
"learning_rate": 4.732218570437224e-06,
"loss": 1.6174,
"step": 611
},
{
"epoch": 1.59,
"learning_rate": 4.7181393848929e-06,
"loss": 1.6466,
"step": 612
},
{
"epoch": 1.6,
"learning_rate": 4.704062440976209e-06,
"loss": 1.5726,
"step": 613
},
{
"epoch": 1.6,
"learning_rate": 4.6899878506405904e-06,
"loss": 1.6325,
"step": 614
},
{
"epoch": 1.6,
"learning_rate": 4.675915725820773e-06,
"loss": 1.7229,
"step": 615
},
{
"epoch": 1.6,
"learning_rate": 4.661846178431873e-06,
"loss": 1.5601,
"step": 616
},
{
"epoch": 1.61,
"learning_rate": 4.64777932036851e-06,
"loss": 1.6655,
"step": 617
},
{
"epoch": 1.61,
"learning_rate": 4.633715263503915e-06,
"loss": 1.5783,
"step": 618
},
{
"epoch": 1.61,
"learning_rate": 4.619654119689041e-06,
"loss": 1.6456,
"step": 619
},
{
"epoch": 1.62,
"learning_rate": 4.6055960007516734e-06,
"loss": 1.612,
"step": 620
},
{
"epoch": 1.62,
"eval_loss": 1.5643430948257446,
"eval_runtime": 221.536,
"eval_samples_per_second": 11.673,
"eval_steps_per_second": 0.731,
"step": 620
},
{
"epoch": 1.62,
"learning_rate": 4.591541018495543e-06,
"loss": 1.5746,
"step": 621
},
{
"epoch": 1.62,
"learning_rate": 4.577489284699429e-06,
"loss": 1.6141,
"step": 622
},
{
"epoch": 1.62,
"learning_rate": 4.563440911116283e-06,
"loss": 1.5741,
"step": 623
},
{
"epoch": 1.63,
"learning_rate": 4.549396009472331e-06,
"loss": 1.6693,
"step": 624
},
{
"epoch": 1.63,
"learning_rate": 4.535354691466181e-06,
"loss": 1.5201,
"step": 625
},
{
"epoch": 1.63,
"learning_rate": 4.521317068767949e-06,
"loss": 1.6772,
"step": 626
},
{
"epoch": 1.63,
"learning_rate": 4.507283253018355e-06,
"loss": 1.6413,
"step": 627
},
{
"epoch": 1.64,
"learning_rate": 4.493253355827846e-06,
"loss": 1.6118,
"step": 628
},
{
"epoch": 1.64,
"learning_rate": 4.479227488775707e-06,
"loss": 1.6488,
"step": 629
},
{
"epoch": 1.64,
"learning_rate": 4.465205763409169e-06,
"loss": 1.6396,
"step": 630
},
{
"epoch": 1.64,
"eval_loss": 1.5630210638046265,
"eval_runtime": 221.6501,
"eval_samples_per_second": 11.667,
"eval_steps_per_second": 0.731,
"step": 630
},
{
"epoch": 1.64,
"learning_rate": 4.4511882912425214e-06,
"loss": 1.6083,
"step": 631
},
{
"epoch": 1.65,
"learning_rate": 4.437175183756233e-06,
"loss": 1.6805,
"step": 632
},
{
"epoch": 1.65,
"learning_rate": 4.4231665523960574e-06,
"loss": 1.6304,
"step": 633
},
{
"epoch": 1.65,
"learning_rate": 4.409162508572151e-06,
"loss": 1.6537,
"step": 634
},
{
"epoch": 1.65,
"learning_rate": 4.395163163658186e-06,
"loss": 1.6465,
"step": 635
},
{
"epoch": 1.66,
"learning_rate": 4.381168628990463e-06,
"loss": 1.5872,
"step": 636
},
{
"epoch": 1.66,
"learning_rate": 4.367179015867028e-06,
"loss": 1.6445,
"step": 637
},
{
"epoch": 1.66,
"learning_rate": 4.3531944355467855e-06,
"loss": 1.7094,
"step": 638
},
{
"epoch": 1.66,
"learning_rate": 4.339214999248614e-06,
"loss": 1.5845,
"step": 639
},
{
"epoch": 1.67,
"learning_rate": 4.325240818150485e-06,
"loss": 1.6414,
"step": 640
},
{
"epoch": 1.67,
"eval_loss": 1.5620007514953613,
"eval_runtime": 221.5034,
"eval_samples_per_second": 11.675,
"eval_steps_per_second": 0.731,
"step": 640
},
{
"epoch": 1.67,
"learning_rate": 4.311272003388569e-06,
"loss": 1.6654,
"step": 641
},
{
"epoch": 1.67,
"learning_rate": 4.297308666056363e-06,
"loss": 1.5831,
"step": 642
},
{
"epoch": 1.68,
"learning_rate": 4.283350917203802e-06,
"loss": 1.6675,
"step": 643
},
{
"epoch": 1.68,
"learning_rate": 4.269398867836377e-06,
"loss": 1.5759,
"step": 644
},
{
"epoch": 1.68,
"learning_rate": 4.255452628914248e-06,
"loss": 1.6232,
"step": 645
},
{
"epoch": 1.68,
"learning_rate": 4.2415123113513665e-06,
"loss": 1.6168,
"step": 646
},
{
"epoch": 1.69,
"learning_rate": 4.22757802601459e-06,
"loss": 1.6575,
"step": 647
},
{
"epoch": 1.69,
"learning_rate": 4.213649883722806e-06,
"loss": 1.5819,
"step": 648
},
{
"epoch": 1.69,
"learning_rate": 4.199727995246041e-06,
"loss": 1.6676,
"step": 649
},
{
"epoch": 1.69,
"learning_rate": 4.185812471304589e-06,
"loss": 1.6096,
"step": 650
},
{
"epoch": 1.69,
"eval_loss": 1.5610833168029785,
"eval_runtime": 221.6192,
"eval_samples_per_second": 11.669,
"eval_steps_per_second": 0.731,
"step": 650
},
{
"epoch": 1.7,
"learning_rate": 4.171903422568128e-06,
"loss": 1.6859,
"step": 651
},
{
"epoch": 1.7,
"learning_rate": 4.158000959654833e-06,
"loss": 1.6561,
"step": 652
},
{
"epoch": 1.7,
"learning_rate": 4.1441051931305095e-06,
"loss": 1.7081,
"step": 653
},
{
"epoch": 1.7,
"learning_rate": 4.130216233507702e-06,
"loss": 1.6924,
"step": 654
},
{
"epoch": 1.71,
"learning_rate": 4.116334191244823e-06,
"loss": 1.6573,
"step": 655
},
{
"epoch": 1.71,
"learning_rate": 4.102459176745267e-06,
"loss": 1.5603,
"step": 656
},
{
"epoch": 1.71,
"learning_rate": 4.088591300356543e-06,
"loss": 1.5204,
"step": 657
},
{
"epoch": 1.71,
"learning_rate": 4.074730672369386e-06,
"loss": 1.5972,
"step": 658
},
{
"epoch": 1.72,
"learning_rate": 4.060877403016886e-06,
"loss": 1.6257,
"step": 659
},
{
"epoch": 1.72,
"learning_rate": 4.047031602473613e-06,
"loss": 1.6149,
"step": 660
},
{
"epoch": 1.72,
"eval_loss": 1.560268521308899,
"eval_runtime": 221.5662,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 660
},
{
"epoch": 1.72,
"learning_rate": 4.033193380854733e-06,
"loss": 1.6141,
"step": 661
},
{
"epoch": 1.72,
"learning_rate": 4.019362848215141e-06,
"loss": 1.646,
"step": 662
},
{
"epoch": 1.73,
"learning_rate": 4.00554011454858e-06,
"loss": 1.6241,
"step": 663
},
{
"epoch": 1.73,
"learning_rate": 3.991725289786766e-06,
"loss": 1.5474,
"step": 664
},
{
"epoch": 1.73,
"learning_rate": 3.977918483798519e-06,
"loss": 1.6931,
"step": 665
},
{
"epoch": 1.73,
"learning_rate": 3.964119806388887e-06,
"loss": 1.6239,
"step": 666
},
{
"epoch": 1.74,
"learning_rate": 3.950329367298268e-06,
"loss": 1.716,
"step": 667
},
{
"epoch": 1.74,
"learning_rate": 3.936547276201542e-06,
"loss": 1.5429,
"step": 668
},
{
"epoch": 1.74,
"learning_rate": 3.9227736427071995e-06,
"loss": 1.628,
"step": 669
},
{
"epoch": 1.75,
"learning_rate": 3.909008576356467e-06,
"loss": 1.5886,
"step": 670
},
{
"epoch": 1.75,
"eval_loss": 1.5592665672302246,
"eval_runtime": 221.6247,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 670
},
{
"epoch": 1.75,
"learning_rate": 3.895252186622433e-06,
"loss": 1.7161,
"step": 671
},
{
"epoch": 1.75,
"learning_rate": 3.8815045829091915e-06,
"loss": 1.6288,
"step": 672
},
{
"epoch": 1.75,
"learning_rate": 3.867765874550949e-06,
"loss": 1.5689,
"step": 673
},
{
"epoch": 1.76,
"learning_rate": 3.854036170811176e-06,
"loss": 1.6022,
"step": 674
},
{
"epoch": 1.76,
"learning_rate": 3.840315580881728e-06,
"loss": 1.531,
"step": 675
},
{
"epoch": 1.76,
"learning_rate": 3.826604213881975e-06,
"loss": 1.6102,
"step": 676
},
{
"epoch": 1.76,
"learning_rate": 3.812902178857941e-06,
"loss": 1.6612,
"step": 677
},
{
"epoch": 1.77,
"learning_rate": 3.7992095847814337e-06,
"loss": 1.6132,
"step": 678
},
{
"epoch": 1.77,
"learning_rate": 3.785526540549173e-06,
"loss": 1.5645,
"step": 679
},
{
"epoch": 1.77,
"learning_rate": 3.771853154981934e-06,
"loss": 1.537,
"step": 680
},
{
"epoch": 1.77,
"eval_loss": 1.5582250356674194,
"eval_runtime": 221.5377,
"eval_samples_per_second": 11.673,
"eval_steps_per_second": 0.731,
"step": 680
},
{
"epoch": 1.77,
"learning_rate": 3.758189536823673e-06,
"loss": 1.6137,
"step": 681
},
{
"epoch": 1.78,
"learning_rate": 3.7445357947406714e-06,
"loss": 1.6117,
"step": 682
},
{
"epoch": 1.78,
"learning_rate": 3.730892037320659e-06,
"loss": 1.5968,
"step": 683
},
{
"epoch": 1.78,
"learning_rate": 3.717258373071965e-06,
"loss": 1.5811,
"step": 684
},
{
"epoch": 1.78,
"learning_rate": 3.7036349104226434e-06,
"loss": 1.6562,
"step": 685
},
{
"epoch": 1.79,
"learning_rate": 3.6900217577196183e-06,
"loss": 1.5226,
"step": 686
},
{
"epoch": 1.79,
"learning_rate": 3.67641902322782e-06,
"loss": 1.641,
"step": 687
},
{
"epoch": 1.79,
"learning_rate": 3.662826815129317e-06,
"loss": 1.5802,
"step": 688
},
{
"epoch": 1.79,
"learning_rate": 3.6492452415224675e-06,
"loss": 1.6854,
"step": 689
},
{
"epoch": 1.8,
"learning_rate": 3.6356744104210528e-06,
"loss": 1.5883,
"step": 690
},
{
"epoch": 1.8,
"eval_loss": 1.5573909282684326,
"eval_runtime": 221.6972,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.731,
"step": 690
},
{
"epoch": 1.8,
"learning_rate": 3.6221144297534178e-06,
"loss": 1.6555,
"step": 691
},
{
"epoch": 1.8,
"learning_rate": 3.608565407361615e-06,
"loss": 1.5577,
"step": 692
},
{
"epoch": 1.81,
"learning_rate": 3.595027451000549e-06,
"loss": 1.5049,
"step": 693
},
{
"epoch": 1.81,
"learning_rate": 3.58150066833711e-06,
"loss": 1.4957,
"step": 694
},
{
"epoch": 1.81,
"learning_rate": 3.567985166949331e-06,
"loss": 1.6509,
"step": 695
},
{
"epoch": 1.81,
"learning_rate": 3.554481054325522e-06,
"loss": 1.6039,
"step": 696
},
{
"epoch": 1.82,
"learning_rate": 3.540988437863421e-06,
"loss": 1.5138,
"step": 697
},
{
"epoch": 1.82,
"learning_rate": 3.527507424869332e-06,
"loss": 1.6161,
"step": 698
},
{
"epoch": 1.82,
"learning_rate": 3.5140381225572826e-06,
"loss": 1.6566,
"step": 699
},
{
"epoch": 1.82,
"learning_rate": 3.5005806380481634e-06,
"loss": 1.6512,
"step": 700
},
{
"epoch": 1.82,
"eval_loss": 1.556625247001648,
"eval_runtime": 221.5959,
"eval_samples_per_second": 11.67,
"eval_steps_per_second": 0.731,
"step": 700
},
{
"epoch": 1.83,
"learning_rate": 3.4871350783688795e-06,
"loss": 1.5832,
"step": 701
},
{
"epoch": 1.83,
"learning_rate": 3.4737015504514993e-06,
"loss": 1.6393,
"step": 702
},
{
"epoch": 1.83,
"learning_rate": 3.4602801611323977e-06,
"loss": 1.5706,
"step": 703
},
{
"epoch": 1.83,
"learning_rate": 3.4468710171514175e-06,
"loss": 1.5715,
"step": 704
},
{
"epoch": 1.84,
"learning_rate": 3.4334742251510127e-06,
"loss": 1.5803,
"step": 705
},
{
"epoch": 1.84,
"learning_rate": 3.420089891675401e-06,
"loss": 1.7129,
"step": 706
},
{
"epoch": 1.84,
"learning_rate": 3.40671812316972e-06,
"loss": 1.6066,
"step": 707
},
{
"epoch": 1.84,
"learning_rate": 3.393359025979178e-06,
"loss": 1.6188,
"step": 708
},
{
"epoch": 1.85,
"learning_rate": 3.3800127063482097e-06,
"loss": 1.7055,
"step": 709
},
{
"epoch": 1.85,
"learning_rate": 3.366679270419626e-06,
"loss": 1.683,
"step": 710
},
{
"epoch": 1.85,
"eval_loss": 1.5559163093566895,
"eval_runtime": 221.6474,
"eval_samples_per_second": 11.667,
"eval_steps_per_second": 0.731,
"step": 710
},
{
"epoch": 1.85,
"learning_rate": 3.35335882423378e-06,
"loss": 1.6224,
"step": 711
},
{
"epoch": 1.85,
"learning_rate": 3.3400514737277144e-06,
"loss": 1.5804,
"step": 712
},
{
"epoch": 1.86,
"learning_rate": 3.326757324734322e-06,
"loss": 1.6535,
"step": 713
},
{
"epoch": 1.86,
"learning_rate": 3.3134764829815064e-06,
"loss": 1.6257,
"step": 714
},
{
"epoch": 1.86,
"learning_rate": 3.300209054091339e-06,
"loss": 1.6455,
"step": 715
},
{
"epoch": 1.87,
"learning_rate": 3.2869551435792185e-06,
"loss": 1.5455,
"step": 716
},
{
"epoch": 1.87,
"learning_rate": 3.273714856853033e-06,
"loss": 1.5906,
"step": 717
},
{
"epoch": 1.87,
"learning_rate": 3.260488299212319e-06,
"loss": 1.5916,
"step": 718
},
{
"epoch": 1.87,
"learning_rate": 3.247275575847427e-06,
"loss": 1.7453,
"step": 719
},
{
"epoch": 1.88,
"learning_rate": 3.2340767918386883e-06,
"loss": 1.7059,
"step": 720
},
{
"epoch": 1.88,
"eval_loss": 1.5549125671386719,
"eval_runtime": 221.5512,
"eval_samples_per_second": 11.672,
"eval_steps_per_second": 0.731,
"step": 720
},
{
"epoch": 1.88,
"learning_rate": 3.2208920521555677e-06,
"loss": 1.7162,
"step": 721
},
{
"epoch": 1.88,
"learning_rate": 3.20772146165584e-06,
"loss": 1.6342,
"step": 722
},
{
"epoch": 1.88,
"learning_rate": 3.194565125084753e-06,
"loss": 1.6197,
"step": 723
},
{
"epoch": 1.89,
"learning_rate": 3.181423147074192e-06,
"loss": 1.6359,
"step": 724
},
{
"epoch": 1.89,
"learning_rate": 3.1682956321418484e-06,
"loss": 1.6265,
"step": 725
},
{
"epoch": 1.89,
"learning_rate": 3.15518268469039e-06,
"loss": 1.4916,
"step": 726
},
{
"epoch": 1.89,
"learning_rate": 3.1420844090066315e-06,
"loss": 1.5581,
"step": 727
},
{
"epoch": 1.9,
"learning_rate": 3.1290009092606988e-06,
"loss": 1.6289,
"step": 728
},
{
"epoch": 1.9,
"learning_rate": 3.1159322895052135e-06,
"loss": 1.5356,
"step": 729
},
{
"epoch": 1.9,
"learning_rate": 3.1028786536744495e-06,
"loss": 1.5453,
"step": 730
},
{
"epoch": 1.9,
"eval_loss": 1.554166316986084,
"eval_runtime": 221.6837,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.731,
"step": 730
},
{
"epoch": 1.9,
"learning_rate": 3.089840105583516e-06,
"loss": 1.6232,
"step": 731
},
{
"epoch": 1.91,
"learning_rate": 3.0768167489275325e-06,
"loss": 1.6116,
"step": 732
},
{
"epoch": 1.91,
"learning_rate": 3.0638086872807987e-06,
"loss": 1.5252,
"step": 733
},
{
"epoch": 1.91,
"learning_rate": 3.050816024095975e-06,
"loss": 1.6361,
"step": 734
},
{
"epoch": 1.91,
"learning_rate": 3.037838862703258e-06,
"loss": 1.6747,
"step": 735
},
{
"epoch": 1.92,
"learning_rate": 3.0248773063095606e-06,
"loss": 1.5886,
"step": 736
},
{
"epoch": 1.92,
"learning_rate": 3.0119314579976854e-06,
"loss": 1.6214,
"step": 737
},
{
"epoch": 1.92,
"learning_rate": 2.9990014207255134e-06,
"loss": 1.583,
"step": 738
},
{
"epoch": 1.93,
"learning_rate": 2.9860872973251815e-06,
"loss": 1.5699,
"step": 739
},
{
"epoch": 1.93,
"learning_rate": 2.9731891905022593e-06,
"loss": 1.5738,
"step": 740
},
{
"epoch": 1.93,
"eval_loss": 1.553594708442688,
"eval_runtime": 221.5462,
"eval_samples_per_second": 11.673,
"eval_steps_per_second": 0.731,
"step": 740
},
{
"epoch": 1.93,
"learning_rate": 2.960307202834941e-06,
"loss": 1.632,
"step": 741
},
{
"epoch": 1.93,
"learning_rate": 2.947441436773224e-06,
"loss": 1.6873,
"step": 742
},
{
"epoch": 1.94,
"learning_rate": 2.9345919946380985e-06,
"loss": 1.6665,
"step": 743
},
{
"epoch": 1.94,
"learning_rate": 2.9217589786207296e-06,
"loss": 1.585,
"step": 744
},
{
"epoch": 1.94,
"learning_rate": 2.9089424907816433e-06,
"loss": 1.5554,
"step": 745
},
{
"epoch": 1.94,
"learning_rate": 2.896142633049922e-06,
"loss": 1.69,
"step": 746
},
{
"epoch": 1.95,
"learning_rate": 2.8833595072223842e-06,
"loss": 1.5192,
"step": 747
},
{
"epoch": 1.95,
"learning_rate": 2.870593214962787e-06,
"loss": 1.5848,
"step": 748
},
{
"epoch": 1.95,
"learning_rate": 2.8578438578010053e-06,
"loss": 1.6617,
"step": 749
},
{
"epoch": 1.95,
"learning_rate": 2.8451115371322302e-06,
"loss": 1.6004,
"step": 750
},
{
"epoch": 1.95,
"eval_loss": 1.5529882907867432,
"eval_runtime": 221.6028,
"eval_samples_per_second": 11.67,
"eval_steps_per_second": 0.731,
"step": 750
},
{
"epoch": 1.96,
"learning_rate": 2.8323963542161665e-06,
"loss": 1.6171,
"step": 751
},
{
"epoch": 1.96,
"learning_rate": 2.8196984101762182e-06,
"loss": 1.6354,
"step": 752
},
{
"epoch": 1.96,
"learning_rate": 2.807017805998689e-06,
"loss": 1.6368,
"step": 753
},
{
"epoch": 1.96,
"learning_rate": 2.7943546425319857e-06,
"loss": 1.5776,
"step": 754
},
{
"epoch": 1.97,
"learning_rate": 2.7817090204857997e-06,
"loss": 1.6396,
"step": 755
},
{
"epoch": 1.97,
"learning_rate": 2.7690810404303276e-06,
"loss": 1.6176,
"step": 756
},
{
"epoch": 1.97,
"learning_rate": 2.756470802795449e-06,
"loss": 1.5653,
"step": 757
},
{
"epoch": 1.97,
"learning_rate": 2.743878407869947e-06,
"loss": 1.6045,
"step": 758
},
{
"epoch": 1.98,
"learning_rate": 2.7313039558006952e-06,
"loss": 1.621,
"step": 759
},
{
"epoch": 1.98,
"learning_rate": 2.7187475465918768e-06,
"loss": 1.6753,
"step": 760
},
{
"epoch": 1.98,
"eval_loss": 1.5522700548171997,
"eval_runtime": 221.543,
"eval_samples_per_second": 11.673,
"eval_steps_per_second": 0.731,
"step": 760
},
{
"epoch": 1.98,
"learning_rate": 2.7062092801041717e-06,
"loss": 1.5738,
"step": 761
},
{
"epoch": 1.99,
"learning_rate": 2.693689256053976e-06,
"loss": 1.672,
"step": 762
},
{
"epoch": 1.99,
"learning_rate": 2.6811875740126063e-06,
"loss": 1.5926,
"step": 763
},
{
"epoch": 1.99,
"learning_rate": 2.6687043334055017e-06,
"loss": 1.5286,
"step": 764
},
{
"epoch": 1.99,
"learning_rate": 2.656239633511437e-06,
"loss": 1.5802,
"step": 765
},
{
"epoch": 2.0,
"learning_rate": 2.643793573461739e-06,
"loss": 1.6207,
"step": 766
},
{
"epoch": 2.0,
"learning_rate": 2.631366252239488e-06,
"loss": 1.582,
"step": 767
},
{
"epoch": 2.0,
"learning_rate": 2.6189577686787317e-06,
"loss": 1.6199,
"step": 768
},
{
"epoch": 2.0,
"learning_rate": 2.6065682214637124e-06,
"loss": 1.6498,
"step": 769
},
{
"epoch": 2.01,
"learning_rate": 2.5941977091280614e-06,
"loss": 1.6362,
"step": 770
},
{
"epoch": 2.01,
"eval_loss": 1.551666498184204,
"eval_runtime": 221.6901,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.731,
"step": 770
},
{
"epoch": 2.01,
"learning_rate": 2.581846330054034e-06,
"loss": 1.6039,
"step": 771
},
{
"epoch": 2.01,
"learning_rate": 2.5695141824717183e-06,
"loss": 1.4543,
"step": 772
},
{
"epoch": 2.01,
"learning_rate": 2.557201364458252e-06,
"loss": 1.5505,
"step": 773
},
{
"epoch": 2.02,
"learning_rate": 2.5449079739370454e-06,
"loss": 1.5936,
"step": 774
},
{
"epoch": 2.02,
"learning_rate": 2.532634108677006e-06,
"loss": 1.5993,
"step": 775
},
{
"epoch": 2.02,
"learning_rate": 2.5203798662917555e-06,
"loss": 1.5803,
"step": 776
},
{
"epoch": 2.02,
"learning_rate": 2.508145344238854e-06,
"loss": 1.5481,
"step": 777
},
{
"epoch": 2.03,
"learning_rate": 2.4959306398190304e-06,
"loss": 1.5412,
"step": 778
},
{
"epoch": 2.03,
"learning_rate": 2.483735850175402e-06,
"loss": 1.5478,
"step": 779
},
{
"epoch": 2.03,
"learning_rate": 2.471561072292703e-06,
"loss": 1.5805,
"step": 780
},
{
"epoch": 2.03,
"eval_loss": 1.5511072874069214,
"eval_runtime": 221.428,
"eval_samples_per_second": 11.679,
"eval_steps_per_second": 0.732,
"step": 780
},
{
"epoch": 2.03,
"learning_rate": 2.4594064029965197e-06,
"loss": 1.6623,
"step": 781
},
{
"epoch": 2.04,
"learning_rate": 2.44727193895251e-06,
"loss": 1.5497,
"step": 782
},
{
"epoch": 2.04,
"learning_rate": 2.4351577766656465e-06,
"loss": 1.6082,
"step": 783
},
{
"epoch": 2.04,
"learning_rate": 2.4230640124794364e-06,
"loss": 1.6368,
"step": 784
},
{
"epoch": 2.04,
"learning_rate": 2.4109907425751616e-06,
"loss": 1.6386,
"step": 785
},
{
"epoch": 2.05,
"learning_rate": 2.3989380629711197e-06,
"loss": 1.6387,
"step": 786
},
{
"epoch": 2.05,
"learning_rate": 2.3869060695218513e-06,
"loss": 1.5591,
"step": 787
},
{
"epoch": 2.05,
"learning_rate": 2.3748948579173792e-06,
"loss": 1.5081,
"step": 788
},
{
"epoch": 2.06,
"learning_rate": 2.362904523682447e-06,
"loss": 1.5427,
"step": 789
},
{
"epoch": 2.06,
"learning_rate": 2.350935162175769e-06,
"loss": 1.6416,
"step": 790
},
{
"epoch": 2.06,
"eval_loss": 1.5508249998092651,
"eval_runtime": 221.6253,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 790
},
{
"epoch": 2.06,
"learning_rate": 2.3389868685892573e-06,
"loss": 1.6477,
"step": 791
},
{
"epoch": 2.06,
"learning_rate": 2.3270597379472713e-06,
"loss": 1.5434,
"step": 792
},
{
"epoch": 2.07,
"learning_rate": 2.3151538651058687e-06,
"loss": 1.6604,
"step": 793
},
{
"epoch": 2.07,
"learning_rate": 2.303269344752039e-06,
"loss": 1.5617,
"step": 794
},
{
"epoch": 2.07,
"learning_rate": 2.2914062714029545e-06,
"loss": 1.6272,
"step": 795
},
{
"epoch": 2.07,
"learning_rate": 2.2795647394052284e-06,
"loss": 1.5248,
"step": 796
},
{
"epoch": 2.08,
"learning_rate": 2.267744842934147e-06,
"loss": 1.7009,
"step": 797
},
{
"epoch": 2.08,
"learning_rate": 2.255946675992938e-06,
"loss": 1.5608,
"step": 798
},
{
"epoch": 2.08,
"learning_rate": 2.2441703324120095e-06,
"loss": 1.6098,
"step": 799
},
{
"epoch": 2.08,
"learning_rate": 2.2324159058482086e-06,
"loss": 1.5755,
"step": 800
},
{
"epoch": 2.08,
"eval_loss": 1.5505614280700684,
"eval_runtime": 221.4712,
"eval_samples_per_second": 11.676,
"eval_steps_per_second": 0.731,
"step": 800
},
{
"epoch": 2.09,
"learning_rate": 2.2206834897840814e-06,
"loss": 1.5641,
"step": 801
},
{
"epoch": 2.09,
"learning_rate": 2.208973177527125e-06,
"loss": 1.586,
"step": 802
},
{
"epoch": 2.09,
"learning_rate": 2.1972850622090426e-06,
"loss": 1.7206,
"step": 803
},
{
"epoch": 2.09,
"learning_rate": 2.185619236785005e-06,
"loss": 1.4538,
"step": 804
},
{
"epoch": 2.1,
"learning_rate": 2.1739757940329177e-06,
"loss": 1.6007,
"step": 805
},
{
"epoch": 2.1,
"learning_rate": 2.1623548265526734e-06,
"loss": 1.5956,
"step": 806
},
{
"epoch": 2.1,
"learning_rate": 2.1507564267654187e-06,
"loss": 1.5959,
"step": 807
},
{
"epoch": 2.1,
"learning_rate": 2.139180686912825e-06,
"loss": 1.5359,
"step": 808
},
{
"epoch": 2.11,
"learning_rate": 2.127627699056345e-06,
"loss": 1.6186,
"step": 809
},
{
"epoch": 2.11,
"learning_rate": 2.11609755507649e-06,
"loss": 1.5763,
"step": 810
},
{
"epoch": 2.11,
"eval_loss": 1.550114631652832,
"eval_runtime": 221.6795,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.731,
"step": 810
},
{
"epoch": 2.11,
"learning_rate": 2.1045903466720915e-06,
"loss": 1.5873,
"step": 811
},
{
"epoch": 2.12,
"learning_rate": 2.093106165359574e-06,
"loss": 1.512,
"step": 812
},
{
"epoch": 2.12,
"learning_rate": 2.0816451024722344e-06,
"loss": 1.5757,
"step": 813
},
{
"epoch": 2.12,
"learning_rate": 2.0702072491595023e-06,
"loss": 1.6093,
"step": 814
},
{
"epoch": 2.12,
"learning_rate": 2.0587926963862287e-06,
"loss": 1.628,
"step": 815
},
{
"epoch": 2.13,
"learning_rate": 2.0474015349319505e-06,
"loss": 1.5597,
"step": 816
},
{
"epoch": 2.13,
"learning_rate": 2.03603385539018e-06,
"loss": 1.628,
"step": 817
},
{
"epoch": 2.13,
"learning_rate": 2.0246897481676735e-06,
"loss": 1.6856,
"step": 818
},
{
"epoch": 2.13,
"learning_rate": 2.013369303483719e-06,
"loss": 1.5875,
"step": 819
},
{
"epoch": 2.14,
"learning_rate": 2.0020726113694204e-06,
"loss": 1.7112,
"step": 820
},
{
"epoch": 2.14,
"eval_loss": 1.5496630668640137,
"eval_runtime": 221.4899,
"eval_samples_per_second": 11.675,
"eval_steps_per_second": 0.731,
"step": 820
},
{
"epoch": 2.14,
"learning_rate": 1.990799761666975e-06,
"loss": 1.6253,
"step": 821
},
{
"epoch": 2.14,
"learning_rate": 1.97955084402896e-06,
"loss": 1.6424,
"step": 822
},
{
"epoch": 2.14,
"learning_rate": 1.9683259479176294e-06,
"loss": 1.6307,
"step": 823
},
{
"epoch": 2.15,
"learning_rate": 1.9571251626041847e-06,
"loss": 1.5608,
"step": 824
},
{
"epoch": 2.15,
"learning_rate": 1.945948577168086e-06,
"loss": 1.5342,
"step": 825
},
{
"epoch": 2.15,
"learning_rate": 1.9347962804963238e-06,
"loss": 1.6104,
"step": 826
},
{
"epoch": 2.15,
"learning_rate": 1.923668361282723e-06,
"loss": 1.6282,
"step": 827
},
{
"epoch": 2.16,
"learning_rate": 1.9125649080272383e-06,
"loss": 1.5276,
"step": 828
},
{
"epoch": 2.16,
"learning_rate": 1.9014860090352477e-06,
"loss": 1.5614,
"step": 829
},
{
"epoch": 2.16,
"learning_rate": 1.8904317524168458e-06,
"loss": 1.6533,
"step": 830
},
{
"epoch": 2.16,
"eval_loss": 1.5492993593215942,
"eval_runtime": 221.6236,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 830
},
{
"epoch": 2.16,
"learning_rate": 1.8794022260861483e-06,
"loss": 1.5863,
"step": 831
},
{
"epoch": 2.17,
"learning_rate": 1.8683975177605968e-06,
"loss": 1.7112,
"step": 832
},
{
"epoch": 2.17,
"learning_rate": 1.8574177149602496e-06,
"loss": 1.601,
"step": 833
},
{
"epoch": 2.17,
"learning_rate": 1.8464629050070941e-06,
"loss": 1.5975,
"step": 834
},
{
"epoch": 2.18,
"learning_rate": 1.835533175024355e-06,
"loss": 1.623,
"step": 835
},
{
"epoch": 2.18,
"learning_rate": 1.8246286119357903e-06,
"loss": 1.5591,
"step": 836
},
{
"epoch": 2.18,
"learning_rate": 1.8137493024650094e-06,
"loss": 1.6076,
"step": 837
},
{
"epoch": 2.18,
"learning_rate": 1.802895333134783e-06,
"loss": 1.5775,
"step": 838
},
{
"epoch": 2.19,
"learning_rate": 1.792066790266348e-06,
"loss": 1.608,
"step": 839
},
{
"epoch": 2.19,
"learning_rate": 1.7812637599787298e-06,
"loss": 1.6008,
"step": 840
},
{
"epoch": 2.19,
"eval_loss": 1.5489221811294556,
"eval_runtime": 221.4324,
"eval_samples_per_second": 11.679,
"eval_steps_per_second": 0.732,
"step": 840
},
{
"epoch": 2.19,
"learning_rate": 1.7704863281880496e-06,
"loss": 1.6018,
"step": 841
},
{
"epoch": 2.19,
"learning_rate": 1.759734580606845e-06,
"loss": 1.6342,
"step": 842
},
{
"epoch": 2.2,
"learning_rate": 1.749008602743391e-06,
"loss": 1.6145,
"step": 843
},
{
"epoch": 2.2,
"learning_rate": 1.7383084799010164e-06,
"loss": 1.6211,
"step": 844
},
{
"epoch": 2.2,
"learning_rate": 1.7276342971774225e-06,
"loss": 1.6228,
"step": 845
},
{
"epoch": 2.2,
"learning_rate": 1.7169861394640108e-06,
"loss": 1.553,
"step": 846
},
{
"epoch": 2.21,
"learning_rate": 1.7063640914452113e-06,
"loss": 1.5691,
"step": 847
},
{
"epoch": 2.21,
"learning_rate": 1.6957682375977986e-06,
"loss": 1.5527,
"step": 848
},
{
"epoch": 2.21,
"learning_rate": 1.6851986621902267e-06,
"loss": 1.5086,
"step": 849
},
{
"epoch": 2.21,
"learning_rate": 1.6746554492819638e-06,
"loss": 1.5731,
"step": 850
},
{
"epoch": 2.21,
"eval_loss": 1.5485070943832397,
"eval_runtime": 221.5833,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 850
},
{
"epoch": 2.22,
"learning_rate": 1.6641386827228107e-06,
"loss": 1.5955,
"step": 851
},
{
"epoch": 2.22,
"learning_rate": 1.653648446152248e-06,
"loss": 1.537,
"step": 852
},
{
"epoch": 2.22,
"learning_rate": 1.6431848229987586e-06,
"loss": 1.5893,
"step": 853
},
{
"epoch": 2.22,
"learning_rate": 1.6327478964791705e-06,
"loss": 1.543,
"step": 854
},
{
"epoch": 2.23,
"learning_rate": 1.6223377495980003e-06,
"loss": 1.7022,
"step": 855
},
{
"epoch": 2.23,
"learning_rate": 1.61195446514678e-06,
"loss": 1.563,
"step": 856
},
{
"epoch": 2.23,
"learning_rate": 1.601598125703407e-06,
"loss": 1.4853,
"step": 857
},
{
"epoch": 2.24,
"learning_rate": 1.5912688136314886e-06,
"loss": 1.5955,
"step": 858
},
{
"epoch": 2.24,
"learning_rate": 1.5809666110796856e-06,
"loss": 1.5711,
"step": 859
},
{
"epoch": 2.24,
"learning_rate": 1.570691599981053e-06,
"loss": 1.4975,
"step": 860
},
{
"epoch": 2.24,
"eval_loss": 1.5480471849441528,
"eval_runtime": 221.4835,
"eval_samples_per_second": 11.676,
"eval_steps_per_second": 0.731,
"step": 860
},
{
"epoch": 2.24,
"learning_rate": 1.5604438620523932e-06,
"loss": 1.6568,
"step": 861
},
{
"epoch": 2.25,
"learning_rate": 1.550223478793612e-06,
"loss": 1.6204,
"step": 862
},
{
"epoch": 2.25,
"learning_rate": 1.5400305314870596e-06,
"loss": 1.5778,
"step": 863
},
{
"epoch": 2.25,
"learning_rate": 1.5298651011968868e-06,
"loss": 1.5217,
"step": 864
},
{
"epoch": 2.25,
"learning_rate": 1.5197272687684106e-06,
"loss": 1.6217,
"step": 865
},
{
"epoch": 2.26,
"learning_rate": 1.5096171148274546e-06,
"loss": 1.6251,
"step": 866
},
{
"epoch": 2.26,
"learning_rate": 1.4995347197797227e-06,
"loss": 1.6151,
"step": 867
},
{
"epoch": 2.26,
"learning_rate": 1.4894801638101502e-06,
"loss": 1.6027,
"step": 868
},
{
"epoch": 2.26,
"learning_rate": 1.4794535268822674e-06,
"loss": 1.6239,
"step": 869
},
{
"epoch": 2.27,
"learning_rate": 1.469454888737571e-06,
"loss": 1.6158,
"step": 870
},
{
"epoch": 2.27,
"eval_loss": 1.5477726459503174,
"eval_runtime": 221.5786,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 870
},
{
"epoch": 2.27,
"learning_rate": 1.4594843288948773e-06,
"loss": 1.5614,
"step": 871
},
{
"epoch": 2.27,
"learning_rate": 1.449541926649705e-06,
"loss": 1.5896,
"step": 872
},
{
"epoch": 2.27,
"learning_rate": 1.4396277610736287e-06,
"loss": 1.5643,
"step": 873
},
{
"epoch": 2.28,
"learning_rate": 1.4297419110136628e-06,
"loss": 1.5279,
"step": 874
},
{
"epoch": 2.28,
"learning_rate": 1.419884455091628e-06,
"loss": 1.5952,
"step": 875
},
{
"epoch": 2.28,
"learning_rate": 1.4100554717035242e-06,
"loss": 1.644,
"step": 876
},
{
"epoch": 2.28,
"learning_rate": 1.4002550390189162e-06,
"loss": 1.6332,
"step": 877
},
{
"epoch": 2.29,
"learning_rate": 1.3904832349803011e-06,
"loss": 1.5595,
"step": 878
},
{
"epoch": 2.29,
"learning_rate": 1.380740137302497e-06,
"loss": 1.6731,
"step": 879
},
{
"epoch": 2.29,
"learning_rate": 1.3710258234720191e-06,
"loss": 1.6063,
"step": 880
},
{
"epoch": 2.29,
"eval_loss": 1.5473620891571045,
"eval_runtime": 221.5086,
"eval_samples_per_second": 11.674,
"eval_steps_per_second": 0.731,
"step": 880
},
{
"epoch": 2.3,
"learning_rate": 1.361340370746464e-06,
"loss": 1.6298,
"step": 881
},
{
"epoch": 2.3,
"learning_rate": 1.3516838561539019e-06,
"loss": 1.6664,
"step": 882
},
{
"epoch": 2.3,
"learning_rate": 1.342056356492255e-06,
"loss": 1.5638,
"step": 883
},
{
"epoch": 2.3,
"learning_rate": 1.332457948328691e-06,
"loss": 1.6128,
"step": 884
},
{
"epoch": 2.31,
"learning_rate": 1.3228887079990155e-06,
"loss": 1.6247,
"step": 885
},
{
"epoch": 2.31,
"learning_rate": 1.3133487116070643e-06,
"loss": 1.5627,
"step": 886
},
{
"epoch": 2.31,
"learning_rate": 1.3038380350240948e-06,
"loss": 1.6161,
"step": 887
},
{
"epoch": 2.31,
"learning_rate": 1.2943567538881841e-06,
"loss": 1.6489,
"step": 888
},
{
"epoch": 2.32,
"learning_rate": 1.2849049436036325e-06,
"loss": 1.5597,
"step": 889
},
{
"epoch": 2.32,
"learning_rate": 1.2754826793403563e-06,
"loss": 1.628,
"step": 890
},
{
"epoch": 2.32,
"eval_loss": 1.5470210313796997,
"eval_runtime": 221.6295,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 890
},
{
"epoch": 2.32,
"learning_rate": 1.2660900360332927e-06,
"loss": 1.6245,
"step": 891
},
{
"epoch": 2.32,
"learning_rate": 1.256727088381809e-06,
"loss": 1.6468,
"step": 892
},
{
"epoch": 2.33,
"learning_rate": 1.2473939108490974e-06,
"loss": 1.6272,
"step": 893
},
{
"epoch": 2.33,
"learning_rate": 1.238090577661596e-06,
"loss": 1.6395,
"step": 894
},
{
"epoch": 2.33,
"learning_rate": 1.2288171628083883e-06,
"loss": 1.5804,
"step": 895
},
{
"epoch": 2.33,
"learning_rate": 1.2195737400406165e-06,
"loss": 1.6033,
"step": 896
},
{
"epoch": 2.34,
"learning_rate": 1.210360382870902e-06,
"loss": 1.6096,
"step": 897
},
{
"epoch": 2.34,
"learning_rate": 1.201177164572752e-06,
"loss": 1.5839,
"step": 898
},
{
"epoch": 2.34,
"learning_rate": 1.192024158179979e-06,
"loss": 1.5953,
"step": 899
},
{
"epoch": 2.34,
"learning_rate": 1.1829014364861252e-06,
"loss": 1.6177,
"step": 900
},
{
"epoch": 2.34,
"eval_loss": 1.546774983406067,
"eval_runtime": 221.4788,
"eval_samples_per_second": 11.676,
"eval_steps_per_second": 0.731,
"step": 900
},
{
"epoch": 2.35,
"learning_rate": 1.1738090720438782e-06,
"loss": 1.5881,
"step": 901
},
{
"epoch": 2.35,
"learning_rate": 1.1647471371644943e-06,
"loss": 1.555,
"step": 902
},
{
"epoch": 2.35,
"learning_rate": 1.155715703917224e-06,
"loss": 1.5393,
"step": 903
},
{
"epoch": 2.35,
"learning_rate": 1.1467148441287423e-06,
"loss": 1.5533,
"step": 904
},
{
"epoch": 2.36,
"learning_rate": 1.1377446293825717e-06,
"loss": 1.603,
"step": 905
},
{
"epoch": 2.36,
"learning_rate": 1.1288051310185182e-06,
"loss": 1.5723,
"step": 906
},
{
"epoch": 2.36,
"learning_rate": 1.1198964201320994e-06,
"loss": 1.6016,
"step": 907
},
{
"epoch": 2.37,
"learning_rate": 1.1110185675739804e-06,
"loss": 1.577,
"step": 908
},
{
"epoch": 2.37,
"learning_rate": 1.1021716439494157e-06,
"loss": 1.667,
"step": 909
},
{
"epoch": 2.37,
"learning_rate": 1.093355719617678e-06,
"loss": 1.5646,
"step": 910
},
{
"epoch": 2.37,
"eval_loss": 1.546728491783142,
"eval_runtime": 221.5999,
"eval_samples_per_second": 11.67,
"eval_steps_per_second": 0.731,
"step": 910
},
{
"epoch": 2.37,
"learning_rate": 1.0845708646915054e-06,
"loss": 1.6052,
"step": 911
},
{
"epoch": 2.38,
"learning_rate": 1.0758171490365444e-06,
"loss": 1.6043,
"step": 912
},
{
"epoch": 2.38,
"learning_rate": 1.0670946422707883e-06,
"loss": 1.587,
"step": 913
},
{
"epoch": 2.38,
"learning_rate": 1.058403413764028e-06,
"loss": 1.6036,
"step": 914
},
{
"epoch": 2.38,
"learning_rate": 1.0497435326373023e-06,
"loss": 1.4276,
"step": 915
},
{
"epoch": 2.39,
"learning_rate": 1.0411150677623438e-06,
"loss": 1.5751,
"step": 916
},
{
"epoch": 2.39,
"learning_rate": 1.0325180877610313e-06,
"loss": 1.5523,
"step": 917
},
{
"epoch": 2.39,
"learning_rate": 1.023952661004845e-06,
"loss": 1.6882,
"step": 918
},
{
"epoch": 2.39,
"learning_rate": 1.0154188556143286e-06,
"loss": 1.5225,
"step": 919
},
{
"epoch": 2.4,
"learning_rate": 1.006916739458535e-06,
"loss": 1.5272,
"step": 920
},
{
"epoch": 2.4,
"eval_loss": 1.5465571880340576,
"eval_runtime": 221.6882,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.731,
"step": 920
},
{
"epoch": 2.4,
"learning_rate": 9.984463801544992e-07,
"loss": 1.6817,
"step": 921
},
{
"epoch": 2.4,
"learning_rate": 9.900078450666929e-07,
"loss": 1.5175,
"step": 922
},
{
"epoch": 2.4,
"learning_rate": 9.81601201306489e-07,
"loss": 1.6615,
"step": 923
},
{
"epoch": 2.41,
"learning_rate": 9.732265157316344e-07,
"loss": 1.6376,
"step": 924
},
{
"epoch": 2.41,
"learning_rate": 9.648838549457101e-07,
"loss": 1.5621,
"step": 925
},
{
"epoch": 2.41,
"learning_rate": 9.56573285297605e-07,
"loss": 1.7235,
"step": 926
},
{
"epoch": 2.41,
"learning_rate": 9.482948728809909e-07,
"loss": 1.6235,
"step": 927
},
{
"epoch": 2.42,
"learning_rate": 9.400486835337913e-07,
"loss": 1.68,
"step": 928
},
{
"epoch": 2.42,
"learning_rate": 9.318347828376639e-07,
"loss": 1.6075,
"step": 929
},
{
"epoch": 2.42,
"learning_rate": 9.236532361174727e-07,
"loss": 1.5402,
"step": 930
},
{
"epoch": 2.42,
"eval_loss": 1.5463948249816895,
"eval_runtime": 221.5101,
"eval_samples_per_second": 11.674,
"eval_steps_per_second": 0.731,
"step": 930
},
{
"epoch": 2.43,
"learning_rate": 9.15504108440774e-07,
"loss": 1.5344,
"step": 931
},
{
"epoch": 2.43,
"learning_rate": 9.073874646172958e-07,
"loss": 1.6023,
"step": 932
},
{
"epoch": 2.43,
"learning_rate": 8.993033691984215e-07,
"loss": 1.6561,
"step": 933
},
{
"epoch": 2.43,
"learning_rate": 8.912518864766817e-07,
"loss": 1.5915,
"step": 934
},
{
"epoch": 2.44,
"learning_rate": 8.832330804852351e-07,
"loss": 1.5965,
"step": 935
},
{
"epoch": 2.44,
"learning_rate": 8.752470149973686e-07,
"loss": 1.5433,
"step": 936
},
{
"epoch": 2.44,
"learning_rate": 8.672937535259812e-07,
"loss": 1.6243,
"step": 937
},
{
"epoch": 2.44,
"learning_rate": 8.593733593230813e-07,
"loss": 1.5851,
"step": 938
},
{
"epoch": 2.45,
"learning_rate": 8.51485895379291e-07,
"loss": 1.5311,
"step": 939
},
{
"epoch": 2.45,
"learning_rate": 8.43631424423334e-07,
"loss": 1.5815,
"step": 940
},
{
"epoch": 2.45,
"eval_loss": 1.5460950136184692,
"eval_runtime": 221.552,
"eval_samples_per_second": 11.672,
"eval_steps_per_second": 0.731,
"step": 940
},
{
"epoch": 2.45,
"learning_rate": 8.358100089215426e-07,
"loss": 1.6118,
"step": 941
},
{
"epoch": 2.45,
"learning_rate": 8.280217110773625e-07,
"loss": 1.6851,
"step": 942
},
{
"epoch": 2.46,
"learning_rate": 8.202665928308551e-07,
"loss": 1.5943,
"step": 943
},
{
"epoch": 2.46,
"learning_rate": 8.125447158582045e-07,
"loss": 1.524,
"step": 944
},
{
"epoch": 2.46,
"learning_rate": 8.048561415712269e-07,
"loss": 1.6237,
"step": 945
},
{
"epoch": 2.46,
"learning_rate": 7.972009311168883e-07,
"loss": 1.5489,
"step": 946
},
{
"epoch": 2.47,
"learning_rate": 7.895791453768076e-07,
"loss": 1.5575,
"step": 947
},
{
"epoch": 2.47,
"learning_rate": 7.819908449667824e-07,
"loss": 1.6436,
"step": 948
},
{
"epoch": 2.47,
"learning_rate": 7.744360902363002e-07,
"loss": 1.6002,
"step": 949
},
{
"epoch": 2.47,
"learning_rate": 7.669149412680604e-07,
"loss": 1.4857,
"step": 950
},
{
"epoch": 2.47,
"eval_loss": 1.545936107635498,
"eval_runtime": 221.5545,
"eval_samples_per_second": 11.672,
"eval_steps_per_second": 0.731,
"step": 950
},
{
"epoch": 2.48,
"learning_rate": 7.594274578775007e-07,
"loss": 1.6765,
"step": 951
},
{
"epoch": 2.48,
"learning_rate": 7.519736996123139e-07,
"loss": 1.5792,
"step": 952
},
{
"epoch": 2.48,
"learning_rate": 7.445537257519775e-07,
"loss": 1.677,
"step": 953
},
{
"epoch": 2.49,
"learning_rate": 7.371675953072871e-07,
"loss": 1.5159,
"step": 954
},
{
"epoch": 2.49,
"learning_rate": 7.298153670198799e-07,
"loss": 1.6549,
"step": 955
},
{
"epoch": 2.49,
"learning_rate": 7.224970993617685e-07,
"loss": 1.619,
"step": 956
},
{
"epoch": 2.49,
"learning_rate": 7.152128505348821e-07,
"loss": 1.6101,
"step": 957
},
{
"epoch": 2.5,
"learning_rate": 7.079626784705978e-07,
"loss": 1.5972,
"step": 958
},
{
"epoch": 2.5,
"learning_rate": 7.007466408292801e-07,
"loss": 1.5516,
"step": 959
},
{
"epoch": 2.5,
"learning_rate": 6.935647949998231e-07,
"loss": 1.5923,
"step": 960
},
{
"epoch": 2.5,
"eval_loss": 1.5458210706710815,
"eval_runtime": 221.6403,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 960
},
{
"epoch": 2.5,
"learning_rate": 6.864171980991985e-07,
"loss": 1.5649,
"step": 961
},
{
"epoch": 2.51,
"learning_rate": 6.793039069719925e-07,
"loss": 1.6615,
"step": 962
},
{
"epoch": 2.51,
"learning_rate": 6.722249781899631e-07,
"loss": 1.5616,
"step": 963
},
{
"epoch": 2.51,
"learning_rate": 6.651804680515828e-07,
"loss": 1.5953,
"step": 964
},
{
"epoch": 2.51,
"learning_rate": 6.581704325815941e-07,
"loss": 1.631,
"step": 965
},
{
"epoch": 2.52,
"learning_rate": 6.511949275305657e-07,
"loss": 1.5349,
"step": 966
},
{
"epoch": 2.52,
"learning_rate": 6.442540083744453e-07,
"loss": 1.5914,
"step": 967
},
{
"epoch": 2.52,
"learning_rate": 6.37347730314119e-07,
"loss": 1.5806,
"step": 968
},
{
"epoch": 2.52,
"learning_rate": 6.304761482749777e-07,
"loss": 1.6937,
"step": 969
},
{
"epoch": 2.53,
"learning_rate": 6.23639316906472e-07,
"loss": 1.6167,
"step": 970
},
{
"epoch": 2.53,
"eval_loss": 1.5456454753875732,
"eval_runtime": 221.6009,
"eval_samples_per_second": 11.67,
"eval_steps_per_second": 0.731,
"step": 970
},
{
"epoch": 2.53,
"learning_rate": 6.168372905816822e-07,
"loss": 1.5585,
"step": 971
},
{
"epoch": 2.53,
"learning_rate": 6.100701233968876e-07,
"loss": 1.6041,
"step": 972
},
{
"epoch": 2.53,
"learning_rate": 6.033378691711333e-07,
"loss": 1.6218,
"step": 973
},
{
"epoch": 2.54,
"learning_rate": 5.966405814457999e-07,
"loss": 1.619,
"step": 974
},
{
"epoch": 2.54,
"learning_rate": 5.899783134841846e-07,
"loss": 1.48,
"step": 975
},
{
"epoch": 2.54,
"learning_rate": 5.833511182710716e-07,
"loss": 1.6509,
"step": 976
},
{
"epoch": 2.55,
"learning_rate": 5.76759048512312e-07,
"loss": 1.5868,
"step": 977
},
{
"epoch": 2.55,
"learning_rate": 5.702021566344079e-07,
"loss": 1.5942,
"step": 978
},
{
"epoch": 2.55,
"learning_rate": 5.636804947840907e-07,
"loss": 1.6148,
"step": 979
},
{
"epoch": 2.55,
"learning_rate": 5.571941148279081e-07,
"loss": 1.7214,
"step": 980
},
{
"epoch": 2.55,
"eval_loss": 1.5455540418624878,
"eval_runtime": 221.5314,
"eval_samples_per_second": 11.673,
"eval_steps_per_second": 0.731,
"step": 980
},
{
"epoch": 2.56,
"learning_rate": 5.507430683518161e-07,
"loss": 1.5196,
"step": 981
},
{
"epoch": 2.56,
"learning_rate": 5.443274066607607e-07,
"loss": 1.5658,
"step": 982
},
{
"epoch": 2.56,
"learning_rate": 5.379471807782743e-07,
"loss": 1.6647,
"step": 983
},
{
"epoch": 2.56,
"learning_rate": 5.316024414460729e-07,
"loss": 1.6552,
"step": 984
},
{
"epoch": 2.57,
"learning_rate": 5.252932391236443e-07,
"loss": 1.6959,
"step": 985
},
{
"epoch": 2.57,
"learning_rate": 5.19019623987857e-07,
"loss": 1.6183,
"step": 986
},
{
"epoch": 2.57,
"learning_rate": 5.127816459325508e-07,
"loss": 1.5426,
"step": 987
},
{
"epoch": 2.57,
"learning_rate": 5.065793545681491e-07,
"loss": 1.4803,
"step": 988
},
{
"epoch": 2.58,
"learning_rate": 5.00412799221257e-07,
"loss": 1.663,
"step": 989
},
{
"epoch": 2.58,
"learning_rate": 4.942820289342759e-07,
"loss": 1.5467,
"step": 990
},
{
"epoch": 2.58,
"eval_loss": 1.545507788658142,
"eval_runtime": 221.6193,
"eval_samples_per_second": 11.669,
"eval_steps_per_second": 0.731,
"step": 990
},
{
"epoch": 2.58,
"learning_rate": 4.881870924650062e-07,
"loss": 1.5791,
"step": 991
},
{
"epoch": 2.58,
"learning_rate": 4.821280382862647e-07,
"loss": 1.5733,
"step": 992
},
{
"epoch": 2.59,
"learning_rate": 4.76104914585499e-07,
"loss": 1.6028,
"step": 993
},
{
"epoch": 2.59,
"learning_rate": 4.70117769264401e-07,
"loss": 1.5632,
"step": 994
},
{
"epoch": 2.59,
"learning_rate": 4.641666499385278e-07,
"loss": 1.6126,
"step": 995
},
{
"epoch": 2.59,
"learning_rate": 4.582516039369245e-07,
"loss": 1.5865,
"step": 996
},
{
"epoch": 2.6,
"learning_rate": 4.523726783017457e-07,
"loss": 1.676,
"step": 997
},
{
"epoch": 2.6,
"learning_rate": 4.4652991978787975e-07,
"loss": 1.4612,
"step": 998
},
{
"epoch": 2.6,
"learning_rate": 4.407233748625839e-07,
"loss": 1.6977,
"step": 999
},
{
"epoch": 2.61,
"learning_rate": 4.3495308970510463e-07,
"loss": 1.6455,
"step": 1000
},
{
"epoch": 2.61,
"eval_loss": 1.5453382730484009,
"eval_runtime": 221.6265,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 1000
},
{
"epoch": 2.61,
"learning_rate": 4.2921911020631926e-07,
"loss": 1.6091,
"step": 1001
},
{
"epoch": 2.61,
"learning_rate": 4.235214819683681e-07,
"loss": 1.644,
"step": 1002
},
{
"epoch": 2.61,
"learning_rate": 4.1786025030428776e-07,
"loss": 1.5587,
"step": 1003
},
{
"epoch": 2.62,
"learning_rate": 4.1223546023765604e-07,
"loss": 1.61,
"step": 1004
},
{
"epoch": 2.62,
"learning_rate": 4.0664715650223343e-07,
"loss": 1.583,
"step": 1005
},
{
"epoch": 2.62,
"learning_rate": 4.010953835416037e-07,
"loss": 1.5307,
"step": 1006
},
{
"epoch": 2.62,
"learning_rate": 3.9558018550882204e-07,
"loss": 1.5984,
"step": 1007
},
{
"epoch": 2.63,
"learning_rate": 3.901016062660673e-07,
"loss": 1.6404,
"step": 1008
},
{
"epoch": 2.63,
"learning_rate": 3.846596893842891e-07,
"loss": 1.5449,
"step": 1009
},
{
"epoch": 2.63,
"learning_rate": 3.792544781428609e-07,
"loss": 1.6137,
"step": 1010
},
{
"epoch": 2.63,
"eval_loss": 1.5452983379364014,
"eval_runtime": 221.4626,
"eval_samples_per_second": 11.677,
"eval_steps_per_second": 0.732,
"step": 1010
},
{
"epoch": 2.63,
"learning_rate": 3.7388601552924066e-07,
"loss": 1.5802,
"step": 1011
},
{
"epoch": 2.64,
"learning_rate": 3.6855434423862356e-07,
"loss": 1.5109,
"step": 1012
},
{
"epoch": 2.64,
"learning_rate": 3.6325950667360443e-07,
"loss": 1.5823,
"step": 1013
},
{
"epoch": 2.64,
"learning_rate": 3.5800154494384176e-07,
"loss": 1.5131,
"step": 1014
},
{
"epoch": 2.64,
"learning_rate": 3.5278050086572313e-07,
"loss": 1.6175,
"step": 1015
},
{
"epoch": 2.65,
"learning_rate": 3.4759641596202766e-07,
"loss": 1.6387,
"step": 1016
},
{
"epoch": 2.65,
"learning_rate": 3.4244933146160395e-07,
"loss": 1.6567,
"step": 1017
},
{
"epoch": 2.65,
"learning_rate": 3.3733928829903396e-07,
"loss": 1.5627,
"step": 1018
},
{
"epoch": 2.65,
"learning_rate": 3.322663271143112e-07,
"loss": 1.7035,
"step": 1019
},
{
"epoch": 2.66,
"learning_rate": 3.2723048825252177e-07,
"loss": 1.6104,
"step": 1020
},
{
"epoch": 2.66,
"eval_loss": 1.5452524423599243,
"eval_runtime": 221.6335,
"eval_samples_per_second": 11.668,
"eval_steps_per_second": 0.731,
"step": 1020
},
{
"epoch": 2.66,
"learning_rate": 3.222318117635143e-07,
"loss": 1.5946,
"step": 1021
},
{
"epoch": 2.66,
"learning_rate": 3.172703374015884e-07,
"loss": 1.6396,
"step": 1022
},
{
"epoch": 2.66,
"learning_rate": 3.12346104625178e-07,
"loss": 1.5768,
"step": 1023
},
{
"epoch": 2.67,
"learning_rate": 3.0745915259653314e-07,
"loss": 1.6198,
"step": 1024
},
{
"epoch": 2.67,
"learning_rate": 3.026095201814122e-07,
"loss": 1.4602,
"step": 1025
},
{
"epoch": 2.67,
"learning_rate": 2.9779724594877377e-07,
"loss": 1.6006,
"step": 1026
},
{
"epoch": 2.68,
"learning_rate": 2.9302236817046636e-07,
"loss": 1.6586,
"step": 1027
},
{
"epoch": 2.68,
"learning_rate": 2.8828492482092576e-07,
"loss": 1.6442,
"step": 1028
},
{
"epoch": 2.68,
"learning_rate": 2.8358495357687366e-07,
"loss": 1.4891,
"step": 1029
},
{
"epoch": 2.68,
"learning_rate": 2.78922491817018e-07,
"loss": 1.6756,
"step": 1030
},
{
"epoch": 2.68,
"eval_loss": 1.545109748840332,
"eval_runtime": 221.6902,
"eval_samples_per_second": 11.665,
"eval_steps_per_second": 0.731,
"step": 1030
},
{
"epoch": 2.69,
"learning_rate": 2.7429757662175316e-07,
"loss": 1.6241,
"step": 1031
},
{
"epoch": 2.69,
"learning_rate": 2.6971024477287e-07,
"loss": 1.5359,
"step": 1032
},
{
"epoch": 2.69,
"learning_rate": 2.651605327532569e-07,
"loss": 1.5156,
"step": 1033
},
{
"epoch": 2.69,
"learning_rate": 2.60648476746615e-07,
"loss": 1.5417,
"step": 1034
},
{
"epoch": 2.7,
"learning_rate": 2.561741126371692e-07,
"loss": 1.5033,
"step": 1035
},
{
"epoch": 2.7,
"learning_rate": 2.5173747600937994e-07,
"loss": 1.58,
"step": 1036
},
{
"epoch": 2.7,
"learning_rate": 2.4733860214766315e-07,
"loss": 1.5353,
"step": 1037
},
{
"epoch": 2.7,
"learning_rate": 2.429775260361106e-07,
"loss": 1.5453,
"step": 1038
},
{
"epoch": 2.71,
"learning_rate": 2.3865428235820775e-07,
"loss": 1.6029,
"step": 1039
},
{
"epoch": 2.71,
"learning_rate": 2.3436890549655922e-07,
"loss": 1.5818,
"step": 1040
},
{
"epoch": 2.71,
"eval_loss": 1.5450130701065063,
"eval_runtime": 221.5386,
"eval_samples_per_second": 11.673,
"eval_steps_per_second": 0.731,
"step": 1040
},
{
"epoch": 2.71,
"learning_rate": 2.301214295326193e-07,
"loss": 1.5887,
"step": 1041
},
{
"epoch": 2.71,
"learning_rate": 2.2591188824641508e-07,
"loss": 1.6142,
"step": 1042
},
{
"epoch": 2.72,
"learning_rate": 2.217403151162817e-07,
"loss": 1.5788,
"step": 1043
},
{
"epoch": 2.72,
"learning_rate": 2.176067433185952e-07,
"loss": 1.6359,
"step": 1044
},
{
"epoch": 2.72,
"learning_rate": 2.1351120572750737e-07,
"loss": 1.5598,
"step": 1045
},
{
"epoch": 2.72,
"learning_rate": 2.0945373491468468e-07,
"loss": 1.5478,
"step": 1046
},
{
"epoch": 2.73,
"learning_rate": 2.0543436314905242e-07,
"loss": 1.6472,
"step": 1047
},
{
"epoch": 2.73,
"learning_rate": 2.0145312239653325e-07,
"loss": 1.5053,
"step": 1048
},
{
"epoch": 2.73,
"learning_rate": 1.975100443197958e-07,
"loss": 1.6325,
"step": 1049
},
{
"epoch": 2.74,
"learning_rate": 1.9360516027800258e-07,
"loss": 1.5829,
"step": 1050
},
{
"epoch": 2.74,
"eval_loss": 1.5450440645217896,
"eval_runtime": 221.5885,
"eval_samples_per_second": 11.67,
"eval_steps_per_second": 0.731,
"step": 1050
},
{
"epoch": 2.74,
"learning_rate": 1.8973850132655957e-07,
"loss": 1.5113,
"step": 1051
},
{
"epoch": 2.74,
"learning_rate": 1.8591009821687044e-07,
"loss": 1.6341,
"step": 1052
},
{
"epoch": 2.74,
"learning_rate": 1.8211998139609222e-07,
"loss": 1.7143,
"step": 1053
},
{
"epoch": 2.75,
"learning_rate": 1.78368181006891e-07,
"loss": 1.5249,
"step": 1054
},
{
"epoch": 2.75,
"learning_rate": 1.7465472688720397e-07,
"loss": 1.6237,
"step": 1055
},
{
"epoch": 2.75,
"learning_rate": 1.7097964857000326e-07,
"loss": 1.6128,
"step": 1056
},
{
"epoch": 2.75,
"learning_rate": 1.6734297528305687e-07,
"loss": 1.5549,
"step": 1057
},
{
"epoch": 2.76,
"learning_rate": 1.6374473594870155e-07,
"loss": 1.5354,
"step": 1058
},
{
"epoch": 2.76,
"learning_rate": 1.6018495918360965e-07,
"loss": 1.6438,
"step": 1059
},
{
"epoch": 2.76,
"learning_rate": 1.5666367329856046e-07,
"loss": 1.5753,
"step": 1060
},
{
"epoch": 2.76,
"eval_loss": 1.5450438261032104,
"eval_runtime": 221.7053,
"eval_samples_per_second": 11.664,
"eval_steps_per_second": 0.731,
"step": 1060
},
{
"epoch": 2.76,
"learning_rate": 1.5318090629821757e-07,
"loss": 1.5775,
"step": 1061
},
{
"epoch": 2.77,
"learning_rate": 1.4973668588090572e-07,
"loss": 1.5399,
"step": 1062
},
{
"epoch": 2.77,
"learning_rate": 1.4633103943839045e-07,
"loss": 1.6173,
"step": 1063
},
{
"epoch": 2.77,
"learning_rate": 1.429639940556571e-07,
"loss": 1.6818,
"step": 1064
},
{
"epoch": 2.77,
"learning_rate": 1.396355765107016e-07,
"loss": 1.5952,
"step": 1065
},
{
"epoch": 2.78,
"learning_rate": 1.363458132743123e-07,
"loss": 1.6716,
"step": 1066
},
{
"epoch": 2.78,
"learning_rate": 1.3309473050986067e-07,
"loss": 1.7006,
"step": 1067
},
{
"epoch": 2.78,
"learning_rate": 1.298823540730948e-07,
"loss": 1.6409,
"step": 1068
},
{
"epoch": 2.78,
"learning_rate": 1.2670870951193293e-07,
"loss": 1.4846,
"step": 1069
},
{
"epoch": 2.79,
"learning_rate": 1.2357382206625802e-07,
"loss": 1.6484,
"step": 1070
},
{
"epoch": 2.79,
"eval_loss": 1.5450248718261719,
"eval_runtime": 221.5954,
"eval_samples_per_second": 11.67,
"eval_steps_per_second": 0.731,
"step": 1070
},
{
"epoch": 2.79,
"learning_rate": 1.2047771666772124e-07,
"loss": 1.5518,
"step": 1071
},
{
"epoch": 2.79,
"learning_rate": 1.1742041793954162e-07,
"loss": 1.5571,
"step": 1072
},
{
"epoch": 2.8,
"learning_rate": 1.1440195019630784e-07,
"loss": 1.6202,
"step": 1073
},
{
"epoch": 2.8,
"learning_rate": 1.114223374437895e-07,
"loss": 1.6521,
"step": 1074
},
{
"epoch": 2.8,
"learning_rate": 1.0848160337874225e-07,
"loss": 1.596,
"step": 1075
},
{
"epoch": 2.8,
"learning_rate": 1.0557977138872133e-07,
"loss": 1.5768,
"step": 1076
},
{
"epoch": 2.81,
"learning_rate": 1.0271686455189556e-07,
"loss": 1.5506,
"step": 1077
},
{
"epoch": 2.81,
"learning_rate": 9.989290563686305e-08,
"loss": 1.6101,
"step": 1078
},
{
"epoch": 2.81,
"learning_rate": 9.710791710247025e-08,
"loss": 1.5509,
"step": 1079
},
{
"epoch": 2.81,
"learning_rate": 9.436192109763376e-08,
"loss": 1.6765,
"step": 1080
},
{
"epoch": 2.81,
"eval_loss": 1.5449930429458618,
"eval_runtime": 221.5701,
"eval_samples_per_second": 11.671,
"eval_steps_per_second": 0.731,
"step": 1080
},
{
"epoch": 2.82,
"learning_rate": 9.165493946116432e-08,
"loss": 1.5969,
"step": 1081
},
{
"epoch": 2.82,
"learning_rate": 8.898699372159147e-08,
"loss": 1.4659,
"step": 1082
},
{
"epoch": 2.82,
"learning_rate": 8.635810509699583e-08,
"loss": 1.5853,
"step": 1083
},
{
"epoch": 2.82,
"learning_rate": 8.376829449483537e-08,
"loss": 1.5234,
"step": 1084
},
{
"epoch": 2.83,
"learning_rate": 8.121758251178391e-08,
"loss": 1.6246,
"step": 1085
},
{
"epoch": 2.83,
"learning_rate": 7.870598943356622e-08,
"loss": 1.5908,
"step": 1086
},
{
"epoch": 2.83,
"learning_rate": 7.62335352347926e-08,
"loss": 1.5624,
"step": 1087
},
{
"epoch": 2.83,
"learning_rate": 7.380023957880511e-08,
"loss": 1.6097,
"step": 1088
},
{
"epoch": 2.84,
"learning_rate": 7.140612181752049e-08,
"loss": 1.53,
"step": 1089
},
{
"epoch": 2.84,
"learning_rate": 6.905120099127249e-08,
"loss": 1.623,
"step": 1090
},
{
"epoch": 2.84,
"eval_loss": 1.5449092388153076,
"eval_runtime": 221.6089,
"eval_samples_per_second": 11.669,
"eval_steps_per_second": 0.731,
"step": 1090
},
{
"epoch": 2.84,
"learning_rate": 6.673549582866368e-08,
"loss": 1.5162,
"step": 1091
},
{
"epoch": 2.84,
"learning_rate": 6.44590247464183e-08,
"loss": 1.6031,
"step": 1092
},
{
"epoch": 2.85,
"learning_rate": 6.222180584923021e-08,
"loss": 1.6486,
"step": 1093
},
{
"epoch": 2.85,
"learning_rate": 6.002385692962242e-08,
"loss": 1.4915,
"step": 1094
},
{
"epoch": 2.85,
"learning_rate": 5.7865195467807775e-08,
"loss": 1.4412,
"step": 1095
},
{
"epoch": 2.86,
"learning_rate": 5.5745838631544036e-08,
"loss": 1.5536,
"step": 1096
},
{
"epoch": 2.86,
"learning_rate": 5.3665803276002906e-08,
"loss": 1.5892,
"step": 1097
},
{
"epoch": 2.86,
"learning_rate": 5.162510594363235e-08,
"loss": 1.6356,
"step": 1098
},
{
"epoch": 2.86,
"learning_rate": 4.9623762864027815e-08,
"loss": 1.6506,
"step": 1099
},
{
"epoch": 2.87,
"learning_rate": 4.7661789953799553e-08,
"loss": 1.6901,
"step": 1100
},
{
"epoch": 2.87,
"eval_loss": 1.5449464321136475,
"eval_runtime": 221.5857,
"eval_samples_per_second": 11.67,
"eval_steps_per_second": 0.731,
"step": 1100
},
{
"epoch": 2.87,
"learning_rate": 4.573920281645161e-08,
"loss": 1.6219,
"step": 1101
},
{
"epoch": 2.87,
"learning_rate": 4.385601674225082e-08,
"loss": 1.6609,
"step": 1102
},
{
"epoch": 2.87,
"learning_rate": 4.2012246708110774e-08,
"loss": 1.6263,
"step": 1103
},
{
"epoch": 2.88,
"learning_rate": 4.020790737746971e-08,
"loss": 1.5852,
"step": 1104
},
{
"epoch": 2.88,
"learning_rate": 3.844301310017673e-08,
"loss": 1.5669,
"step": 1105
},
{
"epoch": 2.88,
"learning_rate": 3.6717577912372406e-08,
"loss": 1.4975,
"step": 1106
},
{
"epoch": 2.88,
"learning_rate": 3.503161553638445e-08,
"loss": 1.5413,
"step": 1107
},
{
"epoch": 2.89,
"learning_rate": 3.338513938061172e-08,
"loss": 1.5464,
"step": 1108
},
{
"epoch": 2.89,
"learning_rate": 3.177816253942145e-08,
"loss": 1.5708,
"step": 1109
},
{
"epoch": 2.89,
"learning_rate": 3.021069779304498e-08,
"loss": 1.6601,
"step": 1110
},
{
"epoch": 2.89,
"eval_loss": 1.5448739528656006,
"eval_runtime": 221.6432,
"eval_samples_per_second": 11.667,
"eval_steps_per_second": 0.731,
"step": 1110
},
{
"epoch": 2.89,
"learning_rate": 2.868275760747441e-08,
"loss": 1.5739,
"step": 1111
},
{
"epoch": 2.9,
"learning_rate": 2.7194354134363886e-08,
"loss": 1.5899,
"step": 1112
},
{
"epoch": 2.9,
"learning_rate": 2.5745499210936274e-08,
"loss": 1.5463,
"step": 1113
},
{
"epoch": 2.9,
"learning_rate": 2.4336204359882153e-08,
"loss": 1.5981,
"step": 1114
},
{
"epoch": 2.9,
"learning_rate": 2.2966480789275438e-08,
"loss": 1.5515,
"step": 1115
},
{
"epoch": 2.91,
"learning_rate": 2.1636339392479553e-08,
"loss": 1.6135,
"step": 1116
},
{
"epoch": 2.91,
"learning_rate": 2.0345790748062532e-08,
"loss": 1.5344,
"step": 1117
},
{
"epoch": 2.91,
"learning_rate": 1.9094845119712603e-08,
"loss": 1.6426,
"step": 1118
},
{
"epoch": 2.92,
"learning_rate": 1.788351245615716e-08,
"loss": 1.5534,
"step": 1119
},
{
"epoch": 2.92,
"learning_rate": 1.6711802391081723e-08,
"loss": 1.6763,
"step": 1120
},
{
"epoch": 2.92,
"eval_loss": 1.5449295043945312,
"eval_runtime": 221.4937,
"eval_samples_per_second": 11.675,
"eval_steps_per_second": 0.731,
"step": 1120
},
{
"epoch": 2.92,
"learning_rate": 1.557972424305665e-08,
"loss": 1.6222,
"step": 1121
},
{
"epoch": 2.92,
"learning_rate": 1.4487287015458872e-08,
"loss": 1.5905,
"step": 1122
},
{
"epoch": 2.93,
"learning_rate": 1.3434499396404176e-08,
"loss": 1.6102,
"step": 1123
},
{
"epoch": 2.93,
"learning_rate": 1.2421369758675027e-08,
"loss": 1.6438,
"step": 1124
},
{
"epoch": 2.93,
"learning_rate": 1.1447906159656741e-08,
"loss": 1.5647,
"step": 1125
},
{
"epoch": 2.93,
"learning_rate": 1.0514116341271419e-08,
"loss": 1.6007,
"step": 1126
},
{
"epoch": 2.94,
"learning_rate": 9.620007729916337e-09,
"loss": 1.6131,
"step": 1127
},
{
"epoch": 2.94,
"learning_rate": 8.765587436406765e-09,
"loss": 1.6681,
"step": 1128
},
{
"epoch": 2.94,
"learning_rate": 7.95086225591657e-09,
"loss": 1.6304,
"step": 1129
},
{
"epoch": 2.94,
"learning_rate": 7.175838667927149e-09,
"loss": 1.6203,
"step": 1130
},
{
"epoch": 2.94,
"eval_loss": 1.5448760986328125,
"eval_runtime": 221.5511,
"eval_samples_per_second": 11.672,
"eval_steps_per_second": 0.731,
"step": 1130
},
{
"epoch": 2.95,
"learning_rate": 6.440522836174135e-09,
"loss": 1.5979,
"step": 1131
},
{
"epoch": 2.95,
"learning_rate": 5.744920608598547e-09,
"loss": 1.4959,
"step": 1132
},
{
"epoch": 2.95,
"learning_rate": 5.089037517300721e-09,
"loss": 1.553,
"step": 1133
},
{
"epoch": 2.95,
"learning_rate": 4.472878778495892e-09,
"loss": 1.5942,
"step": 1134
},
{
"epoch": 2.96,
"learning_rate": 3.896449292473125e-09,
"loss": 1.6051,
"step": 1135
},
{
"epoch": 2.96,
"learning_rate": 3.359753643555341e-09,
"loss": 1.6249,
"step": 1136
},
{
"epoch": 2.96,
"learning_rate": 2.862796100065457e-09,
"loss": 1.6316,
"step": 1137
},
{
"epoch": 2.96,
"learning_rate": 2.4055806142880835e-09,
"loss": 1.6181,
"step": 1138
},
{
"epoch": 2.97,
"learning_rate": 1.988110822443434e-09,
"loss": 1.5719,
"step": 1139
},
{
"epoch": 2.97,
"learning_rate": 1.6103900446534648e-09,
"loss": 1.5113,
"step": 1140
},
{
"epoch": 2.97,
"eval_loss": 1.5448263883590698,
"eval_runtime": 221.6166,
"eval_samples_per_second": 11.669,
"eval_steps_per_second": 0.731,
"step": 1140
},
{
"epoch": 2.97,
"learning_rate": 1.2724212849180019e-09,
"loss": 1.6375,
"step": 1141
},
{
"epoch": 2.97,
"learning_rate": 9.742072310908735e-10,
"loss": 1.6044,
"step": 1142
},
{
"epoch": 2.98,
"learning_rate": 7.157502548588158e-10,
"loss": 1.5811,
"step": 1143
},
{
"epoch": 2.98,
"learning_rate": 4.97052411720933e-10,
"loss": 1.587,
"step": 1144
},
{
"epoch": 2.98,
"learning_rate": 3.181154409725995e-10,
"loss": 1.702,
"step": 1145
},
{
"epoch": 2.99,
"learning_rate": 1.7894076569435759e-10,
"loss": 1.5643,
"step": 1146
},
{
"epoch": 2.99,
"learning_rate": 7.952949273748456e-11,
"loss": 1.4459,
"step": 1147
},
{
"epoch": 2.99,
"learning_rate": 1.9882412715110932e-11,
"loss": 1.4922,
"step": 1148
},
{
"epoch": 2.99,
"learning_rate": 0.0,
"loss": 1.5717,
"step": 1149
},
{
"epoch": 2.99,
"step": 1149,
"total_flos": 2.7036241629701734e+18,
"train_loss": 1.6580357963464072,
"train_runtime": 61956.043,
"train_samples_per_second": 2.379,
"train_steps_per_second": 0.019
}
],
"max_steps": 1149,
"num_train_epochs": 3,
"total_flos": 2.7036241629701734e+18,
"trial_name": null,
"trial_params": null
}