Axolotl-LLama / checkpoint-237 /trainer_state.json
createPLL's picture
Upload folder using huggingface_hub
be179d7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9968454258675079,
"eval_steps": 60,
"global_step": 237,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004206098843322818,
"grad_norm": 0.5299676656723022,
"learning_rate": 2e-05,
"loss": 1.77,
"step": 1
},
{
"epoch": 0.004206098843322818,
"eval_loss": 1.9898090362548828,
"eval_runtime": 65.4901,
"eval_samples_per_second": 41.167,
"eval_steps_per_second": 20.583,
"step": 1
},
{
"epoch": 0.008412197686645636,
"grad_norm": 0.5349143743515015,
"learning_rate": 4e-05,
"loss": 1.745,
"step": 2
},
{
"epoch": 0.012618296529968454,
"grad_norm": 0.5094612240791321,
"learning_rate": 6e-05,
"loss": 1.7007,
"step": 3
},
{
"epoch": 0.016824395373291272,
"grad_norm": 0.5268917083740234,
"learning_rate": 8e-05,
"loss": 1.6582,
"step": 4
},
{
"epoch": 0.02103049421661409,
"grad_norm": 0.5398459434509277,
"learning_rate": 0.0001,
"loss": 1.7391,
"step": 5
},
{
"epoch": 0.025236593059936908,
"grad_norm": 0.5613242983818054,
"learning_rate": 0.00012,
"loss": 1.6436,
"step": 6
},
{
"epoch": 0.029442691903259727,
"grad_norm": 0.45200833678245544,
"learning_rate": 0.00014,
"loss": 1.5859,
"step": 7
},
{
"epoch": 0.033648790746582544,
"grad_norm": 0.3515471816062927,
"learning_rate": 0.00016,
"loss": 1.4385,
"step": 8
},
{
"epoch": 0.03785488958990536,
"grad_norm": 0.2859397530555725,
"learning_rate": 0.00018,
"loss": 1.427,
"step": 9
},
{
"epoch": 0.04206098843322818,
"grad_norm": 0.3456244170665741,
"learning_rate": 0.0002,
"loss": 1.408,
"step": 10
},
{
"epoch": 0.046267087276550996,
"grad_norm": 0.42806366086006165,
"learning_rate": 0.0001999904234053922,
"loss": 1.4541,
"step": 11
},
{
"epoch": 0.050473186119873815,
"grad_norm": 0.5130056142807007,
"learning_rate": 0.00019996169545579207,
"loss": 1.3664,
"step": 12
},
{
"epoch": 0.054679284963196635,
"grad_norm": 0.39732199907302856,
"learning_rate": 0.00019991382165351814,
"loss": 1.3276,
"step": 13
},
{
"epoch": 0.058885383806519455,
"grad_norm": 0.3794059157371521,
"learning_rate": 0.00019984681116793038,
"loss": 1.3153,
"step": 14
},
{
"epoch": 0.06309148264984227,
"grad_norm": 0.27593305706977844,
"learning_rate": 0.00019976067683367385,
"loss": 1.2554,
"step": 15
},
{
"epoch": 0.06729758149316509,
"grad_norm": 0.28591713309288025,
"learning_rate": 0.00019965543514822062,
"loss": 1.283,
"step": 16
},
{
"epoch": 0.07150368033648791,
"grad_norm": 0.26724520325660706,
"learning_rate": 0.00019953110626870979,
"loss": 1.1646,
"step": 17
},
{
"epoch": 0.07570977917981073,
"grad_norm": 0.24611811339855194,
"learning_rate": 0.0001993877140080869,
"loss": 1.1762,
"step": 18
},
{
"epoch": 0.07991587802313355,
"grad_norm": 0.2281356304883957,
"learning_rate": 0.000199225285830543,
"loss": 1.1467,
"step": 19
},
{
"epoch": 0.08412197686645637,
"grad_norm": 0.22052225470542908,
"learning_rate": 0.00019904385284625424,
"loss": 1.1377,
"step": 20
},
{
"epoch": 0.08832807570977919,
"grad_norm": 0.23453611135482788,
"learning_rate": 0.00019884344980542338,
"loss": 1.1162,
"step": 21
},
{
"epoch": 0.09253417455310199,
"grad_norm": 0.22467325627803802,
"learning_rate": 0.00019862411509162406,
"loss": 1.155,
"step": 22
},
{
"epoch": 0.09674027339642481,
"grad_norm": 0.2170630544424057,
"learning_rate": 0.00019838589071444903,
"loss": 1.1279,
"step": 23
},
{
"epoch": 0.10094637223974763,
"grad_norm": 0.21346993744373322,
"learning_rate": 0.00019812882230146398,
"loss": 1.0946,
"step": 24
},
{
"epoch": 0.10515247108307045,
"grad_norm": 0.21408380568027496,
"learning_rate": 0.00019785295908946848,
"loss": 1.0889,
"step": 25
},
{
"epoch": 0.10935856992639327,
"grad_norm": 0.22000430524349213,
"learning_rate": 0.0001975583539150655,
"loss": 1.0476,
"step": 26
},
{
"epoch": 0.11356466876971609,
"grad_norm": 0.20778758823871613,
"learning_rate": 0.00019724506320454153,
"loss": 1.0954,
"step": 27
},
{
"epoch": 0.11777076761303891,
"grad_norm": 0.22037693858146667,
"learning_rate": 0.00019691314696305913,
"loss": 1.055,
"step": 28
},
{
"epoch": 0.12197686645636173,
"grad_norm": 0.20428280532360077,
"learning_rate": 0.0001965626687631641,
"loss": 1.0159,
"step": 29
},
{
"epoch": 0.12618296529968454,
"grad_norm": 0.20502522587776184,
"learning_rate": 0.00019619369573260924,
"loss": 1.0254,
"step": 30
},
{
"epoch": 0.13038906414300735,
"grad_norm": 0.2062043696641922,
"learning_rate": 0.0001958062985414972,
"loss": 0.9779,
"step": 31
},
{
"epoch": 0.13459516298633017,
"grad_norm": 0.22229152917861938,
"learning_rate": 0.00019540055138874505,
"loss": 1.0201,
"step": 32
},
{
"epoch": 0.138801261829653,
"grad_norm": 0.21910454332828522,
"learning_rate": 0.00019497653198787264,
"loss": 0.9958,
"step": 33
},
{
"epoch": 0.14300736067297581,
"grad_norm": 0.22630847990512848,
"learning_rate": 0.0001945343215521182,
"loss": 0.9892,
"step": 34
},
{
"epoch": 0.14721345951629863,
"grad_norm": 0.21370179951190948,
"learning_rate": 0.00019407400477888315,
"loss": 0.9409,
"step": 35
},
{
"epoch": 0.15141955835962145,
"grad_norm": 0.22368259727954865,
"learning_rate": 0.00019359566983351013,
"loss": 0.9626,
"step": 36
},
{
"epoch": 0.15562565720294427,
"grad_norm": 0.24231955409049988,
"learning_rate": 0.00019309940833239626,
"loss": 0.9914,
"step": 37
},
{
"epoch": 0.1598317560462671,
"grad_norm": 0.24762062728405,
"learning_rate": 0.00019258531532544585,
"loss": 0.9311,
"step": 38
},
{
"epoch": 0.1640378548895899,
"grad_norm": 0.21248659491539001,
"learning_rate": 0.00019205348927786532,
"loss": 0.9399,
"step": 39
},
{
"epoch": 0.16824395373291273,
"grad_norm": 0.2374017834663391,
"learning_rate": 0.00019150403205130383,
"loss": 0.9664,
"step": 40
},
{
"epoch": 0.17245005257623555,
"grad_norm": 0.25241079926490784,
"learning_rate": 0.0001909370488843436,
"loss": 0.9475,
"step": 41
},
{
"epoch": 0.17665615141955837,
"grad_norm": 0.24083252251148224,
"learning_rate": 0.00019035264837234347,
"loss": 0.9602,
"step": 42
},
{
"epoch": 0.1808622502628812,
"grad_norm": 0.24024806916713715,
"learning_rate": 0.0001897509424466393,
"loss": 0.9167,
"step": 43
},
{
"epoch": 0.18506834910620398,
"grad_norm": 0.2538228929042816,
"learning_rate": 0.0001891320463531055,
"loss": 0.904,
"step": 44
},
{
"epoch": 0.1892744479495268,
"grad_norm": 0.2393723875284195,
"learning_rate": 0.00018849607863008193,
"loss": 0.8927,
"step": 45
},
{
"epoch": 0.19348054679284962,
"grad_norm": 0.2394389659166336,
"learning_rate": 0.00018784316108566996,
"loss": 0.8675,
"step": 46
},
{
"epoch": 0.19768664563617244,
"grad_norm": 0.24351197481155396,
"learning_rate": 0.00018717341877440226,
"loss": 0.873,
"step": 47
},
{
"epoch": 0.20189274447949526,
"grad_norm": 0.2396727055311203,
"learning_rate": 0.000186486979973291,
"loss": 0.8972,
"step": 48
},
{
"epoch": 0.20609884332281808,
"grad_norm": 0.2674885392189026,
"learning_rate": 0.0001857839761572586,
"loss": 0.8613,
"step": 49
},
{
"epoch": 0.2103049421661409,
"grad_norm": 0.25012922286987305,
"learning_rate": 0.00018506454197395606,
"loss": 0.8481,
"step": 50
},
{
"epoch": 0.21451104100946372,
"grad_norm": 0.23941218852996826,
"learning_rate": 0.0001843288152179739,
"loss": 0.8638,
"step": 51
},
{
"epoch": 0.21871713985278654,
"grad_norm": 0.25679612159729004,
"learning_rate": 0.00018357693680444976,
"loss": 0.8928,
"step": 52
},
{
"epoch": 0.22292323869610936,
"grad_norm": 0.25766387581825256,
"learning_rate": 0.00018280905074207884,
"loss": 0.8997,
"step": 53
},
{
"epoch": 0.22712933753943218,
"grad_norm": 0.24009671807289124,
"learning_rate": 0.00018202530410553163,
"loss": 0.8536,
"step": 54
},
{
"epoch": 0.231335436382755,
"grad_norm": 0.24763701856136322,
"learning_rate": 0.00018122584700728443,
"loss": 0.8581,
"step": 55
},
{
"epoch": 0.23554153522607782,
"grad_norm": 0.2651236653327942,
"learning_rate": 0.0001804108325688679,
"loss": 0.8164,
"step": 56
},
{
"epoch": 0.23974763406940064,
"grad_norm": 0.23978441953659058,
"learning_rate": 0.0001795804168915396,
"loss": 0.8321,
"step": 57
},
{
"epoch": 0.24395373291272346,
"grad_norm": 0.2508217394351959,
"learning_rate": 0.00017873475902638553,
"loss": 0.815,
"step": 58
},
{
"epoch": 0.24815983175604628,
"grad_norm": 0.2765346169471741,
"learning_rate": 0.00017787402094385666,
"loss": 0.8674,
"step": 59
},
{
"epoch": 0.25236593059936907,
"grad_norm": 0.27468088269233704,
"learning_rate": 0.00017699836750274662,
"loss": 0.8841,
"step": 60
},
{
"epoch": 0.25236593059936907,
"eval_loss": 1.075273036956787,
"eval_runtime": 66.5472,
"eval_samples_per_second": 40.513,
"eval_steps_per_second": 20.256,
"step": 60
},
{
"epoch": 0.2565720294426919,
"grad_norm": 0.27056071162223816,
"learning_rate": 0.00017610796641861581,
"loss": 0.8459,
"step": 61
},
{
"epoch": 0.2607781282860147,
"grad_norm": 0.2631956934928894,
"learning_rate": 0.00017520298823166873,
"loss": 0.8853,
"step": 62
},
{
"epoch": 0.26498422712933756,
"grad_norm": 0.28352680802345276,
"learning_rate": 0.00017428360627408978,
"loss": 0.8625,
"step": 63
},
{
"epoch": 0.26919032597266035,
"grad_norm": 0.24897028505802155,
"learning_rate": 0.00017334999663684504,
"loss": 0.8627,
"step": 64
},
{
"epoch": 0.2733964248159832,
"grad_norm": 0.2620624303817749,
"learning_rate": 0.00017240233813595478,
"loss": 0.8088,
"step": 65
},
{
"epoch": 0.277602523659306,
"grad_norm": 0.24983716011047363,
"learning_rate": 0.0001714408122782448,
"loss": 0.8318,
"step": 66
},
{
"epoch": 0.28180862250262884,
"grad_norm": 0.2667708694934845,
"learning_rate": 0.000170465603226582,
"loss": 0.8368,
"step": 67
},
{
"epoch": 0.28601472134595163,
"grad_norm": 0.2828388214111328,
"learning_rate": 0.0001694768977646013,
"loss": 0.8282,
"step": 68
},
{
"epoch": 0.2902208201892745,
"grad_norm": 0.2581498324871063,
"learning_rate": 0.0001684748852609306,
"loss": 0.8375,
"step": 69
},
{
"epoch": 0.29442691903259727,
"grad_norm": 0.27101799845695496,
"learning_rate": 0.0001674597576329207,
"loss": 0.818,
"step": 70
},
{
"epoch": 0.29863301787592006,
"grad_norm": 0.27231255173683167,
"learning_rate": 0.00016643170930988698,
"loss": 0.843,
"step": 71
},
{
"epoch": 0.3028391167192429,
"grad_norm": 0.2566690444946289,
"learning_rate": 0.00016539093719586994,
"loss": 0.8348,
"step": 72
},
{
"epoch": 0.3070452155625657,
"grad_norm": 0.2482360601425171,
"learning_rate": 0.00016433764063192194,
"loss": 0.8122,
"step": 73
},
{
"epoch": 0.31125131440588855,
"grad_norm": 0.25742995738983154,
"learning_rate": 0.00016327202135792685,
"loss": 0.776,
"step": 74
},
{
"epoch": 0.31545741324921134,
"grad_norm": 0.25104233622550964,
"learning_rate": 0.00016219428347396053,
"loss": 0.7823,
"step": 75
},
{
"epoch": 0.3196635120925342,
"grad_norm": 0.2921640872955322,
"learning_rate": 0.00016110463340119913,
"loss": 0.8127,
"step": 76
},
{
"epoch": 0.323869610935857,
"grad_norm": 0.26554426550865173,
"learning_rate": 0.00016000327984238292,
"loss": 0.7716,
"step": 77
},
{
"epoch": 0.3280757097791798,
"grad_norm": 0.24784542620182037,
"learning_rate": 0.00015889043374184286,
"loss": 0.7714,
"step": 78
},
{
"epoch": 0.3322818086225026,
"grad_norm": 0.26592087745666504,
"learning_rate": 0.0001577663082450984,
"loss": 0.7397,
"step": 79
},
{
"epoch": 0.33648790746582546,
"grad_norm": 0.3072431683540344,
"learning_rate": 0.00015663111865803285,
"loss": 0.7579,
"step": 80
},
{
"epoch": 0.34069400630914826,
"grad_norm": 0.29445305466651917,
"learning_rate": 0.00015548508240565583,
"loss": 0.7998,
"step": 81
},
{
"epoch": 0.3449001051524711,
"grad_norm": 0.26053521037101746,
"learning_rate": 0.0001543284189904592,
"loss": 0.7832,
"step": 82
},
{
"epoch": 0.3491062039957939,
"grad_norm": 0.2956802248954773,
"learning_rate": 0.00015316134995037545,
"loss": 0.8054,
"step": 83
},
{
"epoch": 0.35331230283911674,
"grad_norm": 0.2673921287059784,
"learning_rate": 0.00015198409881634617,
"loss": 0.8061,
"step": 84
},
{
"epoch": 0.35751840168243953,
"grad_norm": 0.2793889045715332,
"learning_rate": 0.00015079689106950854,
"loss": 0.7902,
"step": 85
},
{
"epoch": 0.3617245005257624,
"grad_norm": 0.26718223094940186,
"learning_rate": 0.00014959995409800873,
"loss": 0.7769,
"step": 86
},
{
"epoch": 0.3659305993690852,
"grad_norm": 0.300536572933197,
"learning_rate": 0.00014839351715344968,
"loss": 0.8245,
"step": 87
},
{
"epoch": 0.37013669821240797,
"grad_norm": 0.2824515998363495,
"learning_rate": 0.00014717781130698212,
"loss": 0.8122,
"step": 88
},
{
"epoch": 0.3743427970557308,
"grad_norm": 0.28050506114959717,
"learning_rate": 0.00014595306940504716,
"loss": 0.778,
"step": 89
},
{
"epoch": 0.3785488958990536,
"grad_norm": 0.2906787395477295,
"learning_rate": 0.00014471952602477866,
"loss": 0.7703,
"step": 90
},
{
"epoch": 0.38275499474237645,
"grad_norm": 0.298177033662796,
"learning_rate": 0.00014347741742907433,
"loss": 0.7672,
"step": 91
},
{
"epoch": 0.38696109358569925,
"grad_norm": 0.27583765983581543,
"learning_rate": 0.00014222698152134374,
"loss": 0.7784,
"step": 92
},
{
"epoch": 0.3911671924290221,
"grad_norm": 0.28834670782089233,
"learning_rate": 0.0001409684577999423,
"loss": 0.8278,
"step": 93
},
{
"epoch": 0.3953732912723449,
"grad_norm": 0.29721811413764954,
"learning_rate": 0.00013970208731229974,
"loss": 0.7997,
"step": 94
},
{
"epoch": 0.39957939011566773,
"grad_norm": 0.2688146233558655,
"learning_rate": 0.00013842811260875168,
"loss": 0.7465,
"step": 95
},
{
"epoch": 0.4037854889589905,
"grad_norm": 0.27095234394073486,
"learning_rate": 0.0001371467776960837,
"loss": 0.757,
"step": 96
},
{
"epoch": 0.40799158780231337,
"grad_norm": 0.30743858218193054,
"learning_rate": 0.0001358583279907961,
"loss": 0.7882,
"step": 97
},
{
"epoch": 0.41219768664563616,
"grad_norm": 0.274873822927475,
"learning_rate": 0.00013456301027209882,
"loss": 0.7737,
"step": 98
},
{
"epoch": 0.416403785488959,
"grad_norm": 0.25485867261886597,
"learning_rate": 0.00013326107263464558,
"loss": 0.7454,
"step": 99
},
{
"epoch": 0.4206098843322818,
"grad_norm": 0.2994694709777832,
"learning_rate": 0.00013195276444101547,
"loss": 0.8133,
"step": 100
},
{
"epoch": 0.42481598317560465,
"grad_norm": 0.2943129241466522,
"learning_rate": 0.0001306383362739523,
"loss": 0.7501,
"step": 101
},
{
"epoch": 0.42902208201892744,
"grad_norm": 0.2888595163822174,
"learning_rate": 0.0001293180398883701,
"loss": 0.7522,
"step": 102
},
{
"epoch": 0.4332281808622503,
"grad_norm": 0.28455743193626404,
"learning_rate": 0.00012799212816313376,
"loss": 0.7278,
"step": 103
},
{
"epoch": 0.4374342797055731,
"grad_norm": 0.32477039098739624,
"learning_rate": 0.00012666085505262485,
"loss": 0.819,
"step": 104
},
{
"epoch": 0.4416403785488959,
"grad_norm": 0.28067031502723694,
"learning_rate": 0.00012532447553810126,
"loss": 0.7979,
"step": 105
},
{
"epoch": 0.4458464773922187,
"grad_norm": 0.26430413126945496,
"learning_rate": 0.00012398324557885994,
"loss": 0.7497,
"step": 106
},
{
"epoch": 0.4500525762355415,
"grad_norm": 0.27110588550567627,
"learning_rate": 0.00012263742206321287,
"loss": 0.7937,
"step": 107
},
{
"epoch": 0.45425867507886436,
"grad_norm": 0.287041574716568,
"learning_rate": 0.0001212872627592845,
"loss": 0.7897,
"step": 108
},
{
"epoch": 0.45846477392218715,
"grad_norm": 0.28561776876449585,
"learning_rate": 0.00011993302626564102,
"loss": 0.8011,
"step": 109
},
{
"epoch": 0.46267087276551,
"grad_norm": 0.2852155566215515,
"learning_rate": 0.00011857497196176049,
"loss": 0.7426,
"step": 110
},
{
"epoch": 0.4668769716088328,
"grad_norm": 0.2712121903896332,
"learning_rate": 0.00011721335995835336,
"loss": 0.7277,
"step": 111
},
{
"epoch": 0.47108307045215564,
"grad_norm": 0.2779647409915924,
"learning_rate": 0.00011584845104754304,
"loss": 0.7698,
"step": 112
},
{
"epoch": 0.47528916929547843,
"grad_norm": 0.2774654030799866,
"learning_rate": 0.00011448050665291587,
"loss": 0.7583,
"step": 113
},
{
"epoch": 0.4794952681388013,
"grad_norm": 0.3046507239341736,
"learning_rate": 0.00011310978877945007,
"loss": 0.7987,
"step": 114
},
{
"epoch": 0.48370136698212407,
"grad_norm": 0.2816363573074341,
"learning_rate": 0.00011173655996333357,
"loss": 0.7898,
"step": 115
},
{
"epoch": 0.4879074658254469,
"grad_norm": 0.27383196353912354,
"learning_rate": 0.00011036108322167988,
"loss": 0.7248,
"step": 116
},
{
"epoch": 0.4921135646687697,
"grad_norm": 0.28104445338249207,
"learning_rate": 0.00010898362200215197,
"loss": 0.7144,
"step": 117
},
{
"epoch": 0.49631966351209256,
"grad_norm": 0.28643152117729187,
"learning_rate": 0.0001076044401325036,
"loss": 0.7856,
"step": 118
},
{
"epoch": 0.5005257623554153,
"grad_norm": 0.261483371257782,
"learning_rate": 0.0001062238017700478,
"loss": 0.7429,
"step": 119
},
{
"epoch": 0.5047318611987381,
"grad_norm": 0.2796306908130646,
"learning_rate": 0.00010484197135106263,
"loss": 0.7772,
"step": 120
},
{
"epoch": 0.5047318611987381,
"eval_loss": 0.9960550665855408,
"eval_runtime": 65.8413,
"eval_samples_per_second": 40.947,
"eval_steps_per_second": 20.473,
"step": 120
},
{
"epoch": 0.508937960042061,
"grad_norm": 0.3079998791217804,
"learning_rate": 0.00010345921354014279,
"loss": 0.7497,
"step": 121
},
{
"epoch": 0.5131440588853838,
"grad_norm": 0.3106074929237366,
"learning_rate": 0.00010207579317950827,
"loss": 0.7568,
"step": 122
},
{
"epoch": 0.5173501577287066,
"grad_norm": 0.27859166264533997,
"learning_rate": 0.00010069197523827833,
"loss": 0.7695,
"step": 123
},
{
"epoch": 0.5215562565720294,
"grad_norm": 0.2840277850627899,
"learning_rate": 9.930802476172169e-05,
"loss": 0.7815,
"step": 124
},
{
"epoch": 0.5257623554153522,
"grad_norm": 0.28042981028556824,
"learning_rate": 9.792420682049174e-05,
"loss": 0.7546,
"step": 125
},
{
"epoch": 0.5299684542586751,
"grad_norm": 0.2857164144515991,
"learning_rate": 9.654078645985722e-05,
"loss": 0.7617,
"step": 126
},
{
"epoch": 0.5341745531019979,
"grad_norm": 0.29590827226638794,
"learning_rate": 9.515802864893739e-05,
"loss": 0.748,
"step": 127
},
{
"epoch": 0.5383806519453207,
"grad_norm": 0.29375162720680237,
"learning_rate": 9.377619822995219e-05,
"loss": 0.7532,
"step": 128
},
{
"epoch": 0.5425867507886435,
"grad_norm": 0.28436464071273804,
"learning_rate": 9.239555986749645e-05,
"loss": 0.7511,
"step": 129
},
{
"epoch": 0.5467928496319664,
"grad_norm": 0.29677248001098633,
"learning_rate": 9.101637799784804e-05,
"loss": 0.7456,
"step": 130
},
{
"epoch": 0.5509989484752892,
"grad_norm": 0.27983585000038147,
"learning_rate": 8.963891677832011e-05,
"loss": 0.6888,
"step": 131
},
{
"epoch": 0.555205047318612,
"grad_norm": 0.27517008781433105,
"learning_rate": 8.826344003666647e-05,
"loss": 0.7431,
"step": 132
},
{
"epoch": 0.5594111461619348,
"grad_norm": 0.28197160363197327,
"learning_rate": 8.689021122054996e-05,
"loss": 0.7379,
"step": 133
},
{
"epoch": 0.5636172450052577,
"grad_norm": 0.29125264286994934,
"learning_rate": 8.551949334708415e-05,
"loss": 0.7639,
"step": 134
},
{
"epoch": 0.5678233438485805,
"grad_norm": 0.2851899266242981,
"learning_rate": 8.415154895245697e-05,
"loss": 0.7764,
"step": 135
},
{
"epoch": 0.5720294426919033,
"grad_norm": 0.2771802544593811,
"learning_rate": 8.278664004164665e-05,
"loss": 0.6961,
"step": 136
},
{
"epoch": 0.576235541535226,
"grad_norm": 0.27956414222717285,
"learning_rate": 8.142502803823955e-05,
"loss": 0.7454,
"step": 137
},
{
"epoch": 0.580441640378549,
"grad_norm": 0.30068668723106384,
"learning_rate": 8.0066973734359e-05,
"loss": 0.7683,
"step": 138
},
{
"epoch": 0.5846477392218717,
"grad_norm": 0.2820778489112854,
"learning_rate": 7.871273724071553e-05,
"loss": 0.7412,
"step": 139
},
{
"epoch": 0.5888538380651945,
"grad_norm": 0.2672085165977478,
"learning_rate": 7.736257793678714e-05,
"loss": 0.716,
"step": 140
},
{
"epoch": 0.5930599369085173,
"grad_norm": 0.27900293469429016,
"learning_rate": 7.601675442114009e-05,
"loss": 0.7259,
"step": 141
},
{
"epoch": 0.5972660357518401,
"grad_norm": 0.2954063415527344,
"learning_rate": 7.46755244618988e-05,
"loss": 0.7047,
"step": 142
},
{
"epoch": 0.601472134595163,
"grad_norm": 0.3212134838104248,
"learning_rate": 7.333914494737514e-05,
"loss": 0.7657,
"step": 143
},
{
"epoch": 0.6056782334384858,
"grad_norm": 0.30651283264160156,
"learning_rate": 7.200787183686625e-05,
"loss": 0.7489,
"step": 144
},
{
"epoch": 0.6098843322818086,
"grad_norm": 0.26834797859191895,
"learning_rate": 7.068196011162994e-05,
"loss": 0.7484,
"step": 145
},
{
"epoch": 0.6140904311251314,
"grad_norm": 0.2777973711490631,
"learning_rate": 6.936166372604773e-05,
"loss": 0.7245,
"step": 146
},
{
"epoch": 0.6182965299684543,
"grad_norm": 0.293694406747818,
"learning_rate": 6.804723555898458e-05,
"loss": 0.7211,
"step": 147
},
{
"epoch": 0.6225026288117771,
"grad_norm": 0.28515610098838806,
"learning_rate": 6.673892736535448e-05,
"loss": 0.7439,
"step": 148
},
{
"epoch": 0.6267087276550999,
"grad_norm": 0.2929891049861908,
"learning_rate": 6.543698972790117e-05,
"loss": 0.7434,
"step": 149
},
{
"epoch": 0.6309148264984227,
"grad_norm": 0.29031944274902344,
"learning_rate": 6.414167200920391e-05,
"loss": 0.7176,
"step": 150
},
{
"epoch": 0.6351209253417456,
"grad_norm": 0.2764637768268585,
"learning_rate": 6.28532223039163e-05,
"loss": 0.7503,
"step": 151
},
{
"epoch": 0.6393270241850684,
"grad_norm": 0.2900468707084656,
"learning_rate": 6.157188739124834e-05,
"loss": 0.6879,
"step": 152
},
{
"epoch": 0.6435331230283912,
"grad_norm": 0.2989012897014618,
"learning_rate": 6.029791268770029e-05,
"loss": 0.7135,
"step": 153
},
{
"epoch": 0.647739221871714,
"grad_norm": 0.2998535931110382,
"learning_rate": 5.903154220005771e-05,
"loss": 0.7171,
"step": 154
},
{
"epoch": 0.6519453207150369,
"grad_norm": 0.27283868193626404,
"learning_rate": 5.777301847865629e-05,
"loss": 0.7112,
"step": 155
},
{
"epoch": 0.6561514195583596,
"grad_norm": 0.2988041341304779,
"learning_rate": 5.652258257092569e-05,
"loss": 0.7444,
"step": 156
},
{
"epoch": 0.6603575184016824,
"grad_norm": 0.2845938205718994,
"learning_rate": 5.528047397522133e-05,
"loss": 0.716,
"step": 157
},
{
"epoch": 0.6645636172450052,
"grad_norm": 0.29695218801498413,
"learning_rate": 5.404693059495285e-05,
"loss": 0.7585,
"step": 158
},
{
"epoch": 0.668769716088328,
"grad_norm": 0.28558245301246643,
"learning_rate": 5.282218869301788e-05,
"loss": 0.6908,
"step": 159
},
{
"epoch": 0.6729758149316509,
"grad_norm": 0.280200719833374,
"learning_rate": 5.160648284655032e-05,
"loss": 0.7508,
"step": 160
},
{
"epoch": 0.6771819137749737,
"grad_norm": 0.2981257438659668,
"learning_rate": 5.040004590199128e-05,
"loss": 0.7147,
"step": 161
},
{
"epoch": 0.6813880126182965,
"grad_norm": 0.2873106598854065,
"learning_rate": 4.920310893049146e-05,
"loss": 0.7011,
"step": 162
},
{
"epoch": 0.6855941114616193,
"grad_norm": 0.2717635929584503,
"learning_rate": 4.801590118365383e-05,
"loss": 0.6668,
"step": 163
},
{
"epoch": 0.6898002103049422,
"grad_norm": 0.27607038617134094,
"learning_rate": 4.683865004962452e-05,
"loss": 0.7033,
"step": 164
},
{
"epoch": 0.694006309148265,
"grad_norm": 0.2881218194961548,
"learning_rate": 4.567158100954083e-05,
"loss": 0.7275,
"step": 165
},
{
"epoch": 0.6982124079915878,
"grad_norm": 0.2758018672466278,
"learning_rate": 4.4514917594344184e-05,
"loss": 0.737,
"step": 166
},
{
"epoch": 0.7024185068349106,
"grad_norm": 0.29527172446250916,
"learning_rate": 4.3368881341967135e-05,
"loss": 0.7433,
"step": 167
},
{
"epoch": 0.7066246056782335,
"grad_norm": 0.2847643792629242,
"learning_rate": 4.223369175490162e-05,
"loss": 0.7471,
"step": 168
},
{
"epoch": 0.7108307045215563,
"grad_norm": 0.2958676815032959,
"learning_rate": 4.110956625815713e-05,
"loss": 0.6838,
"step": 169
},
{
"epoch": 0.7150368033648791,
"grad_norm": 0.28350576758384705,
"learning_rate": 3.9996720157617094e-05,
"loss": 0.7306,
"step": 170
},
{
"epoch": 0.7192429022082019,
"grad_norm": 0.2808986008167267,
"learning_rate": 3.8895366598800896e-05,
"loss": 0.6823,
"step": 171
},
{
"epoch": 0.7234490010515248,
"grad_norm": 0.2684039771556854,
"learning_rate": 3.780571652603949e-05,
"loss": 0.7105,
"step": 172
},
{
"epoch": 0.7276550998948476,
"grad_norm": 0.28138425946235657,
"learning_rate": 3.672797864207316e-05,
"loss": 0.7221,
"step": 173
},
{
"epoch": 0.7318611987381703,
"grad_norm": 0.2772335708141327,
"learning_rate": 3.566235936807808e-05,
"loss": 0.6835,
"step": 174
},
{
"epoch": 0.7360672975814931,
"grad_norm": 0.27244430780410767,
"learning_rate": 3.460906280413007e-05,
"loss": 0.6577,
"step": 175
},
{
"epoch": 0.7402733964248159,
"grad_norm": 0.2977088689804077,
"learning_rate": 3.3568290690113034e-05,
"loss": 0.7213,
"step": 176
},
{
"epoch": 0.7444794952681388,
"grad_norm": 0.289736270904541,
"learning_rate": 3.25402423670793e-05,
"loss": 0.7154,
"step": 177
},
{
"epoch": 0.7486855941114616,
"grad_norm": 0.287818044424057,
"learning_rate": 3.1525114739069415e-05,
"loss": 0.6977,
"step": 178
},
{
"epoch": 0.7528916929547844,
"grad_norm": 0.31408464908599854,
"learning_rate": 3.0523102235398714e-05,
"loss": 0.781,
"step": 179
},
{
"epoch": 0.7570977917981072,
"grad_norm": 0.27790582180023193,
"learning_rate": 2.9534396773417994e-05,
"loss": 0.7169,
"step": 180
},
{
"epoch": 0.7570977917981072,
"eval_loss": 0.9679059386253357,
"eval_runtime": 66.127,
"eval_samples_per_second": 40.77,
"eval_steps_per_second": 20.385,
"step": 180
},
{
"epoch": 0.7613038906414301,
"grad_norm": 0.28392866253852844,
"learning_rate": 2.855918772175522e-05,
"loss": 0.6662,
"step": 181
},
{
"epoch": 0.7655099894847529,
"grad_norm": 0.2941664159297943,
"learning_rate": 2.7597661864045233e-05,
"loss": 0.6816,
"step": 182
},
{
"epoch": 0.7697160883280757,
"grad_norm": 0.2740324139595032,
"learning_rate": 2.6650003363154963e-05,
"loss": 0.7046,
"step": 183
},
{
"epoch": 0.7739221871713985,
"grad_norm": 0.2933352291584015,
"learning_rate": 2.5716393725910215e-05,
"loss": 0.7208,
"step": 184
},
{
"epoch": 0.7781282860147214,
"grad_norm": 0.2843799591064453,
"learning_rate": 2.47970117683313e-05,
"loss": 0.685,
"step": 185
},
{
"epoch": 0.7823343848580442,
"grad_norm": 0.27152329683303833,
"learning_rate": 2.389203358138419e-05,
"loss": 0.7176,
"step": 186
},
{
"epoch": 0.786540483701367,
"grad_norm": 0.2916063964366913,
"learning_rate": 2.3001632497253424e-05,
"loss": 0.7439,
"step": 187
},
{
"epoch": 0.7907465825446898,
"grad_norm": 0.27915897965431213,
"learning_rate": 2.2125979056143364e-05,
"loss": 0.7,
"step": 188
},
{
"epoch": 0.7949526813880127,
"grad_norm": 0.30191752314567566,
"learning_rate": 2.1265240973614486e-05,
"loss": 0.7377,
"step": 189
},
{
"epoch": 0.7991587802313355,
"grad_norm": 0.286101758480072,
"learning_rate": 2.0419583108460418e-05,
"loss": 0.6916,
"step": 190
},
{
"epoch": 0.8033648790746583,
"grad_norm": 0.2800692319869995,
"learning_rate": 1.958916743113214e-05,
"loss": 0.7374,
"step": 191
},
{
"epoch": 0.807570977917981,
"grad_norm": 0.27292168140411377,
"learning_rate": 1.877415299271561e-05,
"loss": 0.6757,
"step": 192
},
{
"epoch": 0.8117770767613038,
"grad_norm": 0.28094640374183655,
"learning_rate": 1.7974695894468384e-05,
"loss": 0.7024,
"step": 193
},
{
"epoch": 0.8159831756046267,
"grad_norm": 0.2871862053871155,
"learning_rate": 1.7190949257921196e-05,
"loss": 0.7173,
"step": 194
},
{
"epoch": 0.8201892744479495,
"grad_norm": 0.27189600467681885,
"learning_rate": 1.642306319555027e-05,
"loss": 0.7019,
"step": 195
},
{
"epoch": 0.8243953732912723,
"grad_norm": 0.28526559472084045,
"learning_rate": 1.5671184782026106e-05,
"loss": 0.7113,
"step": 196
},
{
"epoch": 0.8286014721345951,
"grad_norm": 0.2855590283870697,
"learning_rate": 1.4935458026043959e-05,
"loss": 0.6977,
"step": 197
},
{
"epoch": 0.832807570977918,
"grad_norm": 0.28118449449539185,
"learning_rate": 1.4216023842741455e-05,
"loss": 0.7241,
"step": 198
},
{
"epoch": 0.8370136698212408,
"grad_norm": 0.28818827867507935,
"learning_rate": 1.3513020026709023e-05,
"loss": 0.6964,
"step": 199
},
{
"epoch": 0.8412197686645636,
"grad_norm": 0.3235337436199188,
"learning_rate": 1.2826581225597767e-05,
"loss": 0.7406,
"step": 200
},
{
"epoch": 0.8454258675078864,
"grad_norm": 0.2899198830127716,
"learning_rate": 1.2156838914330072e-05,
"loss": 0.7374,
"step": 201
},
{
"epoch": 0.8496319663512093,
"grad_norm": 0.28662335872650146,
"learning_rate": 1.1503921369918091e-05,
"loss": 0.7039,
"step": 202
},
{
"epoch": 0.8538380651945321,
"grad_norm": 0.2748032510280609,
"learning_rate": 1.0867953646894525e-05,
"loss": 0.7517,
"step": 203
},
{
"epoch": 0.8580441640378549,
"grad_norm": 0.27125102281570435,
"learning_rate": 1.0249057553360742e-05,
"loss": 0.6948,
"step": 204
},
{
"epoch": 0.8622502628811777,
"grad_norm": 0.2795623242855072,
"learning_rate": 9.647351627656543e-06,
"loss": 0.7123,
"step": 205
},
{
"epoch": 0.8664563617245006,
"grad_norm": 0.28939002752304077,
"learning_rate": 9.062951115656403e-06,
"loss": 0.7266,
"step": 206
},
{
"epoch": 0.8706624605678234,
"grad_norm": 0.2878707945346832,
"learning_rate": 8.495967948696192e-06,
"loss": 0.7335,
"step": 207
},
{
"epoch": 0.8748685594111462,
"grad_norm": 0.27489086985588074,
"learning_rate": 7.946510722134692e-06,
"loss": 0.692,
"step": 208
},
{
"epoch": 0.879074658254469,
"grad_norm": 0.2869216799736023,
"learning_rate": 7.4146846745541506e-06,
"loss": 0.7193,
"step": 209
},
{
"epoch": 0.8832807570977917,
"grad_norm": 0.2801933288574219,
"learning_rate": 6.900591667603751e-06,
"loss": 0.7178,
"step": 210
},
{
"epoch": 0.8874868559411146,
"grad_norm": 0.2767332196235657,
"learning_rate": 6.40433016648988e-06,
"loss": 0.7499,
"step": 211
},
{
"epoch": 0.8916929547844374,
"grad_norm": 0.2783336043357849,
"learning_rate": 5.925995221116853e-06,
"loss": 0.7152,
"step": 212
},
{
"epoch": 0.8958990536277602,
"grad_norm": 0.27832481265068054,
"learning_rate": 5.465678447881828e-06,
"loss": 0.6977,
"step": 213
},
{
"epoch": 0.900105152471083,
"grad_norm": 0.2835717499256134,
"learning_rate": 5.023468012127364e-06,
"loss": 0.7251,
"step": 214
},
{
"epoch": 0.9043112513144059,
"grad_norm": 0.27503538131713867,
"learning_rate": 4.599448611254964e-06,
"loss": 0.7166,
"step": 215
},
{
"epoch": 0.9085173501577287,
"grad_norm": 0.26619476079940796,
"learning_rate": 4.193701458502807e-06,
"loss": 0.7095,
"step": 216
},
{
"epoch": 0.9127234490010515,
"grad_norm": 0.2752280533313751,
"learning_rate": 3.80630426739077e-06,
"loss": 0.7412,
"step": 217
},
{
"epoch": 0.9169295478443743,
"grad_norm": 0.281093567609787,
"learning_rate": 3.4373312368358944e-06,
"loss": 0.7592,
"step": 218
},
{
"epoch": 0.9211356466876972,
"grad_norm": 0.28015753626823425,
"learning_rate": 3.086853036940862e-06,
"loss": 0.7104,
"step": 219
},
{
"epoch": 0.92534174553102,
"grad_norm": 0.2644014358520508,
"learning_rate": 2.754936795458485e-06,
"loss": 0.6985,
"step": 220
},
{
"epoch": 0.9295478443743428,
"grad_norm": 0.2755027413368225,
"learning_rate": 2.4416460849345123e-06,
"loss": 0.7159,
"step": 221
},
{
"epoch": 0.9337539432176656,
"grad_norm": 0.28020283579826355,
"learning_rate": 2.1470409105315283e-06,
"loss": 0.7389,
"step": 222
},
{
"epoch": 0.9379600420609885,
"grad_norm": 0.2773683965206146,
"learning_rate": 1.8711776985360308e-06,
"loss": 0.686,
"step": 223
},
{
"epoch": 0.9421661409043113,
"grad_norm": 0.2784758508205414,
"learning_rate": 1.61410928555098e-06,
"loss": 0.6857,
"step": 224
},
{
"epoch": 0.9463722397476341,
"grad_norm": 0.2857016623020172,
"learning_rate": 1.3758849083759352e-06,
"loss": 0.6982,
"step": 225
},
{
"epoch": 0.9505783385909569,
"grad_norm": 0.27618998289108276,
"learning_rate": 1.1565501945766222e-06,
"loss": 0.7328,
"step": 226
},
{
"epoch": 0.9547844374342797,
"grad_norm": 0.273423969745636,
"learning_rate": 9.56147153745779e-07,
"loss": 0.6762,
"step": 227
},
{
"epoch": 0.9589905362776026,
"grad_norm": 0.2603454291820526,
"learning_rate": 7.747141694570026e-07,
"loss": 0.6784,
"step": 228
},
{
"epoch": 0.9631966351209253,
"grad_norm": 0.2638219892978668,
"learning_rate": 6.122859919130974e-07,
"loss": 0.731,
"step": 229
},
{
"epoch": 0.9674027339642481,
"grad_norm": 0.28604456782341003,
"learning_rate": 4.6889373129022085e-07,
"loss": 0.6937,
"step": 230
},
{
"epoch": 0.9716088328075709,
"grad_norm": 0.2867179811000824,
"learning_rate": 3.445648517793942e-07,
"loss": 0.7492,
"step": 231
},
{
"epoch": 0.9758149316508938,
"grad_norm": 0.27991774678230286,
"learning_rate": 2.3932316632614416e-07,
"loss": 0.7411,
"step": 232
},
{
"epoch": 0.9800210304942166,
"grad_norm": 0.2658878266811371,
"learning_rate": 1.5318883206962842e-07,
"loss": 0.7317,
"step": 233
},
{
"epoch": 0.9842271293375394,
"grad_norm": 0.26533135771751404,
"learning_rate": 8.617834648185774e-08,
"loss": 0.6636,
"step": 234
},
{
"epoch": 0.9884332281808622,
"grad_norm": 0.26577314734458923,
"learning_rate": 3.8304544207945495e-08,
"loss": 0.7273,
"step": 235
},
{
"epoch": 0.9926393270241851,
"grad_norm": 0.2715383768081665,
"learning_rate": 9.576594607807465e-09,
"loss": 0.7253,
"step": 236
},
{
"epoch": 0.9968454258675079,
"grad_norm": 0.28140708804130554,
"learning_rate": 0.0,
"loss": 0.6756,
"step": 237
}
],
"logging_steps": 1,
"max_steps": 237,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.319764496895181e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}