Axolotl-LLama / checkpoint-237 /trainer_state.json

Upload folder using huggingface_hub

be179d7 verified 15 days ago

42.6 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.9968454258675079,
	"eval_steps": 60,
	"global_step": 237,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.004206098843322818,
	"grad_norm": 0.5299676656723022,
	"learning_rate": 2e-05,
	"loss": 1.77,
	"step": 1
	},
	{
	"epoch": 0.004206098843322818,
	"eval_loss": 1.9898090362548828,
	"eval_runtime": 65.4901,
	"eval_samples_per_second": 41.167,
	"eval_steps_per_second": 20.583,
	"step": 1
	},
	{
	"epoch": 0.008412197686645636,
	"grad_norm": 0.5349143743515015,
	"learning_rate": 4e-05,
	"loss": 1.745,
	"step": 2
	},
	{
	"epoch": 0.012618296529968454,
	"grad_norm": 0.5094612240791321,
	"learning_rate": 6e-05,
	"loss": 1.7007,
	"step": 3
	},
	{
	"epoch": 0.016824395373291272,
	"grad_norm": 0.5268917083740234,
	"learning_rate": 8e-05,
	"loss": 1.6582,
	"step": 4
	},
	{
	"epoch": 0.02103049421661409,
	"grad_norm": 0.5398459434509277,
	"learning_rate": 0.0001,
	"loss": 1.7391,
	"step": 5
	},
	{
	"epoch": 0.025236593059936908,
	"grad_norm": 0.5613242983818054,
	"learning_rate": 0.00012,
	"loss": 1.6436,
	"step": 6
	},
	{
	"epoch": 0.029442691903259727,
	"grad_norm": 0.45200833678245544,
	"learning_rate": 0.00014,
	"loss": 1.5859,
	"step": 7
	},
	{
	"epoch": 0.033648790746582544,
	"grad_norm": 0.3515471816062927,
	"learning_rate": 0.00016,
	"loss": 1.4385,
	"step": 8
	},
	{
	"epoch": 0.03785488958990536,
	"grad_norm": 0.2859397530555725,
	"learning_rate": 0.00018,
	"loss": 1.427,
	"step": 9
	},
	{
	"epoch": 0.04206098843322818,
	"grad_norm": 0.3456244170665741,
	"learning_rate": 0.0002,
	"loss": 1.408,
	"step": 10
	},
	{
	"epoch": 0.046267087276550996,
	"grad_norm": 0.42806366086006165,
	"learning_rate": 0.0001999904234053922,
	"loss": 1.4541,
	"step": 11
	},
	{
	"epoch": 0.050473186119873815,
	"grad_norm": 0.5130056142807007,
	"learning_rate": 0.00019996169545579207,
	"loss": 1.3664,
	"step": 12
	},
	{
	"epoch": 0.054679284963196635,
	"grad_norm": 0.39732199907302856,
	"learning_rate": 0.00019991382165351814,
	"loss": 1.3276,
	"step": 13
	},
	{
	"epoch": 0.058885383806519455,
	"grad_norm": 0.3794059157371521,
	"learning_rate": 0.00019984681116793038,
	"loss": 1.3153,
	"step": 14
	},
	{
	"epoch": 0.06309148264984227,
	"grad_norm": 0.27593305706977844,
	"learning_rate": 0.00019976067683367385,
	"loss": 1.2554,
	"step": 15
	},
	{
	"epoch": 0.06729758149316509,
	"grad_norm": 0.28591713309288025,
	"learning_rate": 0.00019965543514822062,
	"loss": 1.283,
	"step": 16
	},
	{
	"epoch": 0.07150368033648791,
	"grad_norm": 0.26724520325660706,
	"learning_rate": 0.00019953110626870979,
	"loss": 1.1646,
	"step": 17
	},
	{
	"epoch": 0.07570977917981073,
	"grad_norm": 0.24611811339855194,
	"learning_rate": 0.0001993877140080869,
	"loss": 1.1762,
	"step": 18
	},
	{
	"epoch": 0.07991587802313355,
	"grad_norm": 0.2281356304883957,
	"learning_rate": 0.000199225285830543,
	"loss": 1.1467,
	"step": 19
	},
	{
	"epoch": 0.08412197686645637,
	"grad_norm": 0.22052225470542908,
	"learning_rate": 0.00019904385284625424,
	"loss": 1.1377,
	"step": 20
	},
	{
	"epoch": 0.08832807570977919,
	"grad_norm": 0.23453611135482788,
	"learning_rate": 0.00019884344980542338,
	"loss": 1.1162,
	"step": 21
	},
	{
	"epoch": 0.09253417455310199,
	"grad_norm": 0.22467325627803802,
	"learning_rate": 0.00019862411509162406,
	"loss": 1.155,
	"step": 22
	},
	{
	"epoch": 0.09674027339642481,
	"grad_norm": 0.2170630544424057,
	"learning_rate": 0.00019838589071444903,
	"loss": 1.1279,
	"step": 23
	},
	{
	"epoch": 0.10094637223974763,
	"grad_norm": 0.21346993744373322,
	"learning_rate": 0.00019812882230146398,
	"loss": 1.0946,
	"step": 24
	},
	{
	"epoch": 0.10515247108307045,
	"grad_norm": 0.21408380568027496,
	"learning_rate": 0.00019785295908946848,
	"loss": 1.0889,
	"step": 25
	},
	{
	"epoch": 0.10935856992639327,
	"grad_norm": 0.22000430524349213,
	"learning_rate": 0.0001975583539150655,
	"loss": 1.0476,
	"step": 26
	},
	{
	"epoch": 0.11356466876971609,
	"grad_norm": 0.20778758823871613,
	"learning_rate": 0.00019724506320454153,
	"loss": 1.0954,
	"step": 27
	},
	{
	"epoch": 0.11777076761303891,
	"grad_norm": 0.22037693858146667,
	"learning_rate": 0.00019691314696305913,
	"loss": 1.055,
	"step": 28
	},
	{
	"epoch": 0.12197686645636173,
	"grad_norm": 0.20428280532360077,
	"learning_rate": 0.0001965626687631641,
	"loss": 1.0159,
	"step": 29
	},
	{
	"epoch": 0.12618296529968454,
	"grad_norm": 0.20502522587776184,
	"learning_rate": 0.00019619369573260924,
	"loss": 1.0254,
	"step": 30
	},
	{
	"epoch": 0.13038906414300735,
	"grad_norm": 0.2062043696641922,
	"learning_rate": 0.0001958062985414972,
	"loss": 0.9779,
	"step": 31
	},
	{
	"epoch": 0.13459516298633017,
	"grad_norm": 0.22229152917861938,
	"learning_rate": 0.00019540055138874505,
	"loss": 1.0201,
	"step": 32
	},
	{
	"epoch": 0.138801261829653,
	"grad_norm": 0.21910454332828522,
	"learning_rate": 0.00019497653198787264,
	"loss": 0.9958,
	"step": 33
	},
	{
	"epoch": 0.14300736067297581,
	"grad_norm": 0.22630847990512848,
	"learning_rate": 0.0001945343215521182,
	"loss": 0.9892,
	"step": 34
	},
	{
	"epoch": 0.14721345951629863,
	"grad_norm": 0.21370179951190948,
	"learning_rate": 0.00019407400477888315,
	"loss": 0.9409,
	"step": 35
	},
	{
	"epoch": 0.15141955835962145,
	"grad_norm": 0.22368259727954865,
	"learning_rate": 0.00019359566983351013,
	"loss": 0.9626,
	"step": 36
	},
	{
	"epoch": 0.15562565720294427,
	"grad_norm": 0.24231955409049988,
	"learning_rate": 0.00019309940833239626,
	"loss": 0.9914,
	"step": 37
	},
	{
	"epoch": 0.1598317560462671,
	"grad_norm": 0.24762062728405,
	"learning_rate": 0.00019258531532544585,
	"loss": 0.9311,
	"step": 38
	},
	{
	"epoch": 0.1640378548895899,
	"grad_norm": 0.21248659491539001,
	"learning_rate": 0.00019205348927786532,
	"loss": 0.9399,
	"step": 39
	},
	{
	"epoch": 0.16824395373291273,
	"grad_norm": 0.2374017834663391,
	"learning_rate": 0.00019150403205130383,
	"loss": 0.9664,
	"step": 40
	},
	{
	"epoch": 0.17245005257623555,
	"grad_norm": 0.25241079926490784,
	"learning_rate": 0.0001909370488843436,
	"loss": 0.9475,
	"step": 41
	},
	{
	"epoch": 0.17665615141955837,
	"grad_norm": 0.24083252251148224,
	"learning_rate": 0.00019035264837234347,
	"loss": 0.9602,
	"step": 42
	},
	{
	"epoch": 0.1808622502628812,
	"grad_norm": 0.24024806916713715,
	"learning_rate": 0.0001897509424466393,
	"loss": 0.9167,
	"step": 43
	},
	{
	"epoch": 0.18506834910620398,
	"grad_norm": 0.2538228929042816,
	"learning_rate": 0.0001891320463531055,
	"loss": 0.904,
	"step": 44
	},
	{
	"epoch": 0.1892744479495268,
	"grad_norm": 0.2393723875284195,
	"learning_rate": 0.00018849607863008193,
	"loss": 0.8927,
	"step": 45
	},
	{
	"epoch": 0.19348054679284962,
	"grad_norm": 0.2394389659166336,
	"learning_rate": 0.00018784316108566996,
	"loss": 0.8675,
	"step": 46
	},
	{
	"epoch": 0.19768664563617244,
	"grad_norm": 0.24351197481155396,
	"learning_rate": 0.00018717341877440226,
	"loss": 0.873,
	"step": 47
	},
	{
	"epoch": 0.20189274447949526,
	"grad_norm": 0.2396727055311203,
	"learning_rate": 0.000186486979973291,
	"loss": 0.8972,
	"step": 48
	},
	{
	"epoch": 0.20609884332281808,
	"grad_norm": 0.2674885392189026,
	"learning_rate": 0.0001857839761572586,
	"loss": 0.8613,
	"step": 49
	},
	{
	"epoch": 0.2103049421661409,
	"grad_norm": 0.25012922286987305,
	"learning_rate": 0.00018506454197395606,
	"loss": 0.8481,
	"step": 50
	},
	{
	"epoch": 0.21451104100946372,
	"grad_norm": 0.23941218852996826,
	"learning_rate": 0.0001843288152179739,
	"loss": 0.8638,
	"step": 51
	},
	{
	"epoch": 0.21871713985278654,
	"grad_norm": 0.25679612159729004,
	"learning_rate": 0.00018357693680444976,
	"loss": 0.8928,
	"step": 52
	},
	{
	"epoch": 0.22292323869610936,
	"grad_norm": 0.25766387581825256,
	"learning_rate": 0.00018280905074207884,
	"loss": 0.8997,
	"step": 53
	},
	{
	"epoch": 0.22712933753943218,
	"grad_norm": 0.24009671807289124,
	"learning_rate": 0.00018202530410553163,
	"loss": 0.8536,
	"step": 54
	},
	{
	"epoch": 0.231335436382755,
	"grad_norm": 0.24763701856136322,
	"learning_rate": 0.00018122584700728443,
	"loss": 0.8581,
	"step": 55
	},
	{
	"epoch": 0.23554153522607782,
	"grad_norm": 0.2651236653327942,
	"learning_rate": 0.0001804108325688679,
	"loss": 0.8164,
	"step": 56
	},
	{
	"epoch": 0.23974763406940064,
	"grad_norm": 0.23978441953659058,
	"learning_rate": 0.0001795804168915396,
	"loss": 0.8321,
	"step": 57
	},
	{
	"epoch": 0.24395373291272346,
	"grad_norm": 0.2508217394351959,
	"learning_rate": 0.00017873475902638553,
	"loss": 0.815,
	"step": 58
	},
	{
	"epoch": 0.24815983175604628,
	"grad_norm": 0.2765346169471741,
	"learning_rate": 0.00017787402094385666,
	"loss": 0.8674,
	"step": 59
	},
	{
	"epoch": 0.25236593059936907,
	"grad_norm": 0.27468088269233704,
	"learning_rate": 0.00017699836750274662,
	"loss": 0.8841,
	"step": 60
	},
	{
	"epoch": 0.25236593059936907,
	"eval_loss": 1.075273036956787,
	"eval_runtime": 66.5472,
	"eval_samples_per_second": 40.513,
	"eval_steps_per_second": 20.256,
	"step": 60
	},
	{
	"epoch": 0.2565720294426919,
	"grad_norm": 0.27056071162223816,
	"learning_rate": 0.00017610796641861581,
	"loss": 0.8459,
	"step": 61
	},
	{
	"epoch": 0.2607781282860147,
	"grad_norm": 0.2631956934928894,
	"learning_rate": 0.00017520298823166873,
	"loss": 0.8853,
	"step": 62
	},
	{
	"epoch": 0.26498422712933756,
	"grad_norm": 0.28352680802345276,
	"learning_rate": 0.00017428360627408978,
	"loss": 0.8625,
	"step": 63
	},
	{
	"epoch": 0.26919032597266035,
	"grad_norm": 0.24897028505802155,
	"learning_rate": 0.00017334999663684504,
	"loss": 0.8627,
	"step": 64
	},
	{
	"epoch": 0.2733964248159832,
	"grad_norm": 0.2620624303817749,
	"learning_rate": 0.00017240233813595478,
	"loss": 0.8088,
	"step": 65
	},
	{
	"epoch": 0.277602523659306,
	"grad_norm": 0.24983716011047363,
	"learning_rate": 0.0001714408122782448,
	"loss": 0.8318,
	"step": 66
	},
	{
	"epoch": 0.28180862250262884,
	"grad_norm": 0.2667708694934845,
	"learning_rate": 0.000170465603226582,
	"loss": 0.8368,
	"step": 67
	},
	{
	"epoch": 0.28601472134595163,
	"grad_norm": 0.2828388214111328,
	"learning_rate": 0.0001694768977646013,
	"loss": 0.8282,
	"step": 68
	},
	{
	"epoch": 0.2902208201892745,
	"grad_norm": 0.2581498324871063,
	"learning_rate": 0.0001684748852609306,
	"loss": 0.8375,
	"step": 69
	},
	{
	"epoch": 0.29442691903259727,
	"grad_norm": 0.27101799845695496,
	"learning_rate": 0.0001674597576329207,
	"loss": 0.818,
	"step": 70
	},
	{
	"epoch": 0.29863301787592006,
	"grad_norm": 0.27231255173683167,
	"learning_rate": 0.00016643170930988698,
	"loss": 0.843,
	"step": 71
	},
	{
	"epoch": 0.3028391167192429,
	"grad_norm": 0.2566690444946289,
	"learning_rate": 0.00016539093719586994,
	"loss": 0.8348,
	"step": 72
	},
	{
	"epoch": 0.3070452155625657,
	"grad_norm": 0.2482360601425171,
	"learning_rate": 0.00016433764063192194,
	"loss": 0.8122,
	"step": 73
	},
	{
	"epoch": 0.31125131440588855,
	"grad_norm": 0.25742995738983154,
	"learning_rate": 0.00016327202135792685,
	"loss": 0.776,
	"step": 74
	},
	{
	"epoch": 0.31545741324921134,
	"grad_norm": 0.25104233622550964,
	"learning_rate": 0.00016219428347396053,
	"loss": 0.7823,
	"step": 75
	},
	{
	"epoch": 0.3196635120925342,
	"grad_norm": 0.2921640872955322,
	"learning_rate": 0.00016110463340119913,
	"loss": 0.8127,
	"step": 76
	},
	{
	"epoch": 0.323869610935857,
	"grad_norm": 0.26554426550865173,
	"learning_rate": 0.00016000327984238292,
	"loss": 0.7716,
	"step": 77
	},
	{
	"epoch": 0.3280757097791798,
	"grad_norm": 0.24784542620182037,
	"learning_rate": 0.00015889043374184286,
	"loss": 0.7714,
	"step": 78
	},
	{
	"epoch": 0.3322818086225026,
	"grad_norm": 0.26592087745666504,
	"learning_rate": 0.0001577663082450984,
	"loss": 0.7397,
	"step": 79
	},
	{
	"epoch": 0.33648790746582546,
	"grad_norm": 0.3072431683540344,
	"learning_rate": 0.00015663111865803285,
	"loss": 0.7579,
	"step": 80
	},
	{
	"epoch": 0.34069400630914826,
	"grad_norm": 0.29445305466651917,
	"learning_rate": 0.00015548508240565583,
	"loss": 0.7998,
	"step": 81
	},
	{
	"epoch": 0.3449001051524711,
	"grad_norm": 0.26053521037101746,
	"learning_rate": 0.0001543284189904592,
	"loss": 0.7832,
	"step": 82
	},
	{
	"epoch": 0.3491062039957939,
	"grad_norm": 0.2956802248954773,
	"learning_rate": 0.00015316134995037545,
	"loss": 0.8054,
	"step": 83
	},
	{
	"epoch": 0.35331230283911674,
	"grad_norm": 0.2673921287059784,
	"learning_rate": 0.00015198409881634617,
	"loss": 0.8061,
	"step": 84
	},
	{
	"epoch": 0.35751840168243953,
	"grad_norm": 0.2793889045715332,
	"learning_rate": 0.00015079689106950854,
	"loss": 0.7902,
	"step": 85
	},
	{
	"epoch": 0.3617245005257624,
	"grad_norm": 0.26718223094940186,
	"learning_rate": 0.00014959995409800873,
	"loss": 0.7769,
	"step": 86
	},
	{
	"epoch": 0.3659305993690852,
	"grad_norm": 0.300536572933197,
	"learning_rate": 0.00014839351715344968,
	"loss": 0.8245,
	"step": 87
	},
	{
	"epoch": 0.37013669821240797,
	"grad_norm": 0.2824515998363495,
	"learning_rate": 0.00014717781130698212,
	"loss": 0.8122,
	"step": 88
	},
	{
	"epoch": 0.3743427970557308,
	"grad_norm": 0.28050506114959717,
	"learning_rate": 0.00014595306940504716,
	"loss": 0.778,
	"step": 89
	},
	{
	"epoch": 0.3785488958990536,
	"grad_norm": 0.2906787395477295,
	"learning_rate": 0.00014471952602477866,
	"loss": 0.7703,
	"step": 90
	},
	{
	"epoch": 0.38275499474237645,
	"grad_norm": 0.298177033662796,
	"learning_rate": 0.00014347741742907433,
	"loss": 0.7672,
	"step": 91
	},
	{
	"epoch": 0.38696109358569925,
	"grad_norm": 0.27583765983581543,
	"learning_rate": 0.00014222698152134374,
	"loss": 0.7784,
	"step": 92
	},
	{
	"epoch": 0.3911671924290221,
	"grad_norm": 0.28834670782089233,
	"learning_rate": 0.0001409684577999423,
	"loss": 0.8278,
	"step": 93
	},
	{
	"epoch": 0.3953732912723449,
	"grad_norm": 0.29721811413764954,
	"learning_rate": 0.00013970208731229974,
	"loss": 0.7997,
	"step": 94
	},
	{
	"epoch": 0.39957939011566773,
	"grad_norm": 0.2688146233558655,
	"learning_rate": 0.00013842811260875168,
	"loss": 0.7465,
	"step": 95
	},
	{
	"epoch": 0.4037854889589905,
	"grad_norm": 0.27095234394073486,
	"learning_rate": 0.0001371467776960837,
	"loss": 0.757,
	"step": 96
	},
	{
	"epoch": 0.40799158780231337,
	"grad_norm": 0.30743858218193054,
	"learning_rate": 0.0001358583279907961,
	"loss": 0.7882,
	"step": 97
	},
	{
	"epoch": 0.41219768664563616,
	"grad_norm": 0.274873822927475,
	"learning_rate": 0.00013456301027209882,
	"loss": 0.7737,
	"step": 98
	},
	{
	"epoch": 0.416403785488959,
	"grad_norm": 0.25485867261886597,
	"learning_rate": 0.00013326107263464558,
	"loss": 0.7454,
	"step": 99
	},
	{
	"epoch": 0.4206098843322818,
	"grad_norm": 0.2994694709777832,
	"learning_rate": 0.00013195276444101547,
	"loss": 0.8133,
	"step": 100
	},
	{
	"epoch": 0.42481598317560465,
	"grad_norm": 0.2943129241466522,
	"learning_rate": 0.0001306383362739523,
	"loss": 0.7501,
	"step": 101
	},
	{
	"epoch": 0.42902208201892744,
	"grad_norm": 0.2888595163822174,
	"learning_rate": 0.0001293180398883701,
	"loss": 0.7522,
	"step": 102
	},
	{
	"epoch": 0.4332281808622503,
	"grad_norm": 0.28455743193626404,
	"learning_rate": 0.00012799212816313376,
	"loss": 0.7278,
	"step": 103
	},
	{
	"epoch": 0.4374342797055731,
	"grad_norm": 0.32477039098739624,
	"learning_rate": 0.00012666085505262485,
	"loss": 0.819,
	"step": 104
	},
	{
	"epoch": 0.4416403785488959,
	"grad_norm": 0.28067031502723694,
	"learning_rate": 0.00012532447553810126,
	"loss": 0.7979,
	"step": 105
	},
	{
	"epoch": 0.4458464773922187,
	"grad_norm": 0.26430413126945496,
	"learning_rate": 0.00012398324557885994,
	"loss": 0.7497,
	"step": 106
	},
	{
	"epoch": 0.4500525762355415,
	"grad_norm": 0.27110588550567627,
	"learning_rate": 0.00012263742206321287,
	"loss": 0.7937,
	"step": 107
	},
	{
	"epoch": 0.45425867507886436,
	"grad_norm": 0.287041574716568,
	"learning_rate": 0.0001212872627592845,
	"loss": 0.7897,
	"step": 108
	},
	{
	"epoch": 0.45846477392218715,
	"grad_norm": 0.28561776876449585,
	"learning_rate": 0.00011993302626564102,
	"loss": 0.8011,
	"step": 109
	},
	{
	"epoch": 0.46267087276551,
	"grad_norm": 0.2852155566215515,
	"learning_rate": 0.00011857497196176049,
	"loss": 0.7426,
	"step": 110
	},
	{
	"epoch": 0.4668769716088328,
	"grad_norm": 0.2712121903896332,
	"learning_rate": 0.00011721335995835336,
	"loss": 0.7277,
	"step": 111
	},
	{
	"epoch": 0.47108307045215564,
	"grad_norm": 0.2779647409915924,
	"learning_rate": 0.00011584845104754304,
	"loss": 0.7698,
	"step": 112
	},
	{
	"epoch": 0.47528916929547843,
	"grad_norm": 0.2774654030799866,
	"learning_rate": 0.00011448050665291587,
	"loss": 0.7583,
	"step": 113
	},
	{
	"epoch": 0.4794952681388013,
	"grad_norm": 0.3046507239341736,
	"learning_rate": 0.00011310978877945007,
	"loss": 0.7987,
	"step": 114
	},
	{
	"epoch": 0.48370136698212407,
	"grad_norm": 0.2816363573074341,
	"learning_rate": 0.00011173655996333357,
	"loss": 0.7898,
	"step": 115
	},
	{
	"epoch": 0.4879074658254469,
	"grad_norm": 0.27383196353912354,
	"learning_rate": 0.00011036108322167988,
	"loss": 0.7248,
	"step": 116
	},
	{
	"epoch": 0.4921135646687697,
	"grad_norm": 0.28104445338249207,
	"learning_rate": 0.00010898362200215197,
	"loss": 0.7144,
	"step": 117
	},
	{
	"epoch": 0.49631966351209256,
	"grad_norm": 0.28643152117729187,
	"learning_rate": 0.0001076044401325036,
	"loss": 0.7856,
	"step": 118
	},
	{
	"epoch": 0.5005257623554153,
	"grad_norm": 0.261483371257782,
	"learning_rate": 0.0001062238017700478,
	"loss": 0.7429,
	"step": 119
	},
	{
	"epoch": 0.5047318611987381,
	"grad_norm": 0.2796306908130646,
	"learning_rate": 0.00010484197135106263,
	"loss": 0.7772,
	"step": 120
	},
	{
	"epoch": 0.5047318611987381,
	"eval_loss": 0.9960550665855408,
	"eval_runtime": 65.8413,
	"eval_samples_per_second": 40.947,
	"eval_steps_per_second": 20.473,
	"step": 120
	},
	{
	"epoch": 0.508937960042061,
	"grad_norm": 0.3079998791217804,
	"learning_rate": 0.00010345921354014279,
	"loss": 0.7497,
	"step": 121
	},
	{
	"epoch": 0.5131440588853838,
	"grad_norm": 0.3106074929237366,
	"learning_rate": 0.00010207579317950827,
	"loss": 0.7568,
	"step": 122
	},
	{
	"epoch": 0.5173501577287066,
	"grad_norm": 0.27859166264533997,
	"learning_rate": 0.00010069197523827833,
	"loss": 0.7695,
	"step": 123
	},
	{
	"epoch": 0.5215562565720294,
	"grad_norm": 0.2840277850627899,
	"learning_rate": 9.930802476172169e-05,
	"loss": 0.7815,
	"step": 124
	},
	{
	"epoch": 0.5257623554153522,
	"grad_norm": 0.28042981028556824,
	"learning_rate": 9.792420682049174e-05,
	"loss": 0.7546,
	"step": 125
	},
	{
	"epoch": 0.5299684542586751,
	"grad_norm": 0.2857164144515991,
	"learning_rate": 9.654078645985722e-05,
	"loss": 0.7617,
	"step": 126
	},
	{
	"epoch": 0.5341745531019979,
	"grad_norm": 0.29590827226638794,
	"learning_rate": 9.515802864893739e-05,
	"loss": 0.748,
	"step": 127
	},
	{
	"epoch": 0.5383806519453207,
	"grad_norm": 0.29375162720680237,
	"learning_rate": 9.377619822995219e-05,
	"loss": 0.7532,
	"step": 128
	},
	{
	"epoch": 0.5425867507886435,
	"grad_norm": 0.28436464071273804,
	"learning_rate": 9.239555986749645e-05,
	"loss": 0.7511,
	"step": 129
	},
	{
	"epoch": 0.5467928496319664,
	"grad_norm": 0.29677248001098633,
	"learning_rate": 9.101637799784804e-05,
	"loss": 0.7456,
	"step": 130
	},
	{
	"epoch": 0.5509989484752892,
	"grad_norm": 0.27983585000038147,
	"learning_rate": 8.963891677832011e-05,
	"loss": 0.6888,
	"step": 131
	},
	{
	"epoch": 0.555205047318612,
	"grad_norm": 0.27517008781433105,
	"learning_rate": 8.826344003666647e-05,
	"loss": 0.7431,
	"step": 132
	},
	{
	"epoch": 0.5594111461619348,
	"grad_norm": 0.28197160363197327,
	"learning_rate": 8.689021122054996e-05,
	"loss": 0.7379,
	"step": 133
	},
	{
	"epoch": 0.5636172450052577,
	"grad_norm": 0.29125264286994934,
	"learning_rate": 8.551949334708415e-05,
	"loss": 0.7639,
	"step": 134
	},
	{
	"epoch": 0.5678233438485805,
	"grad_norm": 0.2851899266242981,
	"learning_rate": 8.415154895245697e-05,
	"loss": 0.7764,
	"step": 135
	},
	{
	"epoch": 0.5720294426919033,
	"grad_norm": 0.2771802544593811,
	"learning_rate": 8.278664004164665e-05,
	"loss": 0.6961,
	"step": 136
	},
	{
	"epoch": 0.576235541535226,
	"grad_norm": 0.27956414222717285,
	"learning_rate": 8.142502803823955e-05,
	"loss": 0.7454,
	"step": 137
	},
	{
	"epoch": 0.580441640378549,
	"grad_norm": 0.30068668723106384,
	"learning_rate": 8.0066973734359e-05,
	"loss": 0.7683,
	"step": 138
	},
	{
	"epoch": 0.5846477392218717,
	"grad_norm": 0.2820778489112854,
	"learning_rate": 7.871273724071553e-05,
	"loss": 0.7412,
	"step": 139
	},
	{
	"epoch": 0.5888538380651945,
	"grad_norm": 0.2672085165977478,
	"learning_rate": 7.736257793678714e-05,
	"loss": 0.716,
	"step": 140
	},
	{
	"epoch": 0.5930599369085173,
	"grad_norm": 0.27900293469429016,
	"learning_rate": 7.601675442114009e-05,
	"loss": 0.7259,
	"step": 141
	},
	{
	"epoch": 0.5972660357518401,
	"grad_norm": 0.2954063415527344,
	"learning_rate": 7.46755244618988e-05,
	"loss": 0.7047,
	"step": 142
	},
	{
	"epoch": 0.601472134595163,
	"grad_norm": 0.3212134838104248,
	"learning_rate": 7.333914494737514e-05,
	"loss": 0.7657,
	"step": 143
	},
	{
	"epoch": 0.6056782334384858,
	"grad_norm": 0.30651283264160156,
	"learning_rate": 7.200787183686625e-05,
	"loss": 0.7489,
	"step": 144
	},
	{
	"epoch": 0.6098843322818086,
	"grad_norm": 0.26834797859191895,
	"learning_rate": 7.068196011162994e-05,
	"loss": 0.7484,
	"step": 145
	},
	{
	"epoch": 0.6140904311251314,
	"grad_norm": 0.2777973711490631,
	"learning_rate": 6.936166372604773e-05,
	"loss": 0.7245,
	"step": 146
	},
	{
	"epoch": 0.6182965299684543,
	"grad_norm": 0.293694406747818,
	"learning_rate": 6.804723555898458e-05,
	"loss": 0.7211,
	"step": 147
	},
	{
	"epoch": 0.6225026288117771,
	"grad_norm": 0.28515610098838806,
	"learning_rate": 6.673892736535448e-05,
	"loss": 0.7439,
	"step": 148
	},
	{
	"epoch": 0.6267087276550999,
	"grad_norm": 0.2929891049861908,
	"learning_rate": 6.543698972790117e-05,
	"loss": 0.7434,
	"step": 149
	},
	{
	"epoch": 0.6309148264984227,
	"grad_norm": 0.29031944274902344,
	"learning_rate": 6.414167200920391e-05,
	"loss": 0.7176,
	"step": 150
	},
	{
	"epoch": 0.6351209253417456,
	"grad_norm": 0.2764637768268585,
	"learning_rate": 6.28532223039163e-05,
	"loss": 0.7503,
	"step": 151
	},
	{
	"epoch": 0.6393270241850684,
	"grad_norm": 0.2900468707084656,
	"learning_rate": 6.157188739124834e-05,
	"loss": 0.6879,
	"step": 152
	},
	{
	"epoch": 0.6435331230283912,
	"grad_norm": 0.2989012897014618,
	"learning_rate": 6.029791268770029e-05,
	"loss": 0.7135,
	"step": 153
	},
	{
	"epoch": 0.647739221871714,
	"grad_norm": 0.2998535931110382,
	"learning_rate": 5.903154220005771e-05,
	"loss": 0.7171,
	"step": 154
	},
	{
	"epoch": 0.6519453207150369,
	"grad_norm": 0.27283868193626404,
	"learning_rate": 5.777301847865629e-05,
	"loss": 0.7112,
	"step": 155
	},
	{
	"epoch": 0.6561514195583596,
	"grad_norm": 0.2988041341304779,
	"learning_rate": 5.652258257092569e-05,
	"loss": 0.7444,
	"step": 156
	},
	{
	"epoch": 0.6603575184016824,
	"grad_norm": 0.2845938205718994,
	"learning_rate": 5.528047397522133e-05,
	"loss": 0.716,
	"step": 157
	},
	{
	"epoch": 0.6645636172450052,
	"grad_norm": 0.29695218801498413,
	"learning_rate": 5.404693059495285e-05,
	"loss": 0.7585,
	"step": 158
	},
	{
	"epoch": 0.668769716088328,
	"grad_norm": 0.28558245301246643,
	"learning_rate": 5.282218869301788e-05,
	"loss": 0.6908,
	"step": 159
	},
	{
	"epoch": 0.6729758149316509,
	"grad_norm": 0.280200719833374,
	"learning_rate": 5.160648284655032e-05,
	"loss": 0.7508,
	"step": 160
	},
	{
	"epoch": 0.6771819137749737,
	"grad_norm": 0.2981257438659668,
	"learning_rate": 5.040004590199128e-05,
	"loss": 0.7147,
	"step": 161
	},
	{
	"epoch": 0.6813880126182965,
	"grad_norm": 0.2873106598854065,
	"learning_rate": 4.920310893049146e-05,
	"loss": 0.7011,
	"step": 162
	},
	{
	"epoch": 0.6855941114616193,
	"grad_norm": 0.2717635929584503,
	"learning_rate": 4.801590118365383e-05,
	"loss": 0.6668,
	"step": 163
	},
	{
	"epoch": 0.6898002103049422,
	"grad_norm": 0.27607038617134094,
	"learning_rate": 4.683865004962452e-05,
	"loss": 0.7033,
	"step": 164
	},
	{
	"epoch": 0.694006309148265,
	"grad_norm": 0.2881218194961548,
	"learning_rate": 4.567158100954083e-05,
	"loss": 0.7275,
	"step": 165
	},
	{
	"epoch": 0.6982124079915878,
	"grad_norm": 0.2758018672466278,
	"learning_rate": 4.4514917594344184e-05,
	"loss": 0.737,
	"step": 166
	},
	{
	"epoch": 0.7024185068349106,
	"grad_norm": 0.29527172446250916,
	"learning_rate": 4.3368881341967135e-05,
	"loss": 0.7433,
	"step": 167
	},
	{
	"epoch": 0.7066246056782335,
	"grad_norm": 0.2847643792629242,
	"learning_rate": 4.223369175490162e-05,
	"loss": 0.7471,
	"step": 168
	},
	{
	"epoch": 0.7108307045215563,
	"grad_norm": 0.2958676815032959,
	"learning_rate": 4.110956625815713e-05,
	"loss": 0.6838,
	"step": 169
	},
	{
	"epoch": 0.7150368033648791,
	"grad_norm": 0.28350576758384705,
	"learning_rate": 3.9996720157617094e-05,
	"loss": 0.7306,
	"step": 170
	},
	{
	"epoch": 0.7192429022082019,
	"grad_norm": 0.2808986008167267,
	"learning_rate": 3.8895366598800896e-05,
	"loss": 0.6823,
	"step": 171
	},
	{
	"epoch": 0.7234490010515248,
	"grad_norm": 0.2684039771556854,
	"learning_rate": 3.780571652603949e-05,
	"loss": 0.7105,
	"step": 172
	},
	{
	"epoch": 0.7276550998948476,
	"grad_norm": 0.28138425946235657,
	"learning_rate": 3.672797864207316e-05,
	"loss": 0.7221,
	"step": 173
	},
	{
	"epoch": 0.7318611987381703,
	"grad_norm": 0.2772335708141327,
	"learning_rate": 3.566235936807808e-05,
	"loss": 0.6835,
	"step": 174
	},
	{
	"epoch": 0.7360672975814931,
	"grad_norm": 0.27244430780410767,
	"learning_rate": 3.460906280413007e-05,
	"loss": 0.6577,
	"step": 175
	},
	{
	"epoch": 0.7402733964248159,
	"grad_norm": 0.2977088689804077,
	"learning_rate": 3.3568290690113034e-05,
	"loss": 0.7213,
	"step": 176
	},
	{
	"epoch": 0.7444794952681388,
	"grad_norm": 0.289736270904541,
	"learning_rate": 3.25402423670793e-05,
	"loss": 0.7154,
	"step": 177
	},
	{
	"epoch": 0.7486855941114616,
	"grad_norm": 0.287818044424057,
	"learning_rate": 3.1525114739069415e-05,
	"loss": 0.6977,
	"step": 178
	},
	{
	"epoch": 0.7528916929547844,
	"grad_norm": 0.31408464908599854,
	"learning_rate": 3.0523102235398714e-05,
	"loss": 0.781,
	"step": 179
	},
	{
	"epoch": 0.7570977917981072,
	"grad_norm": 0.27790582180023193,
	"learning_rate": 2.9534396773417994e-05,
	"loss": 0.7169,
	"step": 180
	},
	{
	"epoch": 0.7570977917981072,
	"eval_loss": 0.9679059386253357,
	"eval_runtime": 66.127,
	"eval_samples_per_second": 40.77,
	"eval_steps_per_second": 20.385,
	"step": 180
	},
	{
	"epoch": 0.7613038906414301,
	"grad_norm": 0.28392866253852844,
	"learning_rate": 2.855918772175522e-05,
	"loss": 0.6662,
	"step": 181
	},
	{
	"epoch": 0.7655099894847529,
	"grad_norm": 0.2941664159297943,
	"learning_rate": 2.7597661864045233e-05,
	"loss": 0.6816,
	"step": 182
	},
	{
	"epoch": 0.7697160883280757,
	"grad_norm": 0.2740324139595032,
	"learning_rate": 2.6650003363154963e-05,
	"loss": 0.7046,
	"step": 183
	},
	{
	"epoch": 0.7739221871713985,
	"grad_norm": 0.2933352291584015,
	"learning_rate": 2.5716393725910215e-05,
	"loss": 0.7208,
	"step": 184
	},
	{
	"epoch": 0.7781282860147214,
	"grad_norm": 0.2843799591064453,
	"learning_rate": 2.47970117683313e-05,
	"loss": 0.685,
	"step": 185
	},
	{
	"epoch": 0.7823343848580442,
	"grad_norm": 0.27152329683303833,
	"learning_rate": 2.389203358138419e-05,
	"loss": 0.7176,
	"step": 186
	},
	{
	"epoch": 0.786540483701367,
	"grad_norm": 0.2916063964366913,
	"learning_rate": 2.3001632497253424e-05,
	"loss": 0.7439,
	"step": 187
	},
	{
	"epoch": 0.7907465825446898,
	"grad_norm": 0.27915897965431213,
	"learning_rate": 2.2125979056143364e-05,
	"loss": 0.7,
	"step": 188
	},
	{
	"epoch": 0.7949526813880127,
	"grad_norm": 0.30191752314567566,
	"learning_rate": 2.1265240973614486e-05,
	"loss": 0.7377,
	"step": 189
	},
	{
	"epoch": 0.7991587802313355,
	"grad_norm": 0.286101758480072,
	"learning_rate": 2.0419583108460418e-05,
	"loss": 0.6916,
	"step": 190
	},
	{
	"epoch": 0.8033648790746583,
	"grad_norm": 0.2800692319869995,
	"learning_rate": 1.958916743113214e-05,
	"loss": 0.7374,
	"step": 191
	},
	{
	"epoch": 0.807570977917981,
	"grad_norm": 0.27292168140411377,
	"learning_rate": 1.877415299271561e-05,
	"loss": 0.6757,
	"step": 192
	},
	{
	"epoch": 0.8117770767613038,
	"grad_norm": 0.28094640374183655,
	"learning_rate": 1.7974695894468384e-05,
	"loss": 0.7024,
	"step": 193
	},
	{
	"epoch": 0.8159831756046267,
	"grad_norm": 0.2871862053871155,
	"learning_rate": 1.7190949257921196e-05,
	"loss": 0.7173,
	"step": 194
	},
	{
	"epoch": 0.8201892744479495,
	"grad_norm": 0.27189600467681885,
	"learning_rate": 1.642306319555027e-05,
	"loss": 0.7019,
	"step": 195
	},
	{
	"epoch": 0.8243953732912723,
	"grad_norm": 0.28526559472084045,
	"learning_rate": 1.5671184782026106e-05,
	"loss": 0.7113,
	"step": 196
	},
	{
	"epoch": 0.8286014721345951,
	"grad_norm": 0.2855590283870697,
	"learning_rate": 1.4935458026043959e-05,
	"loss": 0.6977,
	"step": 197
	},
	{
	"epoch": 0.832807570977918,
	"grad_norm": 0.28118449449539185,
	"learning_rate": 1.4216023842741455e-05,
	"loss": 0.7241,
	"step": 198
	},
	{
	"epoch": 0.8370136698212408,
	"grad_norm": 0.28818827867507935,
	"learning_rate": 1.3513020026709023e-05,
	"loss": 0.6964,
	"step": 199
	},
	{
	"epoch": 0.8412197686645636,
	"grad_norm": 0.3235337436199188,
	"learning_rate": 1.2826581225597767e-05,
	"loss": 0.7406,
	"step": 200
	},
	{
	"epoch": 0.8454258675078864,
	"grad_norm": 0.2899198830127716,
	"learning_rate": 1.2156838914330072e-05,
	"loss": 0.7374,
	"step": 201
	},
	{
	"epoch": 0.8496319663512093,
	"grad_norm": 0.28662335872650146,
	"learning_rate": 1.1503921369918091e-05,
	"loss": 0.7039,
	"step": 202
	},
	{
	"epoch": 0.8538380651945321,
	"grad_norm": 0.2748032510280609,
	"learning_rate": 1.0867953646894525e-05,
	"loss": 0.7517,
	"step": 203
	},
	{
	"epoch": 0.8580441640378549,
	"grad_norm": 0.27125102281570435,
	"learning_rate": 1.0249057553360742e-05,
	"loss": 0.6948,
	"step": 204
	},
	{
	"epoch": 0.8622502628811777,
	"grad_norm": 0.2795623242855072,
	"learning_rate": 9.647351627656543e-06,
	"loss": 0.7123,
	"step": 205
	},
	{
	"epoch": 0.8664563617245006,
	"grad_norm": 0.28939002752304077,
	"learning_rate": 9.062951115656403e-06,
	"loss": 0.7266,
	"step": 206
	},
	{
	"epoch": 0.8706624605678234,
	"grad_norm": 0.2878707945346832,
	"learning_rate": 8.495967948696192e-06,
	"loss": 0.7335,
	"step": 207
	},
	{
	"epoch": 0.8748685594111462,
	"grad_norm": 0.27489086985588074,
	"learning_rate": 7.946510722134692e-06,
	"loss": 0.692,
	"step": 208
	},
	{
	"epoch": 0.879074658254469,
	"grad_norm": 0.2869216799736023,
	"learning_rate": 7.4146846745541506e-06,
	"loss": 0.7193,
	"step": 209
	},
	{
	"epoch": 0.8832807570977917,
	"grad_norm": 0.2801933288574219,
	"learning_rate": 6.900591667603751e-06,
	"loss": 0.7178,
	"step": 210
	},
	{
	"epoch": 0.8874868559411146,
	"grad_norm": 0.2767332196235657,
	"learning_rate": 6.40433016648988e-06,
	"loss": 0.7499,
	"step": 211
	},
	{
	"epoch": 0.8916929547844374,
	"grad_norm": 0.2783336043357849,
	"learning_rate": 5.925995221116853e-06,
	"loss": 0.7152,
	"step": 212
	},
	{
	"epoch": 0.8958990536277602,
	"grad_norm": 0.27832481265068054,
	"learning_rate": 5.465678447881828e-06,
	"loss": 0.6977,
	"step": 213
	},
	{
	"epoch": 0.900105152471083,
	"grad_norm": 0.2835717499256134,
	"learning_rate": 5.023468012127364e-06,
	"loss": 0.7251,
	"step": 214
	},
	{
	"epoch": 0.9043112513144059,
	"grad_norm": 0.27503538131713867,
	"learning_rate": 4.599448611254964e-06,
	"loss": 0.7166,
	"step": 215
	},
	{
	"epoch": 0.9085173501577287,
	"grad_norm": 0.26619476079940796,
	"learning_rate": 4.193701458502807e-06,
	"loss": 0.7095,
	"step": 216
	},
	{
	"epoch": 0.9127234490010515,
	"grad_norm": 0.2752280533313751,
	"learning_rate": 3.80630426739077e-06,
	"loss": 0.7412,
	"step": 217
	},
	{
	"epoch": 0.9169295478443743,
	"grad_norm": 0.281093567609787,
	"learning_rate": 3.4373312368358944e-06,
	"loss": 0.7592,
	"step": 218
	},
	{
	"epoch": 0.9211356466876972,
	"grad_norm": 0.28015753626823425,
	"learning_rate": 3.086853036940862e-06,
	"loss": 0.7104,
	"step": 219
	},
	{
	"epoch": 0.92534174553102,
	"grad_norm": 0.2644014358520508,
	"learning_rate": 2.754936795458485e-06,
	"loss": 0.6985,
	"step": 220
	},
	{
	"epoch": 0.9295478443743428,
	"grad_norm": 0.2755027413368225,
	"learning_rate": 2.4416460849345123e-06,
	"loss": 0.7159,
	"step": 221
	},
	{
	"epoch": 0.9337539432176656,
	"grad_norm": 0.28020283579826355,
	"learning_rate": 2.1470409105315283e-06,
	"loss": 0.7389,
	"step": 222
	},
	{
	"epoch": 0.9379600420609885,
	"grad_norm": 0.2773683965206146,
	"learning_rate": 1.8711776985360308e-06,
	"loss": 0.686,
	"step": 223
	},
	{
	"epoch": 0.9421661409043113,
	"grad_norm": 0.2784758508205414,
	"learning_rate": 1.61410928555098e-06,
	"loss": 0.6857,
	"step": 224
	},
	{
	"epoch": 0.9463722397476341,
	"grad_norm": 0.2857016623020172,
	"learning_rate": 1.3758849083759352e-06,
	"loss": 0.6982,
	"step": 225
	},
	{
	"epoch": 0.9505783385909569,
	"grad_norm": 0.27618998289108276,
	"learning_rate": 1.1565501945766222e-06,
	"loss": 0.7328,
	"step": 226
	},
	{
	"epoch": 0.9547844374342797,
	"grad_norm": 0.273423969745636,
	"learning_rate": 9.56147153745779e-07,
	"loss": 0.6762,
	"step": 227
	},
	{
	"epoch": 0.9589905362776026,
	"grad_norm": 0.2603454291820526,
	"learning_rate": 7.747141694570026e-07,
	"loss": 0.6784,
	"step": 228
	},
	{
	"epoch": 0.9631966351209253,
	"grad_norm": 0.2638219892978668,
	"learning_rate": 6.122859919130974e-07,
	"loss": 0.731,
	"step": 229
	},
	{
	"epoch": 0.9674027339642481,
	"grad_norm": 0.28604456782341003,
	"learning_rate": 4.6889373129022085e-07,
	"loss": 0.6937,
	"step": 230
	},
	{
	"epoch": 0.9716088328075709,
	"grad_norm": 0.2867179811000824,
	"learning_rate": 3.445648517793942e-07,
	"loss": 0.7492,
	"step": 231
	},
	{
	"epoch": 0.9758149316508938,
	"grad_norm": 0.27991774678230286,
	"learning_rate": 2.3932316632614416e-07,
	"loss": 0.7411,
	"step": 232
	},
	{
	"epoch": 0.9800210304942166,
	"grad_norm": 0.2658878266811371,
	"learning_rate": 1.5318883206962842e-07,
	"loss": 0.7317,
	"step": 233
	},
	{
	"epoch": 0.9842271293375394,
	"grad_norm": 0.26533135771751404,
	"learning_rate": 8.617834648185774e-08,
	"loss": 0.6636,
	"step": 234
	},
	{
	"epoch": 0.9884332281808622,
	"grad_norm": 0.26577314734458923,
	"learning_rate": 3.8304544207945495e-08,
	"loss": 0.7273,
	"step": 235
	},
	{
	"epoch": 0.9926393270241851,
	"grad_norm": 0.2715383768081665,
	"learning_rate": 9.576594607807465e-09,
	"loss": 0.7253,
	"step": 236
	},
	{
	"epoch": 0.9968454258675079,
	"grad_norm": 0.28140708804130554,
	"learning_rate": 0.0,
	"loss": 0.6756,
	"step": 237
	}
	],
	"logging_steps": 1,
	"max_steps": 237,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 2.319764496895181e+16,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}