J.O.S.I.E.v4o / checkpoint-1000 /trainer_state.json
Isaak Carter Augustus
Upload 11 files
a89081e verified
raw
history blame
No virus
18.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.023342261748452116,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00023342261748452117,
"grad_norm": 0.2070077508687973,
"learning_rate": 0.0002,
"loss": 1.7672,
"step": 10
},
{
"epoch": 0.00046684523496904234,
"grad_norm": 0.2195936143398285,
"learning_rate": 0.0001959183673469388,
"loss": 1.1291,
"step": 20
},
{
"epoch": 0.0007002678524535634,
"grad_norm": 0.1822710633277893,
"learning_rate": 0.00019183673469387756,
"loss": 0.6041,
"step": 30
},
{
"epoch": 0.0009336904699380847,
"grad_norm": 0.10866066068410873,
"learning_rate": 0.00018775510204081634,
"loss": 0.5399,
"step": 40
},
{
"epoch": 0.0011671130874226058,
"grad_norm": 0.06832244247198105,
"learning_rate": 0.00018367346938775512,
"loss": 0.4337,
"step": 50
},
{
"epoch": 0.0014005357049071269,
"grad_norm": 0.13112975656986237,
"learning_rate": 0.0001795918367346939,
"loss": 0.4785,
"step": 60
},
{
"epoch": 0.0016339583223916481,
"grad_norm": 0.05374117195606232,
"learning_rate": 0.00017551020408163265,
"loss": 0.4458,
"step": 70
},
{
"epoch": 0.0018673809398761694,
"grad_norm": 0.049559202045202255,
"learning_rate": 0.00017142857142857143,
"loss": 0.4517,
"step": 80
},
{
"epoch": 0.0021008035573606906,
"grad_norm": 0.10584782809019089,
"learning_rate": 0.00016734693877551023,
"loss": 0.4592,
"step": 90
},
{
"epoch": 0.0023342261748452117,
"grad_norm": 0.062141530215740204,
"learning_rate": 0.00016326530612244898,
"loss": 0.4625,
"step": 100
},
{
"epoch": 0.0025676487923297327,
"grad_norm": 0.14701640605926514,
"learning_rate": 0.00015918367346938776,
"loss": 0.4818,
"step": 110
},
{
"epoch": 0.0028010714098142537,
"grad_norm": 0.048784978687763214,
"learning_rate": 0.00015510204081632654,
"loss": 0.4687,
"step": 120
},
{
"epoch": 0.003034494027298775,
"grad_norm": 0.05522393435239792,
"learning_rate": 0.0001510204081632653,
"loss": 0.4576,
"step": 130
},
{
"epoch": 0.0032679166447832962,
"grad_norm": 0.05478575825691223,
"learning_rate": 0.0001469387755102041,
"loss": 0.4666,
"step": 140
},
{
"epoch": 0.0035013392622678173,
"grad_norm": 0.09066344797611237,
"learning_rate": 0.00014285714285714287,
"loss": 0.4168,
"step": 150
},
{
"epoch": 0.0037347618797523388,
"grad_norm": 0.054524753242731094,
"learning_rate": 0.00013877551020408165,
"loss": 0.4813,
"step": 160
},
{
"epoch": 0.00396818449723686,
"grad_norm": 0.12929686903953552,
"learning_rate": 0.0001346938775510204,
"loss": 0.4981,
"step": 170
},
{
"epoch": 0.004201607114721381,
"grad_norm": 0.05895541235804558,
"learning_rate": 0.00013061224489795917,
"loss": 0.4078,
"step": 180
},
{
"epoch": 0.004435029732205902,
"grad_norm": 0.05701744183897972,
"learning_rate": 0.00012653061224489798,
"loss": 0.4323,
"step": 190
},
{
"epoch": 0.004668452349690423,
"grad_norm": 0.10815092921257019,
"learning_rate": 0.00012244897959183676,
"loss": 0.5232,
"step": 200
},
{
"epoch": 0.004901874967174945,
"grad_norm": 0.1338973492383957,
"learning_rate": 0.00011836734693877552,
"loss": 0.5053,
"step": 210
},
{
"epoch": 0.005135297584659465,
"grad_norm": 0.04165051504969597,
"learning_rate": 0.00011428571428571428,
"loss": 0.4149,
"step": 220
},
{
"epoch": 0.005368720202143987,
"grad_norm": 0.05300717428326607,
"learning_rate": 0.00011020408163265306,
"loss": 0.444,
"step": 230
},
{
"epoch": 0.0056021428196285075,
"grad_norm": 0.1370624154806137,
"learning_rate": 0.00010612244897959185,
"loss": 0.4525,
"step": 240
},
{
"epoch": 0.005835565437113029,
"grad_norm": 0.049909207969903946,
"learning_rate": 0.00010204081632653062,
"loss": 0.4497,
"step": 250
},
{
"epoch": 0.00606898805459755,
"grad_norm": 0.110743448138237,
"learning_rate": 9.79591836734694e-05,
"loss": 0.4837,
"step": 260
},
{
"epoch": 0.006302410672082071,
"grad_norm": 0.09541227668523788,
"learning_rate": 9.387755102040817e-05,
"loss": 0.49,
"step": 270
},
{
"epoch": 0.0065358332895665925,
"grad_norm": 0.05263066291809082,
"learning_rate": 8.979591836734695e-05,
"loss": 0.4437,
"step": 280
},
{
"epoch": 0.006769255907051114,
"grad_norm": 0.09211356937885284,
"learning_rate": 8.571428571428571e-05,
"loss": 0.4479,
"step": 290
},
{
"epoch": 0.007002678524535635,
"grad_norm": 0.05164729803800583,
"learning_rate": 8.163265306122449e-05,
"loss": 0.4329,
"step": 300
},
{
"epoch": 0.007236101142020156,
"grad_norm": 0.08837030827999115,
"learning_rate": 7.755102040816327e-05,
"loss": 0.4533,
"step": 310
},
{
"epoch": 0.0074695237595046775,
"grad_norm": 0.0369272343814373,
"learning_rate": 7.346938775510205e-05,
"loss": 0.3667,
"step": 320
},
{
"epoch": 0.007702946376989198,
"grad_norm": 0.059746578335762024,
"learning_rate": 6.938775510204082e-05,
"loss": 0.424,
"step": 330
},
{
"epoch": 0.00793636899447372,
"grad_norm": 0.04736114665865898,
"learning_rate": 6.530612244897959e-05,
"loss": 0.4538,
"step": 340
},
{
"epoch": 0.00816979161195824,
"grad_norm": 0.04814208671450615,
"learning_rate": 6.122448979591838e-05,
"loss": 0.4894,
"step": 350
},
{
"epoch": 0.008403214229442763,
"grad_norm": 0.04663668945431709,
"learning_rate": 5.714285714285714e-05,
"loss": 0.5158,
"step": 360
},
{
"epoch": 0.008636636846927283,
"grad_norm": 0.08329813182353973,
"learning_rate": 5.3061224489795926e-05,
"loss": 0.5901,
"step": 370
},
{
"epoch": 0.008870059464411804,
"grad_norm": 0.0947406217455864,
"learning_rate": 4.89795918367347e-05,
"loss": 0.4438,
"step": 380
},
{
"epoch": 0.009103482081896326,
"grad_norm": 0.048670731484889984,
"learning_rate": 4.4897959183673474e-05,
"loss": 0.4304,
"step": 390
},
{
"epoch": 0.009336904699380847,
"grad_norm": 0.12740883231163025,
"learning_rate": 4.0816326530612245e-05,
"loss": 0.5186,
"step": 400
},
{
"epoch": 0.009570327316865367,
"grad_norm": 0.13359272480010986,
"learning_rate": 3.673469387755102e-05,
"loss": 0.5146,
"step": 410
},
{
"epoch": 0.00980374993434989,
"grad_norm": 0.07435787469148636,
"learning_rate": 3.265306122448979e-05,
"loss": 0.4666,
"step": 420
},
{
"epoch": 0.01003717255183441,
"grad_norm": 0.05466726794838905,
"learning_rate": 2.857142857142857e-05,
"loss": 0.3812,
"step": 430
},
{
"epoch": 0.01027059516931893,
"grad_norm": 0.05390426889061928,
"learning_rate": 2.448979591836735e-05,
"loss": 0.4026,
"step": 440
},
{
"epoch": 0.010504017786803453,
"grad_norm": 0.055242184549570084,
"learning_rate": 2.0408163265306123e-05,
"loss": 0.437,
"step": 450
},
{
"epoch": 0.010737440404287974,
"grad_norm": 0.03994165360927582,
"learning_rate": 1.6326530612244897e-05,
"loss": 0.4343,
"step": 460
},
{
"epoch": 0.010970863021772494,
"grad_norm": 0.04847300797700882,
"learning_rate": 1.2244897959183674e-05,
"loss": 0.4618,
"step": 470
},
{
"epoch": 0.011204285639257015,
"grad_norm": 0.08686497807502747,
"learning_rate": 8.163265306122448e-06,
"loss": 0.4264,
"step": 480
},
{
"epoch": 0.011437708256741537,
"grad_norm": 0.09176526963710785,
"learning_rate": 4.081632653061224e-06,
"loss": 0.5168,
"step": 490
},
{
"epoch": 0.011671130874226058,
"grad_norm": 0.10465481132268906,
"learning_rate": 0.0,
"loss": 0.4519,
"step": 500
},
{
"epoch": 0.011904553491710579,
"grad_norm": 0.051657382398843765,
"learning_rate": 9.8989898989899e-05,
"loss": 0.4728,
"step": 510
},
{
"epoch": 0.0121379761091951,
"grad_norm": 0.062193650752305984,
"learning_rate": 9.696969696969698e-05,
"loss": 0.4483,
"step": 520
},
{
"epoch": 0.012371398726679621,
"grad_norm": 0.06362653523683548,
"learning_rate": 9.494949494949495e-05,
"loss": 0.4215,
"step": 530
},
{
"epoch": 0.012604821344164142,
"grad_norm": 0.06238653138279915,
"learning_rate": 9.292929292929293e-05,
"loss": 0.4224,
"step": 540
},
{
"epoch": 0.012838243961648664,
"grad_norm": 0.0477604866027832,
"learning_rate": 9.090909090909092e-05,
"loss": 0.448,
"step": 550
},
{
"epoch": 0.013071666579133185,
"grad_norm": 0.09850312024354935,
"learning_rate": 8.888888888888889e-05,
"loss": 0.4424,
"step": 560
},
{
"epoch": 0.013305089196617706,
"grad_norm": 0.06217048689723015,
"learning_rate": 8.686868686868688e-05,
"loss": 0.3644,
"step": 570
},
{
"epoch": 0.013538511814102228,
"grad_norm": 0.043189432471990585,
"learning_rate": 8.484848484848486e-05,
"loss": 0.4564,
"step": 580
},
{
"epoch": 0.013771934431586749,
"grad_norm": 0.10206077247858047,
"learning_rate": 8.282828282828283e-05,
"loss": 0.4176,
"step": 590
},
{
"epoch": 0.01400535704907127,
"grad_norm": 0.05712655559182167,
"learning_rate": 8.080808080808081e-05,
"loss": 0.3896,
"step": 600
},
{
"epoch": 0.014238779666555791,
"grad_norm": 0.04486239328980446,
"learning_rate": 7.878787878787879e-05,
"loss": 0.3761,
"step": 610
},
{
"epoch": 0.014472202284040312,
"grad_norm": 0.043401289731264114,
"learning_rate": 7.676767676767676e-05,
"loss": 0.4471,
"step": 620
},
{
"epoch": 0.014705624901524833,
"grad_norm": 0.4940922260284424,
"learning_rate": 7.474747474747475e-05,
"loss": 0.4569,
"step": 630
},
{
"epoch": 0.014939047519009355,
"grad_norm": 0.10270397365093231,
"learning_rate": 7.272727272727273e-05,
"loss": 0.4805,
"step": 640
},
{
"epoch": 0.015172470136493876,
"grad_norm": 0.13152533769607544,
"learning_rate": 7.07070707070707e-05,
"loss": 0.5194,
"step": 650
},
{
"epoch": 0.015405892753978396,
"grad_norm": 0.07382863759994507,
"learning_rate": 6.86868686868687e-05,
"loss": 0.4161,
"step": 660
},
{
"epoch": 0.015639315371462917,
"grad_norm": 0.08843934535980225,
"learning_rate": 6.666666666666667e-05,
"loss": 0.5265,
"step": 670
},
{
"epoch": 0.01587273798894744,
"grad_norm": 0.053686585277318954,
"learning_rate": 6.464646464646466e-05,
"loss": 0.4667,
"step": 680
},
{
"epoch": 0.01610616060643196,
"grad_norm": 0.05910225212574005,
"learning_rate": 6.262626262626264e-05,
"loss": 0.4254,
"step": 690
},
{
"epoch": 0.01633958322391648,
"grad_norm": 0.039652127772569656,
"learning_rate": 6.060606060606061e-05,
"loss": 0.4511,
"step": 700
},
{
"epoch": 0.016573005841401003,
"grad_norm": 0.0999956876039505,
"learning_rate": 5.858585858585859e-05,
"loss": 0.4396,
"step": 710
},
{
"epoch": 0.016806428458885525,
"grad_norm": 0.03926937282085419,
"learning_rate": 5.6565656565656563e-05,
"loss": 0.4178,
"step": 720
},
{
"epoch": 0.017039851076370044,
"grad_norm": 0.09462181478738785,
"learning_rate": 5.4545454545454546e-05,
"loss": 0.4092,
"step": 730
},
{
"epoch": 0.017273273693854566,
"grad_norm": 0.05022445321083069,
"learning_rate": 5.2525252525252536e-05,
"loss": 0.422,
"step": 740
},
{
"epoch": 0.01750669631133909,
"grad_norm": 0.10167255997657776,
"learning_rate": 5.050505050505051e-05,
"loss": 0.4028,
"step": 750
},
{
"epoch": 0.017740118928823607,
"grad_norm": 0.0910029336810112,
"learning_rate": 4.848484848484849e-05,
"loss": 0.4341,
"step": 760
},
{
"epoch": 0.01797354154630813,
"grad_norm": 0.047616615891456604,
"learning_rate": 4.6464646464646464e-05,
"loss": 0.411,
"step": 770
},
{
"epoch": 0.018206964163792652,
"grad_norm": 0.08828525990247726,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.4616,
"step": 780
},
{
"epoch": 0.01844038678127717,
"grad_norm": 0.044807884842157364,
"learning_rate": 4.242424242424243e-05,
"loss": 0.4865,
"step": 790
},
{
"epoch": 0.018673809398761693,
"grad_norm": 0.08502307534217834,
"learning_rate": 4.0404040404040405e-05,
"loss": 0.4624,
"step": 800
},
{
"epoch": 0.018907232016246216,
"grad_norm": 0.1129627451300621,
"learning_rate": 3.838383838383838e-05,
"loss": 0.4338,
"step": 810
},
{
"epoch": 0.019140654633730735,
"grad_norm": 0.10634730011224747,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.5142,
"step": 820
},
{
"epoch": 0.019374077251215257,
"grad_norm": 0.04792294651269913,
"learning_rate": 3.434343434343435e-05,
"loss": 0.4286,
"step": 830
},
{
"epoch": 0.01960749986869978,
"grad_norm": 0.046725083142519,
"learning_rate": 3.232323232323233e-05,
"loss": 0.4116,
"step": 840
},
{
"epoch": 0.019840922486184298,
"grad_norm": 0.052620138972997665,
"learning_rate": 3.0303030303030306e-05,
"loss": 0.4141,
"step": 850
},
{
"epoch": 0.02007434510366882,
"grad_norm": 0.10660973191261292,
"learning_rate": 2.8282828282828282e-05,
"loss": 0.4347,
"step": 860
},
{
"epoch": 0.020307767721153343,
"grad_norm": 0.0386226549744606,
"learning_rate": 2.6262626262626268e-05,
"loss": 0.4461,
"step": 870
},
{
"epoch": 0.02054119033863786,
"grad_norm": 0.07292847335338593,
"learning_rate": 2.4242424242424244e-05,
"loss": 0.4355,
"step": 880
},
{
"epoch": 0.020774612956122384,
"grad_norm": 0.06434612721204758,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.5035,
"step": 890
},
{
"epoch": 0.021008035573606906,
"grad_norm": 0.10716721415519714,
"learning_rate": 2.0202020202020203e-05,
"loss": 0.3866,
"step": 900
},
{
"epoch": 0.021241458191091425,
"grad_norm": 0.04890590161085129,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.3767,
"step": 910
},
{
"epoch": 0.021474880808575948,
"grad_norm": 0.06173992156982422,
"learning_rate": 1.6161616161616165e-05,
"loss": 0.4063,
"step": 920
},
{
"epoch": 0.021708303426060466,
"grad_norm": 0.053141020238399506,
"learning_rate": 1.4141414141414141e-05,
"loss": 0.4751,
"step": 930
},
{
"epoch": 0.02194172604354499,
"grad_norm": 0.05243794620037079,
"learning_rate": 1.2121212121212122e-05,
"loss": 0.4625,
"step": 940
},
{
"epoch": 0.02217514866102951,
"grad_norm": 0.061262525618076324,
"learning_rate": 1.0101010101010101e-05,
"loss": 0.4492,
"step": 950
},
{
"epoch": 0.02240857127851403,
"grad_norm": 0.10454926639795303,
"learning_rate": 8.080808080808082e-06,
"loss": 0.4522,
"step": 960
},
{
"epoch": 0.022641993895998552,
"grad_norm": 0.05348167195916176,
"learning_rate": 6.060606060606061e-06,
"loss": 0.434,
"step": 970
},
{
"epoch": 0.022875416513483075,
"grad_norm": 0.1315009742975235,
"learning_rate": 4.040404040404041e-06,
"loss": 0.5003,
"step": 980
},
{
"epoch": 0.023108839130967593,
"grad_norm": 0.0520632266998291,
"learning_rate": 2.0202020202020206e-06,
"loss": 0.3953,
"step": 990
},
{
"epoch": 0.023342261748452116,
"grad_norm": 0.08680278062820435,
"learning_rate": 0.0,
"loss": 0.432,
"step": 1000
}
],
"logging_steps": 10,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.170835390089626e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}