diaenra's picture
Training in progress, step 100, checkpoint
4f766a6 verified
raw
history blame
19 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.08804754567466432,
"eval_steps": 25,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008804754567466431,
"grad_norm": 7.369410991668701,
"learning_rate": 2e-05,
"loss": 12.8418,
"step": 1
},
{
"epoch": 0.0008804754567466431,
"eval_loss": 12.954377174377441,
"eval_runtime": 117.5732,
"eval_samples_per_second": 4.074,
"eval_steps_per_second": 2.041,
"step": 1
},
{
"epoch": 0.0017609509134932863,
"grad_norm": 7.059070110321045,
"learning_rate": 4e-05,
"loss": 11.8143,
"step": 2
},
{
"epoch": 0.0026414263702399295,
"grad_norm": 6.777024745941162,
"learning_rate": 6e-05,
"loss": 13.3913,
"step": 3
},
{
"epoch": 0.0035219018269865726,
"grad_norm": 7.668654918670654,
"learning_rate": 8e-05,
"loss": 12.8971,
"step": 4
},
{
"epoch": 0.004402377283733216,
"grad_norm": 9.06992244720459,
"learning_rate": 0.0001,
"loss": 12.9941,
"step": 5
},
{
"epoch": 0.005282852740479859,
"grad_norm": 12.04774284362793,
"learning_rate": 0.00012,
"loss": 12.8961,
"step": 6
},
{
"epoch": 0.0061633281972265025,
"grad_norm": 12.445601463317871,
"learning_rate": 0.00014,
"loss": 11.8487,
"step": 7
},
{
"epoch": 0.007043803653973145,
"grad_norm": 15.487993240356445,
"learning_rate": 0.00016,
"loss": 9.6627,
"step": 8
},
{
"epoch": 0.007924279110719789,
"grad_norm": 16.53350257873535,
"learning_rate": 0.00018,
"loss": 6.7407,
"step": 9
},
{
"epoch": 0.008804754567466431,
"grad_norm": 12.909174919128418,
"learning_rate": 0.0002,
"loss": 4.0144,
"step": 10
},
{
"epoch": 0.009685230024213076,
"grad_norm": 11.34801197052002,
"learning_rate": 0.0001999390827019096,
"loss": 2.473,
"step": 11
},
{
"epoch": 0.010565705480959718,
"grad_norm": 11.02859878540039,
"learning_rate": 0.00019975640502598244,
"loss": 2.9376,
"step": 12
},
{
"epoch": 0.01144618093770636,
"grad_norm": 13.622664451599121,
"learning_rate": 0.00019945218953682734,
"loss": 1.4846,
"step": 13
},
{
"epoch": 0.012326656394453005,
"grad_norm": 13.476359367370605,
"learning_rate": 0.00019902680687415705,
"loss": 1.1088,
"step": 14
},
{
"epoch": 0.013207131851199648,
"grad_norm": 10.137425422668457,
"learning_rate": 0.00019848077530122083,
"loss": 1.0127,
"step": 15
},
{
"epoch": 0.01408760730794629,
"grad_norm": 17.802549362182617,
"learning_rate": 0.00019781476007338058,
"loss": 2.9471,
"step": 16
},
{
"epoch": 0.014968082764692935,
"grad_norm": 12.524669647216797,
"learning_rate": 0.00019702957262759965,
"loss": 0.7211,
"step": 17
},
{
"epoch": 0.015848558221439577,
"grad_norm": 9.665834426879883,
"learning_rate": 0.0001961261695938319,
"loss": 1.6121,
"step": 18
},
{
"epoch": 0.01672903367818622,
"grad_norm": 8.007771492004395,
"learning_rate": 0.00019510565162951537,
"loss": 1.1061,
"step": 19
},
{
"epoch": 0.017609509134932862,
"grad_norm": 4.475860118865967,
"learning_rate": 0.00019396926207859084,
"loss": 0.6912,
"step": 20
},
{
"epoch": 0.01848998459167951,
"grad_norm": 4.555602550506592,
"learning_rate": 0.00019271838545667876,
"loss": 0.6382,
"step": 21
},
{
"epoch": 0.01937046004842615,
"grad_norm": 5.3586883544921875,
"learning_rate": 0.0001913545457642601,
"loss": 0.3292,
"step": 22
},
{
"epoch": 0.020250935505172794,
"grad_norm": 8.260420799255371,
"learning_rate": 0.0001898794046299167,
"loss": 1.0672,
"step": 23
},
{
"epoch": 0.021131410961919436,
"grad_norm": 11.399406433105469,
"learning_rate": 0.00018829475928589271,
"loss": 1.183,
"step": 24
},
{
"epoch": 0.02201188641866608,
"grad_norm": 6.585690021514893,
"learning_rate": 0.00018660254037844388,
"loss": 0.5138,
"step": 25
},
{
"epoch": 0.02201188641866608,
"eval_loss": 0.798155665397644,
"eval_runtime": 118.5422,
"eval_samples_per_second": 4.041,
"eval_steps_per_second": 2.025,
"step": 25
},
{
"epoch": 0.02289236187541272,
"grad_norm": 3.101707696914673,
"learning_rate": 0.0001848048096156426,
"loss": 0.6762,
"step": 26
},
{
"epoch": 0.023772837332159368,
"grad_norm": 4.0726189613342285,
"learning_rate": 0.00018290375725550417,
"loss": 0.4531,
"step": 27
},
{
"epoch": 0.02465331278890601,
"grad_norm": 7.052011013031006,
"learning_rate": 0.00018090169943749476,
"loss": 1.0759,
"step": 28
},
{
"epoch": 0.025533788245652653,
"grad_norm": 1.7110868692398071,
"learning_rate": 0.00017880107536067218,
"loss": 0.4769,
"step": 29
},
{
"epoch": 0.026414263702399295,
"grad_norm": 2.779916763305664,
"learning_rate": 0.0001766044443118978,
"loss": 0.5725,
"step": 30
},
{
"epoch": 0.027294739159145938,
"grad_norm": 4.19709587097168,
"learning_rate": 0.00017431448254773944,
"loss": 0.6394,
"step": 31
},
{
"epoch": 0.02817521461589258,
"grad_norm": 6.238762378692627,
"learning_rate": 0.0001719339800338651,
"loss": 0.7417,
"step": 32
},
{
"epoch": 0.029055690072639227,
"grad_norm": 4.305187702178955,
"learning_rate": 0.00016946583704589973,
"loss": 0.754,
"step": 33
},
{
"epoch": 0.02993616552938587,
"grad_norm": 13.725184440612793,
"learning_rate": 0.00016691306063588583,
"loss": 2.4752,
"step": 34
},
{
"epoch": 0.030816640986132512,
"grad_norm": 2.0460703372955322,
"learning_rate": 0.00016427876096865394,
"loss": 0.4011,
"step": 35
},
{
"epoch": 0.031697116442879154,
"grad_norm": 1.824874758720398,
"learning_rate": 0.0001615661475325658,
"loss": 0.4695,
"step": 36
},
{
"epoch": 0.0325775918996258,
"grad_norm": 1.9817159175872803,
"learning_rate": 0.00015877852522924732,
"loss": 0.3925,
"step": 37
},
{
"epoch": 0.03345806735637244,
"grad_norm": 1.2084903717041016,
"learning_rate": 0.0001559192903470747,
"loss": 0.38,
"step": 38
},
{
"epoch": 0.034338542813119086,
"grad_norm": 5.959559440612793,
"learning_rate": 0.0001529919264233205,
"loss": 1.468,
"step": 39
},
{
"epoch": 0.035219018269865725,
"grad_norm": 2.2862613201141357,
"learning_rate": 0.00015000000000000001,
"loss": 0.5494,
"step": 40
},
{
"epoch": 0.03609949372661237,
"grad_norm": 1.3760128021240234,
"learning_rate": 0.00014694715627858908,
"loss": 0.3777,
"step": 41
},
{
"epoch": 0.03697996918335902,
"grad_norm": 2.4577219486236572,
"learning_rate": 0.00014383711467890774,
"loss": 0.6612,
"step": 42
},
{
"epoch": 0.037860444640105656,
"grad_norm": 4.882229328155518,
"learning_rate": 0.00014067366430758004,
"loss": 0.9137,
"step": 43
},
{
"epoch": 0.0387409200968523,
"grad_norm": 1.15999174118042,
"learning_rate": 0.00013746065934159123,
"loss": 0.2879,
"step": 44
},
{
"epoch": 0.03962139555359894,
"grad_norm": 3.1867716312408447,
"learning_rate": 0.00013420201433256689,
"loss": 0.7579,
"step": 45
},
{
"epoch": 0.04050187101034559,
"grad_norm": 3.6683099269866943,
"learning_rate": 0.00013090169943749476,
"loss": 0.5053,
"step": 46
},
{
"epoch": 0.041382346467092226,
"grad_norm": 1.5074632167816162,
"learning_rate": 0.0001275637355816999,
"loss": 0.5215,
"step": 47
},
{
"epoch": 0.04226282192383887,
"grad_norm": 1.734575867652893,
"learning_rate": 0.00012419218955996676,
"loss": 0.4402,
"step": 48
},
{
"epoch": 0.04314329738058552,
"grad_norm": 3.334616184234619,
"learning_rate": 0.00012079116908177593,
"loss": 0.5652,
"step": 49
},
{
"epoch": 0.04402377283733216,
"grad_norm": 3.017249822616577,
"learning_rate": 0.00011736481776669306,
"loss": 0.5196,
"step": 50
},
{
"epoch": 0.04402377283733216,
"eval_loss": 0.536153256893158,
"eval_runtime": 118.572,
"eval_samples_per_second": 4.04,
"eval_steps_per_second": 2.024,
"step": 50
},
{
"epoch": 0.044904248294078804,
"grad_norm": 2.4015002250671387,
"learning_rate": 0.00011391731009600654,
"loss": 0.8285,
"step": 51
},
{
"epoch": 0.04578472375082544,
"grad_norm": 1.8211569786071777,
"learning_rate": 0.00011045284632676536,
"loss": 0.7426,
"step": 52
},
{
"epoch": 0.04666519920757209,
"grad_norm": 1.4902381896972656,
"learning_rate": 0.00010697564737441252,
"loss": 0.4468,
"step": 53
},
{
"epoch": 0.047545674664318735,
"grad_norm": 1.00511634349823,
"learning_rate": 0.00010348994967025012,
"loss": 0.4147,
"step": 54
},
{
"epoch": 0.048426150121065374,
"grad_norm": 2.31351637840271,
"learning_rate": 0.0001,
"loss": 0.5817,
"step": 55
},
{
"epoch": 0.04930662557781202,
"grad_norm": 2.334179162979126,
"learning_rate": 9.651005032974994e-05,
"loss": 0.3195,
"step": 56
},
{
"epoch": 0.05018710103455866,
"grad_norm": 1.681875228881836,
"learning_rate": 9.302435262558747e-05,
"loss": 0.7227,
"step": 57
},
{
"epoch": 0.051067576491305305,
"grad_norm": 2.5601327419281006,
"learning_rate": 8.954715367323468e-05,
"loss": 0.6287,
"step": 58
},
{
"epoch": 0.05194805194805195,
"grad_norm": 2.3683104515075684,
"learning_rate": 8.608268990399349e-05,
"loss": 0.7883,
"step": 59
},
{
"epoch": 0.05282852740479859,
"grad_norm": 1.4547897577285767,
"learning_rate": 8.263518223330697e-05,
"loss": 0.5512,
"step": 60
},
{
"epoch": 0.05370900286154524,
"grad_norm": 2.2935614585876465,
"learning_rate": 7.920883091822408e-05,
"loss": 0.5621,
"step": 61
},
{
"epoch": 0.054589478318291876,
"grad_norm": 1.791729211807251,
"learning_rate": 7.580781044003324e-05,
"loss": 0.5211,
"step": 62
},
{
"epoch": 0.05546995377503852,
"grad_norm": 1.9423251152038574,
"learning_rate": 7.243626441830009e-05,
"loss": 0.5148,
"step": 63
},
{
"epoch": 0.05635042923178516,
"grad_norm": 1.6972748041152954,
"learning_rate": 6.909830056250527e-05,
"loss": 0.5665,
"step": 64
},
{
"epoch": 0.05723090468853181,
"grad_norm": 1.5219111442565918,
"learning_rate": 6.579798566743314e-05,
"loss": 0.4329,
"step": 65
},
{
"epoch": 0.05811138014527845,
"grad_norm": 3.782133102416992,
"learning_rate": 6.25393406584088e-05,
"loss": 0.5416,
"step": 66
},
{
"epoch": 0.05899185560202509,
"grad_norm": 3.464118719100952,
"learning_rate": 5.9326335692419995e-05,
"loss": 0.6234,
"step": 67
},
{
"epoch": 0.05987233105877174,
"grad_norm": 5.132722854614258,
"learning_rate": 5.616288532109225e-05,
"loss": 0.7638,
"step": 68
},
{
"epoch": 0.06075280651551838,
"grad_norm": 2.7802538871765137,
"learning_rate": 5.305284372141095e-05,
"loss": 0.4478,
"step": 69
},
{
"epoch": 0.061633281972265024,
"grad_norm": 2.032424211502075,
"learning_rate": 5.000000000000002e-05,
"loss": 0.4982,
"step": 70
},
{
"epoch": 0.06251375742901166,
"grad_norm": 1.9181748628616333,
"learning_rate": 4.700807357667952e-05,
"loss": 0.4599,
"step": 71
},
{
"epoch": 0.06339423288575831,
"grad_norm": 2.8431079387664795,
"learning_rate": 4.4080709652925336e-05,
"loss": 0.5564,
"step": 72
},
{
"epoch": 0.06427470834250495,
"grad_norm": 2.748572587966919,
"learning_rate": 4.12214747707527e-05,
"loss": 0.5392,
"step": 73
},
{
"epoch": 0.0651551837992516,
"grad_norm": 3.2076663970947266,
"learning_rate": 3.843385246743417e-05,
"loss": 0.7945,
"step": 74
},
{
"epoch": 0.06603565925599823,
"grad_norm": 0.9875300526618958,
"learning_rate": 3.5721239031346066e-05,
"loss": 0.388,
"step": 75
},
{
"epoch": 0.06603565925599823,
"eval_loss": 0.4547451436519623,
"eval_runtime": 118.536,
"eval_samples_per_second": 4.041,
"eval_steps_per_second": 2.025,
"step": 75
},
{
"epoch": 0.06691613471274488,
"grad_norm": 1.0145089626312256,
"learning_rate": 3.308693936411421e-05,
"loss": 0.4273,
"step": 76
},
{
"epoch": 0.06779661016949153,
"grad_norm": 1.8886373043060303,
"learning_rate": 3.053416295410026e-05,
"loss": 0.4417,
"step": 77
},
{
"epoch": 0.06867708562623817,
"grad_norm": 1.1678515672683716,
"learning_rate": 2.8066019966134904e-05,
"loss": 0.4121,
"step": 78
},
{
"epoch": 0.06955756108298482,
"grad_norm": 3.4151155948638916,
"learning_rate": 2.5685517452260567e-05,
"loss": 0.5006,
"step": 79
},
{
"epoch": 0.07043803653973145,
"grad_norm": 1.1192302703857422,
"learning_rate": 2.339555568810221e-05,
"loss": 0.3849,
"step": 80
},
{
"epoch": 0.0713185119964781,
"grad_norm": 1.1415249109268188,
"learning_rate": 2.119892463932781e-05,
"loss": 0.39,
"step": 81
},
{
"epoch": 0.07219898745322474,
"grad_norm": 1.7569003105163574,
"learning_rate": 1.9098300562505266e-05,
"loss": 0.6394,
"step": 82
},
{
"epoch": 0.07307946290997139,
"grad_norm": 3.5374794006347656,
"learning_rate": 1.7096242744495837e-05,
"loss": 1.0147,
"step": 83
},
{
"epoch": 0.07395993836671803,
"grad_norm": 0.9409904479980469,
"learning_rate": 1.5195190384357404e-05,
"loss": 0.3722,
"step": 84
},
{
"epoch": 0.07484041382346467,
"grad_norm": 1.6608474254608154,
"learning_rate": 1.339745962155613e-05,
"loss": 0.4311,
"step": 85
},
{
"epoch": 0.07572088928021131,
"grad_norm": 2.132925271987915,
"learning_rate": 1.1705240714107302e-05,
"loss": 0.6121,
"step": 86
},
{
"epoch": 0.07660136473695796,
"grad_norm": 1.2028199434280396,
"learning_rate": 1.0120595370083318e-05,
"loss": 0.33,
"step": 87
},
{
"epoch": 0.0774818401937046,
"grad_norm": 1.059144139289856,
"learning_rate": 8.645454235739903e-06,
"loss": 0.458,
"step": 88
},
{
"epoch": 0.07836231565045125,
"grad_norm": 1.6935112476348877,
"learning_rate": 7.281614543321269e-06,
"loss": 0.5158,
"step": 89
},
{
"epoch": 0.07924279110719788,
"grad_norm": 1.4320807456970215,
"learning_rate": 6.030737921409169e-06,
"loss": 0.4924,
"step": 90
},
{
"epoch": 0.08012326656394453,
"grad_norm": 2.2760002613067627,
"learning_rate": 4.8943483704846475e-06,
"loss": 0.4067,
"step": 91
},
{
"epoch": 0.08100374202069117,
"grad_norm": 2.197097063064575,
"learning_rate": 3.873830406168111e-06,
"loss": 0.6285,
"step": 92
},
{
"epoch": 0.08188421747743782,
"grad_norm": 1.5171617269515991,
"learning_rate": 2.970427372400353e-06,
"loss": 0.5067,
"step": 93
},
{
"epoch": 0.08276469293418445,
"grad_norm": 1.208709478378296,
"learning_rate": 2.1852399266194314e-06,
"loss": 0.419,
"step": 94
},
{
"epoch": 0.0836451683909311,
"grad_norm": 1.862436056137085,
"learning_rate": 1.5192246987791981e-06,
"loss": 0.567,
"step": 95
},
{
"epoch": 0.08452564384767774,
"grad_norm": 1.5693188905715942,
"learning_rate": 9.731931258429638e-07,
"loss": 0.6098,
"step": 96
},
{
"epoch": 0.08540611930442439,
"grad_norm": 2.0004355907440186,
"learning_rate": 5.478104631726711e-07,
"loss": 0.5554,
"step": 97
},
{
"epoch": 0.08628659476117104,
"grad_norm": 1.4339842796325684,
"learning_rate": 2.4359497401758024e-07,
"loss": 0.3237,
"step": 98
},
{
"epoch": 0.08716707021791767,
"grad_norm": 2.115415573120117,
"learning_rate": 6.09172980904238e-08,
"loss": 0.5879,
"step": 99
},
{
"epoch": 0.08804754567466432,
"grad_norm": 15.940089225769043,
"learning_rate": 0.0,
"loss": 0.7996,
"step": 100
},
{
"epoch": 0.08804754567466432,
"eval_loss": 0.4441887140274048,
"eval_runtime": 118.611,
"eval_samples_per_second": 4.038,
"eval_steps_per_second": 2.023,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.318461275897856e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}