|
{ |
|
"best_metric": 0.9695621132850647, |
|
"best_model_checkpoint": "/kaggle/working/LLaMA-Factory/output/dolphin-2.9-llama3-8b-GER/checkpoint-400", |
|
"epoch": 0.4711425206124853, |
|
"eval_steps": 100, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.5445218086242676, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.2105, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.4362960457801819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9311, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.5098525881767273, |
|
"learning_rate": 7.5e-06, |
|
"loss": 1.3604, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.0830402374267578, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5808, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5936903953552246, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.248, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7212775945663452, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.2712, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.4733128845691681, |
|
"learning_rate": 1.75e-05, |
|
"loss": 1.3005, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.6236618757247925, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3398, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5326029658317566, |
|
"learning_rate": 2.25e-05, |
|
"loss": 1.0476, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5021428465843201, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.1245, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.46588796377182007, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 1.1041, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.936045229434967, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3201, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5649005770683289, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 1.1596, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5679148435592651, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.2501, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8638319373130798, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.237, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5722486972808838, |
|
"learning_rate": 4e-05, |
|
"loss": 1.2616, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5802999138832092, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.9578, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.2806150913238525, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.3815, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5839210748672485, |
|
"learning_rate": 4.75e-05, |
|
"loss": 1.2619, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.7132797837257385, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2842, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.7936571836471558, |
|
"learning_rate": 5.25e-05, |
|
"loss": 1.3991, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7268731594085693, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 1.2217, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6649657487869263, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 1.2485, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.45912331342697144, |
|
"learning_rate": 6e-05, |
|
"loss": 1.1452, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.49475613236427307, |
|
"learning_rate": 6.25e-05, |
|
"loss": 1.256, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6027999520301819, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 1.2267, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.48380520939826965, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 1.202, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6623361706733704, |
|
"learning_rate": 7e-05, |
|
"loss": 1.3924, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.6632773280143738, |
|
"learning_rate": 7.25e-05, |
|
"loss": 1.2443, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.5974243879318237, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.0591, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.7902927398681641, |
|
"learning_rate": 7.75e-05, |
|
"loss": 1.0192, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.7081462144851685, |
|
"learning_rate": 8e-05, |
|
"loss": 1.0955, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.4744333326816559, |
|
"learning_rate": 8.25e-05, |
|
"loss": 1.0564, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.703999400138855, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.1022, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.5378603339195251, |
|
"learning_rate": 8.75e-05, |
|
"loss": 1.063, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6105419397354126, |
|
"learning_rate": 9e-05, |
|
"loss": 0.9748, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6250177025794983, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 0.9951, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6306005716323853, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.9875, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8635210990905762, |
|
"learning_rate": 9.75e-05, |
|
"loss": 1.1825, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5938963890075684, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9678, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5765062570571899, |
|
"learning_rate": 0.0001025, |
|
"loss": 1.1031, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5162666440010071, |
|
"learning_rate": 0.000105, |
|
"loss": 1.0831, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8675170540809631, |
|
"learning_rate": 0.0001075, |
|
"loss": 1.0034, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.44026264548301697, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 0.9607, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4922897219657898, |
|
"learning_rate": 0.00011250000000000001, |
|
"loss": 1.0251, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5505048632621765, |
|
"learning_rate": 0.00011499999999999999, |
|
"loss": 0.6948, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5226988792419434, |
|
"learning_rate": 0.00011750000000000001, |
|
"loss": 1.2259, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5105571150779724, |
|
"learning_rate": 0.00012, |
|
"loss": 1.1239, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.0914857387542725, |
|
"learning_rate": 0.00012250000000000002, |
|
"loss": 1.4032, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.4611998498439789, |
|
"learning_rate": 0.000125, |
|
"loss": 1.1481, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5243968367576599, |
|
"learning_rate": 0.0001275, |
|
"loss": 1.0514, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.48045778274536133, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 1.1552, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5543797612190247, |
|
"learning_rate": 0.0001325, |
|
"loss": 1.035, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5148913264274597, |
|
"learning_rate": 0.00013500000000000003, |
|
"loss": 0.9244, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5708130598068237, |
|
"learning_rate": 0.0001375, |
|
"loss": 1.2272, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7507888078689575, |
|
"learning_rate": 0.00014, |
|
"loss": 1.1196, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5411068201065063, |
|
"learning_rate": 0.00014250000000000002, |
|
"loss": 1.1444, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7314138412475586, |
|
"learning_rate": 0.000145, |
|
"loss": 1.3451, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5134022831916809, |
|
"learning_rate": 0.0001475, |
|
"loss": 0.8021, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7859703302383423, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.952, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.4740566611289978, |
|
"learning_rate": 0.0001525, |
|
"loss": 0.8878, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8245677947998047, |
|
"learning_rate": 0.000155, |
|
"loss": 1.4341, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5067218542098999, |
|
"learning_rate": 0.0001575, |
|
"loss": 0.8404, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.40824806690216064, |
|
"learning_rate": 0.00016, |
|
"loss": 1.0943, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.428151935338974, |
|
"learning_rate": 0.00016250000000000002, |
|
"loss": 1.0819, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.437532901763916, |
|
"learning_rate": 0.000165, |
|
"loss": 1.0461, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.4619809091091156, |
|
"learning_rate": 0.0001675, |
|
"loss": 1.0232, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.3247042894363403, |
|
"learning_rate": 0.00017, |
|
"loss": 1.218, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5896454453468323, |
|
"learning_rate": 0.00017250000000000002, |
|
"loss": 1.1953, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5605736970901489, |
|
"learning_rate": 0.000175, |
|
"loss": 1.2542, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.36923155188560486, |
|
"learning_rate": 0.0001775, |
|
"loss": 0.7096, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.6809273362159729, |
|
"learning_rate": 0.00018, |
|
"loss": 1.1959, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.42645493149757385, |
|
"learning_rate": 0.0001825, |
|
"loss": 1.1829, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7621450424194336, |
|
"learning_rate": 0.00018500000000000002, |
|
"loss": 1.1599, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5299301743507385, |
|
"learning_rate": 0.0001875, |
|
"loss": 1.0546, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.484611451625824, |
|
"learning_rate": 0.00019, |
|
"loss": 1.0854, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.4027417004108429, |
|
"learning_rate": 0.00019250000000000002, |
|
"loss": 0.7812, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7537221908569336, |
|
"learning_rate": 0.000195, |
|
"loss": 0.9719, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.45811474323272705, |
|
"learning_rate": 0.00019750000000000003, |
|
"loss": 0.9684, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5426852107048035, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9451, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.48964977264404297, |
|
"learning_rate": 0.000199739921976593, |
|
"loss": 1.0807, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6225113272666931, |
|
"learning_rate": 0.00019947984395318597, |
|
"loss": 1.1913, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.47157713770866394, |
|
"learning_rate": 0.00019921976592977893, |
|
"loss": 1.0392, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.42048129439353943, |
|
"learning_rate": 0.0001989596879063719, |
|
"loss": 0.8439, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.4129664897918701, |
|
"learning_rate": 0.0001986996098829649, |
|
"loss": 1.1025, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5610629320144653, |
|
"learning_rate": 0.00019843953185955788, |
|
"loss": 0.9368, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.4288642406463623, |
|
"learning_rate": 0.00019817945383615086, |
|
"loss": 0.9711, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.572507917881012, |
|
"learning_rate": 0.0001979193758127438, |
|
"loss": 0.9739, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5115912556648254, |
|
"learning_rate": 0.0001976592977893368, |
|
"loss": 0.9356, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.608063280582428, |
|
"learning_rate": 0.00019739921976592978, |
|
"loss": 1.2459, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.0171502828598022, |
|
"learning_rate": 0.00019713914174252276, |
|
"loss": 1.0629, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.514552116394043, |
|
"learning_rate": 0.00019687906371911574, |
|
"loss": 0.9646, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.4363428056240082, |
|
"learning_rate": 0.00019661898569570872, |
|
"loss": 0.8659, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.4470008611679077, |
|
"learning_rate": 0.0001963589076723017, |
|
"loss": 1.0338, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5710871815681458, |
|
"learning_rate": 0.0001960988296488947, |
|
"loss": 0.946, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6456964612007141, |
|
"learning_rate": 0.00019583875162548767, |
|
"loss": 1.0956, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.45742425322532654, |
|
"learning_rate": 0.00019557867360208065, |
|
"loss": 0.7985, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.9953218698501587, |
|
"learning_rate": 0.0001953185955786736, |
|
"loss": 0.9476, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5075244307518005, |
|
"learning_rate": 0.0001950585175552666, |
|
"loss": 1.1704, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7076822519302368, |
|
"learning_rate": 0.00019479843953185957, |
|
"loss": 1.2054, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 1.0368528366088867, |
|
"eval_runtime": 547.1826, |
|
"eval_samples_per_second": 2.76, |
|
"eval_steps_per_second": 1.38, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7408495545387268, |
|
"learning_rate": 0.00019453836150845255, |
|
"loss": 1.3149, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5010321736335754, |
|
"learning_rate": 0.00019427828348504554, |
|
"loss": 0.8483, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5388755798339844, |
|
"learning_rate": 0.0001940182054616385, |
|
"loss": 1.034, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.32918357849121094, |
|
"learning_rate": 0.00019375812743823147, |
|
"loss": 1.0762, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.4812282919883728, |
|
"learning_rate": 0.00019349804941482446, |
|
"loss": 1.0797, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.6160157918930054, |
|
"learning_rate": 0.00019323797139141744, |
|
"loss": 0.9488, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.41306182742118835, |
|
"learning_rate": 0.00019297789336801042, |
|
"loss": 1.0742, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5411396026611328, |
|
"learning_rate": 0.00019271781534460338, |
|
"loss": 1.1162, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.45688003301620483, |
|
"learning_rate": 0.00019245773732119636, |
|
"loss": 0.8042, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.4660446047782898, |
|
"learning_rate": 0.00019219765929778934, |
|
"loss": 0.8234, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6277762055397034, |
|
"learning_rate": 0.00019193758127438232, |
|
"loss": 1.0324, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6119320392608643, |
|
"learning_rate": 0.0001916775032509753, |
|
"loss": 0.6912, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.5463618040084839, |
|
"learning_rate": 0.00019141742522756826, |
|
"loss": 1.1297, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.4918029010295868, |
|
"learning_rate": 0.00019115734720416124, |
|
"loss": 1.0857, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.4734187424182892, |
|
"learning_rate": 0.00019089726918075422, |
|
"loss": 1.0246, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5416173934936523, |
|
"learning_rate": 0.0001906371911573472, |
|
"loss": 1.0262, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.486284464597702, |
|
"learning_rate": 0.0001903771131339402, |
|
"loss": 1.2198, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.4003067910671234, |
|
"learning_rate": 0.00019011703511053317, |
|
"loss": 0.8525, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.38142916560173035, |
|
"learning_rate": 0.00018985695708712615, |
|
"loss": 1.1475, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.42364147305488586, |
|
"learning_rate": 0.00018959687906371913, |
|
"loss": 0.8207, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.384432852268219, |
|
"learning_rate": 0.00018933680104031212, |
|
"loss": 0.9451, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.38188567757606506, |
|
"learning_rate": 0.0001890767230169051, |
|
"loss": 0.921, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.4212167263031006, |
|
"learning_rate": 0.00018881664499349805, |
|
"loss": 0.8532, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7674171924591064, |
|
"learning_rate": 0.00018855656697009104, |
|
"loss": 1.1751, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.4111984074115753, |
|
"learning_rate": 0.00018829648894668402, |
|
"loss": 0.906, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6682639718055725, |
|
"learning_rate": 0.000188036410923277, |
|
"loss": 1.1284, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.3887438178062439, |
|
"learning_rate": 0.00018777633289986998, |
|
"loss": 1.0324, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5829350352287292, |
|
"learning_rate": 0.00018751625487646294, |
|
"loss": 1.038, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5861855149269104, |
|
"learning_rate": 0.00018725617685305592, |
|
"loss": 0.9453, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.40044718980789185, |
|
"learning_rate": 0.0001869960988296489, |
|
"loss": 1.1971, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.4322098195552826, |
|
"learning_rate": 0.00018673602080624188, |
|
"loss": 0.9792, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.4132952392101288, |
|
"learning_rate": 0.00018647594278283487, |
|
"loss": 0.9597, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.4141194224357605, |
|
"learning_rate": 0.00018621586475942782, |
|
"loss": 1.0785, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5526332855224609, |
|
"learning_rate": 0.0001859557867360208, |
|
"loss": 1.0762, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6953147053718567, |
|
"learning_rate": 0.00018569570871261379, |
|
"loss": 1.1048, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.4306538701057434, |
|
"learning_rate": 0.00018543563068920677, |
|
"loss": 0.9798, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.32605746388435364, |
|
"learning_rate": 0.00018517555266579975, |
|
"loss": 0.9825, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.48711439967155457, |
|
"learning_rate": 0.0001849154746423927, |
|
"loss": 0.9685, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7728849649429321, |
|
"learning_rate": 0.0001846553966189857, |
|
"loss": 1.2843, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.38121744990348816, |
|
"learning_rate": 0.00018439531859557867, |
|
"loss": 0.8289, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5053502321243286, |
|
"learning_rate": 0.00018413524057217165, |
|
"loss": 1.0824, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.44003796577453613, |
|
"learning_rate": 0.00018387516254876463, |
|
"loss": 1.0957, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.35713884234428406, |
|
"learning_rate": 0.00018361508452535762, |
|
"loss": 0.7869, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6612699627876282, |
|
"learning_rate": 0.0001833550065019506, |
|
"loss": 1.1621, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.4270091950893402, |
|
"learning_rate": 0.00018309492847854358, |
|
"loss": 0.8964, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.42207202315330505, |
|
"learning_rate": 0.00018283485045513656, |
|
"loss": 1.1403, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.48486125469207764, |
|
"learning_rate": 0.00018257477243172954, |
|
"loss": 0.9993, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.4205480217933655, |
|
"learning_rate": 0.0001823146944083225, |
|
"loss": 1.0404, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.3420919179916382, |
|
"learning_rate": 0.00018205461638491548, |
|
"loss": 1.1434, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6411290168762207, |
|
"learning_rate": 0.00018179453836150846, |
|
"loss": 1.2055, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7193886637687683, |
|
"learning_rate": 0.00018153446033810145, |
|
"loss": 0.9436, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.34661969542503357, |
|
"learning_rate": 0.00018127438231469443, |
|
"loss": 0.8158, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.4028843939304352, |
|
"learning_rate": 0.00018101430429128738, |
|
"loss": 0.9863, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.3898251950740814, |
|
"learning_rate": 0.00018075422626788037, |
|
"loss": 0.9215, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.3827955424785614, |
|
"learning_rate": 0.00018049414824447335, |
|
"loss": 0.9513, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.3840239644050598, |
|
"learning_rate": 0.00018023407022106633, |
|
"loss": 0.8303, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.42413073778152466, |
|
"learning_rate": 0.0001799739921976593, |
|
"loss": 0.8909, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.47216784954071045, |
|
"learning_rate": 0.00017971391417425227, |
|
"loss": 1.0413, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.35447853803634644, |
|
"learning_rate": 0.00017945383615084525, |
|
"loss": 0.9781, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.46554017066955566, |
|
"learning_rate": 0.00017919375812743823, |
|
"loss": 0.7186, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.3696407973766327, |
|
"learning_rate": 0.00017893368010403121, |
|
"loss": 1.0137, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5633952617645264, |
|
"learning_rate": 0.0001786736020806242, |
|
"loss": 1.0944, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.3334963917732239, |
|
"learning_rate": 0.00017841352405721715, |
|
"loss": 1.2606, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.4501505494117737, |
|
"learning_rate": 0.00017815344603381013, |
|
"loss": 0.7306, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.44553884863853455, |
|
"learning_rate": 0.00017789336801040312, |
|
"loss": 0.871, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.474342405796051, |
|
"learning_rate": 0.0001776332899869961, |
|
"loss": 0.7593, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7258349061012268, |
|
"learning_rate": 0.00017737321196358908, |
|
"loss": 1.4035, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.43586501479148865, |
|
"learning_rate": 0.00017711313394018206, |
|
"loss": 1.1679, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.4580634832382202, |
|
"learning_rate": 0.00017685305591677504, |
|
"loss": 0.9496, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.40509316325187683, |
|
"learning_rate": 0.00017659297789336803, |
|
"loss": 0.9515, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.3496972620487213, |
|
"learning_rate": 0.000176332899869961, |
|
"loss": 1.1846, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.6468595862388611, |
|
"learning_rate": 0.000176072821846554, |
|
"loss": 1.0307, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.3840126693248749, |
|
"learning_rate": 0.00017581274382314695, |
|
"loss": 1.0248, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.4715554118156433, |
|
"learning_rate": 0.00017555266579973993, |
|
"loss": 1.1017, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.38534194231033325, |
|
"learning_rate": 0.0001752925877763329, |
|
"loss": 1.0073, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.3881916403770447, |
|
"learning_rate": 0.0001750325097529259, |
|
"loss": 0.8961, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.49220091104507446, |
|
"learning_rate": 0.00017477243172951887, |
|
"loss": 1.0184, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.38170936703681946, |
|
"learning_rate": 0.00017451235370611186, |
|
"loss": 1.0939, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.3692156672477722, |
|
"learning_rate": 0.0001742522756827048, |
|
"loss": 1.085, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.43161171674728394, |
|
"learning_rate": 0.0001739921976592978, |
|
"loss": 0.9098, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.3874380886554718, |
|
"learning_rate": 0.00017373211963589078, |
|
"loss": 1.0108, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.37556958198547363, |
|
"learning_rate": 0.00017347204161248376, |
|
"loss": 1.0344, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.4288795292377472, |
|
"learning_rate": 0.00017321196358907674, |
|
"loss": 0.9329, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6587222218513489, |
|
"learning_rate": 0.0001729518855656697, |
|
"loss": 1.0312, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6700361371040344, |
|
"learning_rate": 0.00017269180754226268, |
|
"loss": 1.035, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5208828449249268, |
|
"learning_rate": 0.00017243172951885566, |
|
"loss": 1.0334, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5774813294410706, |
|
"learning_rate": 0.00017217165149544864, |
|
"loss": 1.0366, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.3492725193500519, |
|
"learning_rate": 0.00017191157347204162, |
|
"loss": 1.0225, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.3714265525341034, |
|
"learning_rate": 0.00017165149544863458, |
|
"loss": 1.0355, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.4425973892211914, |
|
"learning_rate": 0.00017139141742522756, |
|
"loss": 0.974, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.43257299065589905, |
|
"learning_rate": 0.00017113133940182054, |
|
"loss": 0.8614, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.37084296345710754, |
|
"learning_rate": 0.00017087126137841353, |
|
"loss": 0.7135, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6695109009742737, |
|
"learning_rate": 0.0001706111833550065, |
|
"loss": 0.7722, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5144183039665222, |
|
"learning_rate": 0.0001703511053315995, |
|
"loss": 0.9364, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.58673095703125, |
|
"learning_rate": 0.00017009102730819247, |
|
"loss": 1.1141, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.4033479392528534, |
|
"learning_rate": 0.00016983094928478545, |
|
"loss": 1.0002, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.38111910223960876, |
|
"learning_rate": 0.00016957087126137844, |
|
"loss": 0.9318, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.4174363613128662, |
|
"learning_rate": 0.00016931079323797142, |
|
"loss": 0.9381, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.3726828098297119, |
|
"learning_rate": 0.00016905071521456437, |
|
"loss": 1.0405, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5414443612098694, |
|
"learning_rate": 0.00016879063719115736, |
|
"loss": 1.0667, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 1.0012115240097046, |
|
"eval_runtime": 546.9226, |
|
"eval_samples_per_second": 2.761, |
|
"eval_steps_per_second": 1.38, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.523889422416687, |
|
"learning_rate": 0.00016853055916775034, |
|
"loss": 1.0408, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.47460106015205383, |
|
"learning_rate": 0.00016827048114434332, |
|
"loss": 1.0327, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5161871314048767, |
|
"learning_rate": 0.0001680104031209363, |
|
"loss": 0.9104, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.3816511631011963, |
|
"learning_rate": 0.00016775032509752926, |
|
"loss": 0.6875, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.3574850261211395, |
|
"learning_rate": 0.00016749024707412224, |
|
"loss": 0.8079, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.37416210770606995, |
|
"learning_rate": 0.00016723016905071522, |
|
"loss": 0.7479, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.648332417011261, |
|
"learning_rate": 0.0001669700910273082, |
|
"loss": 1.0868, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.37010350823402405, |
|
"learning_rate": 0.0001667100130039012, |
|
"loss": 0.8634, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5206050872802734, |
|
"learning_rate": 0.00016644993498049414, |
|
"loss": 0.8827, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.4001045525074005, |
|
"learning_rate": 0.00016618985695708712, |
|
"loss": 1.1029, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.3948250114917755, |
|
"learning_rate": 0.0001659297789336801, |
|
"loss": 0.7248, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.4833560287952423, |
|
"learning_rate": 0.0001656697009102731, |
|
"loss": 0.7379, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.4951235055923462, |
|
"learning_rate": 0.00016540962288686607, |
|
"loss": 1.0003, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7607774138450623, |
|
"learning_rate": 0.00016514954486345903, |
|
"loss": 0.8599, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.3604733347892761, |
|
"learning_rate": 0.000164889466840052, |
|
"loss": 1.0301, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.3514990508556366, |
|
"learning_rate": 0.000164629388816645, |
|
"loss": 1.072, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.8136312365531921, |
|
"learning_rate": 0.00016436931079323797, |
|
"loss": 1.0795, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.4305706024169922, |
|
"learning_rate": 0.00016410923276983095, |
|
"loss": 0.8288, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.4228506088256836, |
|
"learning_rate": 0.00016384915474642394, |
|
"loss": 0.863, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.3303283751010895, |
|
"learning_rate": 0.00016358907672301692, |
|
"loss": 0.8992, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.4840926229953766, |
|
"learning_rate": 0.0001633289986996099, |
|
"loss": 0.9309, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.3939206600189209, |
|
"learning_rate": 0.00016306892067620288, |
|
"loss": 1.0456, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.4611961245536804, |
|
"learning_rate": 0.00016280884265279587, |
|
"loss": 1.1839, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.4443643093109131, |
|
"learning_rate": 0.00016254876462938882, |
|
"loss": 0.8566, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.38910534977912903, |
|
"learning_rate": 0.0001622886866059818, |
|
"loss": 0.9426, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6585952043533325, |
|
"learning_rate": 0.00016202860858257478, |
|
"loss": 1.0632, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2883739173412323, |
|
"learning_rate": 0.00016176853055916777, |
|
"loss": 0.8594, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.4793940484523773, |
|
"learning_rate": 0.00016150845253576075, |
|
"loss": 0.9258, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.45249128341674805, |
|
"learning_rate": 0.0001612483745123537, |
|
"loss": 0.875, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7601549029350281, |
|
"learning_rate": 0.00016098829648894669, |
|
"loss": 1.0903, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.4474358856678009, |
|
"learning_rate": 0.00016072821846553967, |
|
"loss": 0.8636, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.4709658920764923, |
|
"learning_rate": 0.00016046814044213265, |
|
"loss": 1.01, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.49074116349220276, |
|
"learning_rate": 0.00016020806241872563, |
|
"loss": 1.129, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.40011027455329895, |
|
"learning_rate": 0.0001599479843953186, |
|
"loss": 0.9789, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.36765021085739136, |
|
"learning_rate": 0.00015968790637191157, |
|
"loss": 0.9647, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.4226991832256317, |
|
"learning_rate": 0.00015942782834850455, |
|
"loss": 1.0593, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5633476376533508, |
|
"learning_rate": 0.00015916775032509753, |
|
"loss": 1.139, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.4227031171321869, |
|
"learning_rate": 0.00015890767230169052, |
|
"loss": 0.9766, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7903754711151123, |
|
"learning_rate": 0.00015864759427828347, |
|
"loss": 1.0215, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5319734811782837, |
|
"learning_rate": 0.00015838751625487645, |
|
"loss": 1.0351, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.38870692253112793, |
|
"learning_rate": 0.00015812743823146944, |
|
"loss": 1.1089, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5061522126197815, |
|
"learning_rate": 0.00015786736020806242, |
|
"loss": 0.9956, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.6587597131729126, |
|
"learning_rate": 0.0001576072821846554, |
|
"loss": 0.9433, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.500415027141571, |
|
"learning_rate": 0.00015734720416124838, |
|
"loss": 0.9071, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.49865788221359253, |
|
"learning_rate": 0.00015708712613784136, |
|
"loss": 1.013, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.472517192363739, |
|
"learning_rate": 0.00015682704811443435, |
|
"loss": 1.0552, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5091108083724976, |
|
"learning_rate": 0.00015656697009102733, |
|
"loss": 0.9919, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.39484265446662903, |
|
"learning_rate": 0.0001563068920676203, |
|
"loss": 0.9945, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.47099944949150085, |
|
"learning_rate": 0.00015604681404421327, |
|
"loss": 0.847, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.4681323766708374, |
|
"learning_rate": 0.00015578673602080625, |
|
"loss": 0.8998, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.35557135939598083, |
|
"learning_rate": 0.00015552665799739923, |
|
"loss": 0.7108, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7873208522796631, |
|
"learning_rate": 0.0001552665799739922, |
|
"loss": 1.1322, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.45806723833084106, |
|
"learning_rate": 0.0001550065019505852, |
|
"loss": 0.9095, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.639227032661438, |
|
"learning_rate": 0.00015474642392717815, |
|
"loss": 1.1761, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6849457621574402, |
|
"learning_rate": 0.00015448634590377113, |
|
"loss": 1.1391, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5833932757377625, |
|
"learning_rate": 0.00015422626788036411, |
|
"loss": 1.0434, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.684618353843689, |
|
"learning_rate": 0.0001539661898569571, |
|
"loss": 0.9719, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.4258657991886139, |
|
"learning_rate": 0.00015370611183355008, |
|
"loss": 1.1534, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5763025879859924, |
|
"learning_rate": 0.00015344603381014303, |
|
"loss": 0.9974, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5415167808532715, |
|
"learning_rate": 0.00015318595578673602, |
|
"loss": 1.0696, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.43196481466293335, |
|
"learning_rate": 0.000152925877763329, |
|
"loss": 0.8191, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.3914147913455963, |
|
"learning_rate": 0.00015266579973992198, |
|
"loss": 0.8465, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4838383197784424, |
|
"learning_rate": 0.00015240572171651496, |
|
"loss": 1.0027, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.3581622540950775, |
|
"learning_rate": 0.00015214564369310792, |
|
"loss": 1.1247, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.9383752942085266, |
|
"learning_rate": 0.0001518855656697009, |
|
"loss": 1.2207, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5058914422988892, |
|
"learning_rate": 0.00015162548764629388, |
|
"loss": 1.0036, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4513028860092163, |
|
"learning_rate": 0.0001513654096228869, |
|
"loss": 1.0949, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5779305696487427, |
|
"learning_rate": 0.00015110533159947987, |
|
"loss": 1.0035, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6758905649185181, |
|
"learning_rate": 0.00015084525357607283, |
|
"loss": 1.0163, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5109132528305054, |
|
"learning_rate": 0.0001505851755526658, |
|
"loss": 0.7357, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.42248305678367615, |
|
"learning_rate": 0.0001503250975292588, |
|
"loss": 1.0664, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7441749572753906, |
|
"learning_rate": 0.00015006501950585178, |
|
"loss": 0.9483, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5364653468132019, |
|
"learning_rate": 0.00014980494148244476, |
|
"loss": 1.1671, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.3390597701072693, |
|
"learning_rate": 0.0001495448634590377, |
|
"loss": 0.9616, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.3810036778450012, |
|
"learning_rate": 0.0001492847854356307, |
|
"loss": 0.9419, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.39790648221969604, |
|
"learning_rate": 0.00014902470741222368, |
|
"loss": 0.947, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5461649894714355, |
|
"learning_rate": 0.00014876462938881666, |
|
"loss": 1.0065, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.376022607088089, |
|
"learning_rate": 0.00014850455136540964, |
|
"loss": 0.8626, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.47341254353523254, |
|
"learning_rate": 0.0001482444733420026, |
|
"loss": 0.9812, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5151678323745728, |
|
"learning_rate": 0.00014798439531859558, |
|
"loss": 0.9688, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.4801676273345947, |
|
"learning_rate": 0.00014772431729518856, |
|
"loss": 0.9348, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.43506500124931335, |
|
"learning_rate": 0.00014746423927178154, |
|
"loss": 1.0792, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5347827076911926, |
|
"learning_rate": 0.00014720416124837452, |
|
"loss": 1.1285, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5323308706283569, |
|
"learning_rate": 0.00014694408322496748, |
|
"loss": 1.047, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.364046573638916, |
|
"learning_rate": 0.00014668400520156046, |
|
"loss": 1.0206, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.41780295968055725, |
|
"learning_rate": 0.00014642392717815344, |
|
"loss": 1.011, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6010937690734863, |
|
"learning_rate": 0.00014616384915474643, |
|
"loss": 0.9836, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.3574303090572357, |
|
"learning_rate": 0.0001459037711313394, |
|
"loss": 0.9051, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.3346600830554962, |
|
"learning_rate": 0.0001456436931079324, |
|
"loss": 1.0333, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.48559319972991943, |
|
"learning_rate": 0.00014538361508452537, |
|
"loss": 0.8363, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.4872112274169922, |
|
"learning_rate": 0.00014512353706111836, |
|
"loss": 0.9565, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.37863266468048096, |
|
"learning_rate": 0.00014486345903771134, |
|
"loss": 0.9869, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6248475909233093, |
|
"learning_rate": 0.00014460338101430432, |
|
"loss": 1.1117, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.40380629897117615, |
|
"learning_rate": 0.00014434330299089727, |
|
"loss": 0.895, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.4236217439174652, |
|
"learning_rate": 0.00014408322496749026, |
|
"loss": 0.8849, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.35017380118370056, |
|
"learning_rate": 0.00014382314694408324, |
|
"loss": 0.8472, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5927186608314514, |
|
"learning_rate": 0.00014356306892067622, |
|
"loss": 1.0073, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7567859292030334, |
|
"learning_rate": 0.0001433029908972692, |
|
"loss": 0.9784, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.4356285035610199, |
|
"learning_rate": 0.00014304291287386216, |
|
"loss": 0.8339, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3144444525241852, |
|
"learning_rate": 0.00014278283485045514, |
|
"loss": 1.0751, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 0.9849200248718262, |
|
"eval_runtime": 546.1357, |
|
"eval_samples_per_second": 2.765, |
|
"eval_steps_per_second": 1.382, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5712672472000122, |
|
"learning_rate": 0.00014252275682704812, |
|
"loss": 0.9618, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7324238419532776, |
|
"learning_rate": 0.0001422626788036411, |
|
"loss": 1.1554, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.4014214277267456, |
|
"learning_rate": 0.0001420026007802341, |
|
"loss": 0.9471, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.3690221905708313, |
|
"learning_rate": 0.00014174252275682704, |
|
"loss": 0.691, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.3683489263057709, |
|
"learning_rate": 0.00014148244473342002, |
|
"loss": 0.7806, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6884939670562744, |
|
"learning_rate": 0.000141222366710013, |
|
"loss": 1.1163, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5952924489974976, |
|
"learning_rate": 0.000140962288686606, |
|
"loss": 1.0318, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.4473588764667511, |
|
"learning_rate": 0.00014070221066319897, |
|
"loss": 1.0514, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.3944532871246338, |
|
"learning_rate": 0.00014044213263979193, |
|
"loss": 0.8776, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6479160189628601, |
|
"learning_rate": 0.0001401820546163849, |
|
"loss": 1.0586, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.4719555974006653, |
|
"learning_rate": 0.0001399219765929779, |
|
"loss": 1.1372, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.4791443347930908, |
|
"learning_rate": 0.00013966189856957087, |
|
"loss": 1.0131, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6476565599441528, |
|
"learning_rate": 0.00013940182054616385, |
|
"loss": 0.9054, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.4465392529964447, |
|
"learning_rate": 0.00013914174252275684, |
|
"loss": 1.09, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.4066149890422821, |
|
"learning_rate": 0.00013888166449934982, |
|
"loss": 0.8515, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.3584599196910858, |
|
"learning_rate": 0.0001386215864759428, |
|
"loss": 0.8008, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6454095840454102, |
|
"learning_rate": 0.00013836150845253578, |
|
"loss": 1.1542, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6096594333648682, |
|
"learning_rate": 0.00013810143042912877, |
|
"loss": 1.26, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.42694196105003357, |
|
"learning_rate": 0.00013784135240572172, |
|
"loss": 0.754, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5280473232269287, |
|
"learning_rate": 0.0001375812743823147, |
|
"loss": 1.0522, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.46211832761764526, |
|
"learning_rate": 0.00013732119635890769, |
|
"loss": 1.094, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.3849136531352997, |
|
"learning_rate": 0.00013706111833550067, |
|
"loss": 0.9067, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.37398576736450195, |
|
"learning_rate": 0.00013680104031209365, |
|
"loss": 0.9948, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.38779109716415405, |
|
"learning_rate": 0.0001365409622886866, |
|
"loss": 1.0286, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.4133855700492859, |
|
"learning_rate": 0.0001362808842652796, |
|
"loss": 0.977, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.44233980774879456, |
|
"learning_rate": 0.00013602080624187257, |
|
"loss": 0.908, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.4096635580062866, |
|
"learning_rate": 0.00013576072821846555, |
|
"loss": 0.88, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.4307350814342499, |
|
"learning_rate": 0.00013550065019505853, |
|
"loss": 0.8848, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.33225497603416443, |
|
"learning_rate": 0.0001352405721716515, |
|
"loss": 0.8369, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5546039342880249, |
|
"learning_rate": 0.00013498049414824447, |
|
"loss": 0.6756, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.4454500377178192, |
|
"learning_rate": 0.00013472041612483745, |
|
"loss": 0.994, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.37814733386039734, |
|
"learning_rate": 0.00013446033810143044, |
|
"loss": 1.028, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8602733612060547, |
|
"learning_rate": 0.00013420026007802342, |
|
"loss": 1.0618, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5205318331718445, |
|
"learning_rate": 0.00013394018205461637, |
|
"loss": 0.9995, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.3141395151615143, |
|
"learning_rate": 0.00013368010403120935, |
|
"loss": 1.0196, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.3242855668067932, |
|
"learning_rate": 0.00013342002600780234, |
|
"loss": 0.9072, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.37035492062568665, |
|
"learning_rate": 0.00013315994798439532, |
|
"loss": 0.8574, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.46976369619369507, |
|
"learning_rate": 0.0001328998699609883, |
|
"loss": 1.0797, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.4217787981033325, |
|
"learning_rate": 0.00013263979193758128, |
|
"loss": 0.9538, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7571201920509338, |
|
"learning_rate": 0.00013237971391417427, |
|
"loss": 1.0524, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7066246271133423, |
|
"learning_rate": 0.00013211963589076725, |
|
"loss": 0.9775, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.3496040403842926, |
|
"learning_rate": 0.00013185955786736023, |
|
"loss": 0.9926, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.49747544527053833, |
|
"learning_rate": 0.0001315994798439532, |
|
"loss": 1.0803, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.406820148229599, |
|
"learning_rate": 0.00013133940182054617, |
|
"loss": 0.8144, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.3256113827228546, |
|
"learning_rate": 0.00013107932379713915, |
|
"loss": 0.8676, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.48141512274742126, |
|
"learning_rate": 0.00013081924577373213, |
|
"loss": 0.9067, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.35172903537750244, |
|
"learning_rate": 0.0001305591677503251, |
|
"loss": 0.9535, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.39796268939971924, |
|
"learning_rate": 0.0001302990897269181, |
|
"loss": 1.0817, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.4287821650505066, |
|
"learning_rate": 0.00013003901170351105, |
|
"loss": 0.9363, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.45490118861198425, |
|
"learning_rate": 0.00012977893368010403, |
|
"loss": 1.0131, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.4730069935321808, |
|
"learning_rate": 0.00012951885565669702, |
|
"loss": 1.0037, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.34408411383628845, |
|
"learning_rate": 0.00012925877763329, |
|
"loss": 0.737, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5118675827980042, |
|
"learning_rate": 0.00012899869960988298, |
|
"loss": 1.0871, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.45648622512817383, |
|
"learning_rate": 0.00012873862158647593, |
|
"loss": 0.8413, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.30808356404304504, |
|
"learning_rate": 0.00012847854356306892, |
|
"loss": 0.7535, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.4459214508533478, |
|
"learning_rate": 0.0001282184655396619, |
|
"loss": 0.9316, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7930001020431519, |
|
"learning_rate": 0.00012795838751625488, |
|
"loss": 0.9959, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5756102204322815, |
|
"learning_rate": 0.00012769830949284786, |
|
"loss": 1.0373, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.37661170959472656, |
|
"learning_rate": 0.00012743823146944082, |
|
"loss": 0.885, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5622540712356567, |
|
"learning_rate": 0.0001271781534460338, |
|
"loss": 0.8901, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.42885056138038635, |
|
"learning_rate": 0.00012691807542262678, |
|
"loss": 0.9577, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.38707178831100464, |
|
"learning_rate": 0.00012665799739921976, |
|
"loss": 0.9606, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.3369719684123993, |
|
"learning_rate": 0.00012639791937581275, |
|
"loss": 0.8205, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.35598838329315186, |
|
"learning_rate": 0.00012613784135240573, |
|
"loss": 0.7119, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6167771816253662, |
|
"learning_rate": 0.0001258777633289987, |
|
"loss": 1.2462, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.3976896107196808, |
|
"learning_rate": 0.0001256176853055917, |
|
"loss": 0.981, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.4538115859031677, |
|
"learning_rate": 0.00012535760728218468, |
|
"loss": 1.0327, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7170857787132263, |
|
"learning_rate": 0.00012509752925877766, |
|
"loss": 0.9929, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.45370182394981384, |
|
"learning_rate": 0.00012483745123537064, |
|
"loss": 0.7871, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4275226891040802, |
|
"learning_rate": 0.0001245773732119636, |
|
"loss": 0.7853, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4282675087451935, |
|
"learning_rate": 0.00012431729518855658, |
|
"loss": 1.0487, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6588276624679565, |
|
"learning_rate": 0.00012405721716514956, |
|
"loss": 0.9411, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6060877442359924, |
|
"learning_rate": 0.00012379713914174254, |
|
"loss": 0.9601, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5504083037376404, |
|
"learning_rate": 0.00012353706111833552, |
|
"loss": 0.9798, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.40328875184059143, |
|
"learning_rate": 0.00012327698309492848, |
|
"loss": 0.9134, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5537497401237488, |
|
"learning_rate": 0.00012301690507152146, |
|
"loss": 0.8478, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4089404046535492, |
|
"learning_rate": 0.00012275682704811444, |
|
"loss": 0.9013, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.47143158316612244, |
|
"learning_rate": 0.00012249674902470743, |
|
"loss": 0.9138, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7318865060806274, |
|
"learning_rate": 0.0001222366710013004, |
|
"loss": 1.1403, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.537233293056488, |
|
"learning_rate": 0.00012197659297789336, |
|
"loss": 0.9555, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.3424953520298004, |
|
"learning_rate": 0.00012171651495448635, |
|
"loss": 0.8932, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.33310917019844055, |
|
"learning_rate": 0.00012145643693107933, |
|
"loss": 1.0403, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.46227535605430603, |
|
"learning_rate": 0.00012119635890767231, |
|
"loss": 0.9926, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.5558334589004517, |
|
"learning_rate": 0.00012093628088426529, |
|
"loss": 0.9385, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.648505449295044, |
|
"learning_rate": 0.00012067620286085826, |
|
"loss": 1.2165, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.4283152222633362, |
|
"learning_rate": 0.00012041612483745124, |
|
"loss": 0.8768, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.38274627923965454, |
|
"learning_rate": 0.00012015604681404422, |
|
"loss": 0.8584, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.3748539090156555, |
|
"learning_rate": 0.0001198959687906372, |
|
"loss": 0.951, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.509125828742981, |
|
"learning_rate": 0.00011963589076723019, |
|
"loss": 0.9077, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.4202275276184082, |
|
"learning_rate": 0.00011937581274382314, |
|
"loss": 0.9849, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.4720480144023895, |
|
"learning_rate": 0.00011911573472041613, |
|
"loss": 1.2436, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.365005761384964, |
|
"learning_rate": 0.00011885565669700911, |
|
"loss": 0.7927, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7129966616630554, |
|
"learning_rate": 0.00011859557867360209, |
|
"loss": 1.2607, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.3527359068393707, |
|
"learning_rate": 0.00011833550065019507, |
|
"loss": 0.9059, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.42210349440574646, |
|
"learning_rate": 0.00011807542262678804, |
|
"loss": 0.8577, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6976379752159119, |
|
"learning_rate": 0.00011781534460338102, |
|
"loss": 1.1473, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.38397690653800964, |
|
"learning_rate": 0.000117555266579974, |
|
"loss": 0.8791, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.42243918776512146, |
|
"learning_rate": 0.00011729518855656699, |
|
"loss": 1.0964, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.41610145568847656, |
|
"learning_rate": 0.00011703511053315997, |
|
"loss": 0.7124, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.4049556255340576, |
|
"learning_rate": 0.00011677503250975293, |
|
"loss": 0.8838, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 0.9695621132850647, |
|
"eval_runtime": 547.2222, |
|
"eval_samples_per_second": 2.759, |
|
"eval_steps_per_second": 1.38, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 849, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 1.8664571225820365e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|