|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.04349086691794723, |
|
"eval_steps": 9, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005798782255726297, |
|
"grad_norm": 72.19591522216797, |
|
"learning_rate": 1e-05, |
|
"loss": 9.7221, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0005798782255726297, |
|
"eval_loss": 9.690610885620117, |
|
"eval_runtime": 20.6228, |
|
"eval_samples_per_second": 281.63, |
|
"eval_steps_per_second": 8.825, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0011597564511452595, |
|
"grad_norm": 84.83391571044922, |
|
"learning_rate": 2e-05, |
|
"loss": 9.8583, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0017396346767178893, |
|
"grad_norm": 76.03058624267578, |
|
"learning_rate": 3e-05, |
|
"loss": 9.6866, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.002319512902290519, |
|
"grad_norm": 75.7887954711914, |
|
"learning_rate": 4e-05, |
|
"loss": 9.2374, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0028993911278631487, |
|
"grad_norm": 73.26809692382812, |
|
"learning_rate": 5e-05, |
|
"loss": 8.4066, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0034792693534357786, |
|
"grad_norm": 51.59004592895508, |
|
"learning_rate": 6e-05, |
|
"loss": 7.7976, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004059147579008408, |
|
"grad_norm": 41.35436248779297, |
|
"learning_rate": 7e-05, |
|
"loss": 7.2723, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.004639025804581038, |
|
"grad_norm": 21.733489990234375, |
|
"learning_rate": 8e-05, |
|
"loss": 6.6234, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.005218904030153668, |
|
"grad_norm": 29.629690170288086, |
|
"learning_rate": 9e-05, |
|
"loss": 5.7096, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.005218904030153668, |
|
"eval_loss": 4.918065547943115, |
|
"eval_runtime": 20.5791, |
|
"eval_samples_per_second": 282.228, |
|
"eval_steps_per_second": 8.844, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0057987822557262975, |
|
"grad_norm": 24.306255340576172, |
|
"learning_rate": 0.0001, |
|
"loss": 4.5784, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006378660481298927, |
|
"grad_norm": 22.750164031982422, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 4.1386, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.006958538706871557, |
|
"grad_norm": 16.85099983215332, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 3.424, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.007538416932444187, |
|
"grad_norm": 11.644445419311523, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 3.5163, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.008118295158016816, |
|
"grad_norm": 13.419211387634277, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 3.0114, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.008698173383589447, |
|
"grad_norm": 11.04304313659668, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 3.1268, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.009278051609162076, |
|
"grad_norm": 14.95022201538086, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 2.5739, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.009857929834734706, |
|
"grad_norm": 8.970131874084473, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 1.9381, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.010437808060307335, |
|
"grad_norm": 15.41360092163086, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 2.1968, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.010437808060307335, |
|
"eval_loss": 2.0878777503967285, |
|
"eval_runtime": 25.0521, |
|
"eval_samples_per_second": 231.837, |
|
"eval_steps_per_second": 7.265, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.011017686285879966, |
|
"grad_norm": 26.915355682373047, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 1.7957, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.011597564511452595, |
|
"grad_norm": 14.224507331848145, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 1.5277, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012177442737025224, |
|
"grad_norm": 15.548272132873535, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 1.6963, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.012757320962597855, |
|
"grad_norm": 20.450119018554688, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 2.0978, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.013337199188170484, |
|
"grad_norm": 30.09837532043457, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 1.7468, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.013917077413743114, |
|
"grad_norm": 29.925216674804688, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 1.5683, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.014496955639315743, |
|
"grad_norm": 10.300623893737793, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 1.3637, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.015076833864888374, |
|
"grad_norm": 12.566941261291504, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 1.5909, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.015656712090461003, |
|
"grad_norm": 11.902721405029297, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 1.3469, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.015656712090461003, |
|
"eval_loss": 1.393225073814392, |
|
"eval_runtime": 45.6636, |
|
"eval_samples_per_second": 127.191, |
|
"eval_steps_per_second": 3.986, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.016236590316033632, |
|
"grad_norm": 6.123865604400635, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 1.4863, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.016816468541606264, |
|
"grad_norm": 10.206008911132812, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 1.3196, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.017396346767178893, |
|
"grad_norm": 9.587065696716309, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 1.1059, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.017976224992751522, |
|
"grad_norm": 9.408559799194336, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 1.5897, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.01855610321832415, |
|
"grad_norm": 6.678525447845459, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 1.1822, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.01913598144389678, |
|
"grad_norm": 5.290743827819824, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 0.9327, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.019715859669469413, |
|
"grad_norm": 5.287943363189697, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 0.9831, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02029573789504204, |
|
"grad_norm": 4.547323226928711, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 1.0122, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02087561612061467, |
|
"grad_norm": 4.139932155609131, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 0.9118, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.02087561612061467, |
|
"eval_loss": 0.9454571008682251, |
|
"eval_runtime": 20.6133, |
|
"eval_samples_per_second": 281.76, |
|
"eval_steps_per_second": 8.829, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0214554943461873, |
|
"grad_norm": 4.820893287658691, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 0.7848, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.022035372571759932, |
|
"grad_norm": 7.449838638305664, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 1.1475, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.02261525079733256, |
|
"grad_norm": 6.734663963317871, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 0.9721, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.02319512902290519, |
|
"grad_norm": 4.044934272766113, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.8632, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02377500724847782, |
|
"grad_norm": 6.2852702140808105, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 0.8913, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.024354885474050448, |
|
"grad_norm": 5.198728561401367, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 0.9631, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.02493476369962308, |
|
"grad_norm": 12.389552116394043, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 0.8648, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.02551464192519571, |
|
"grad_norm": 16.09233856201172, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 1.2013, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.02609452015076834, |
|
"grad_norm": 9.62452507019043, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.8578, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02609452015076834, |
|
"eval_loss": 0.8617116808891296, |
|
"eval_runtime": 20.6142, |
|
"eval_samples_per_second": 281.747, |
|
"eval_steps_per_second": 8.829, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.026674398376340967, |
|
"grad_norm": 5.45116662979126, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.8589, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0272542766019136, |
|
"grad_norm": 8.586546897888184, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 1.0707, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.02783415482748623, |
|
"grad_norm": 7.391688823699951, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 1.0134, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.028414033053058858, |
|
"grad_norm": 6.437960624694824, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 0.877, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.028993911278631487, |
|
"grad_norm": 3.7490339279174805, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.8931, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.029573789504204116, |
|
"grad_norm": 5.76321268081665, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 0.8438, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.030153667729776748, |
|
"grad_norm": 6.54787540435791, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 0.8288, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.030733545955349377, |
|
"grad_norm": 5.9420084953308105, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 0.8325, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.031313424180922006, |
|
"grad_norm": 4.388433456420898, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 0.806, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.031313424180922006, |
|
"eval_loss": 0.8188083171844482, |
|
"eval_runtime": 20.6473, |
|
"eval_samples_per_second": 281.296, |
|
"eval_steps_per_second": 8.815, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.031893302406494635, |
|
"grad_norm": 4.667327404022217, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8285, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.032473180632067264, |
|
"grad_norm": 4.210194110870361, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 0.8421, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.03305305885763989, |
|
"grad_norm": 6.773632049560547, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 0.8522, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03363293708321253, |
|
"grad_norm": 4.837597846984863, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 0.7556, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.03421281530878516, |
|
"grad_norm": 7.49063777923584, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 0.8624, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03479269353435779, |
|
"grad_norm": 2.9477198123931885, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.7574, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.035372571759930416, |
|
"grad_norm": 4.244167327880859, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 0.8535, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.035952449985503045, |
|
"grad_norm": 4.019453525543213, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 0.7877, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.036532328211075674, |
|
"grad_norm": 4.17199182510376, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 0.8334, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.036532328211075674, |
|
"eval_loss": 0.789574921131134, |
|
"eval_runtime": 20.9404, |
|
"eval_samples_per_second": 277.358, |
|
"eval_steps_per_second": 8.691, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0371122064366483, |
|
"grad_norm": 3.5703322887420654, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.8385, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03769208466222093, |
|
"grad_norm": 3.0590972900390625, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.8761, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03827196288779356, |
|
"grad_norm": 3.206594944000244, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 0.7679, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.038851841113366196, |
|
"grad_norm": 2.16125750541687, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 0.8023, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.039431719338938825, |
|
"grad_norm": 3.744929313659668, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 0.7697, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.040011597564511454, |
|
"grad_norm": 4.342959403991699, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 0.8038, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.04059147579008408, |
|
"grad_norm": 2.113845109939575, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.7157, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04117135401565671, |
|
"grad_norm": 3.42581844329834, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 0.7877, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.04175123224122934, |
|
"grad_norm": 3.2024004459381104, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 0.8178, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.04175123224122934, |
|
"eval_loss": 0.757203996181488, |
|
"eval_runtime": 20.7013, |
|
"eval_samples_per_second": 280.562, |
|
"eval_steps_per_second": 8.792, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.04233111046680197, |
|
"grad_norm": 3.951155662536621, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.7732, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0429109886923746, |
|
"grad_norm": 2.8352742195129395, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 0.7979, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.04349086691794723, |
|
"grad_norm": 3.528505563735962, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.836, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.08810631610368e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|