{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03931976801336872, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007863953602673744, "grad_norm": 1.727420449256897, "learning_rate": 1e-05, "loss": 1.2759, "step": 1 }, { "epoch": 0.0007863953602673744, "eval_loss": 2.9491188526153564, "eval_runtime": 802.2962, "eval_samples_per_second": 2.67, "eval_steps_per_second": 1.335, "step": 1 }, { "epoch": 0.0015727907205347487, "grad_norm": 2.06775164604187, "learning_rate": 2e-05, "loss": 1.2877, "step": 2 }, { "epoch": 0.0023591860808021233, "grad_norm": 2.407663583755493, "learning_rate": 3e-05, "loss": 1.367, "step": 3 }, { "epoch": 0.0031455814410694975, "grad_norm": 1.9768824577331543, "learning_rate": 4e-05, "loss": 1.2932, "step": 4 }, { "epoch": 0.003931976801336872, "grad_norm": 2.2676198482513428, "learning_rate": 5e-05, "loss": 1.4012, "step": 5 }, { "epoch": 0.004718372161604247, "grad_norm": 2.2389116287231445, "learning_rate": 6e-05, "loss": 1.4455, "step": 6 }, { "epoch": 0.005504767521871621, "grad_norm": 3.368894577026367, "learning_rate": 7e-05, "loss": 1.6058, "step": 7 }, { "epoch": 0.006291162882138995, "grad_norm": 2.1887245178222656, "learning_rate": 8e-05, "loss": 1.315, "step": 8 }, { "epoch": 0.00707755824240637, "grad_norm": 2.749788284301758, "learning_rate": 9e-05, "loss": 1.4487, "step": 9 }, { "epoch": 0.007863953602673744, "grad_norm": 2.44626784324646, "learning_rate": 0.0001, "loss": 0.9548, "step": 10 }, { "epoch": 0.008650348962941118, "grad_norm": 2.37338924407959, "learning_rate": 9.98458666866564e-05, "loss": 0.8456, "step": 11 }, { "epoch": 0.009436744323208493, "grad_norm": 1.9337718486785889, "learning_rate": 9.938441702975689e-05, "loss": 0.8131, "step": 12 }, { "epoch": 0.010223139683475867, "grad_norm": 5.305741786956787, "learning_rate": 9.861849601988383e-05, "loss": 1.3641, "step": 13 }, { "epoch": 0.010223139683475867, "eval_loss": 1.1911835670471191, "eval_runtime": 790.9924, "eval_samples_per_second": 2.708, "eval_steps_per_second": 1.354, "step": 13 }, { "epoch": 0.011009535043743243, "grad_norm": 1.8393322229385376, "learning_rate": 9.755282581475769e-05, "loss": 0.8994, "step": 14 }, { "epoch": 0.011795930404010616, "grad_norm": 2.288557529449463, "learning_rate": 9.619397662556435e-05, "loss": 0.964, "step": 15 }, { "epoch": 0.01258232576427799, "grad_norm": 2.239872455596924, "learning_rate": 9.45503262094184e-05, "loss": 0.8643, "step": 16 }, { "epoch": 0.013368721124545365, "grad_norm": 2.4298009872436523, "learning_rate": 9.263200821770461e-05, "loss": 0.9466, "step": 17 }, { "epoch": 0.01415511648481274, "grad_norm": 3.0432651042938232, "learning_rate": 9.045084971874738e-05, "loss": 1.0756, "step": 18 }, { "epoch": 0.014941511845080115, "grad_norm": 1.3065857887268066, "learning_rate": 8.802029828000156e-05, "loss": 0.6739, "step": 19 }, { "epoch": 0.01572790720534749, "grad_norm": 2.0720527172088623, "learning_rate": 8.535533905932738e-05, "loss": 0.8697, "step": 20 }, { "epoch": 0.016514302565614862, "grad_norm": 1.8250558376312256, "learning_rate": 8.247240241650918e-05, "loss": 0.7318, "step": 21 }, { "epoch": 0.017300697925882236, "grad_norm": 2.166428565979004, "learning_rate": 7.938926261462366e-05, "loss": 0.7315, "step": 22 }, { "epoch": 0.018087093286149613, "grad_norm": 2.2534689903259277, "learning_rate": 7.612492823579745e-05, "loss": 0.9042, "step": 23 }, { "epoch": 0.018873488646416987, "grad_norm": 1.4026483297348022, "learning_rate": 7.269952498697734e-05, "loss": 0.7183, "step": 24 }, { "epoch": 0.01965988400668436, "grad_norm": 1.6705015897750854, "learning_rate": 6.91341716182545e-05, "loss": 0.6787, "step": 25 }, { "epoch": 0.020446279366951734, "grad_norm": 2.1395044326782227, "learning_rate": 6.545084971874738e-05, "loss": 0.9407, "step": 26 }, { "epoch": 0.020446279366951734, "eval_loss": 0.8785327672958374, "eval_runtime": 781.2326, "eval_samples_per_second": 2.742, "eval_steps_per_second": 1.371, "step": 26 }, { "epoch": 0.021232674727219108, "grad_norm": 1.2242342233657837, "learning_rate": 6.167226819279528e-05, "loss": 0.6702, "step": 27 }, { "epoch": 0.022019070087486485, "grad_norm": 1.7602314949035645, "learning_rate": 5.782172325201155e-05, "loss": 0.9246, "step": 28 }, { "epoch": 0.02280546544775386, "grad_norm": 2.0709216594696045, "learning_rate": 5.392295478639225e-05, "loss": 0.8103, "step": 29 }, { "epoch": 0.023591860808021232, "grad_norm": 1.5635908842086792, "learning_rate": 5e-05, "loss": 0.8027, "step": 30 }, { "epoch": 0.024378256168288606, "grad_norm": 1.6824203729629517, "learning_rate": 4.607704521360776e-05, "loss": 0.776, "step": 31 }, { "epoch": 0.02516465152855598, "grad_norm": 1.457542061805725, "learning_rate": 4.2178276747988446e-05, "loss": 0.5255, "step": 32 }, { "epoch": 0.025951046888823357, "grad_norm": 1.5966757535934448, "learning_rate": 3.832773180720475e-05, "loss": 0.6032, "step": 33 }, { "epoch": 0.02673744224909073, "grad_norm": 1.5422358512878418, "learning_rate": 3.4549150281252636e-05, "loss": 0.548, "step": 34 }, { "epoch": 0.027523837609358105, "grad_norm": 1.623889446258545, "learning_rate": 3.086582838174551e-05, "loss": 0.7702, "step": 35 }, { "epoch": 0.02831023296962548, "grad_norm": 1.665121078491211, "learning_rate": 2.7300475013022663e-05, "loss": 0.6063, "step": 36 }, { "epoch": 0.029096628329892852, "grad_norm": 1.3971141576766968, "learning_rate": 2.3875071764202563e-05, "loss": 0.48, "step": 37 }, { "epoch": 0.02988302369016023, "grad_norm": 1.3747204542160034, "learning_rate": 2.061073738537635e-05, "loss": 0.6483, "step": 38 }, { "epoch": 0.030669419050427603, "grad_norm": 1.3398278951644897, "learning_rate": 1.7527597583490822e-05, "loss": 0.7969, "step": 39 }, { "epoch": 0.030669419050427603, "eval_loss": 0.8240298628807068, "eval_runtime": 792.48, "eval_samples_per_second": 2.703, "eval_steps_per_second": 1.351, "step": 39 }, { "epoch": 0.03145581441069498, "grad_norm": 1.3485429286956787, "learning_rate": 1.4644660940672627e-05, "loss": 0.6484, "step": 40 }, { "epoch": 0.03224220977096235, "grad_norm": 1.2892612218856812, "learning_rate": 1.1979701719998453e-05, "loss": 0.7116, "step": 41 }, { "epoch": 0.033028605131229724, "grad_norm": 1.8880107402801514, "learning_rate": 9.549150281252633e-06, "loss": 0.5925, "step": 42 }, { "epoch": 0.0338150004914971, "grad_norm": 1.4518492221832275, "learning_rate": 7.367991782295391e-06, "loss": 0.6824, "step": 43 }, { "epoch": 0.03460139585176447, "grad_norm": 2.3983242511749268, "learning_rate": 5.449673790581611e-06, "loss": 0.9552, "step": 44 }, { "epoch": 0.03538779121203185, "grad_norm": 1.543707013130188, "learning_rate": 3.8060233744356633e-06, "loss": 0.8833, "step": 45 }, { "epoch": 0.036174186572299226, "grad_norm": 1.7945361137390137, "learning_rate": 2.4471741852423237e-06, "loss": 0.8763, "step": 46 }, { "epoch": 0.0369605819325666, "grad_norm": 1.6415727138519287, "learning_rate": 1.3815039801161721e-06, "loss": 0.7373, "step": 47 }, { "epoch": 0.03774697729283397, "grad_norm": 1.7929890155792236, "learning_rate": 6.15582970243117e-07, "loss": 0.6606, "step": 48 }, { "epoch": 0.03853337265310135, "grad_norm": 1.3461647033691406, "learning_rate": 1.5413331334360182e-07, "loss": 0.7808, "step": 49 }, { "epoch": 0.03931976801336872, "grad_norm": 1.1689928770065308, "learning_rate": 0.0, "loss": 0.7459, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.067577446170624e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }