{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9957924263674616, "eval_steps": 500, "global_step": 534, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1.296296296296296e-06, "loss": 1.1525, "step": 1 }, { "epoch": 0.03, "learning_rate": 6.48148148148148e-06, "loss": 1.1069, "step": 5 }, { "epoch": 0.06, "learning_rate": 1.296296296296296e-05, "loss": 1.0522, "step": 10 }, { "epoch": 0.08, "learning_rate": 1.9444444444444442e-05, "loss": 0.977, "step": 15 }, { "epoch": 0.11, "learning_rate": 2.592592592592592e-05, "loss": 0.9307, "step": 20 }, { "epoch": 0.14, "learning_rate": 3.240740740740741e-05, "loss": 0.9144, "step": 25 }, { "epoch": 0.17, "learning_rate": 3.8888888888888884e-05, "loss": 0.9218, "step": 30 }, { "epoch": 0.2, "learning_rate": 4.537037037037037e-05, "loss": 0.8808, "step": 35 }, { "epoch": 0.22, "learning_rate": 5.185185185185184e-05, "loss": 0.8981, "step": 40 }, { "epoch": 0.25, "learning_rate": 5.833333333333333e-05, "loss": 0.8721, "step": 45 }, { "epoch": 0.28, "learning_rate": 6.481481481481482e-05, "loss": 0.8714, "step": 50 }, { "epoch": 0.31, "learning_rate": 6.999925035824452e-05, "loss": 0.8616, "step": 55 }, { "epoch": 0.34, "learning_rate": 6.99730162684253e-05, "loss": 0.8686, "step": 60 }, { "epoch": 0.36, "learning_rate": 6.99093321968073e-05, "loss": 0.823, "step": 65 }, { "epoch": 0.39, "learning_rate": 6.980826633788956e-05, "loss": 0.8186, "step": 70 }, { "epoch": 0.42, "learning_rate": 6.96699269155252e-05, "loss": 0.8349, "step": 75 }, { "epoch": 0.45, "learning_rate": 6.949446206703272e-05, "loss": 0.8436, "step": 80 }, { "epoch": 0.48, "learning_rate": 6.928205968456678e-05, "loss": 0.8353, "step": 85 }, { "epoch": 0.5, "learning_rate": 6.903294721391867e-05, "loss": 0.8553, "step": 90 }, { "epoch": 0.53, "learning_rate": 6.874739141096151e-05, "loss": 0.8244, "step": 95 }, { "epoch": 0.56, "learning_rate": 6.842569805600153e-05, "loss": 0.8424, "step": 100 }, { "epoch": 0.59, "learning_rate": 6.806821162634073e-05, "loss": 0.8328, "step": 105 }, { "epoch": 0.62, "learning_rate": 6.767531492740205e-05, "loss": 0.821, "step": 110 }, { "epoch": 0.65, "learning_rate": 6.724742868281173e-05, "loss": 0.7997, "step": 115 }, { "epoch": 0.67, "learning_rate": 6.678501108387784e-05, "loss": 0.8393, "step": 120 }, { "epoch": 0.7, "learning_rate": 6.628855729894762e-05, "loss": 0.8226, "step": 125 }, { "epoch": 0.73, "learning_rate": 6.575859894316877e-05, "loss": 0.7994, "step": 130 }, { "epoch": 0.76, "learning_rate": 6.519570350922271e-05, "loss": 0.7988, "step": 135 }, { "epoch": 0.79, "learning_rate": 6.460047375963906e-05, "loss": 0.7932, "step": 140 }, { "epoch": 0.81, "learning_rate": 6.397354708134257e-05, "loss": 0.786, "step": 145 }, { "epoch": 0.84, "learning_rate": 6.331559480312315e-05, "loss": 0.7943, "step": 150 }, { "epoch": 0.87, "learning_rate": 6.262732147676025e-05, "loss": 0.8187, "step": 155 }, { "epoch": 0.9, "learning_rate": 6.190946412257108e-05, "loss": 0.7774, "step": 160 }, { "epoch": 0.93, "learning_rate": 6.116279144019089e-05, "loss": 0.7925, "step": 165 }, { "epoch": 0.95, "learning_rate": 6.038810298543006e-05, "loss": 0.7877, "step": 170 }, { "epoch": 0.98, "learning_rate": 5.9586228314089804e-05, "loss": 0.7614, "step": 175 }, { "epoch": 1.0, "eval_loss": 0.7861344218254089, "eval_runtime": 8.8077, "eval_samples_per_second": 23.616, "eval_steps_per_second": 5.904, "step": 178 }, { "epoch": 1.01, "learning_rate": 5.8758026093652964e-05, "loss": 0.7171, "step": 180 }, { "epoch": 1.04, "learning_rate": 5.790438318380136e-05, "loss": 0.5443, "step": 185 }, { "epoch": 1.07, "learning_rate": 5.702621368674431e-05, "loss": 0.5642, "step": 190 }, { "epoch": 1.09, "learning_rate": 5.6124457968374994e-05, "loss": 0.5541, "step": 195 }, { "epoch": 1.12, "learning_rate": 5.520008165130318e-05, "loss": 0.5469, "step": 200 }, { "epoch": 1.15, "learning_rate": 5.425407458084227e-05, "loss": 0.5585, "step": 205 }, { "epoch": 1.18, "learning_rate": 5.3287449765058205e-05, "loss": 0.5547, "step": 210 }, { "epoch": 1.21, "learning_rate": 5.230124229001506e-05, "loss": 0.5497, "step": 215 }, { "epoch": 1.23, "learning_rate": 5.1296508211378896e-05, "loss": 0.5525, "step": 220 }, { "epoch": 1.26, "learning_rate": 5.027432342356697e-05, "loss": 0.5793, "step": 225 }, { "epoch": 1.29, "learning_rate": 4.9235782507653006e-05, "loss": 0.5504, "step": 230 }, { "epoch": 1.32, "learning_rate": 4.818199755926237e-05, "loss": 0.5546, "step": 235 }, { "epoch": 1.35, "learning_rate": 4.711409699771225e-05, "loss": 0.5674, "step": 240 }, { "epoch": 1.37, "learning_rate": 4.603322435767194e-05, "loss": 0.5422, "step": 245 }, { "epoch": 1.4, "learning_rate": 4.49405370646373e-05, "loss": 0.5694, "step": 250 }, { "epoch": 1.43, "learning_rate": 4.3837205195530523e-05, "loss": 0.5359, "step": 255 }, { "epoch": 1.46, "learning_rate": 4.2724410225752544e-05, "loss": 0.5271, "step": 260 }, { "epoch": 1.49, "learning_rate": 4.160334376402943e-05, "loss": 0.5414, "step": 265 }, { "epoch": 1.51, "learning_rate": 4.047520627640808e-05, "loss": 0.5243, "step": 270 }, { "epoch": 1.54, "learning_rate": 3.934120580076699e-05, "loss": 0.5551, "step": 275 }, { "epoch": 1.57, "learning_rate": 3.820255665321909e-05, "loss": 0.5291, "step": 280 }, { "epoch": 1.6, "learning_rate": 3.706047812779161e-05, "loss": 0.5604, "step": 285 }, { "epoch": 1.63, "learning_rate": 3.5916193190775565e-05, "loss": 0.531, "step": 290 }, { "epoch": 1.65, "learning_rate": 3.4770927171142685e-05, "loss": 0.5531, "step": 295 }, { "epoch": 1.68, "learning_rate": 3.362590644843261e-05, "loss": 0.5574, "step": 300 }, { "epoch": 1.71, "learning_rate": 3.248235713951481e-05, "loss": 0.5286, "step": 305 }, { "epoch": 1.74, "learning_rate": 3.134150378563213e-05, "loss": 0.5286, "step": 310 }, { "epoch": 1.77, "learning_rate": 3.020456804113112e-05, "loss": 0.5335, "step": 315 }, { "epoch": 1.8, "learning_rate": 2.9072767365284138e-05, "loss": 0.5593, "step": 320 }, { "epoch": 1.82, "learning_rate": 2.7947313718603398e-05, "loss": 0.537, "step": 325 }, { "epoch": 1.85, "learning_rate": 2.6829412265043318e-05, "loss": 0.535, "step": 330 }, { "epoch": 1.88, "learning_rate": 2.5720260081480713e-05, "loss": 0.5187, "step": 335 }, { "epoch": 1.91, "learning_rate": 2.4621044875855016e-05, "loss": 0.5157, "step": 340 }, { "epoch": 1.94, "learning_rate": 2.353294371534073e-05, "loss": 0.5321, "step": 345 }, { "epoch": 1.96, "learning_rate": 2.245712176591449e-05, "loss": 0.5577, "step": 350 }, { "epoch": 1.99, "learning_rate": 2.139473104466615e-05, "loss": 0.5033, "step": 355 }, { "epoch": 2.0, "eval_loss": 0.7809656858444214, "eval_runtime": 8.805, "eval_samples_per_second": 23.623, "eval_steps_per_second": 5.906, "step": 356 }, { "epoch": 2.02, "learning_rate": 2.0346909186190027e-05, "loss": 0.433, "step": 360 }, { "epoch": 2.05, "learning_rate": 1.931477822437721e-05, "loss": 0.3788, "step": 365 }, { "epoch": 2.08, "learning_rate": 1.8299443390913702e-05, "loss": 0.3838, "step": 370 }, { "epoch": 2.1, "learning_rate": 1.7301991931770543e-05, "loss": 0.3723, "step": 375 }, { "epoch": 2.13, "learning_rate": 1.6323491942953597e-05, "loss": 0.3813, "step": 380 }, { "epoch": 2.16, "learning_rate": 1.5364991226759663e-05, "loss": 0.3772, "step": 385 }, { "epoch": 2.19, "learning_rate": 1.4427516169763444e-05, "loss": 0.3917, "step": 390 }, { "epoch": 2.22, "learning_rate": 1.351207064373717e-05, "loss": 0.3745, "step": 395 }, { "epoch": 2.24, "learning_rate": 1.2619634930679534e-05, "loss": 0.3923, "step": 400 }, { "epoch": 2.27, "learning_rate": 1.175116467310515e-05, "loss": 0.3817, "step": 405 }, { "epoch": 2.3, "learning_rate": 1.0907589850718605e-05, "loss": 0.4024, "step": 410 }, { "epoch": 2.33, "learning_rate": 1.0089813784569004e-05, "loss": 0.3807, "step": 415 }, { "epoch": 2.36, "learning_rate": 9.298712169751012e-06, "loss": 0.3739, "step": 420 }, { "epoch": 2.38, "learning_rate": 8.535132137688704e-06, "loss": 0.3415, "step": 425 }, { "epoch": 2.41, "learning_rate": 7.799891349006018e-06, "loss": 0.3652, "step": 430 }, { "epoch": 2.44, "learning_rate": 7.093777117955255e-06, "loss": 0.3785, "step": 435 }, { "epoch": 2.47, "learning_rate": 6.417545569341238e-06, "loss": 0.369, "step": 440 }, { "epoch": 2.5, "learning_rate": 5.771920828843995e-06, "loss": 0.3693, "step": 445 }, { "epoch": 2.52, "learning_rate": 5.157594247606773e-06, "loss": 0.3763, "step": 450 }, { "epoch": 2.55, "learning_rate": 4.575223661919969e-06, "loss": 0.3589, "step": 455 }, { "epoch": 2.58, "learning_rate": 4.025432688793538e-06, "loss": 0.372, "step": 460 }, { "epoch": 2.61, "learning_rate": 3.5088100581722544e-06, "loss": 0.3694, "step": 465 }, { "epoch": 2.64, "learning_rate": 3.0259089825089655e-06, "loss": 0.3791, "step": 470 }, { "epoch": 2.66, "learning_rate": 2.577246564370866e-06, "loss": 0.3838, "step": 475 }, { "epoch": 2.69, "learning_rate": 2.163303242713058e-06, "loss": 0.3786, "step": 480 }, { "epoch": 2.72, "learning_rate": 1.7845222784125152e-06, "loss": 0.4005, "step": 485 }, { "epoch": 2.75, "learning_rate": 1.4413092796132431e-06, "loss": 0.358, "step": 490 }, { "epoch": 2.78, "learning_rate": 1.1340317673909082e-06, "loss": 0.3938, "step": 495 }, { "epoch": 2.81, "learning_rate": 8.630187822020567e-07, "loss": 0.3751, "step": 500 }, { "epoch": 2.83, "learning_rate": 6.28560531539366e-07, "loss": 0.3627, "step": 505 }, { "epoch": 2.86, "learning_rate": 4.309080791701819e-07, "loss": 0.3779, "step": 510 }, { "epoch": 2.89, "learning_rate": 2.702730762911531e-07, "loss": 0.3841, "step": 515 }, { "epoch": 2.92, "learning_rate": 1.4682753488684196e-07, "loss": 0.3566, "step": 520 }, { "epoch": 2.95, "learning_rate": 6.070364353494617e-08, "loss": 0.3684, "step": 525 }, { "epoch": 2.97, "learning_rate": 1.1993625855495903e-08, "loss": 0.3561, "step": 530 }, { "epoch": 3.0, "eval_loss": 0.8389958739280701, "eval_runtime": 8.8169, "eval_samples_per_second": 23.591, "eval_steps_per_second": 5.898, "step": 534 }, { "epoch": 3.0, "step": 534, "total_flos": 1.0858665717058765e+17, "train_loss": 0.5910449749075072, "train_runtime": 1507.9641, "train_samples_per_second": 5.672, "train_steps_per_second": 0.354 } ], "logging_steps": 5, "max_steps": 534, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.0858665717058765e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }