{ "best_metric": 11.921277046203613, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 2.008298755186722, "eval_steps": 25, "global_step": 121, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016597510373443983, "grad_norm": 0.008619977161288261, "learning_rate": 2.9999999999999997e-05, "loss": 11.9327, "step": 1 }, { "epoch": 0.016597510373443983, "eval_loss": 11.933255195617676, "eval_runtime": 0.0685, "eval_samples_per_second": 729.868, "eval_steps_per_second": 29.195, "step": 1 }, { "epoch": 0.03319502074688797, "grad_norm": 0.007508904207497835, "learning_rate": 5.9999999999999995e-05, "loss": 11.9321, "step": 2 }, { "epoch": 0.04979253112033195, "grad_norm": 0.007870020344853401, "learning_rate": 8.999999999999999e-05, "loss": 11.9325, "step": 3 }, { "epoch": 0.06639004149377593, "grad_norm": 0.008674144744873047, "learning_rate": 0.00011999999999999999, "loss": 11.9323, "step": 4 }, { "epoch": 0.08298755186721991, "grad_norm": 0.00782778114080429, "learning_rate": 0.00015, "loss": 11.9326, "step": 5 }, { "epoch": 0.0995850622406639, "grad_norm": 0.007341116201132536, "learning_rate": 0.00017999999999999998, "loss": 11.9326, "step": 6 }, { "epoch": 0.11618257261410789, "grad_norm": 0.00769679993391037, "learning_rate": 0.00020999999999999998, "loss": 11.9325, "step": 7 }, { "epoch": 0.13278008298755187, "grad_norm": 0.008552188985049725, "learning_rate": 0.00023999999999999998, "loss": 11.9325, "step": 8 }, { "epoch": 0.14937759336099585, "grad_norm": 0.0071903131902217865, "learning_rate": 0.00027, "loss": 11.9329, "step": 9 }, { "epoch": 0.16597510373443983, "grad_norm": 0.006793375127017498, "learning_rate": 0.0003, "loss": 11.9332, "step": 10 }, { "epoch": 0.1825726141078838, "grad_norm": 0.006766727194190025, "learning_rate": 0.00029993992606774825, "loss": 11.9326, "step": 11 }, { "epoch": 0.1991701244813278, "grad_norm": 0.006954336538910866, "learning_rate": 0.00029975975238935744, "loss": 11.9319, "step": 12 }, { "epoch": 0.2157676348547718, "grad_norm": 0.008698610588908195, "learning_rate": 0.00029945962328137895, "loss": 11.9319, "step": 13 }, { "epoch": 0.23236514522821577, "grad_norm": 0.008457913063466549, "learning_rate": 0.0002990397791429554, "loss": 11.9322, "step": 14 }, { "epoch": 0.24896265560165975, "grad_norm": 0.008619188331067562, "learning_rate": 0.0002985005562632645, "loss": 11.9318, "step": 15 }, { "epoch": 0.26556016597510373, "grad_norm": 0.009035887196660042, "learning_rate": 0.00029784238655215626, "loss": 11.9322, "step": 16 }, { "epoch": 0.2821576763485477, "grad_norm": 0.010372682474553585, "learning_rate": 0.000297065797194199, "loss": 11.9315, "step": 17 }, { "epoch": 0.2987551867219917, "grad_norm": 0.009721986949443817, "learning_rate": 0.00029617141022641243, "loss": 11.932, "step": 18 }, { "epoch": 0.3153526970954357, "grad_norm": 0.011710022576153278, "learning_rate": 0.00029515994204002484, "loss": 11.9316, "step": 19 }, { "epoch": 0.33195020746887965, "grad_norm": 0.011379418894648552, "learning_rate": 0.00029403220280665337, "loss": 11.932, "step": 20 }, { "epoch": 0.34854771784232363, "grad_norm": 0.012264162302017212, "learning_rate": 0.0002927890958293689, "loss": 11.9319, "step": 21 }, { "epoch": 0.3651452282157676, "grad_norm": 0.012727621011435986, "learning_rate": 0.0002914316168191626, "loss": 11.9315, "step": 22 }, { "epoch": 0.3817427385892116, "grad_norm": 0.013410990126430988, "learning_rate": 0.0002899608530973956, "loss": 11.9308, "step": 23 }, { "epoch": 0.3983402489626556, "grad_norm": 0.014416859485208988, "learning_rate": 0.00028837798272487026, "loss": 11.932, "step": 24 }, { "epoch": 0.4149377593360996, "grad_norm": 0.01126072183251381, "learning_rate": 0.00028668427355822034, "loss": 11.9328, "step": 25 }, { "epoch": 0.4149377593360996, "eval_loss": 11.931723594665527, "eval_runtime": 0.0673, "eval_samples_per_second": 743.046, "eval_steps_per_second": 29.722, "step": 25 }, { "epoch": 0.4315352697095436, "grad_norm": 0.019695591181516647, "learning_rate": 0.0002848810822343755, "loss": 11.931, "step": 26 }, { "epoch": 0.44813278008298757, "grad_norm": 0.01740470714867115, "learning_rate": 0.00028296985308391476, "loss": 11.9307, "step": 27 }, { "epoch": 0.46473029045643155, "grad_norm": 0.021143503487110138, "learning_rate": 0.0002809521169741782, "loss": 11.9306, "step": 28 }, { "epoch": 0.48132780082987553, "grad_norm": 0.023337863385677338, "learning_rate": 0.0002788294900830639, "loss": 11.9307, "step": 29 }, { "epoch": 0.4979253112033195, "grad_norm": 0.026668556034564972, "learning_rate": 0.00027660367260449255, "loss": 11.9305, "step": 30 }, { "epoch": 0.5145228215767634, "grad_norm": 0.02838689088821411, "learning_rate": 0.0002742764473865763, "loss": 11.9296, "step": 31 }, { "epoch": 0.5311203319502075, "grad_norm": 0.027914777398109436, "learning_rate": 0.00027184967850358286, "loss": 11.9302, "step": 32 }, { "epoch": 0.5477178423236515, "grad_norm": 0.028979269787669182, "learning_rate": 0.0002693253097628385, "loss": 11.9297, "step": 33 }, { "epoch": 0.5643153526970954, "grad_norm": 0.03429955616593361, "learning_rate": 0.00026670536314776593, "loss": 11.9294, "step": 34 }, { "epoch": 0.5809128630705395, "grad_norm": 0.026081860065460205, "learning_rate": 0.00026399193719830457, "loss": 11.9296, "step": 35 }, { "epoch": 0.5975103734439834, "grad_norm": 0.026742985472083092, "learning_rate": 0.00026118720533001, "loss": 11.9289, "step": 36 }, { "epoch": 0.6141078838174274, "grad_norm": 0.03144499287009239, "learning_rate": 0.0002582934140931786, "loss": 11.929, "step": 37 }, { "epoch": 0.6307053941908713, "grad_norm": 0.04750339314341545, "learning_rate": 0.0002553128813733934, "loss": 11.9278, "step": 38 }, { "epoch": 0.6473029045643154, "grad_norm": 0.044398147612810135, "learning_rate": 0.0002522479945349299, "loss": 11.9275, "step": 39 }, { "epoch": 0.6639004149377593, "grad_norm": 0.04385744780302048, "learning_rate": 0.00024910120850851216, "loss": 11.9278, "step": 40 }, { "epoch": 0.6804979253112033, "grad_norm": 0.05074305832386017, "learning_rate": 0.00024587504382494774, "loss": 11.9262, "step": 41 }, { "epoch": 0.6970954356846473, "grad_norm": 0.047362346202135086, "learning_rate": 0.00024257208459621828, "loss": 11.9268, "step": 42 }, { "epoch": 0.7136929460580913, "grad_norm": 0.04583980143070221, "learning_rate": 0.00023919497644564298, "loss": 11.9258, "step": 43 }, { "epoch": 0.7302904564315352, "grad_norm": 0.049639176577329636, "learning_rate": 0.0002357464243887718, "loss": 11.9255, "step": 44 }, { "epoch": 0.7468879668049793, "grad_norm": 0.04268966615200043, "learning_rate": 0.00023222919066670647, "loss": 11.9256, "step": 45 }, { "epoch": 0.7634854771784232, "grad_norm": 0.04423046112060547, "learning_rate": 0.00022864609253358474, "loss": 11.9251, "step": 46 }, { "epoch": 0.7800829875518672, "grad_norm": 0.03876250982284546, "learning_rate": 0.000225, "loss": 11.9259, "step": 47 }, { "epoch": 0.7966804979253111, "grad_norm": 0.031338173896074295, "learning_rate": 0.00022129383353416347, "loss": 11.9259, "step": 48 }, { "epoch": 0.8132780082987552, "grad_norm": 0.031210312619805336, "learning_rate": 0.00021753056172265096, "loss": 11.9261, "step": 49 }, { "epoch": 0.8298755186721992, "grad_norm": 0.03140793740749359, "learning_rate": 0.00021371319889260717, "loss": 11.926, "step": 50 }, { "epoch": 0.8298755186721992, "eval_loss": 11.924413681030273, "eval_runtime": 0.0674, "eval_samples_per_second": 742.066, "eval_steps_per_second": 29.683, "step": 50 }, { "epoch": 0.8464730290456431, "grad_norm": 0.043788015842437744, "learning_rate": 0.00020984480269731242, "loss": 11.9232, "step": 51 }, { "epoch": 0.8630705394190872, "grad_norm": 0.03660878911614418, "learning_rate": 0.0002059284716670463, "loss": 11.9244, "step": 52 }, { "epoch": 0.8796680497925311, "grad_norm": 0.03714355081319809, "learning_rate": 0.00020196734272720854, "loss": 11.923, "step": 53 }, { "epoch": 0.8962655601659751, "grad_norm": 0.03057694435119629, "learning_rate": 0.00019796458868568678, "loss": 11.922, "step": 54 }, { "epoch": 0.9128630705394191, "grad_norm": 0.02990574575960636, "learning_rate": 0.00019392341569148252, "loss": 11.9218, "step": 55 }, { "epoch": 0.9294605809128631, "grad_norm": 0.025974757969379425, "learning_rate": 0.00018984706066663143, "loss": 11.9227, "step": 56 }, { "epoch": 0.946058091286307, "grad_norm": 0.02820601500570774, "learning_rate": 0.00018573878871347473, "loss": 11.9225, "step": 57 }, { "epoch": 0.9626556016597511, "grad_norm": 0.028666473925113678, "learning_rate": 0.00018160189049935892, "loss": 11.9233, "step": 58 }, { "epoch": 0.979253112033195, "grad_norm": 0.02322000451385975, "learning_rate": 0.00017743967962085798, "loss": 11.923, "step": 59 }, { "epoch": 0.995850622406639, "grad_norm": 0.028861528262495995, "learning_rate": 0.00017325548994962965, "loss": 11.924, "step": 60 }, { "epoch": 1.012448132780083, "grad_norm": 0.04327556490898132, "learning_rate": 0.0001690526729620318, "loss": 21.3788, "step": 61 }, { "epoch": 1.0290456431535269, "grad_norm": 0.02456553652882576, "learning_rate": 0.00016483459505463747, "loss": 11.6539, "step": 62 }, { "epoch": 1.045643153526971, "grad_norm": 0.02364276722073555, "learning_rate": 0.00016060463484779918, "loss": 11.83, "step": 63 }, { "epoch": 1.062240663900415, "grad_norm": 0.02070525474846363, "learning_rate": 0.00015636618047942222, "loss": 11.9144, "step": 64 }, { "epoch": 1.0788381742738589, "grad_norm": 0.0210164375603199, "learning_rate": 0.00015212262689111433, "loss": 11.9249, "step": 65 }, { "epoch": 1.095435684647303, "grad_norm": 0.02003309689462185, "learning_rate": 0.0001478773731088857, "loss": 11.9061, "step": 66 }, { "epoch": 1.112033195020747, "grad_norm": 0.01854197308421135, "learning_rate": 0.00014363381952057778, "loss": 11.938, "step": 67 }, { "epoch": 1.1286307053941909, "grad_norm": 0.022063063457608223, "learning_rate": 0.0001393953651522008, "loss": 11.9017, "step": 68 }, { "epoch": 1.1452282157676348, "grad_norm": 0.017270218580961227, "learning_rate": 0.00013516540494536253, "loss": 11.978, "step": 69 }, { "epoch": 1.161825726141079, "grad_norm": 0.01767859421670437, "learning_rate": 0.00013094732703796818, "loss": 11.9076, "step": 70 }, { "epoch": 1.1784232365145229, "grad_norm": 0.01846623234450817, "learning_rate": 0.0001267445100503703, "loss": 11.9928, "step": 71 }, { "epoch": 1.1950207468879668, "grad_norm": 0.02277253195643425, "learning_rate": 0.000122560320379142, "loss": 11.9957, "step": 72 }, { "epoch": 1.2116182572614107, "grad_norm": 0.017460819333791733, "learning_rate": 0.00011839810950064109, "loss": 9.1042, "step": 73 }, { "epoch": 1.2282157676348548, "grad_norm": 0.027226807549595833, "learning_rate": 0.00011426121128652526, "loss": 14.9039, "step": 74 }, { "epoch": 1.2448132780082988, "grad_norm": 0.02005860209465027, "learning_rate": 0.00011015293933336857, "loss": 11.642, "step": 75 }, { "epoch": 1.2448132780082988, "eval_loss": 11.92165756225586, "eval_runtime": 0.0678, "eval_samples_per_second": 737.673, "eval_steps_per_second": 29.507, "step": 75 }, { "epoch": 1.2614107883817427, "grad_norm": 0.01866528019309044, "learning_rate": 0.00010607658430851744, "loss": 11.8735, "step": 76 }, { "epoch": 1.2780082987551866, "grad_norm": 0.014573541469871998, "learning_rate": 0.0001020354113143132, "loss": 11.8891, "step": 77 }, { "epoch": 1.2946058091286308, "grad_norm": 0.014832578599452972, "learning_rate": 9.803265727279149e-05, "loss": 11.9601, "step": 78 }, { "epoch": 1.3112033195020747, "grad_norm": 0.010414165444672108, "learning_rate": 9.407152833295372e-05, "loss": 11.9166, "step": 79 }, { "epoch": 1.3278008298755186, "grad_norm": 0.014295806176960468, "learning_rate": 9.015519730268754e-05, "loss": 11.883, "step": 80 }, { "epoch": 1.3443983402489628, "grad_norm": 0.011363668367266655, "learning_rate": 8.62868011073928e-05, "loss": 11.9823, "step": 81 }, { "epoch": 1.3609958506224067, "grad_norm": 0.012058326043188572, "learning_rate": 8.246943827734897e-05, "loss": 11.9326, "step": 82 }, { "epoch": 1.3775933609958506, "grad_norm": 0.013995883986353874, "learning_rate": 7.870616646583648e-05, "loss": 11.9321, "step": 83 }, { "epoch": 1.3941908713692945, "grad_norm": 0.017102347686886787, "learning_rate": 7.500000000000002e-05, "loss": 11.9893, "step": 84 }, { "epoch": 1.4107883817427385, "grad_norm": 0.019754430279135704, "learning_rate": 7.135390746641526e-05, "loss": 12.3581, "step": 85 }, { "epoch": 1.4273858921161826, "grad_norm": 0.014634879305958748, "learning_rate": 6.777080933329354e-05, "loss": 11.5957, "step": 86 }, { "epoch": 1.4439834024896265, "grad_norm": 0.01592666283249855, "learning_rate": 6.425357561122819e-05, "loss": 11.8704, "step": 87 }, { "epoch": 1.4605809128630705, "grad_norm": 0.01606922596693039, "learning_rate": 6.080502355435701e-05, "loss": 11.7446, "step": 88 }, { "epoch": 1.4771784232365146, "grad_norm": 0.017506958916783333, "learning_rate": 5.742791540378175e-05, "loss": 11.9198, "step": 89 }, { "epoch": 1.4937759336099585, "grad_norm": 0.014228662475943565, "learning_rate": 5.4124956175052295e-05, "loss": 11.9523, "step": 90 }, { "epoch": 1.5103734439834025, "grad_norm": 0.01120895054191351, "learning_rate": 5.089879149148781e-05, "loss": 11.8631, "step": 91 }, { "epoch": 1.5269709543568464, "grad_norm": 0.01069362461566925, "learning_rate": 4.7752005465070094e-05, "loss": 11.9319, "step": 92 }, { "epoch": 1.5435684647302903, "grad_norm": 0.011584184132516384, "learning_rate": 4.468711862660662e-05, "loss": 11.9276, "step": 93 }, { "epoch": 1.5601659751037344, "grad_norm": 0.012647481635212898, "learning_rate": 4.1706585906821334e-05, "loss": 11.9491, "step": 94 }, { "epoch": 1.5767634854771784, "grad_norm": 0.012272904627025127, "learning_rate": 3.881279466999001e-05, "loss": 11.9568, "step": 95 }, { "epoch": 1.5933609958506225, "grad_norm": 0.016778666526079178, "learning_rate": 3.600806280169541e-05, "loss": 11.9439, "step": 96 }, { "epoch": 1.6099585062240664, "grad_norm": 0.01830066554248333, "learning_rate": 3.3294636852234105e-05, "loss": 12.0261, "step": 97 }, { "epoch": 1.6265560165975104, "grad_norm": 0.0076575614511966705, "learning_rate": 3.067469023716154e-05, "loss": 9.0989, "step": 98 }, { "epoch": 1.6431535269709543, "grad_norm": 0.020825980231165886, "learning_rate": 2.8150321496417135e-05, "loss": 14.9122, "step": 99 }, { "epoch": 1.6597510373443982, "grad_norm": 0.013375605456531048, "learning_rate": 2.5723552613423687e-05, "loss": 11.6177, "step": 100 }, { "epoch": 1.6597510373443982, "eval_loss": 11.921277046203613, "eval_runtime": 0.0676, "eval_samples_per_second": 739.82, "eval_steps_per_second": 29.593, "step": 100 }, { "epoch": 1.6763485477178424, "grad_norm": 0.01629016175866127, "learning_rate": 2.3396327395507448e-05, "loss": 11.9071, "step": 101 }, { "epoch": 1.6929460580912863, "grad_norm": 0.015467053279280663, "learning_rate": 2.117050991693609e-05, "loss": 11.8865, "step": 102 }, { "epoch": 1.7095435684647304, "grad_norm": 0.01504999864846468, "learning_rate": 1.9047883025821774e-05, "loss": 11.9293, "step": 103 }, { "epoch": 1.7261410788381744, "grad_norm": 0.01039854995906353, "learning_rate": 1.7030146916085185e-05, "loss": 11.8954, "step": 104 }, { "epoch": 1.7427385892116183, "grad_norm": 0.011301021091639996, "learning_rate": 1.5118917765624467e-05, "loss": 11.9575, "step": 105 }, { "epoch": 1.7593360995850622, "grad_norm": 0.009601314552128315, "learning_rate": 1.3315726441779629e-05, "loss": 11.9122, "step": 106 }, { "epoch": 1.7759336099585061, "grad_norm": 0.011440463364124298, "learning_rate": 1.1622017275129708e-05, "loss": 11.9329, "step": 107 }, { "epoch": 1.79253112033195, "grad_norm": 0.013274903409183025, "learning_rate": 1.00391469026044e-05, "loss": 11.9641, "step": 108 }, { "epoch": 1.8091286307053942, "grad_norm": 0.013196711428463459, "learning_rate": 8.568383180837368e-06, "loss": 12.013, "step": 109 }, { "epoch": 1.8257261410788381, "grad_norm": 0.022849783301353455, "learning_rate": 7.210904170631021e-06, "loss": 12.3974, "step": 110 }, { "epoch": 1.8423236514522823, "grad_norm": 0.015286913141608238, "learning_rate": 5.967797193346574e-06, "loss": 11.7548, "step": 111 }, { "epoch": 1.8589211618257262, "grad_norm": 0.012541989795863628, "learning_rate": 4.840057959975169e-06, "loss": 11.5806, "step": 112 }, { "epoch": 1.8755186721991701, "grad_norm": 0.013375689275562763, "learning_rate": 3.828589773587515e-06, "loss": 11.8641, "step": 113 }, { "epoch": 1.892116182572614, "grad_norm": 0.01497753243893385, "learning_rate": 2.934202805800989e-06, "loss": 11.8974, "step": 114 }, { "epoch": 1.908713692946058, "grad_norm": 0.010004348121583462, "learning_rate": 2.1576134478437313e-06, "loss": 11.8974, "step": 115 }, { "epoch": 1.9253112033195021, "grad_norm": 0.009814193472266197, "learning_rate": 1.4994437367354339e-06, "loss": 11.9167, "step": 116 }, { "epoch": 1.941908713692946, "grad_norm": 0.008285530842840672, "learning_rate": 9.602208570445636e-07, "loss": 12.0253, "step": 117 }, { "epoch": 1.9585062240663902, "grad_norm": 0.01668418012559414, "learning_rate": 5.403767186210218e-07, "loss": 11.8426, "step": 118 }, { "epoch": 1.9751037344398341, "grad_norm": 0.014265616424381733, "learning_rate": 2.402476106425466e-07, "loss": 12.0079, "step": 119 }, { "epoch": 1.991701244813278, "grad_norm": 0.016387728974223137, "learning_rate": 6.007393225176404e-08, "loss": 12.1605, "step": 120 }, { "epoch": 2.008298755186722, "grad_norm": 0.04243115335702896, "learning_rate": 0.0, "loss": 20.6424, "step": 121 } ], "logging_steps": 1, "max_steps": 121, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 774412984320.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }