bbytxt commited on
Commit
690d77b
·
verified ·
1 Parent(s): 262743f

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5e6eb8fd4cb6dbbe4ec7981572f13a3c7717419c472d0d9ff6d3b972d5502ea
3
  size 159967880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:523dc5b3f0972cefe226770c2ad55aa66f71799eaa7dd4e9167c0f26c3edbacc
3
  size 159967880
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22d19b468ff404a93b1d2619112dd211f586b4688d74c707ad00b920a70d3238
3
  size 320194002
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f84cf4e3fadd73fb35b714bc232bb82e1f27b24feb9470e5872e26b49f420d6
3
  size 320194002
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb5da702253e40d3909105761b192282defbec927cbd5e729a3b3a45c889cd35
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b635fafc36c81ad5baf855e8bb81484c018caf9c35f2f12172539e4089754e6f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ad2841b888ce0ae948634757c3fcacf0119c249e0fec8f3ca61ea266369ef92
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5d2a6c6aafc669cea03b9634666f204de949a3d45ce2f48a07e7e3eaf18c715
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 2.671022653579712,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-50",
4
- "epoch": 0.017776197671318106,
5
  "eval_steps": 25,
6
- "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -381,6 +381,372 @@
381
  "eval_samples_per_second": 14.655,
382
  "eval_steps_per_second": 2.052,
383
  "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  }
385
  ],
386
  "logging_steps": 1,
@@ -409,7 +775,7 @@
409
  "attributes": {}
410
  }
411
  },
412
- "total_flos": 6.977708362196582e+16,
413
  "train_batch_size": 8,
414
  "trial_name": null,
415
  "trial_params": null
 
1
  {
2
+ "best_metric": 2.6253039836883545,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
+ "epoch": 0.03555239534263621,
5
  "eval_steps": 25,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
381
  "eval_samples_per_second": 14.655,
382
  "eval_steps_per_second": 2.052,
383
  "step": 50
384
+ },
385
+ {
386
+ "epoch": 0.018131721624744468,
387
+ "grad_norm": 7.684964179992676,
388
+ "learning_rate": 0.0002668315918143169,
389
+ "loss": 9.6814,
390
+ "step": 51
391
+ },
392
+ {
393
+ "epoch": 0.01848724557817083,
394
+ "grad_norm": 8.576638221740723,
395
+ "learning_rate": 0.00026526016662852886,
396
+ "loss": 9.9166,
397
+ "step": 52
398
+ },
399
+ {
400
+ "epoch": 0.018842769531597192,
401
+ "grad_norm": 6.030575275421143,
402
+ "learning_rate": 0.00026365723046405023,
403
+ "loss": 9.4847,
404
+ "step": 53
405
+ },
406
+ {
407
+ "epoch": 0.019198293485023554,
408
+ "grad_norm": 5.481552600860596,
409
+ "learning_rate": 0.0002620232215476231,
410
+ "loss": 10.3622,
411
+ "step": 54
412
+ },
413
+ {
414
+ "epoch": 0.019553817438449916,
415
+ "grad_norm": 6.241498947143555,
416
+ "learning_rate": 0.0002603585866009697,
417
+ "loss": 10.2548,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 0.01990934139187628,
422
+ "grad_norm": 6.387746334075928,
423
+ "learning_rate": 0.00025866378071866334,
424
+ "loss": 11.7413,
425
+ "step": 56
426
+ },
427
+ {
428
+ "epoch": 0.02026486534530264,
429
+ "grad_norm": 5.217421054840088,
430
+ "learning_rate": 0.00025693926724370956,
431
+ "loss": 10.0964,
432
+ "step": 57
433
+ },
434
+ {
435
+ "epoch": 0.020620389298729003,
436
+ "grad_norm": 5.437652587890625,
437
+ "learning_rate": 0.00025518551764087326,
438
+ "loss": 10.6227,
439
+ "step": 58
440
+ },
441
+ {
442
+ "epoch": 0.020975913252155365,
443
+ "grad_norm": 5.892682075500488,
444
+ "learning_rate": 0.00025340301136778483,
445
+ "loss": 10.518,
446
+ "step": 59
447
+ },
448
+ {
449
+ "epoch": 0.021331437205581727,
450
+ "grad_norm": 5.065958499908447,
451
+ "learning_rate": 0.00025159223574386114,
452
+ "loss": 10.1033,
453
+ "step": 60
454
+ },
455
+ {
456
+ "epoch": 0.02168696115900809,
457
+ "grad_norm": 4.1559953689575195,
458
+ "learning_rate": 0.0002497536858170772,
459
+ "loss": 10.8079,
460
+ "step": 61
461
+ },
462
+ {
463
+ "epoch": 0.02204248511243445,
464
+ "grad_norm": 4.143509387969971,
465
+ "learning_rate": 0.00024788786422862526,
466
+ "loss": 10.3888,
467
+ "step": 62
468
+ },
469
+ {
470
+ "epoch": 0.022398009065860813,
471
+ "grad_norm": 4.2139363288879395,
472
+ "learning_rate": 0.00024599528107549745,
473
+ "loss": 10.6368,
474
+ "step": 63
475
+ },
476
+ {
477
+ "epoch": 0.022753533019287175,
478
+ "grad_norm": 4.801663398742676,
479
+ "learning_rate": 0.00024407645377103054,
480
+ "loss": 9.8445,
481
+ "step": 64
482
+ },
483
+ {
484
+ "epoch": 0.023109056972713538,
485
+ "grad_norm": 5.6707024574279785,
486
+ "learning_rate": 0.00024213190690345018,
487
+ "loss": 9.8583,
488
+ "step": 65
489
+ },
490
+ {
491
+ "epoch": 0.0234645809261399,
492
+ "grad_norm": 4.797868728637695,
493
+ "learning_rate": 0.00024016217209245374,
494
+ "loss": 10.7598,
495
+ "step": 66
496
+ },
497
+ {
498
+ "epoch": 0.023820104879566262,
499
+ "grad_norm": 4.8325371742248535,
500
+ "learning_rate": 0.00023816778784387094,
501
+ "loss": 9.2898,
502
+ "step": 67
503
+ },
504
+ {
505
+ "epoch": 0.024175628832992624,
506
+ "grad_norm": 4.890848159790039,
507
+ "learning_rate": 0.0002361492994024415,
508
+ "loss": 10.3457,
509
+ "step": 68
510
+ },
511
+ {
512
+ "epoch": 0.024531152786418986,
513
+ "grad_norm": 7.144769191741943,
514
+ "learning_rate": 0.0002341072586027509,
515
+ "loss": 9.929,
516
+ "step": 69
517
+ },
518
+ {
519
+ "epoch": 0.024886676739845348,
520
+ "grad_norm": 5.622339248657227,
521
+ "learning_rate": 0.00023204222371836405,
522
+ "loss": 9.5537,
523
+ "step": 70
524
+ },
525
+ {
526
+ "epoch": 0.02524220069327171,
527
+ "grad_norm": 8.047707557678223,
528
+ "learning_rate": 0.00022995475930919905,
529
+ "loss": 10.8773,
530
+ "step": 71
531
+ },
532
+ {
533
+ "epoch": 0.025597724646698072,
534
+ "grad_norm": 5.305599689483643,
535
+ "learning_rate": 0.00022784543606718227,
536
+ "loss": 10.2004,
537
+ "step": 72
538
+ },
539
+ {
540
+ "epoch": 0.025953248600124434,
541
+ "grad_norm": 5.336524963378906,
542
+ "learning_rate": 0.00022571483066022657,
543
+ "loss": 9.5412,
544
+ "step": 73
545
+ },
546
+ {
547
+ "epoch": 0.026308772553550797,
548
+ "grad_norm": 5.118494987487793,
549
+ "learning_rate": 0.0002235635255745762,
550
+ "loss": 9.6813,
551
+ "step": 74
552
+ },
553
+ {
554
+ "epoch": 0.02666429650697716,
555
+ "grad_norm": 4.906371116638184,
556
+ "learning_rate": 0.00022139210895556104,
557
+ "loss": 10.4721,
558
+ "step": 75
559
+ },
560
+ {
561
+ "epoch": 0.02666429650697716,
562
+ "eval_loss": 2.505891799926758,
563
+ "eval_runtime": 3.4096,
564
+ "eval_samples_per_second": 14.665,
565
+ "eval_steps_per_second": 2.053,
566
+ "step": 75
567
+ },
568
+ {
569
+ "epoch": 0.02701982046040352,
570
+ "grad_norm": 4.981451511383057,
571
+ "learning_rate": 0.00021920117444680317,
572
+ "loss": 8.8563,
573
+ "step": 76
574
+ },
575
+ {
576
+ "epoch": 0.027375344413829883,
577
+ "grad_norm": 4.70904016494751,
578
+ "learning_rate": 0.00021699132102792097,
579
+ "loss": 10.5626,
580
+ "step": 77
581
+ },
582
+ {
583
+ "epoch": 0.027730868367256245,
584
+ "grad_norm": 5.6545233726501465,
585
+ "learning_rate": 0.0002147631528507739,
586
+ "loss": 11.0533,
587
+ "step": 78
588
+ },
589
+ {
590
+ "epoch": 0.028086392320682607,
591
+ "grad_norm": 5.137275218963623,
592
+ "learning_rate": 0.00021251727907429355,
593
+ "loss": 10.8668,
594
+ "step": 79
595
+ },
596
+ {
597
+ "epoch": 0.02844191627410897,
598
+ "grad_norm": 5.4811553955078125,
599
+ "learning_rate": 0.0002102543136979454,
600
+ "loss": 10.9318,
601
+ "step": 80
602
+ },
603
+ {
604
+ "epoch": 0.02879744022753533,
605
+ "grad_norm": 5.466274261474609,
606
+ "learning_rate": 0.0002079748753938678,
607
+ "loss": 10.401,
608
+ "step": 81
609
+ },
610
+ {
611
+ "epoch": 0.029152964180961694,
612
+ "grad_norm": 5.636516094207764,
613
+ "learning_rate": 0.0002056795873377331,
614
+ "loss": 10.0704,
615
+ "step": 82
616
+ },
617
+ {
618
+ "epoch": 0.029508488134388056,
619
+ "grad_norm": 5.315571308135986,
620
+ "learning_rate": 0.00020336907703837748,
621
+ "loss": 11.0001,
622
+ "step": 83
623
+ },
624
+ {
625
+ "epoch": 0.029864012087814418,
626
+ "grad_norm": 6.53507661819458,
627
+ "learning_rate": 0.00020104397616624645,
628
+ "loss": 11.992,
629
+ "step": 84
630
+ },
631
+ {
632
+ "epoch": 0.03021953604124078,
633
+ "grad_norm": 5.709775924682617,
634
+ "learning_rate": 0.00019870492038070252,
635
+ "loss": 10.4158,
636
+ "step": 85
637
+ },
638
+ {
639
+ "epoch": 0.030575059994667142,
640
+ "grad_norm": 5.086690425872803,
641
+ "learning_rate": 0.0001963525491562421,
642
+ "loss": 10.1491,
643
+ "step": 86
644
+ },
645
+ {
646
+ "epoch": 0.030930583948093504,
647
+ "grad_norm": 5.827716827392578,
648
+ "learning_rate": 0.0001939875056076697,
649
+ "loss": 11.3729,
650
+ "step": 87
651
+ },
652
+ {
653
+ "epoch": 0.031286107901519866,
654
+ "grad_norm": 5.376121520996094,
655
+ "learning_rate": 0.00019161043631427666,
656
+ "loss": 10.2509,
657
+ "step": 88
658
+ },
659
+ {
660
+ "epoch": 0.03164163185494623,
661
+ "grad_norm": 5.2902445793151855,
662
+ "learning_rate": 0.00018922199114307294,
663
+ "loss": 9.9306,
664
+ "step": 89
665
+ },
666
+ {
667
+ "epoch": 0.03199715580837259,
668
+ "grad_norm": 6.116251468658447,
669
+ "learning_rate": 0.00018682282307111987,
670
+ "loss": 11.4962,
671
+ "step": 90
672
+ },
673
+ {
674
+ "epoch": 0.03235267976179895,
675
+ "grad_norm": 5.817269325256348,
676
+ "learning_rate": 0.00018441358800701273,
677
+ "loss": 10.5293,
678
+ "step": 91
679
+ },
680
+ {
681
+ "epoch": 0.032708203715225315,
682
+ "grad_norm": 5.698713779449463,
683
+ "learning_rate": 0.00018199494461156203,
684
+ "loss": 9.9072,
685
+ "step": 92
686
+ },
687
+ {
688
+ "epoch": 0.03306372766865168,
689
+ "grad_norm": 7.766204357147217,
690
+ "learning_rate": 0.000179567554117722,
691
+ "loss": 10.4333,
692
+ "step": 93
693
+ },
694
+ {
695
+ "epoch": 0.03341925162207804,
696
+ "grad_norm": 6.0432939529418945,
697
+ "learning_rate": 0.00017713208014981648,
698
+ "loss": 11.2163,
699
+ "step": 94
700
+ },
701
+ {
702
+ "epoch": 0.0337747755755044,
703
+ "grad_norm": 6.411512851715088,
704
+ "learning_rate": 0.00017468918854211007,
705
+ "loss": 11.3715,
706
+ "step": 95
707
+ },
708
+ {
709
+ "epoch": 0.03413029952893076,
710
+ "grad_norm": 6.964004039764404,
711
+ "learning_rate": 0.00017223954715677627,
712
+ "loss": 10.0495,
713
+ "step": 96
714
+ },
715
+ {
716
+ "epoch": 0.034485823482357125,
717
+ "grad_norm": 5.977689743041992,
718
+ "learning_rate": 0.00016978382570131034,
719
+ "loss": 10.9753,
720
+ "step": 97
721
+ },
722
+ {
723
+ "epoch": 0.03484134743578349,
724
+ "grad_norm": 6.7813825607299805,
725
+ "learning_rate": 0.00016732269554543794,
726
+ "loss": 9.5063,
727
+ "step": 98
728
+ },
729
+ {
730
+ "epoch": 0.03519687138920985,
731
+ "grad_norm": 6.9034318923950195,
732
+ "learning_rate": 0.00016485682953756942,
733
+ "loss": 10.1179,
734
+ "step": 99
735
+ },
736
+ {
737
+ "epoch": 0.03555239534263621,
738
+ "grad_norm": 10.123339653015137,
739
+ "learning_rate": 0.00016238690182084986,
740
+ "loss": 9.7543,
741
+ "step": 100
742
+ },
743
+ {
744
+ "epoch": 0.03555239534263621,
745
+ "eval_loss": 2.6253039836883545,
746
+ "eval_runtime": 3.4096,
747
+ "eval_samples_per_second": 14.665,
748
+ "eval_steps_per_second": 2.053,
749
+ "step": 100
750
  }
751
  ],
752
  "logging_steps": 1,
 
775
  "attributes": {}
776
  }
777
  },
778
+ "total_flos": 1.3920701757417062e+17,
779
  "train_batch_size": 8,
780
  "trial_name": null,
781
  "trial_params": null