leixa commited on
Commit
913624c
1 Parent(s): 81707a6

Training in progress, step 375, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5744af8f22d56944d2d945034e5a025b6de0020efd45cc44c9b8cb27e1e94f71
3
- size 150486964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:814c0505ae9626a132eee74a5d49746ba88ce25071435a2e3bf44bf9a6955753
3
+ size 150487412
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50c8f9662444a91e91f01d69a445b49517e278e5e03e01795aa31274466481e0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d2e5ea8bbdbe6933b5b2f456e20e6bca2dc98048eecb503cf50ba6989aff775
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d90c730646140ec36d7749c40daa51b09c4e3a0b620d5c95eeda7764b46e3d79
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:128e0b0294b5389dce5b958620f0aba512ba88459c3fb7de261ee4ac77eb7fa5
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.00808142847329694,
5
  "eval_steps": 125,
6
- "global_step": 250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -612,6 +612,308 @@
612
  "eval_samples_per_second": 45.252,
613
  "eval_steps_per_second": 22.627,
614
  "step": 250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  }
616
  ],
617
  "logging_steps": 3,
@@ -631,7 +933,7 @@
631
  "attributes": {}
632
  }
633
  },
634
- "total_flos": 1.700904566784e+16,
635
  "train_batch_size": 2,
636
  "trial_name": null,
637
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.01212214270994541,
5
  "eval_steps": 125,
6
+ "global_step": 375,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
612
  "eval_samples_per_second": 45.252,
613
  "eval_steps_per_second": 22.627,
614
  "step": 250
615
+ },
616
+ {
617
+ "epoch": 0.008146079901083316,
618
+ "grad_norm": NaN,
619
+ "learning_rate": 5.0961652739384356e-05,
620
+ "loss": 0.0,
621
+ "step": 252
622
+ },
623
+ {
624
+ "epoch": 0.008243057042762878,
625
+ "grad_norm": NaN,
626
+ "learning_rate": 5e-05,
627
+ "loss": 0.0,
628
+ "step": 255
629
+ },
630
+ {
631
+ "epoch": 0.008340034184442442,
632
+ "grad_norm": NaN,
633
+ "learning_rate": 4.903834726061565e-05,
634
+ "loss": 0.0,
635
+ "step": 258
636
+ },
637
+ {
638
+ "epoch": 0.008437011326122006,
639
+ "grad_norm": NaN,
640
+ "learning_rate": 4.807705027948008e-05,
641
+ "loss": 0.0,
642
+ "step": 261
643
+ },
644
+ {
645
+ "epoch": 0.00853398846780157,
646
+ "grad_norm": NaN,
647
+ "learning_rate": 4.711646468323129e-05,
648
+ "loss": 0.0,
649
+ "step": 264
650
+ },
651
+ {
652
+ "epoch": 0.008630965609481131,
653
+ "grad_norm": NaN,
654
+ "learning_rate": 4.6156945835334184e-05,
655
+ "loss": 0.0,
656
+ "step": 267
657
+ },
658
+ {
659
+ "epoch": 0.008727942751160695,
660
+ "grad_norm": NaN,
661
+ "learning_rate": 4.5198848704615914e-05,
662
+ "loss": 0.0,
663
+ "step": 270
664
+ },
665
+ {
666
+ "epoch": 0.008824919892840259,
667
+ "grad_norm": NaN,
668
+ "learning_rate": 4.424252773394704e-05,
669
+ "loss": 0.0,
670
+ "step": 273
671
+ },
672
+ {
673
+ "epoch": 0.008921897034519823,
674
+ "grad_norm": NaN,
675
+ "learning_rate": 4.328833670911724e-05,
676
+ "loss": 0.0,
677
+ "step": 276
678
+ },
679
+ {
680
+ "epoch": 0.009018874176199385,
681
+ "grad_norm": NaN,
682
+ "learning_rate": 4.23366286279542e-05,
683
+ "loss": 0.0,
684
+ "step": 279
685
+ },
686
+ {
687
+ "epoch": 0.009115851317878948,
688
+ "grad_norm": NaN,
689
+ "learning_rate": 4.138775556973406e-05,
690
+ "loss": 0.0,
691
+ "step": 282
692
+ },
693
+ {
694
+ "epoch": 0.009212828459558512,
695
+ "grad_norm": NaN,
696
+ "learning_rate": 4.04420685649314e-05,
697
+ "loss": 0.0,
698
+ "step": 285
699
+ },
700
+ {
701
+ "epoch": 0.009309805601238074,
702
+ "grad_norm": NaN,
703
+ "learning_rate": 3.9499917465357534e-05,
704
+ "loss": 0.0,
705
+ "step": 288
706
+ },
707
+ {
708
+ "epoch": 0.009406782742917638,
709
+ "grad_norm": NaN,
710
+ "learning_rate": 3.856165081473474e-05,
711
+ "loss": 0.0,
712
+ "step": 291
713
+ },
714
+ {
715
+ "epoch": 0.009503759884597202,
716
+ "grad_norm": NaN,
717
+ "learning_rate": 3.762761571975429e-05,
718
+ "loss": 0.0,
719
+ "step": 294
720
+ },
721
+ {
722
+ "epoch": 0.009600737026276765,
723
+ "grad_norm": NaN,
724
+ "learning_rate": 3.6698157721666246e-05,
725
+ "loss": 0.0,
726
+ "step": 297
727
+ },
728
+ {
729
+ "epoch": 0.009697714167956327,
730
+ "grad_norm": NaN,
731
+ "learning_rate": 3.5773620668448384e-05,
732
+ "loss": 0.0,
733
+ "step": 300
734
+ },
735
+ {
736
+ "epoch": 0.009794691309635891,
737
+ "grad_norm": NaN,
738
+ "learning_rate": 3.48543465876014e-05,
739
+ "loss": 0.0,
740
+ "step": 303
741
+ },
742
+ {
743
+ "epoch": 0.009891668451315455,
744
+ "grad_norm": NaN,
745
+ "learning_rate": 3.3940675559617724e-05,
746
+ "loss": 0.0,
747
+ "step": 306
748
+ },
749
+ {
750
+ "epoch": 0.009988645592995019,
751
+ "grad_norm": NaN,
752
+ "learning_rate": 3.303294559217063e-05,
753
+ "loss": 0.0,
754
+ "step": 309
755
+ },
756
+ {
757
+ "epoch": 0.01008562273467458,
758
+ "grad_norm": NaN,
759
+ "learning_rate": 3.213149249506997e-05,
760
+ "loss": 0.0,
761
+ "step": 312
762
+ },
763
+ {
764
+ "epoch": 0.010182599876354144,
765
+ "grad_norm": NaN,
766
+ "learning_rate": 3.12366497560313e-05,
767
+ "loss": 0.0,
768
+ "step": 315
769
+ },
770
+ {
771
+ "epoch": 0.010279577018033708,
772
+ "grad_norm": NaN,
773
+ "learning_rate": 3.0348748417303823e-05,
774
+ "loss": 0.0,
775
+ "step": 318
776
+ },
777
+ {
778
+ "epoch": 0.010376554159713272,
779
+ "grad_norm": NaN,
780
+ "learning_rate": 2.9468116953203107e-05,
781
+ "loss": 0.0,
782
+ "step": 321
783
+ },
784
+ {
785
+ "epoch": 0.010473531301392834,
786
+ "grad_norm": NaN,
787
+ "learning_rate": 2.8595081148593738e-05,
788
+ "loss": 0.0,
789
+ "step": 324
790
+ },
791
+ {
792
+ "epoch": 0.010570508443072398,
793
+ "grad_norm": NaN,
794
+ "learning_rate": 2.772996397836704e-05,
795
+ "loss": 0.0,
796
+ "step": 327
797
+ },
798
+ {
799
+ "epoch": 0.010667485584751961,
800
+ "grad_norm": NaN,
801
+ "learning_rate": 2.687308548795825e-05,
802
+ "loss": 0.0,
803
+ "step": 330
804
+ },
805
+ {
806
+ "epoch": 0.010764462726431523,
807
+ "grad_norm": NaN,
808
+ "learning_rate": 2.6024762674947313e-05,
809
+ "loss": 0.0,
810
+ "step": 333
811
+ },
812
+ {
813
+ "epoch": 0.010861439868111087,
814
+ "grad_norm": NaN,
815
+ "learning_rate": 2.5185309371787513e-05,
816
+ "loss": 0.0,
817
+ "step": 336
818
+ },
819
+ {
820
+ "epoch": 0.01095841700979065,
821
+ "grad_norm": NaN,
822
+ "learning_rate": 2.43550361297047e-05,
823
+ "loss": 0.0,
824
+ "step": 339
825
+ },
826
+ {
827
+ "epoch": 0.011055394151470214,
828
+ "grad_norm": NaN,
829
+ "learning_rate": 2.353425010381063e-05,
830
+ "loss": 0.0,
831
+ "step": 342
832
+ },
833
+ {
834
+ "epoch": 0.011152371293149776,
835
+ "grad_norm": NaN,
836
+ "learning_rate": 2.272325493947257e-05,
837
+ "loss": 0.0,
838
+ "step": 345
839
+ },
840
+ {
841
+ "epoch": 0.01124934843482934,
842
+ "grad_norm": NaN,
843
+ "learning_rate": 2.192235065998126e-05,
844
+ "loss": 0.0,
845
+ "step": 348
846
+ },
847
+ {
848
+ "epoch": 0.011346325576508904,
849
+ "grad_norm": NaN,
850
+ "learning_rate": 2.1131833555559037e-05,
851
+ "loss": 0.0,
852
+ "step": 351
853
+ },
854
+ {
855
+ "epoch": 0.011443302718188468,
856
+ "grad_norm": NaN,
857
+ "learning_rate": 2.0351996073748713e-05,
858
+ "loss": 0.0,
859
+ "step": 354
860
+ },
861
+ {
862
+ "epoch": 0.01154027985986803,
863
+ "grad_norm": NaN,
864
+ "learning_rate": 1.9583126711224343e-05,
865
+ "loss": 0.0,
866
+ "step": 357
867
+ },
868
+ {
869
+ "epoch": 0.011637257001547593,
870
+ "grad_norm": NaN,
871
+ "learning_rate": 1.8825509907063327e-05,
872
+ "loss": 0.0,
873
+ "step": 360
874
+ },
875
+ {
876
+ "epoch": 0.011734234143227157,
877
+ "grad_norm": NaN,
878
+ "learning_rate": 1.807942593751973e-05,
879
+ "loss": 0.0,
880
+ "step": 363
881
+ },
882
+ {
883
+ "epoch": 0.011831211284906721,
884
+ "grad_norm": NaN,
885
+ "learning_rate": 1.7345150812337564e-05,
886
+ "loss": 0.0,
887
+ "step": 366
888
+ },
889
+ {
890
+ "epoch": 0.011928188426586283,
891
+ "grad_norm": NaN,
892
+ "learning_rate": 1.66229561726426e-05,
893
+ "loss": 0.0,
894
+ "step": 369
895
+ },
896
+ {
897
+ "epoch": 0.012025165568265847,
898
+ "grad_norm": NaN,
899
+ "learning_rate": 1.5913109190450032e-05,
900
+ "loss": 0.0,
901
+ "step": 372
902
+ },
903
+ {
904
+ "epoch": 0.01212214270994541,
905
+ "grad_norm": NaN,
906
+ "learning_rate": 1.5215872469825682e-05,
907
+ "loss": 0.0,
908
+ "step": 375
909
+ },
910
+ {
911
+ "epoch": 0.01212214270994541,
912
+ "eval_loss": NaN,
913
+ "eval_runtime": 573.9041,
914
+ "eval_samples_per_second": 45.393,
915
+ "eval_steps_per_second": 22.697,
916
+ "step": 375
917
  }
918
  ],
919
  "logging_steps": 3,
 
933
  "attributes": {}
934
  }
935
  },
936
+ "total_flos": 2.551356850176e+16,
937
  "train_batch_size": 2,
938
  "trial_name": null,
939
  "trial_params": null