leixa commited on
Commit
a829bcd
1 Parent(s): 913624c

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:814c0505ae9626a132eee74a5d49746ba88ce25071435a2e3bf44bf9a6955753
3
  size 150487412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b14558afc2ff84281d8813573705af0c29271e886a5c55ea1716c22df3fa2654
3
  size 150487412
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d2e5ea8bbdbe6933b5b2f456e20e6bca2dc98048eecb503cf50ba6989aff775
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7e214f2b29bc7b5cdb3187dd8641f87052b9b8ab7ca01c37e612aeca4c84a0c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:128e0b0294b5389dce5b958620f0aba512ba88459c3fb7de261ee4ac77eb7fa5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe1d153de177b356f9e3a70d6e4ec979560b0c300994e71ca4cb89afc74c5b3a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.01212214270994541,
5
  "eval_steps": 125,
6
- "global_step": 375,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -914,6 +914,301 @@
914
  "eval_samples_per_second": 45.393,
915
  "eval_steps_per_second": 22.697,
916
  "step": 375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
917
  }
918
  ],
919
  "logging_steps": 3,
@@ -928,12 +1223,12 @@
928
  "should_evaluate": false,
929
  "should_log": false,
930
  "should_save": true,
931
- "should_training_stop": false
932
  },
933
  "attributes": {}
934
  }
935
  },
936
- "total_flos": 2.551356850176e+16,
937
  "train_batch_size": 2,
938
  "trial_name": null,
939
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.01616285694659388,
5
  "eval_steps": 125,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
914
  "eval_samples_per_second": 45.393,
915
  "eval_steps_per_second": 22.697,
916
  "step": 375
917
+ },
918
+ {
919
+ "epoch": 0.012219119851624972,
920
+ "grad_norm": NaN,
921
+ "learning_rate": 1.4531503949737108e-05,
922
+ "loss": 0.0,
923
+ "step": 378
924
+ },
925
+ {
926
+ "epoch": 0.012316096993304536,
927
+ "grad_norm": NaN,
928
+ "learning_rate": 1.3860256808630428e-05,
929
+ "loss": 0.0,
930
+ "step": 381
931
+ },
932
+ {
933
+ "epoch": 0.0124130741349841,
934
+ "grad_norm": NaN,
935
+ "learning_rate": 1.3202379370768252e-05,
936
+ "loss": 0.0,
937
+ "step": 384
938
+ },
939
+ {
940
+ "epoch": 0.012510051276663664,
941
+ "grad_norm": NaN,
942
+ "learning_rate": 1.2558115014363592e-05,
943
+ "loss": 0.0,
944
+ "step": 387
945
+ },
946
+ {
947
+ "epoch": 0.012607028418343226,
948
+ "grad_norm": NaN,
949
+ "learning_rate": 1.1927702081543279e-05,
950
+ "loss": 0.0,
951
+ "step": 390
952
+ },
953
+ {
954
+ "epoch": 0.01270400556002279,
955
+ "grad_norm": NaN,
956
+ "learning_rate": 1.1311373790174657e-05,
957
+ "loss": 0.0,
958
+ "step": 393
959
+ },
960
+ {
961
+ "epoch": 0.012800982701702353,
962
+ "grad_norm": NaN,
963
+ "learning_rate": 1.0709358147587884e-05,
964
+ "loss": 0.0,
965
+ "step": 396
966
+ },
967
+ {
968
+ "epoch": 0.012897959843381917,
969
+ "grad_norm": NaN,
970
+ "learning_rate": 1.0121877866225781e-05,
971
+ "loss": 0.0,
972
+ "step": 399
973
+ },
974
+ {
975
+ "epoch": 0.012994936985061479,
976
+ "grad_norm": NaN,
977
+ "learning_rate": 9.549150281252633e-06,
978
+ "loss": 0.0,
979
+ "step": 402
980
+ },
981
+ {
982
+ "epoch": 0.013091914126741043,
983
+ "grad_norm": NaN,
984
+ "learning_rate": 8.991387270152201e-06,
985
+ "loss": 0.0,
986
+ "step": 405
987
+ },
988
+ {
989
+ "epoch": 0.013188891268420606,
990
+ "grad_norm": NaN,
991
+ "learning_rate": 8.448795174344804e-06,
992
+ "loss": 0.0,
993
+ "step": 408
994
+ },
995
+ {
996
+ "epoch": 0.01328586841010017,
997
+ "grad_norm": NaN,
998
+ "learning_rate": 7.921574722852343e-06,
999
+ "loss": 0.0,
1000
+ "step": 411
1001
+ },
1002
+ {
1003
+ "epoch": 0.013382845551779732,
1004
+ "grad_norm": NaN,
1005
+ "learning_rate": 7.409920958039795e-06,
1006
+ "loss": 0.0,
1007
+ "step": 414
1008
+ },
1009
+ {
1010
+ "epoch": 0.013479822693459296,
1011
+ "grad_norm": NaN,
1012
+ "learning_rate": 6.9140231634602485e-06,
1013
+ "loss": 0.0,
1014
+ "step": 417
1015
+ },
1016
+ {
1017
+ "epoch": 0.01357679983513886,
1018
+ "grad_norm": NaN,
1019
+ "learning_rate": 6.43406479383053e-06,
1020
+ "loss": 0.0,
1021
+ "step": 420
1022
+ },
1023
+ {
1024
+ "epoch": 0.013673776976818422,
1025
+ "grad_norm": NaN,
1026
+ "learning_rate": 5.9702234071631e-06,
1027
+ "loss": 0.0,
1028
+ "step": 423
1029
+ },
1030
+ {
1031
+ "epoch": 0.013770754118497985,
1032
+ "grad_norm": NaN,
1033
+ "learning_rate": 5.5226705990794155e-06,
1034
+ "loss": 0.0,
1035
+ "step": 426
1036
+ },
1037
+ {
1038
+ "epoch": 0.013867731260177549,
1039
+ "grad_norm": NaN,
1040
+ "learning_rate": 5.091571939329048e-06,
1041
+ "loss": 0.0,
1042
+ "step": 429
1043
+ },
1044
+ {
1045
+ "epoch": 0.013964708401857113,
1046
+ "grad_norm": NaN,
1047
+ "learning_rate": 4.677086910538092e-06,
1048
+ "loss": 0.0,
1049
+ "step": 432
1050
+ },
1051
+ {
1052
+ "epoch": 0.014061685543536675,
1053
+ "grad_norm": NaN,
1054
+ "learning_rate": 4.279368849209381e-06,
1055
+ "loss": 0.0,
1056
+ "step": 435
1057
+ },
1058
+ {
1059
+ "epoch": 0.014158662685216239,
1060
+ "grad_norm": NaN,
1061
+ "learning_rate": 3.898564888996476e-06,
1062
+ "loss": 0.0,
1063
+ "step": 438
1064
+ },
1065
+ {
1066
+ "epoch": 0.014255639826895802,
1067
+ "grad_norm": NaN,
1068
+ "learning_rate": 3.534815906272404e-06,
1069
+ "loss": 0.0,
1070
+ "step": 441
1071
+ },
1072
+ {
1073
+ "epoch": 0.014352616968575366,
1074
+ "grad_norm": NaN,
1075
+ "learning_rate": 3.18825646801314e-06,
1076
+ "loss": 0.0,
1077
+ "step": 444
1078
+ },
1079
+ {
1080
+ "epoch": 0.014449594110254928,
1081
+ "grad_norm": NaN,
1082
+ "learning_rate": 2.8590147820153513e-06,
1083
+ "loss": 0.0,
1084
+ "step": 447
1085
+ },
1086
+ {
1087
+ "epoch": 0.014546571251934492,
1088
+ "grad_norm": NaN,
1089
+ "learning_rate": 2.547212649466568e-06,
1090
+ "loss": 0.0,
1091
+ "step": 450
1092
+ },
1093
+ {
1094
+ "epoch": 0.014643548393614056,
1095
+ "grad_norm": NaN,
1096
+ "learning_rate": 2.2529654198854835e-06,
1097
+ "loss": 0.0,
1098
+ "step": 453
1099
+ },
1100
+ {
1101
+ "epoch": 0.01474052553529362,
1102
+ "grad_norm": NaN,
1103
+ "learning_rate": 1.9763819484490355e-06,
1104
+ "loss": 0.0,
1105
+ "step": 456
1106
+ },
1107
+ {
1108
+ "epoch": 0.014837502676973181,
1109
+ "grad_norm": NaN,
1110
+ "learning_rate": 1.7175645557220566e-06,
1111
+ "loss": 0.0,
1112
+ "step": 459
1113
+ },
1114
+ {
1115
+ "epoch": 0.014934479818652745,
1116
+ "grad_norm": NaN,
1117
+ "learning_rate": 1.4766089898042678e-06,
1118
+ "loss": 0.0,
1119
+ "step": 462
1120
+ },
1121
+ {
1122
+ "epoch": 0.015031456960332309,
1123
+ "grad_norm": NaN,
1124
+ "learning_rate": 1.2536043909088191e-06,
1125
+ "loss": 0.0,
1126
+ "step": 465
1127
+ },
1128
+ {
1129
+ "epoch": 0.01512843410201187,
1130
+ "grad_norm": NaN,
1131
+ "learning_rate": 1.0486332583853563e-06,
1132
+ "loss": 0.0,
1133
+ "step": 468
1134
+ },
1135
+ {
1136
+ "epoch": 0.015225411243691435,
1137
+ "grad_norm": NaN,
1138
+ "learning_rate": 8.617714201998084e-07,
1139
+ "loss": 0.0,
1140
+ "step": 471
1141
+ },
1142
+ {
1143
+ "epoch": 0.015322388385370998,
1144
+ "grad_norm": NaN,
1145
+ "learning_rate": 6.93088004882253e-07,
1146
+ "loss": 0.0,
1147
+ "step": 474
1148
+ },
1149
+ {
1150
+ "epoch": 0.015419365527050562,
1151
+ "grad_norm": NaN,
1152
+ "learning_rate": 5.426454159531913e-07,
1153
+ "loss": 0.0,
1154
+ "step": 477
1155
+ },
1156
+ {
1157
+ "epoch": 0.015516342668730124,
1158
+ "grad_norm": NaN,
1159
+ "learning_rate": 4.104993088376974e-07,
1160
+ "loss": 0.0,
1161
+ "step": 480
1162
+ },
1163
+ {
1164
+ "epoch": 0.015613319810409688,
1165
+ "grad_norm": NaN,
1166
+ "learning_rate": 2.966985702759828e-07,
1167
+ "loss": 0.0,
1168
+ "step": 483
1169
+ },
1170
+ {
1171
+ "epoch": 0.01571029695208925,
1172
+ "grad_norm": NaN,
1173
+ "learning_rate": 2.012853002380466e-07,
1174
+ "loss": 0.0,
1175
+ "step": 486
1176
+ },
1177
+ {
1178
+ "epoch": 0.015807274093768815,
1179
+ "grad_norm": NaN,
1180
+ "learning_rate": 1.2429479634897267e-07,
1181
+ "loss": 0.0,
1182
+ "step": 489
1183
+ },
1184
+ {
1185
+ "epoch": 0.015904251235448377,
1186
+ "grad_norm": NaN,
1187
+ "learning_rate": 6.575554083078084e-08,
1188
+ "loss": 0.0,
1189
+ "step": 492
1190
+ },
1191
+ {
1192
+ "epoch": 0.016001228377127943,
1193
+ "grad_norm": NaN,
1194
+ "learning_rate": 2.568918996560532e-08,
1195
+ "loss": 0.0,
1196
+ "step": 495
1197
+ },
1198
+ {
1199
+ "epoch": 0.016098205518807505,
1200
+ "grad_norm": NaN,
1201
+ "learning_rate": 4.110566084036816e-09,
1202
+ "loss": 0.0,
1203
+ "step": 498
1204
+ },
1205
+ {
1206
+ "epoch": 0.01616285694659388,
1207
+ "eval_loss": NaN,
1208
+ "eval_runtime": 660.3671,
1209
+ "eval_samples_per_second": 39.449,
1210
+ "eval_steps_per_second": 19.725,
1211
+ "step": 500
1212
  }
1213
  ],
1214
  "logging_steps": 3,
 
1223
  "should_evaluate": false,
1224
  "should_log": false,
1225
  "should_save": true,
1226
+ "should_training_stop": true
1227
  },
1228
  "attributes": {}
1229
  }
1230
  },
1231
+ "total_flos": 3.401809133568e+16,
1232
  "train_batch_size": 2,
1233
  "trial_name": null,
1234
  "trial_params": null