barbaroo commited on
Commit
a410737
1 Parent(s): df14403

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/nllb-200-distilled-600M",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "architectures": [
6
+ "M2M100ForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 0,
10
+ "d_model": 1024,
11
+ "decoder_attention_heads": 16,
12
+ "decoder_ffn_dim": 4096,
13
+ "decoder_layerdrop": 0,
14
+ "decoder_layers": 12,
15
+ "decoder_start_token_id": 2,
16
+ "dropout": 0.1,
17
+ "encoder_attention_heads": 16,
18
+ "encoder_ffn_dim": 4096,
19
+ "encoder_layerdrop": 0,
20
+ "encoder_layers": 12,
21
+ "eos_token_id": 2,
22
+ "init_std": 0.02,
23
+ "is_encoder_decoder": true,
24
+ "max_length": 200,
25
+ "max_position_embeddings": 1024,
26
+ "model_type": "m2m_100",
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 1,
29
+ "scale_embedding": true,
30
+ "tokenizer_class": "NllbTokenizer",
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.32.1",
33
+ "use_cache": true,
34
+ "vocab_size": 256206
35
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 0,
3
+ "decoder_start_token_id": 2,
4
+ "eos_token_id": 2,
5
+ "max_length": 200,
6
+ "pad_token_id": 1,
7
+ "transformers_version": "4.32.1"
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:618767c44537ed357cb93f4d8b65026c78c21f184f2caa8fcdcf3069d59c4d4d
3
+ size 2460469182
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:853d3b831a322ab43fece1d3ff86857d88e856bb1f82059c7b110b51dab10187
3
+ size 14244
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a
3
+ size 4852054
special_tokens_map.json ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "ace_Arab",
4
+ "ace_Latn",
5
+ "acm_Arab",
6
+ "acq_Arab",
7
+ "aeb_Arab",
8
+ "afr_Latn",
9
+ "ajp_Arab",
10
+ "aka_Latn",
11
+ "amh_Ethi",
12
+ "apc_Arab",
13
+ "arb_Arab",
14
+ "ars_Arab",
15
+ "ary_Arab",
16
+ "arz_Arab",
17
+ "asm_Beng",
18
+ "ast_Latn",
19
+ "awa_Deva",
20
+ "ayr_Latn",
21
+ "azb_Arab",
22
+ "azj_Latn",
23
+ "bak_Cyrl",
24
+ "bam_Latn",
25
+ "ban_Latn",
26
+ "bel_Cyrl",
27
+ "bem_Latn",
28
+ "ben_Beng",
29
+ "bho_Deva",
30
+ "bjn_Arab",
31
+ "bjn_Latn",
32
+ "bod_Tibt",
33
+ "bos_Latn",
34
+ "bug_Latn",
35
+ "bul_Cyrl",
36
+ "cat_Latn",
37
+ "ceb_Latn",
38
+ "ces_Latn",
39
+ "cjk_Latn",
40
+ "ckb_Arab",
41
+ "crh_Latn",
42
+ "cym_Latn",
43
+ "dan_Latn",
44
+ "deu_Latn",
45
+ "dik_Latn",
46
+ "dyu_Latn",
47
+ "dzo_Tibt",
48
+ "ell_Grek",
49
+ "eng_Latn",
50
+ "epo_Latn",
51
+ "est_Latn",
52
+ "eus_Latn",
53
+ "ewe_Latn",
54
+ "fao_Latn",
55
+ "pes_Arab",
56
+ "fij_Latn",
57
+ "fin_Latn",
58
+ "fon_Latn",
59
+ "fra_Latn",
60
+ "fur_Latn",
61
+ "fuv_Latn",
62
+ "gla_Latn",
63
+ "gle_Latn",
64
+ "glg_Latn",
65
+ "grn_Latn",
66
+ "guj_Gujr",
67
+ "hat_Latn",
68
+ "hau_Latn",
69
+ "heb_Hebr",
70
+ "hin_Deva",
71
+ "hne_Deva",
72
+ "hrv_Latn",
73
+ "hun_Latn",
74
+ "hye_Armn",
75
+ "ibo_Latn",
76
+ "ilo_Latn",
77
+ "ind_Latn",
78
+ "isl_Latn",
79
+ "ita_Latn",
80
+ "jav_Latn",
81
+ "jpn_Jpan",
82
+ "kab_Latn",
83
+ "kac_Latn",
84
+ "kam_Latn",
85
+ "kan_Knda",
86
+ "kas_Arab",
87
+ "kas_Deva",
88
+ "kat_Geor",
89
+ "knc_Arab",
90
+ "knc_Latn",
91
+ "kaz_Cyrl",
92
+ "kbp_Latn",
93
+ "kea_Latn",
94
+ "khm_Khmr",
95
+ "kik_Latn",
96
+ "kin_Latn",
97
+ "kir_Cyrl",
98
+ "kmb_Latn",
99
+ "kon_Latn",
100
+ "kor_Hang",
101
+ "kmr_Latn",
102
+ "lao_Laoo",
103
+ "lvs_Latn",
104
+ "lij_Latn",
105
+ "lim_Latn",
106
+ "lin_Latn",
107
+ "lit_Latn",
108
+ "lmo_Latn",
109
+ "ltg_Latn",
110
+ "ltz_Latn",
111
+ "lua_Latn",
112
+ "lug_Latn",
113
+ "luo_Latn",
114
+ "lus_Latn",
115
+ "mag_Deva",
116
+ "mai_Deva",
117
+ "mal_Mlym",
118
+ "mar_Deva",
119
+ "min_Latn",
120
+ "mkd_Cyrl",
121
+ "plt_Latn",
122
+ "mlt_Latn",
123
+ "mni_Beng",
124
+ "khk_Cyrl",
125
+ "mos_Latn",
126
+ "mri_Latn",
127
+ "zsm_Latn",
128
+ "mya_Mymr",
129
+ "nld_Latn",
130
+ "nno_Latn",
131
+ "nob_Latn",
132
+ "npi_Deva",
133
+ "nso_Latn",
134
+ "nus_Latn",
135
+ "nya_Latn",
136
+ "oci_Latn",
137
+ "gaz_Latn",
138
+ "ory_Orya",
139
+ "pag_Latn",
140
+ "pan_Guru",
141
+ "pap_Latn",
142
+ "pol_Latn",
143
+ "por_Latn",
144
+ "prs_Arab",
145
+ "pbt_Arab",
146
+ "quy_Latn",
147
+ "ron_Latn",
148
+ "run_Latn",
149
+ "rus_Cyrl",
150
+ "sag_Latn",
151
+ "san_Deva",
152
+ "sat_Beng",
153
+ "scn_Latn",
154
+ "shn_Mymr",
155
+ "sin_Sinh",
156
+ "slk_Latn",
157
+ "slv_Latn",
158
+ "smo_Latn",
159
+ "sna_Latn",
160
+ "snd_Arab",
161
+ "som_Latn",
162
+ "sot_Latn",
163
+ "spa_Latn",
164
+ "als_Latn",
165
+ "srd_Latn",
166
+ "srp_Cyrl",
167
+ "ssw_Latn",
168
+ "sun_Latn",
169
+ "swe_Latn",
170
+ "swh_Latn",
171
+ "szl_Latn",
172
+ "tam_Taml",
173
+ "tat_Cyrl",
174
+ "tel_Telu",
175
+ "tgk_Cyrl",
176
+ "tgl_Latn",
177
+ "tha_Thai",
178
+ "tir_Ethi",
179
+ "taq_Latn",
180
+ "taq_Tfng",
181
+ "tpi_Latn",
182
+ "tsn_Latn",
183
+ "tso_Latn",
184
+ "tuk_Latn",
185
+ "tum_Latn",
186
+ "tur_Latn",
187
+ "twi_Latn",
188
+ "tzm_Tfng",
189
+ "uig_Arab",
190
+ "ukr_Cyrl",
191
+ "umb_Latn",
192
+ "urd_Arab",
193
+ "uzn_Latn",
194
+ "vec_Latn",
195
+ "vie_Latn",
196
+ "war_Latn",
197
+ "wol_Latn",
198
+ "xho_Latn",
199
+ "ydd_Hebr",
200
+ "yor_Latn",
201
+ "yue_Hant",
202
+ "zho_Hans",
203
+ "zho_Hant",
204
+ "zul_Latn"
205
+ ],
206
+ "bos_token": "<s>",
207
+ "cls_token": "<s>",
208
+ "eos_token": "</s>",
209
+ "mask_token": {
210
+ "content": "<mask>",
211
+ "lstrip": true,
212
+ "normalized": true,
213
+ "rstrip": false,
214
+ "single_word": false
215
+ },
216
+ "pad_token": "<pad>",
217
+ "sep_token": "</s>",
218
+ "unk_token": "<unk>"
219
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ff7aa7fc7583a64154ebf609119145fa22981fe0e212f9d45a95cb7386c81be
3
+ size 17331491
tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "bos_token": "<s>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "legacy_behaviour": false,
8
+ "mask_token": {
9
+ "__type": "AddedToken",
10
+ "content": "<mask>",
11
+ "lstrip": true,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "model_max_length": 1024,
17
+ "pad_token": "<pad>",
18
+ "sep_token": "</s>",
19
+ "sp_model_kwargs": {},
20
+ "src_lang": "fao_Latn",
21
+ "tgt_lang": null,
22
+ "tokenizer_class": "NllbTokenizer",
23
+ "unk_token": "<unk>"
24
+ }
trainer_state.json ADDED
@@ -0,0 +1,675 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5260419845581055,
3
+ "best_model_checkpoint": "nllb_200_distilled_600M_en_fo_bsz_64_epochs_10_no_decay_sprotin+gpt4-fo-en-final/checkpoint-8000",
4
+ "epoch": 2.8116110751117507,
5
+ "eval_steps": 500,
6
+ "global_step": 8000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 1.4e-05,
14
+ "loss": 1.0309,
15
+ "step": 100
16
+ },
17
+ {
18
+ "epoch": 0.07,
19
+ "learning_rate": 2.8e-05,
20
+ "loss": 0.7947,
21
+ "step": 200
22
+ },
23
+ {
24
+ "epoch": 0.11,
25
+ "learning_rate": 4.2e-05,
26
+ "loss": 0.7532,
27
+ "step": 300
28
+ },
29
+ {
30
+ "epoch": 0.14,
31
+ "learning_rate": 5.6e-05,
32
+ "loss": 0.7018,
33
+ "step": 400
34
+ },
35
+ {
36
+ "epoch": 0.18,
37
+ "learning_rate": 7e-05,
38
+ "loss": 0.6906,
39
+ "step": 500
40
+ },
41
+ {
42
+ "epoch": 0.18,
43
+ "eval_bleu": 47.5534,
44
+ "eval_chrf++": 63.6048,
45
+ "eval_gen_len": 14.9537,
46
+ "eval_loss": 0.6338649392127991,
47
+ "eval_runtime": 1451.1833,
48
+ "eval_samples_per_second": 5.045,
49
+ "eval_steps_per_second": 2.523,
50
+ "step": 500
51
+ },
52
+ {
53
+ "epoch": 0.21,
54
+ "learning_rate": 6.974955277280858e-05,
55
+ "loss": 0.6521,
56
+ "step": 600
57
+ },
58
+ {
59
+ "epoch": 0.25,
60
+ "learning_rate": 6.949910554561716e-05,
61
+ "loss": 0.6456,
62
+ "step": 700
63
+ },
64
+ {
65
+ "epoch": 0.28,
66
+ "learning_rate": 6.924865831842576e-05,
67
+ "loss": 0.6272,
68
+ "step": 800
69
+ },
70
+ {
71
+ "epoch": 0.32,
72
+ "learning_rate": 6.899821109123434e-05,
73
+ "loss": 0.611,
74
+ "step": 900
75
+ },
76
+ {
77
+ "epoch": 0.35,
78
+ "learning_rate": 6.874776386404293e-05,
79
+ "loss": 0.6053,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.35,
84
+ "eval_bleu": 48.5345,
85
+ "eval_chrf++": 64.5072,
86
+ "eval_gen_len": 14.9738,
87
+ "eval_loss": 0.5941245555877686,
88
+ "eval_runtime": 1563.9824,
89
+ "eval_samples_per_second": 4.681,
90
+ "eval_steps_per_second": 2.341,
91
+ "step": 1000
92
+ },
93
+ {
94
+ "epoch": 0.39,
95
+ "learning_rate": 6.849731663685151e-05,
96
+ "loss": 0.6083,
97
+ "step": 1100
98
+ },
99
+ {
100
+ "epoch": 0.42,
101
+ "learning_rate": 6.824686940966009e-05,
102
+ "loss": 0.5881,
103
+ "step": 1200
104
+ },
105
+ {
106
+ "epoch": 0.46,
107
+ "learning_rate": 6.799642218246869e-05,
108
+ "loss": 0.5836,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 0.49,
113
+ "learning_rate": 6.774597495527727e-05,
114
+ "loss": 0.5865,
115
+ "step": 1400
116
+ },
117
+ {
118
+ "epoch": 0.53,
119
+ "learning_rate": 6.749552772808586e-05,
120
+ "loss": 0.5734,
121
+ "step": 1500
122
+ },
123
+ {
124
+ "epoch": 0.53,
125
+ "eval_bleu": 49.7044,
126
+ "eval_chrf++": 65.4589,
127
+ "eval_gen_len": 15.0074,
128
+ "eval_loss": 0.5721458196640015,
129
+ "eval_runtime": 1595.6062,
130
+ "eval_samples_per_second": 4.588,
131
+ "eval_steps_per_second": 2.294,
132
+ "step": 1500
133
+ },
134
+ {
135
+ "epoch": 0.56,
136
+ "learning_rate": 6.724508050089444e-05,
137
+ "loss": 0.5598,
138
+ "step": 1600
139
+ },
140
+ {
141
+ "epoch": 0.6,
142
+ "learning_rate": 6.699463327370304e-05,
143
+ "loss": 0.5565,
144
+ "step": 1700
145
+ },
146
+ {
147
+ "epoch": 0.63,
148
+ "learning_rate": 6.674418604651162e-05,
149
+ "loss": 0.5615,
150
+ "step": 1800
151
+ },
152
+ {
153
+ "epoch": 0.67,
154
+ "learning_rate": 6.64937388193202e-05,
155
+ "loss": 0.5548,
156
+ "step": 1900
157
+ },
158
+ {
159
+ "epoch": 0.7,
160
+ "learning_rate": 6.624329159212879e-05,
161
+ "loss": 0.5498,
162
+ "step": 2000
163
+ },
164
+ {
165
+ "epoch": 0.7,
166
+ "eval_bleu": 50.272,
167
+ "eval_chrf++": 66.0079,
168
+ "eval_gen_len": 15.0224,
169
+ "eval_loss": 0.5572330355644226,
170
+ "eval_runtime": 1598.5782,
171
+ "eval_samples_per_second": 4.58,
172
+ "eval_steps_per_second": 2.29,
173
+ "step": 2000
174
+ },
175
+ {
176
+ "epoch": 0.74,
177
+ "learning_rate": 6.599284436493739e-05,
178
+ "loss": 0.545,
179
+ "step": 2100
180
+ },
181
+ {
182
+ "epoch": 0.77,
183
+ "learning_rate": 6.574239713774597e-05,
184
+ "loss": 0.5348,
185
+ "step": 2200
186
+ },
187
+ {
188
+ "epoch": 0.81,
189
+ "learning_rate": 6.549194991055455e-05,
190
+ "loss": 0.548,
191
+ "step": 2300
192
+ },
193
+ {
194
+ "epoch": 0.84,
195
+ "learning_rate": 6.524150268336314e-05,
196
+ "loss": 0.5346,
197
+ "step": 2400
198
+ },
199
+ {
200
+ "epoch": 0.88,
201
+ "learning_rate": 6.499105545617173e-05,
202
+ "loss": 0.5339,
203
+ "step": 2500
204
+ },
205
+ {
206
+ "epoch": 0.88,
207
+ "eval_bleu": 50.7705,
208
+ "eval_chrf++": 66.3386,
209
+ "eval_gen_len": 15.0134,
210
+ "eval_loss": 0.5522322058677673,
211
+ "eval_runtime": 1606.2109,
212
+ "eval_samples_per_second": 4.558,
213
+ "eval_steps_per_second": 2.279,
214
+ "step": 2500
215
+ },
216
+ {
217
+ "epoch": 0.91,
218
+ "learning_rate": 6.474060822898032e-05,
219
+ "loss": 0.5346,
220
+ "step": 2600
221
+ },
222
+ {
223
+ "epoch": 0.95,
224
+ "learning_rate": 6.44901610017889e-05,
225
+ "loss": 0.5288,
226
+ "step": 2700
227
+ },
228
+ {
229
+ "epoch": 0.98,
230
+ "learning_rate": 6.423971377459748e-05,
231
+ "loss": 0.5356,
232
+ "step": 2800
233
+ },
234
+ {
235
+ "epoch": 1.02,
236
+ "learning_rate": 6.398926654740608e-05,
237
+ "loss": 0.4816,
238
+ "step": 2900
239
+ },
240
+ {
241
+ "epoch": 1.05,
242
+ "learning_rate": 6.373881932021467e-05,
243
+ "loss": 0.46,
244
+ "step": 3000
245
+ },
246
+ {
247
+ "epoch": 1.05,
248
+ "eval_bleu": 51.2211,
249
+ "eval_chrf++": 66.6581,
250
+ "eval_gen_len": 14.9399,
251
+ "eval_loss": 0.5472458004951477,
252
+ "eval_runtime": 1577.1872,
253
+ "eval_samples_per_second": 4.642,
254
+ "eval_steps_per_second": 2.321,
255
+ "step": 3000
256
+ },
257
+ {
258
+ "epoch": 1.09,
259
+ "learning_rate": 6.348837209302325e-05,
260
+ "loss": 0.4624,
261
+ "step": 3100
262
+ },
263
+ {
264
+ "epoch": 1.12,
265
+ "learning_rate": 6.323792486583183e-05,
266
+ "loss": 0.4614,
267
+ "step": 3200
268
+ },
269
+ {
270
+ "epoch": 1.16,
271
+ "learning_rate": 6.298747763864043e-05,
272
+ "loss": 0.4578,
273
+ "step": 3300
274
+ },
275
+ {
276
+ "epoch": 1.19,
277
+ "learning_rate": 6.273703041144901e-05,
278
+ "loss": 0.4639,
279
+ "step": 3400
280
+ },
281
+ {
282
+ "epoch": 1.23,
283
+ "learning_rate": 6.24865831842576e-05,
284
+ "loss": 0.4528,
285
+ "step": 3500
286
+ },
287
+ {
288
+ "epoch": 1.23,
289
+ "eval_bleu": 51.5852,
290
+ "eval_chrf++": 66.9915,
291
+ "eval_gen_len": 15.0586,
292
+ "eval_loss": 0.5428858995437622,
293
+ "eval_runtime": 1593.4651,
294
+ "eval_samples_per_second": 4.594,
295
+ "eval_steps_per_second": 2.298,
296
+ "step": 3500
297
+ },
298
+ {
299
+ "epoch": 1.27,
300
+ "learning_rate": 6.22361359570662e-05,
301
+ "loss": 0.4575,
302
+ "step": 3600
303
+ },
304
+ {
305
+ "epoch": 1.3,
306
+ "learning_rate": 6.198568872987478e-05,
307
+ "loss": 0.4402,
308
+ "step": 3700
309
+ },
310
+ {
311
+ "epoch": 1.34,
312
+ "learning_rate": 6.173524150268336e-05,
313
+ "loss": 0.4491,
314
+ "step": 3800
315
+ },
316
+ {
317
+ "epoch": 1.37,
318
+ "learning_rate": 6.148479427549194e-05,
319
+ "loss": 0.4621,
320
+ "step": 3900
321
+ },
322
+ {
323
+ "epoch": 1.41,
324
+ "learning_rate": 6.123434704830053e-05,
325
+ "loss": 0.4434,
326
+ "step": 4000
327
+ },
328
+ {
329
+ "epoch": 1.41,
330
+ "eval_bleu": 51.6156,
331
+ "eval_chrf++": 67.0015,
332
+ "eval_gen_len": 14.9593,
333
+ "eval_loss": 0.5395042300224304,
334
+ "eval_runtime": 1640.5913,
335
+ "eval_samples_per_second": 4.462,
336
+ "eval_steps_per_second": 2.232,
337
+ "step": 4000
338
+ },
339
+ {
340
+ "epoch": 1.44,
341
+ "learning_rate": 6.098389982110912e-05,
342
+ "loss": 0.4472,
343
+ "step": 4100
344
+ },
345
+ {
346
+ "epoch": 1.48,
347
+ "learning_rate": 6.073345259391771e-05,
348
+ "loss": 0.45,
349
+ "step": 4200
350
+ },
351
+ {
352
+ "epoch": 1.51,
353
+ "learning_rate": 6.048300536672629e-05,
354
+ "loss": 0.438,
355
+ "step": 4300
356
+ },
357
+ {
358
+ "epoch": 1.55,
359
+ "learning_rate": 6.0232558139534877e-05,
360
+ "loss": 0.4506,
361
+ "step": 4400
362
+ },
363
+ {
364
+ "epoch": 1.58,
365
+ "learning_rate": 5.998211091234346e-05,
366
+ "loss": 0.4356,
367
+ "step": 4500
368
+ },
369
+ {
370
+ "epoch": 1.58,
371
+ "eval_bleu": 51.8932,
372
+ "eval_chrf++": 67.156,
373
+ "eval_gen_len": 14.9581,
374
+ "eval_loss": 0.5352627038955688,
375
+ "eval_runtime": 1610.6884,
376
+ "eval_samples_per_second": 4.545,
377
+ "eval_steps_per_second": 2.273,
378
+ "step": 4500
379
+ },
380
+ {
381
+ "epoch": 1.62,
382
+ "learning_rate": 5.973166368515206e-05,
383
+ "loss": 0.4459,
384
+ "step": 4600
385
+ },
386
+ {
387
+ "epoch": 1.65,
388
+ "learning_rate": 5.948121645796064e-05,
389
+ "loss": 0.4354,
390
+ "step": 4700
391
+ },
392
+ {
393
+ "epoch": 1.69,
394
+ "learning_rate": 5.9230769230769225e-05,
395
+ "loss": 0.4498,
396
+ "step": 4800
397
+ },
398
+ {
399
+ "epoch": 1.72,
400
+ "learning_rate": 5.898032200357781e-05,
401
+ "loss": 0.4395,
402
+ "step": 4900
403
+ },
404
+ {
405
+ "epoch": 1.76,
406
+ "learning_rate": 5.87298747763864e-05,
407
+ "loss": 0.4404,
408
+ "step": 5000
409
+ },
410
+ {
411
+ "epoch": 1.76,
412
+ "eval_bleu": 52.2258,
413
+ "eval_chrf++": 67.5549,
414
+ "eval_gen_len": 15.0285,
415
+ "eval_loss": 0.5267295241355896,
416
+ "eval_runtime": 1625.9049,
417
+ "eval_samples_per_second": 4.503,
418
+ "eval_steps_per_second": 2.252,
419
+ "step": 5000
420
+ },
421
+ {
422
+ "epoch": 1.79,
423
+ "learning_rate": 5.847942754919499e-05,
424
+ "loss": 0.4362,
425
+ "step": 5100
426
+ },
427
+ {
428
+ "epoch": 1.83,
429
+ "learning_rate": 5.822898032200357e-05,
430
+ "loss": 0.4351,
431
+ "step": 5200
432
+ },
433
+ {
434
+ "epoch": 1.86,
435
+ "learning_rate": 5.7978533094812156e-05,
436
+ "loss": 0.4386,
437
+ "step": 5300
438
+ },
439
+ {
440
+ "epoch": 1.9,
441
+ "learning_rate": 5.7728085867620747e-05,
442
+ "loss": 0.435,
443
+ "step": 5400
444
+ },
445
+ {
446
+ "epoch": 1.93,
447
+ "learning_rate": 5.747763864042934e-05,
448
+ "loss": 0.434,
449
+ "step": 5500
450
+ },
451
+ {
452
+ "epoch": 1.93,
453
+ "eval_bleu": 52.0504,
454
+ "eval_chrf++": 67.3977,
455
+ "eval_gen_len": 15.0447,
456
+ "eval_loss": 0.5267728567123413,
457
+ "eval_runtime": 1575.9325,
458
+ "eval_samples_per_second": 4.646,
459
+ "eval_steps_per_second": 2.323,
460
+ "step": 5500
461
+ },
462
+ {
463
+ "epoch": 1.97,
464
+ "learning_rate": 5.722719141323792e-05,
465
+ "loss": 0.4442,
466
+ "step": 5600
467
+ },
468
+ {
469
+ "epoch": 2.0,
470
+ "learning_rate": 5.6976744186046504e-05,
471
+ "loss": 0.4287,
472
+ "step": 5700
473
+ },
474
+ {
475
+ "epoch": 2.04,
476
+ "learning_rate": 5.6726296958855094e-05,
477
+ "loss": 0.3744,
478
+ "step": 5800
479
+ },
480
+ {
481
+ "epoch": 2.07,
482
+ "learning_rate": 5.647584973166368e-05,
483
+ "loss": 0.3763,
484
+ "step": 5900
485
+ },
486
+ {
487
+ "epoch": 2.11,
488
+ "learning_rate": 5.622540250447227e-05,
489
+ "loss": 0.385,
490
+ "step": 6000
491
+ },
492
+ {
493
+ "epoch": 2.11,
494
+ "eval_bleu": 52.2895,
495
+ "eval_chrf++": 67.5809,
496
+ "eval_gen_len": 15.0462,
497
+ "eval_loss": 0.5324221849441528,
498
+ "eval_runtime": 1597.5578,
499
+ "eval_samples_per_second": 4.583,
500
+ "eval_steps_per_second": 2.292,
501
+ "step": 6000
502
+ },
503
+ {
504
+ "epoch": 2.14,
505
+ "learning_rate": 5.597495527728085e-05,
506
+ "loss": 0.3732,
507
+ "step": 6100
508
+ },
509
+ {
510
+ "epoch": 2.18,
511
+ "learning_rate": 5.572450805008944e-05,
512
+ "loss": 0.3712,
513
+ "step": 6200
514
+ },
515
+ {
516
+ "epoch": 2.21,
517
+ "learning_rate": 5.5474060822898026e-05,
518
+ "loss": 0.3696,
519
+ "step": 6300
520
+ },
521
+ {
522
+ "epoch": 2.25,
523
+ "learning_rate": 5.522361359570661e-05,
524
+ "loss": 0.3814,
525
+ "step": 6400
526
+ },
527
+ {
528
+ "epoch": 2.28,
529
+ "learning_rate": 5.497316636851521e-05,
530
+ "loss": 0.3782,
531
+ "step": 6500
532
+ },
533
+ {
534
+ "epoch": 2.28,
535
+ "eval_bleu": 52.7032,
536
+ "eval_chrf++": 67.8354,
537
+ "eval_gen_len": 15.0145,
538
+ "eval_loss": 0.5296782851219177,
539
+ "eval_runtime": 1601.7393,
540
+ "eval_samples_per_second": 4.571,
541
+ "eval_steps_per_second": 2.286,
542
+ "step": 6500
543
+ },
544
+ {
545
+ "epoch": 2.32,
546
+ "learning_rate": 5.472271914132379e-05,
547
+ "loss": 0.3769,
548
+ "step": 6600
549
+ },
550
+ {
551
+ "epoch": 2.35,
552
+ "learning_rate": 5.4472271914132374e-05,
553
+ "loss": 0.3743,
554
+ "step": 6700
555
+ },
556
+ {
557
+ "epoch": 2.39,
558
+ "learning_rate": 5.422182468694096e-05,
559
+ "loss": 0.3702,
560
+ "step": 6800
561
+ },
562
+ {
563
+ "epoch": 2.43,
564
+ "learning_rate": 5.3971377459749555e-05,
565
+ "loss": 0.374,
566
+ "step": 6900
567
+ },
568
+ {
569
+ "epoch": 2.46,
570
+ "learning_rate": 5.372093023255814e-05,
571
+ "loss": 0.3701,
572
+ "step": 7000
573
+ },
574
+ {
575
+ "epoch": 2.46,
576
+ "eval_bleu": 52.6804,
577
+ "eval_chrf++": 67.801,
578
+ "eval_gen_len": 15.0228,
579
+ "eval_loss": 0.5314484238624573,
580
+ "eval_runtime": 1555.2346,
581
+ "eval_samples_per_second": 4.707,
582
+ "eval_steps_per_second": 2.354,
583
+ "step": 7000
584
+ },
585
+ {
586
+ "epoch": 2.5,
587
+ "learning_rate": 5.347048300536672e-05,
588
+ "loss": 0.3695,
589
+ "step": 7100
590
+ },
591
+ {
592
+ "epoch": 2.53,
593
+ "learning_rate": 5.3220035778175306e-05,
594
+ "loss": 0.3719,
595
+ "step": 7200
596
+ },
597
+ {
598
+ "epoch": 2.57,
599
+ "learning_rate": 5.296958855098389e-05,
600
+ "loss": 0.3786,
601
+ "step": 7300
602
+ },
603
+ {
604
+ "epoch": 2.6,
605
+ "learning_rate": 5.2719141323792486e-05,
606
+ "loss": 0.3752,
607
+ "step": 7400
608
+ },
609
+ {
610
+ "epoch": 2.64,
611
+ "learning_rate": 5.246869409660107e-05,
612
+ "loss": 0.3669,
613
+ "step": 7500
614
+ },
615
+ {
616
+ "epoch": 2.64,
617
+ "eval_bleu": 52.7255,
618
+ "eval_chrf++": 67.8883,
619
+ "eval_gen_len": 14.9541,
620
+ "eval_loss": 0.5295674204826355,
621
+ "eval_runtime": 1595.3953,
622
+ "eval_samples_per_second": 4.589,
623
+ "eval_steps_per_second": 2.295,
624
+ "step": 7500
625
+ },
626
+ {
627
+ "epoch": 2.67,
628
+ "learning_rate": 5.2218246869409654e-05,
629
+ "loss": 0.3714,
630
+ "step": 7600
631
+ },
632
+ {
633
+ "epoch": 2.71,
634
+ "learning_rate": 5.1967799642218244e-05,
635
+ "loss": 0.3757,
636
+ "step": 7700
637
+ },
638
+ {
639
+ "epoch": 2.74,
640
+ "learning_rate": 5.171735241502683e-05,
641
+ "loss": 0.3723,
642
+ "step": 7800
643
+ },
644
+ {
645
+ "epoch": 2.78,
646
+ "learning_rate": 5.146690518783542e-05,
647
+ "loss": 0.3697,
648
+ "step": 7900
649
+ },
650
+ {
651
+ "epoch": 2.81,
652
+ "learning_rate": 5.1216457960644e-05,
653
+ "loss": 0.377,
654
+ "step": 8000
655
+ },
656
+ {
657
+ "epoch": 2.81,
658
+ "eval_bleu": 52.8794,
659
+ "eval_chrf++": 67.9457,
660
+ "eval_gen_len": 15.0385,
661
+ "eval_loss": 0.5260419845581055,
662
+ "eval_runtime": 1587.5224,
663
+ "eval_samples_per_second": 4.612,
664
+ "eval_steps_per_second": 2.306,
665
+ "step": 8000
666
+ }
667
+ ],
668
+ "logging_steps": 100,
669
+ "max_steps": 28450,
670
+ "num_train_epochs": 10,
671
+ "save_steps": 2000,
672
+ "total_flos": 1.1095532220721398e+18,
673
+ "trial_name": null,
674
+ "trial_params": null
675
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df509bd1aec72fc4723359f565472efcb5b00628b2ac4376e425af130cc7e604
3
+ size 4728