barbaroo commited on
Commit
d0e14cf
1 Parent(s): 9cbab13

Upload 11 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/nllb-200-distilled-600M",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "architectures": [
6
+ "M2M100ForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 0,
10
+ "d_model": 1024,
11
+ "decoder_attention_heads": 16,
12
+ "decoder_ffn_dim": 4096,
13
+ "decoder_layerdrop": 0,
14
+ "decoder_layers": 12,
15
+ "decoder_start_token_id": 2,
16
+ "dropout": 0.1,
17
+ "encoder_attention_heads": 16,
18
+ "encoder_ffn_dim": 4096,
19
+ "encoder_layerdrop": 0,
20
+ "encoder_layers": 12,
21
+ "eos_token_id": 2,
22
+ "init_std": 0.02,
23
+ "is_encoder_decoder": true,
24
+ "max_length": 200,
25
+ "max_position_embeddings": 1024,
26
+ "model_type": "m2m_100",
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 1,
29
+ "scale_embedding": true,
30
+ "tokenizer_class": "NllbTokenizer",
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.32.1",
33
+ "use_cache": true,
34
+ "vocab_size": 256206
35
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 0,
3
+ "decoder_start_token_id": 2,
4
+ "eos_token_id": 2,
5
+ "max_length": 200,
6
+ "pad_token_id": 1,
7
+ "transformers_version": "4.32.1"
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6898506f3c102a1f6e48cf201f1e3b96dc6d7bbb28a9587b6c3bb3614aa28ab9
3
+ size 2460469182
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25772a151b7d17fd11a98507401e7c15dfd41da42bcdaa4af0a563fdc30b5799
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fddbd70ca32e254cd4ea3865389dcbce1c04e82f577c61ae350f154ff4e2dff
3
+ size 1064
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a
3
+ size 4852054
special_tokens_map.json ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "ace_Arab",
4
+ "ace_Latn",
5
+ "acm_Arab",
6
+ "acq_Arab",
7
+ "aeb_Arab",
8
+ "afr_Latn",
9
+ "ajp_Arab",
10
+ "aka_Latn",
11
+ "amh_Ethi",
12
+ "apc_Arab",
13
+ "arb_Arab",
14
+ "ars_Arab",
15
+ "ary_Arab",
16
+ "arz_Arab",
17
+ "asm_Beng",
18
+ "ast_Latn",
19
+ "awa_Deva",
20
+ "ayr_Latn",
21
+ "azb_Arab",
22
+ "azj_Latn",
23
+ "bak_Cyrl",
24
+ "bam_Latn",
25
+ "ban_Latn",
26
+ "bel_Cyrl",
27
+ "bem_Latn",
28
+ "ben_Beng",
29
+ "bho_Deva",
30
+ "bjn_Arab",
31
+ "bjn_Latn",
32
+ "bod_Tibt",
33
+ "bos_Latn",
34
+ "bug_Latn",
35
+ "bul_Cyrl",
36
+ "cat_Latn",
37
+ "ceb_Latn",
38
+ "ces_Latn",
39
+ "cjk_Latn",
40
+ "ckb_Arab",
41
+ "crh_Latn",
42
+ "cym_Latn",
43
+ "dan_Latn",
44
+ "deu_Latn",
45
+ "dik_Latn",
46
+ "dyu_Latn",
47
+ "dzo_Tibt",
48
+ "ell_Grek",
49
+ "eng_Latn",
50
+ "epo_Latn",
51
+ "est_Latn",
52
+ "eus_Latn",
53
+ "ewe_Latn",
54
+ "fao_Latn",
55
+ "pes_Arab",
56
+ "fij_Latn",
57
+ "fin_Latn",
58
+ "fon_Latn",
59
+ "fra_Latn",
60
+ "fur_Latn",
61
+ "fuv_Latn",
62
+ "gla_Latn",
63
+ "gle_Latn",
64
+ "glg_Latn",
65
+ "grn_Latn",
66
+ "guj_Gujr",
67
+ "hat_Latn",
68
+ "hau_Latn",
69
+ "heb_Hebr",
70
+ "hin_Deva",
71
+ "hne_Deva",
72
+ "hrv_Latn",
73
+ "hun_Latn",
74
+ "hye_Armn",
75
+ "ibo_Latn",
76
+ "ilo_Latn",
77
+ "ind_Latn",
78
+ "isl_Latn",
79
+ "ita_Latn",
80
+ "jav_Latn",
81
+ "jpn_Jpan",
82
+ "kab_Latn",
83
+ "kac_Latn",
84
+ "kam_Latn",
85
+ "kan_Knda",
86
+ "kas_Arab",
87
+ "kas_Deva",
88
+ "kat_Geor",
89
+ "knc_Arab",
90
+ "knc_Latn",
91
+ "kaz_Cyrl",
92
+ "kbp_Latn",
93
+ "kea_Latn",
94
+ "khm_Khmr",
95
+ "kik_Latn",
96
+ "kin_Latn",
97
+ "kir_Cyrl",
98
+ "kmb_Latn",
99
+ "kon_Latn",
100
+ "kor_Hang",
101
+ "kmr_Latn",
102
+ "lao_Laoo",
103
+ "lvs_Latn",
104
+ "lij_Latn",
105
+ "lim_Latn",
106
+ "lin_Latn",
107
+ "lit_Latn",
108
+ "lmo_Latn",
109
+ "ltg_Latn",
110
+ "ltz_Latn",
111
+ "lua_Latn",
112
+ "lug_Latn",
113
+ "luo_Latn",
114
+ "lus_Latn",
115
+ "mag_Deva",
116
+ "mai_Deva",
117
+ "mal_Mlym",
118
+ "mar_Deva",
119
+ "min_Latn",
120
+ "mkd_Cyrl",
121
+ "plt_Latn",
122
+ "mlt_Latn",
123
+ "mni_Beng",
124
+ "khk_Cyrl",
125
+ "mos_Latn",
126
+ "mri_Latn",
127
+ "zsm_Latn",
128
+ "mya_Mymr",
129
+ "nld_Latn",
130
+ "nno_Latn",
131
+ "nob_Latn",
132
+ "npi_Deva",
133
+ "nso_Latn",
134
+ "nus_Latn",
135
+ "nya_Latn",
136
+ "oci_Latn",
137
+ "gaz_Latn",
138
+ "ory_Orya",
139
+ "pag_Latn",
140
+ "pan_Guru",
141
+ "pap_Latn",
142
+ "pol_Latn",
143
+ "por_Latn",
144
+ "prs_Arab",
145
+ "pbt_Arab",
146
+ "quy_Latn",
147
+ "ron_Latn",
148
+ "run_Latn",
149
+ "rus_Cyrl",
150
+ "sag_Latn",
151
+ "san_Deva",
152
+ "sat_Beng",
153
+ "scn_Latn",
154
+ "shn_Mymr",
155
+ "sin_Sinh",
156
+ "slk_Latn",
157
+ "slv_Latn",
158
+ "smo_Latn",
159
+ "sna_Latn",
160
+ "snd_Arab",
161
+ "som_Latn",
162
+ "sot_Latn",
163
+ "spa_Latn",
164
+ "als_Latn",
165
+ "srd_Latn",
166
+ "srp_Cyrl",
167
+ "ssw_Latn",
168
+ "sun_Latn",
169
+ "swe_Latn",
170
+ "swh_Latn",
171
+ "szl_Latn",
172
+ "tam_Taml",
173
+ "tat_Cyrl",
174
+ "tel_Telu",
175
+ "tgk_Cyrl",
176
+ "tgl_Latn",
177
+ "tha_Thai",
178
+ "tir_Ethi",
179
+ "taq_Latn",
180
+ "taq_Tfng",
181
+ "tpi_Latn",
182
+ "tsn_Latn",
183
+ "tso_Latn",
184
+ "tuk_Latn",
185
+ "tum_Latn",
186
+ "tur_Latn",
187
+ "twi_Latn",
188
+ "tzm_Tfng",
189
+ "uig_Arab",
190
+ "ukr_Cyrl",
191
+ "umb_Latn",
192
+ "urd_Arab",
193
+ "uzn_Latn",
194
+ "vec_Latn",
195
+ "vie_Latn",
196
+ "war_Latn",
197
+ "wol_Latn",
198
+ "xho_Latn",
199
+ "ydd_Hebr",
200
+ "yor_Latn",
201
+ "yue_Hant",
202
+ "zho_Hans",
203
+ "zho_Hant",
204
+ "zul_Latn"
205
+ ],
206
+ "bos_token": "<s>",
207
+ "cls_token": "<s>",
208
+ "eos_token": "</s>",
209
+ "mask_token": {
210
+ "content": "<mask>",
211
+ "lstrip": true,
212
+ "normalized": true,
213
+ "rstrip": false,
214
+ "single_word": false
215
+ },
216
+ "pad_token": "<pad>",
217
+ "sep_token": "</s>",
218
+ "unk_token": "<unk>"
219
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:643ad36f91931f94a61d683ee8465d3b96ad53133f13cfa6594335b79ed88b77
3
+ size 17331491
tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "bos_token": "<s>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "legacy_behaviour": false,
8
+ "mask_token": {
9
+ "__type": "AddedToken",
10
+ "content": "<mask>",
11
+ "lstrip": true,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "model_max_length": 1024,
17
+ "pad_token": "<pad>",
18
+ "sep_token": "</s>",
19
+ "sp_model_kwargs": {},
20
+ "src_lang": "eng_Latn",
21
+ "tgt_lang": null,
22
+ "tokenizer_class": "NllbTokenizer",
23
+ "unk_token": "<unk>"
24
+ }
trainer_state.json ADDED
@@ -0,0 +1,1372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.6904399394989014,
3
+ "best_model_checkpoint": "nllb_200_distilled_600M_ENtoFO_bsz_64_epochs_10lr7e-05/checkpoint-16500",
4
+ "epoch": 5.798947842417985,
5
+ "eval_steps": 500,
6
+ "global_step": 16500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 1.4e-05,
14
+ "loss": 1.6553,
15
+ "step": 100
16
+ },
17
+ {
18
+ "epoch": 0.07,
19
+ "learning_rate": 2.8e-05,
20
+ "loss": 1.3075,
21
+ "step": 200
22
+ },
23
+ {
24
+ "epoch": 0.11,
25
+ "learning_rate": 4.2e-05,
26
+ "loss": 1.2606,
27
+ "step": 300
28
+ },
29
+ {
30
+ "epoch": 0.14,
31
+ "learning_rate": 5.6e-05,
32
+ "loss": 1.2085,
33
+ "step": 400
34
+ },
35
+ {
36
+ "epoch": 0.18,
37
+ "learning_rate": 7e-05,
38
+ "loss": 1.1956,
39
+ "step": 500
40
+ },
41
+ {
42
+ "epoch": 0.18,
43
+ "eval_bleu": 36.3211,
44
+ "eval_chrf++": 55.6095,
45
+ "eval_gen_len": 17.5404,
46
+ "eval_loss": 1.01542067527771,
47
+ "eval_runtime": 1596.0286,
48
+ "eval_samples_per_second": 4.587,
49
+ "eval_steps_per_second": 2.294,
50
+ "step": 500
51
+ },
52
+ {
53
+ "epoch": 0.21,
54
+ "learning_rate": 6.974955277280858e-05,
55
+ "loss": 1.1487,
56
+ "step": 600
57
+ },
58
+ {
59
+ "epoch": 0.25,
60
+ "learning_rate": 6.949910554561716e-05,
61
+ "loss": 1.1406,
62
+ "step": 700
63
+ },
64
+ {
65
+ "epoch": 0.28,
66
+ "learning_rate": 6.924865831842576e-05,
67
+ "loss": 1.1197,
68
+ "step": 800
69
+ },
70
+ {
71
+ "epoch": 0.32,
72
+ "learning_rate": 6.899821109123434e-05,
73
+ "loss": 1.0739,
74
+ "step": 900
75
+ },
76
+ {
77
+ "epoch": 0.35,
78
+ "learning_rate": 6.874776386404293e-05,
79
+ "loss": 1.0817,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.35,
84
+ "eval_bleu": 37.9118,
85
+ "eval_chrf++": 57.1775,
86
+ "eval_gen_len": 17.6499,
87
+ "eval_loss": 0.9288437962532043,
88
+ "eval_runtime": 1599.8831,
89
+ "eval_samples_per_second": 4.576,
90
+ "eval_steps_per_second": 2.288,
91
+ "step": 1000
92
+ },
93
+ {
94
+ "epoch": 0.39,
95
+ "learning_rate": 6.849731663685151e-05,
96
+ "loss": 1.0784,
97
+ "step": 1100
98
+ },
99
+ {
100
+ "epoch": 0.42,
101
+ "learning_rate": 6.824686940966009e-05,
102
+ "loss": 1.0538,
103
+ "step": 1200
104
+ },
105
+ {
106
+ "epoch": 0.46,
107
+ "learning_rate": 6.799642218246869e-05,
108
+ "loss": 1.0431,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 0.49,
113
+ "learning_rate": 6.774597495527727e-05,
114
+ "loss": 1.0347,
115
+ "step": 1400
116
+ },
117
+ {
118
+ "epoch": 0.53,
119
+ "learning_rate": 6.749552772808586e-05,
120
+ "loss": 1.0162,
121
+ "step": 1500
122
+ },
123
+ {
124
+ "epoch": 0.53,
125
+ "eval_bleu": 39.3818,
126
+ "eval_chrf++": 58.3559,
127
+ "eval_gen_len": 17.6375,
128
+ "eval_loss": 0.8730005025863647,
129
+ "eval_runtime": 1591.8771,
130
+ "eval_samples_per_second": 4.599,
131
+ "eval_steps_per_second": 2.3,
132
+ "step": 1500
133
+ },
134
+ {
135
+ "epoch": 0.56,
136
+ "learning_rate": 6.724508050089444e-05,
137
+ "loss": 0.9995,
138
+ "step": 1600
139
+ },
140
+ {
141
+ "epoch": 0.6,
142
+ "learning_rate": 6.699463327370304e-05,
143
+ "loss": 0.987,
144
+ "step": 1700
145
+ },
146
+ {
147
+ "epoch": 0.63,
148
+ "learning_rate": 6.674418604651162e-05,
149
+ "loss": 1.0009,
150
+ "step": 1800
151
+ },
152
+ {
153
+ "epoch": 0.67,
154
+ "learning_rate": 6.64937388193202e-05,
155
+ "loss": 0.9843,
156
+ "step": 1900
157
+ },
158
+ {
159
+ "epoch": 0.7,
160
+ "learning_rate": 6.624329159212879e-05,
161
+ "loss": 0.974,
162
+ "step": 2000
163
+ },
164
+ {
165
+ "epoch": 0.7,
166
+ "eval_bleu": 40.0752,
167
+ "eval_chrf++": 58.9288,
168
+ "eval_gen_len": 17.741,
169
+ "eval_loss": 0.8381767272949219,
170
+ "eval_runtime": 1584.9965,
171
+ "eval_samples_per_second": 4.619,
172
+ "eval_steps_per_second": 2.31,
173
+ "step": 2000
174
+ },
175
+ {
176
+ "epoch": 0.74,
177
+ "learning_rate": 6.599284436493739e-05,
178
+ "loss": 0.9649,
179
+ "step": 2100
180
+ },
181
+ {
182
+ "epoch": 0.77,
183
+ "learning_rate": 6.574239713774597e-05,
184
+ "loss": 0.9511,
185
+ "step": 2200
186
+ },
187
+ {
188
+ "epoch": 0.81,
189
+ "learning_rate": 6.549194991055455e-05,
190
+ "loss": 0.9689,
191
+ "step": 2300
192
+ },
193
+ {
194
+ "epoch": 0.84,
195
+ "learning_rate": 6.524150268336314e-05,
196
+ "loss": 0.9512,
197
+ "step": 2400
198
+ },
199
+ {
200
+ "epoch": 0.88,
201
+ "learning_rate": 6.499105545617173e-05,
202
+ "loss": 0.9513,
203
+ "step": 2500
204
+ },
205
+ {
206
+ "epoch": 0.88,
207
+ "eval_bleu": 40.6133,
208
+ "eval_chrf++": 59.5241,
209
+ "eval_gen_len": 17.7642,
210
+ "eval_loss": 0.8131038546562195,
211
+ "eval_runtime": 1608.2394,
212
+ "eval_samples_per_second": 4.552,
213
+ "eval_steps_per_second": 2.276,
214
+ "step": 2500
215
+ },
216
+ {
217
+ "epoch": 0.91,
218
+ "learning_rate": 6.474060822898032e-05,
219
+ "loss": 0.9476,
220
+ "step": 2600
221
+ },
222
+ {
223
+ "epoch": 0.95,
224
+ "learning_rate": 6.44901610017889e-05,
225
+ "loss": 0.9202,
226
+ "step": 2700
227
+ },
228
+ {
229
+ "epoch": 0.98,
230
+ "learning_rate": 6.423971377459748e-05,
231
+ "loss": 0.9423,
232
+ "step": 2800
233
+ },
234
+ {
235
+ "epoch": 1.02,
236
+ "learning_rate": 6.398926654740608e-05,
237
+ "loss": 0.8949,
238
+ "step": 2900
239
+ },
240
+ {
241
+ "epoch": 1.05,
242
+ "learning_rate": 6.373881932021467e-05,
243
+ "loss": 0.8405,
244
+ "step": 3000
245
+ },
246
+ {
247
+ "epoch": 1.05,
248
+ "eval_bleu": 40.8822,
249
+ "eval_chrf++": 59.9339,
250
+ "eval_gen_len": 17.7189,
251
+ "eval_loss": 0.7998338937759399,
252
+ "eval_runtime": 1592.0637,
253
+ "eval_samples_per_second": 4.598,
254
+ "eval_steps_per_second": 2.3,
255
+ "step": 3000
256
+ },
257
+ {
258
+ "epoch": 1.09,
259
+ "learning_rate": 6.348837209302325e-05,
260
+ "loss": 0.8431,
261
+ "step": 3100
262
+ },
263
+ {
264
+ "epoch": 1.12,
265
+ "learning_rate": 6.323792486583183e-05,
266
+ "loss": 0.8383,
267
+ "step": 3200
268
+ },
269
+ {
270
+ "epoch": 1.16,
271
+ "learning_rate": 6.298747763864043e-05,
272
+ "loss": 0.838,
273
+ "step": 3300
274
+ },
275
+ {
276
+ "epoch": 1.19,
277
+ "learning_rate": 6.273703041144901e-05,
278
+ "loss": 0.8527,
279
+ "step": 3400
280
+ },
281
+ {
282
+ "epoch": 1.23,
283
+ "learning_rate": 6.24865831842576e-05,
284
+ "loss": 0.8252,
285
+ "step": 3500
286
+ },
287
+ {
288
+ "epoch": 1.23,
289
+ "eval_bleu": 41.6082,
290
+ "eval_chrf++": 60.3254,
291
+ "eval_gen_len": 17.7662,
292
+ "eval_loss": 0.7859154939651489,
293
+ "eval_runtime": 1613.9109,
294
+ "eval_samples_per_second": 4.536,
295
+ "eval_steps_per_second": 2.268,
296
+ "step": 3500
297
+ },
298
+ {
299
+ "epoch": 1.27,
300
+ "learning_rate": 6.22361359570662e-05,
301
+ "loss": 0.836,
302
+ "step": 3600
303
+ },
304
+ {
305
+ "epoch": 1.3,
306
+ "learning_rate": 6.198568872987478e-05,
307
+ "loss": 0.8274,
308
+ "step": 3700
309
+ },
310
+ {
311
+ "epoch": 1.34,
312
+ "learning_rate": 6.173524150268336e-05,
313
+ "loss": 0.8257,
314
+ "step": 3800
315
+ },
316
+ {
317
+ "epoch": 1.37,
318
+ "learning_rate": 6.148479427549194e-05,
319
+ "loss": 0.8301,
320
+ "step": 3900
321
+ },
322
+ {
323
+ "epoch": 1.41,
324
+ "learning_rate": 6.123434704830053e-05,
325
+ "loss": 0.8235,
326
+ "step": 4000
327
+ },
328
+ {
329
+ "epoch": 1.41,
330
+ "eval_bleu": 41.9785,
331
+ "eval_chrf++": 60.615,
332
+ "eval_gen_len": 17.6996,
333
+ "eval_loss": 0.7718562483787537,
334
+ "eval_runtime": 1591.3834,
335
+ "eval_samples_per_second": 4.6,
336
+ "eval_steps_per_second": 2.301,
337
+ "step": 4000
338
+ },
339
+ {
340
+ "epoch": 1.44,
341
+ "learning_rate": 6.098389982110912e-05,
342
+ "loss": 0.8335,
343
+ "step": 4100
344
+ },
345
+ {
346
+ "epoch": 1.48,
347
+ "learning_rate": 6.073345259391771e-05,
348
+ "loss": 0.8184,
349
+ "step": 4200
350
+ },
351
+ {
352
+ "epoch": 1.51,
353
+ "learning_rate": 6.048300536672629e-05,
354
+ "loss": 0.8139,
355
+ "step": 4300
356
+ },
357
+ {
358
+ "epoch": 1.55,
359
+ "learning_rate": 6.0232558139534877e-05,
360
+ "loss": 0.8121,
361
+ "step": 4400
362
+ },
363
+ {
364
+ "epoch": 1.58,
365
+ "learning_rate": 5.998211091234346e-05,
366
+ "loss": 0.8174,
367
+ "step": 4500
368
+ },
369
+ {
370
+ "epoch": 1.58,
371
+ "eval_bleu": 41.9942,
372
+ "eval_chrf++": 60.7015,
373
+ "eval_gen_len": 17.7548,
374
+ "eval_loss": 0.7601388692855835,
375
+ "eval_runtime": 1610.8686,
376
+ "eval_samples_per_second": 4.545,
377
+ "eval_steps_per_second": 2.273,
378
+ "step": 4500
379
+ },
380
+ {
381
+ "epoch": 1.62,
382
+ "learning_rate": 5.973166368515206e-05,
383
+ "loss": 0.8232,
384
+ "step": 4600
385
+ },
386
+ {
387
+ "epoch": 1.65,
388
+ "learning_rate": 5.948121645796064e-05,
389
+ "loss": 0.8099,
390
+ "step": 4700
391
+ },
392
+ {
393
+ "epoch": 1.69,
394
+ "learning_rate": 5.9230769230769225e-05,
395
+ "loss": 0.8154,
396
+ "step": 4800
397
+ },
398
+ {
399
+ "epoch": 1.72,
400
+ "learning_rate": 5.898032200357781e-05,
401
+ "loss": 0.818,
402
+ "step": 4900
403
+ },
404
+ {
405
+ "epoch": 1.76,
406
+ "learning_rate": 5.87298747763864e-05,
407
+ "loss": 0.7992,
408
+ "step": 5000
409
+ },
410
+ {
411
+ "epoch": 1.76,
412
+ "eval_bleu": 42.3622,
413
+ "eval_chrf++": 61.0481,
414
+ "eval_gen_len": 17.8145,
415
+ "eval_loss": 0.7486168146133423,
416
+ "eval_runtime": 1597.5591,
417
+ "eval_samples_per_second": 4.583,
418
+ "eval_steps_per_second": 2.292,
419
+ "step": 5000
420
+ },
421
+ {
422
+ "epoch": 1.79,
423
+ "learning_rate": 5.847942754919499e-05,
424
+ "loss": 0.8087,
425
+ "step": 5100
426
+ },
427
+ {
428
+ "epoch": 1.83,
429
+ "learning_rate": 5.822898032200357e-05,
430
+ "loss": 0.7958,
431
+ "step": 5200
432
+ },
433
+ {
434
+ "epoch": 1.86,
435
+ "learning_rate": 5.7978533094812156e-05,
436
+ "loss": 0.8022,
437
+ "step": 5300
438
+ },
439
+ {
440
+ "epoch": 1.9,
441
+ "learning_rate": 5.7728085867620747e-05,
442
+ "loss": 0.7876,
443
+ "step": 5400
444
+ },
445
+ {
446
+ "epoch": 1.93,
447
+ "learning_rate": 5.747763864042934e-05,
448
+ "loss": 0.7915,
449
+ "step": 5500
450
+ },
451
+ {
452
+ "epoch": 1.93,
453
+ "eval_bleu": 42.4851,
454
+ "eval_chrf++": 61.1145,
455
+ "eval_gen_len": 17.7756,
456
+ "eval_loss": 0.7351738810539246,
457
+ "eval_runtime": 1598.7533,
458
+ "eval_samples_per_second": 4.579,
459
+ "eval_steps_per_second": 2.29,
460
+ "step": 5500
461
+ },
462
+ {
463
+ "epoch": 1.97,
464
+ "learning_rate": 5.722719141323792e-05,
465
+ "loss": 0.7795,
466
+ "step": 5600
467
+ },
468
+ {
469
+ "epoch": 2.0,
470
+ "learning_rate": 5.6976744186046504e-05,
471
+ "loss": 0.8015,
472
+ "step": 5700
473
+ },
474
+ {
475
+ "epoch": 2.04,
476
+ "learning_rate": 5.6726296958855094e-05,
477
+ "loss": 0.7219,
478
+ "step": 5800
479
+ },
480
+ {
481
+ "epoch": 2.07,
482
+ "learning_rate": 5.647584973166368e-05,
483
+ "loss": 0.7231,
484
+ "step": 5900
485
+ },
486
+ {
487
+ "epoch": 2.11,
488
+ "learning_rate": 5.622540250447227e-05,
489
+ "loss": 0.718,
490
+ "step": 6000
491
+ },
492
+ {
493
+ "epoch": 2.11,
494
+ "eval_bleu": 42.5957,
495
+ "eval_chrf++": 61.1828,
496
+ "eval_gen_len": 17.7144,
497
+ "eval_loss": 0.7349444627761841,
498
+ "eval_runtime": 1593.2386,
499
+ "eval_samples_per_second": 4.595,
500
+ "eval_steps_per_second": 2.298,
501
+ "step": 6000
502
+ },
503
+ {
504
+ "epoch": 2.14,
505
+ "learning_rate": 5.597495527728085e-05,
506
+ "loss": 0.7155,
507
+ "step": 6100
508
+ },
509
+ {
510
+ "epoch": 2.18,
511
+ "learning_rate": 5.572450805008944e-05,
512
+ "loss": 0.7222,
513
+ "step": 6200
514
+ },
515
+ {
516
+ "epoch": 2.21,
517
+ "learning_rate": 5.5474060822898026e-05,
518
+ "loss": 0.7113,
519
+ "step": 6300
520
+ },
521
+ {
522
+ "epoch": 2.25,
523
+ "learning_rate": 5.522361359570661e-05,
524
+ "loss": 0.7067,
525
+ "step": 6400
526
+ },
527
+ {
528
+ "epoch": 2.28,
529
+ "learning_rate": 5.497316636851521e-05,
530
+ "loss": 0.714,
531
+ "step": 6500
532
+ },
533
+ {
534
+ "epoch": 2.28,
535
+ "eval_bleu": 43.1947,
536
+ "eval_chrf++": 61.6389,
537
+ "eval_gen_len": 17.7485,
538
+ "eval_loss": 0.7279652953147888,
539
+ "eval_runtime": 1602.3425,
540
+ "eval_samples_per_second": 4.569,
541
+ "eval_steps_per_second": 2.285,
542
+ "step": 6500
543
+ },
544
+ {
545
+ "epoch": 2.32,
546
+ "learning_rate": 5.472271914132379e-05,
547
+ "loss": 0.7284,
548
+ "step": 6600
549
+ },
550
+ {
551
+ "epoch": 2.35,
552
+ "learning_rate": 5.4472271914132374e-05,
553
+ "loss": 0.7106,
554
+ "step": 6700
555
+ },
556
+ {
557
+ "epoch": 2.39,
558
+ "learning_rate": 5.422182468694096e-05,
559
+ "loss": 0.7226,
560
+ "step": 6800
561
+ },
562
+ {
563
+ "epoch": 2.43,
564
+ "learning_rate": 5.3971377459749555e-05,
565
+ "loss": 0.7151,
566
+ "step": 6900
567
+ },
568
+ {
569
+ "epoch": 2.46,
570
+ "learning_rate": 5.372093023255814e-05,
571
+ "loss": 0.7242,
572
+ "step": 7000
573
+ },
574
+ {
575
+ "epoch": 2.46,
576
+ "eval_bleu": 43.0217,
577
+ "eval_chrf++": 61.4,
578
+ "eval_gen_len": 17.7472,
579
+ "eval_loss": 0.7255465984344482,
580
+ "eval_runtime": 1596.9259,
581
+ "eval_samples_per_second": 4.584,
582
+ "eval_steps_per_second": 2.293,
583
+ "step": 7000
584
+ },
585
+ {
586
+ "epoch": 2.5,
587
+ "learning_rate": 5.347048300536672e-05,
588
+ "loss": 0.7115,
589
+ "step": 7100
590
+ },
591
+ {
592
+ "epoch": 2.53,
593
+ "learning_rate": 5.3220035778175306e-05,
594
+ "loss": 0.6996,
595
+ "step": 7200
596
+ },
597
+ {
598
+ "epoch": 2.57,
599
+ "learning_rate": 5.296958855098389e-05,
600
+ "loss": 0.7226,
601
+ "step": 7300
602
+ },
603
+ {
604
+ "epoch": 2.6,
605
+ "learning_rate": 5.2719141323792486e-05,
606
+ "loss": 0.7023,
607
+ "step": 7400
608
+ },
609
+ {
610
+ "epoch": 2.64,
611
+ "learning_rate": 5.246869409660107e-05,
612
+ "loss": 0.7035,
613
+ "step": 7500
614
+ },
615
+ {
616
+ "epoch": 2.64,
617
+ "eval_bleu": 42.9886,
618
+ "eval_chrf++": 61.5585,
619
+ "eval_gen_len": 17.7513,
620
+ "eval_loss": 0.7192216515541077,
621
+ "eval_runtime": 1608.427,
622
+ "eval_samples_per_second": 4.552,
623
+ "eval_steps_per_second": 2.276,
624
+ "step": 7500
625
+ },
626
+ {
627
+ "epoch": 2.67,
628
+ "learning_rate": 5.2218246869409654e-05,
629
+ "loss": 0.7175,
630
+ "step": 7600
631
+ },
632
+ {
633
+ "epoch": 2.71,
634
+ "learning_rate": 5.1967799642218244e-05,
635
+ "loss": 0.7164,
636
+ "step": 7700
637
+ },
638
+ {
639
+ "epoch": 2.74,
640
+ "learning_rate": 5.171735241502683e-05,
641
+ "loss": 0.703,
642
+ "step": 7800
643
+ },
644
+ {
645
+ "epoch": 2.78,
646
+ "learning_rate": 5.146690518783542e-05,
647
+ "loss": 0.7067,
648
+ "step": 7900
649
+ },
650
+ {
651
+ "epoch": 2.81,
652
+ "learning_rate": 5.1216457960644e-05,
653
+ "loss": 0.7048,
654
+ "step": 8000
655
+ },
656
+ {
657
+ "epoch": 2.81,
658
+ "eval_bleu": 42.9399,
659
+ "eval_chrf++": 61.4851,
660
+ "eval_gen_len": 17.7067,
661
+ "eval_loss": 0.7168448567390442,
662
+ "eval_runtime": 1552.4929,
663
+ "eval_samples_per_second": 4.716,
664
+ "eval_steps_per_second": 2.358,
665
+ "step": 8000
666
+ },
667
+ {
668
+ "epoch": 2.85,
669
+ "learning_rate": 5.096601073345259e-05,
670
+ "loss": 0.7127,
671
+ "step": 8100
672
+ },
673
+ {
674
+ "epoch": 2.88,
675
+ "learning_rate": 5.0715563506261176e-05,
676
+ "loss": 0.7091,
677
+ "step": 8200
678
+ },
679
+ {
680
+ "epoch": 2.92,
681
+ "learning_rate": 5.046511627906976e-05,
682
+ "loss": 0.7122,
683
+ "step": 8300
684
+ },
685
+ {
686
+ "epoch": 2.95,
687
+ "learning_rate": 5.021466905187835e-05,
688
+ "loss": 0.6949,
689
+ "step": 8400
690
+ },
691
+ {
692
+ "epoch": 2.99,
693
+ "learning_rate": 4.996422182468694e-05,
694
+ "loss": 0.685,
695
+ "step": 8500
696
+ },
697
+ {
698
+ "epoch": 2.99,
699
+ "eval_bleu": 43.114,
700
+ "eval_chrf++": 61.6028,
701
+ "eval_gen_len": 17.844,
702
+ "eval_loss": 0.7094260454177856,
703
+ "eval_runtime": 1415.2591,
704
+ "eval_samples_per_second": 5.173,
705
+ "eval_steps_per_second": 2.587,
706
+ "step": 8500
707
+ },
708
+ {
709
+ "epoch": 3.02,
710
+ "learning_rate": 4.9713774597495524e-05,
711
+ "loss": 0.6618,
712
+ "step": 8600
713
+ },
714
+ {
715
+ "epoch": 3.06,
716
+ "learning_rate": 4.946332737030411e-05,
717
+ "loss": 0.6417,
718
+ "step": 8700
719
+ },
720
+ {
721
+ "epoch": 3.09,
722
+ "learning_rate": 4.92128801431127e-05,
723
+ "loss": 0.65,
724
+ "step": 8800
725
+ },
726
+ {
727
+ "epoch": 3.13,
728
+ "learning_rate": 4.896243291592129e-05,
729
+ "loss": 0.6375,
730
+ "step": 8900
731
+ },
732
+ {
733
+ "epoch": 3.16,
734
+ "learning_rate": 4.871198568872987e-05,
735
+ "loss": 0.632,
736
+ "step": 9000
737
+ },
738
+ {
739
+ "epoch": 3.16,
740
+ "eval_bleu": 43.3779,
741
+ "eval_chrf++": 61.8915,
742
+ "eval_gen_len": 17.7121,
743
+ "eval_loss": 0.7186790108680725,
744
+ "eval_runtime": 1408.2967,
745
+ "eval_samples_per_second": 5.198,
746
+ "eval_steps_per_second": 2.6,
747
+ "step": 9000
748
+ },
749
+ {
750
+ "epoch": 3.2,
751
+ "learning_rate": 4.8461538461538455e-05,
752
+ "loss": 0.6434,
753
+ "step": 9100
754
+ },
755
+ {
756
+ "epoch": 3.23,
757
+ "learning_rate": 4.821109123434704e-05,
758
+ "loss": 0.6354,
759
+ "step": 9200
760
+ },
761
+ {
762
+ "epoch": 3.27,
763
+ "learning_rate": 4.7960644007155636e-05,
764
+ "loss": 0.6374,
765
+ "step": 9300
766
+ },
767
+ {
768
+ "epoch": 3.3,
769
+ "learning_rate": 4.771019677996422e-05,
770
+ "loss": 0.6478,
771
+ "step": 9400
772
+ },
773
+ {
774
+ "epoch": 3.34,
775
+ "learning_rate": 4.74597495527728e-05,
776
+ "loss": 0.6444,
777
+ "step": 9500
778
+ },
779
+ {
780
+ "epoch": 3.34,
781
+ "eval_bleu": 43.0761,
782
+ "eval_chrf++": 61.6092,
783
+ "eval_gen_len": 17.7518,
784
+ "eval_loss": 0.7161450982093811,
785
+ "eval_runtime": 1412.5727,
786
+ "eval_samples_per_second": 5.183,
787
+ "eval_steps_per_second": 2.592,
788
+ "step": 9500
789
+ },
790
+ {
791
+ "epoch": 3.37,
792
+ "learning_rate": 4.720930232558139e-05,
793
+ "loss": 0.6358,
794
+ "step": 9600
795
+ },
796
+ {
797
+ "epoch": 3.41,
798
+ "learning_rate": 4.695885509838998e-05,
799
+ "loss": 0.6502,
800
+ "step": 9700
801
+ },
802
+ {
803
+ "epoch": 3.44,
804
+ "learning_rate": 4.670840787119857e-05,
805
+ "loss": 0.6376,
806
+ "step": 9800
807
+ },
808
+ {
809
+ "epoch": 3.48,
810
+ "learning_rate": 4.645796064400715e-05,
811
+ "loss": 0.638,
812
+ "step": 9900
813
+ },
814
+ {
815
+ "epoch": 3.51,
816
+ "learning_rate": 4.6207513416815735e-05,
817
+ "loss": 0.6302,
818
+ "step": 10000
819
+ },
820
+ {
821
+ "epoch": 3.51,
822
+ "eval_bleu": 43.4763,
823
+ "eval_chrf++": 61.8105,
824
+ "eval_gen_len": 17.7754,
825
+ "eval_loss": 0.7070448398590088,
826
+ "eval_runtime": 1409.243,
827
+ "eval_samples_per_second": 5.195,
828
+ "eval_steps_per_second": 2.598,
829
+ "step": 10000
830
+ },
831
+ {
832
+ "epoch": 3.55,
833
+ "learning_rate": 4.5957066189624325e-05,
834
+ "loss": 0.632,
835
+ "step": 10100
836
+ },
837
+ {
838
+ "epoch": 3.58,
839
+ "learning_rate": 4.570661896243291e-05,
840
+ "loss": 0.6364,
841
+ "step": 10200
842
+ },
843
+ {
844
+ "epoch": 3.62,
845
+ "learning_rate": 4.54561717352415e-05,
846
+ "loss": 0.6466,
847
+ "step": 10300
848
+ },
849
+ {
850
+ "epoch": 3.66,
851
+ "learning_rate": 4.520572450805009e-05,
852
+ "loss": 0.6373,
853
+ "step": 10400
854
+ },
855
+ {
856
+ "epoch": 3.69,
857
+ "learning_rate": 4.495527728085867e-05,
858
+ "loss": 0.6478,
859
+ "step": 10500
860
+ },
861
+ {
862
+ "epoch": 3.69,
863
+ "eval_bleu": 43.725,
864
+ "eval_chrf++": 62.0616,
865
+ "eval_gen_len": 17.788,
866
+ "eval_loss": 0.705007016658783,
867
+ "eval_runtime": 1414.2328,
868
+ "eval_samples_per_second": 5.177,
869
+ "eval_steps_per_second": 2.589,
870
+ "step": 10500
871
+ },
872
+ {
873
+ "epoch": 3.73,
874
+ "learning_rate": 4.470483005366726e-05,
875
+ "loss": 0.6516,
876
+ "step": 10600
877
+ },
878
+ {
879
+ "epoch": 3.76,
880
+ "learning_rate": 4.445438282647585e-05,
881
+ "loss": 0.6334,
882
+ "step": 10700
883
+ },
884
+ {
885
+ "epoch": 3.8,
886
+ "learning_rate": 4.420393559928444e-05,
887
+ "loss": 0.6542,
888
+ "step": 10800
889
+ },
890
+ {
891
+ "epoch": 3.83,
892
+ "learning_rate": 4.395348837209302e-05,
893
+ "loss": 0.646,
894
+ "step": 10900
895
+ },
896
+ {
897
+ "epoch": 3.87,
898
+ "learning_rate": 4.3703041144901605e-05,
899
+ "loss": 0.6374,
900
+ "step": 11000
901
+ },
902
+ {
903
+ "epoch": 3.87,
904
+ "eval_bleu": 43.7206,
905
+ "eval_chrf++": 62.1048,
906
+ "eval_gen_len": 17.7229,
907
+ "eval_loss": 0.6963800191879272,
908
+ "eval_runtime": 1413.4358,
909
+ "eval_samples_per_second": 5.18,
910
+ "eval_steps_per_second": 2.59,
911
+ "step": 11000
912
+ },
913
+ {
914
+ "epoch": 3.9,
915
+ "learning_rate": 4.345259391771019e-05,
916
+ "loss": 0.6416,
917
+ "step": 11100
918
+ },
919
+ {
920
+ "epoch": 3.94,
921
+ "learning_rate": 4.3202146690518786e-05,
922
+ "loss": 0.6249,
923
+ "step": 11200
924
+ },
925
+ {
926
+ "epoch": 3.97,
927
+ "learning_rate": 4.295169946332737e-05,
928
+ "loss": 0.638,
929
+ "step": 11300
930
+ },
931
+ {
932
+ "epoch": 4.01,
933
+ "learning_rate": 4.270125223613595e-05,
934
+ "loss": 0.6352,
935
+ "step": 11400
936
+ },
937
+ {
938
+ "epoch": 4.04,
939
+ "learning_rate": 4.2450805008944536e-05,
940
+ "loss": 0.5804,
941
+ "step": 11500
942
+ },
943
+ {
944
+ "epoch": 4.04,
945
+ "eval_bleu": 43.8669,
946
+ "eval_chrf++": 62.1364,
947
+ "eval_gen_len": 17.7865,
948
+ "eval_loss": 0.7024260759353638,
949
+ "eval_runtime": 1410.9857,
950
+ "eval_samples_per_second": 5.189,
951
+ "eval_steps_per_second": 2.595,
952
+ "step": 11500
953
+ },
954
+ {
955
+ "epoch": 4.08,
956
+ "learning_rate": 4.220035778175313e-05,
957
+ "loss": 0.5799,
958
+ "step": 11600
959
+ },
960
+ {
961
+ "epoch": 4.11,
962
+ "learning_rate": 4.194991055456172e-05,
963
+ "loss": 0.5852,
964
+ "step": 11700
965
+ },
966
+ {
967
+ "epoch": 4.15,
968
+ "learning_rate": 4.16994633273703e-05,
969
+ "loss": 0.5801,
970
+ "step": 11800
971
+ },
972
+ {
973
+ "epoch": 4.18,
974
+ "learning_rate": 4.1449016100178884e-05,
975
+ "loss": 0.5945,
976
+ "step": 11900
977
+ },
978
+ {
979
+ "epoch": 4.22,
980
+ "learning_rate": 4.1198568872987475e-05,
981
+ "loss": 0.5919,
982
+ "step": 12000
983
+ },
984
+ {
985
+ "epoch": 4.22,
986
+ "eval_bleu": 43.6775,
987
+ "eval_chrf++": 61.9586,
988
+ "eval_gen_len": 17.8369,
989
+ "eval_loss": 0.7032192945480347,
990
+ "eval_runtime": 1420.6308,
991
+ "eval_samples_per_second": 5.153,
992
+ "eval_steps_per_second": 2.577,
993
+ "step": 12000
994
+ },
995
+ {
996
+ "epoch": 4.25,
997
+ "learning_rate": 4.094812164579606e-05,
998
+ "loss": 0.5928,
999
+ "step": 12100
1000
+ },
1001
+ {
1002
+ "epoch": 4.29,
1003
+ "learning_rate": 4.069767441860465e-05,
1004
+ "loss": 0.5923,
1005
+ "step": 12200
1006
+ },
1007
+ {
1008
+ "epoch": 4.32,
1009
+ "learning_rate": 4.044722719141323e-05,
1010
+ "loss": 0.5943,
1011
+ "step": 12300
1012
+ },
1013
+ {
1014
+ "epoch": 4.36,
1015
+ "learning_rate": 4.019677996422182e-05,
1016
+ "loss": 0.5905,
1017
+ "step": 12400
1018
+ },
1019
+ {
1020
+ "epoch": 4.39,
1021
+ "learning_rate": 3.9946332737030406e-05,
1022
+ "loss": 0.5879,
1023
+ "step": 12500
1024
+ },
1025
+ {
1026
+ "epoch": 4.39,
1027
+ "eval_bleu": 43.9642,
1028
+ "eval_chrf++": 62.2021,
1029
+ "eval_gen_len": 17.7873,
1030
+ "eval_loss": 0.7024480104446411,
1031
+ "eval_runtime": 1420.746,
1032
+ "eval_samples_per_second": 5.153,
1033
+ "eval_steps_per_second": 2.577,
1034
+ "step": 12500
1035
+ },
1036
+ {
1037
+ "epoch": 4.43,
1038
+ "learning_rate": 3.9695885509839e-05,
1039
+ "loss": 0.5855,
1040
+ "step": 12600
1041
+ },
1042
+ {
1043
+ "epoch": 4.46,
1044
+ "learning_rate": 3.944543828264758e-05,
1045
+ "loss": 0.5855,
1046
+ "step": 12700
1047
+ },
1048
+ {
1049
+ "epoch": 4.5,
1050
+ "learning_rate": 3.919499105545617e-05,
1051
+ "loss": 0.5833,
1052
+ "step": 12800
1053
+ },
1054
+ {
1055
+ "epoch": 4.53,
1056
+ "learning_rate": 3.8944543828264754e-05,
1057
+ "loss": 0.5767,
1058
+ "step": 12900
1059
+ },
1060
+ {
1061
+ "epoch": 4.57,
1062
+ "learning_rate": 3.869409660107334e-05,
1063
+ "loss": 0.5858,
1064
+ "step": 13000
1065
+ },
1066
+ {
1067
+ "epoch": 4.57,
1068
+ "eval_bleu": 44.027,
1069
+ "eval_chrf++": 62.2226,
1070
+ "eval_gen_len": 17.7678,
1071
+ "eval_loss": 0.6992958784103394,
1072
+ "eval_runtime": 1410.8973,
1073
+ "eval_samples_per_second": 5.189,
1074
+ "eval_steps_per_second": 2.595,
1075
+ "step": 13000
1076
+ },
1077
+ {
1078
+ "epoch": 4.6,
1079
+ "learning_rate": 3.8443649373881935e-05,
1080
+ "loss": 0.5775,
1081
+ "step": 13100
1082
+ },
1083
+ {
1084
+ "epoch": 4.64,
1085
+ "learning_rate": 3.819320214669052e-05,
1086
+ "loss": 0.5825,
1087
+ "step": 13200
1088
+ },
1089
+ {
1090
+ "epoch": 4.67,
1091
+ "learning_rate": 3.79427549194991e-05,
1092
+ "loss": 0.5851,
1093
+ "step": 13300
1094
+ },
1095
+ {
1096
+ "epoch": 4.71,
1097
+ "learning_rate": 3.7692307692307686e-05,
1098
+ "loss": 0.5913,
1099
+ "step": 13400
1100
+ },
1101
+ {
1102
+ "epoch": 4.74,
1103
+ "learning_rate": 3.744186046511627e-05,
1104
+ "loss": 0.5877,
1105
+ "step": 13500
1106
+ },
1107
+ {
1108
+ "epoch": 4.74,
1109
+ "eval_bleu": 44.1426,
1110
+ "eval_chrf++": 62.3429,
1111
+ "eval_gen_len": 17.7805,
1112
+ "eval_loss": 0.6957116723060608,
1113
+ "eval_runtime": 1414.8882,
1114
+ "eval_samples_per_second": 5.174,
1115
+ "eval_steps_per_second": 2.587,
1116
+ "step": 13500
1117
+ },
1118
+ {
1119
+ "epoch": 4.78,
1120
+ "learning_rate": 3.719141323792487e-05,
1121
+ "loss": 0.5857,
1122
+ "step": 13600
1123
+ },
1124
+ {
1125
+ "epoch": 4.81,
1126
+ "learning_rate": 3.694096601073345e-05,
1127
+ "loss": 0.5776,
1128
+ "step": 13700
1129
+ },
1130
+ {
1131
+ "epoch": 4.85,
1132
+ "learning_rate": 3.6690518783542034e-05,
1133
+ "loss": 0.5985,
1134
+ "step": 13800
1135
+ },
1136
+ {
1137
+ "epoch": 4.89,
1138
+ "learning_rate": 3.644007155635062e-05,
1139
+ "loss": 0.5855,
1140
+ "step": 13900
1141
+ },
1142
+ {
1143
+ "epoch": 4.92,
1144
+ "learning_rate": 3.618962432915921e-05,
1145
+ "loss": 0.5895,
1146
+ "step": 14000
1147
+ },
1148
+ {
1149
+ "epoch": 4.92,
1150
+ "eval_bleu": 44.2097,
1151
+ "eval_chrf++": 62.4158,
1152
+ "eval_gen_len": 17.7713,
1153
+ "eval_loss": 0.6944009065628052,
1154
+ "eval_runtime": 1417.3776,
1155
+ "eval_samples_per_second": 5.165,
1156
+ "eval_steps_per_second": 2.583,
1157
+ "step": 14000
1158
+ },
1159
+ {
1160
+ "epoch": 4.96,
1161
+ "learning_rate": 3.59391771019678e-05,
1162
+ "loss": 0.581,
1163
+ "step": 14100
1164
+ },
1165
+ {
1166
+ "epoch": 4.99,
1167
+ "learning_rate": 3.568872987477638e-05,
1168
+ "loss": 0.5835,
1169
+ "step": 14200
1170
+ },
1171
+ {
1172
+ "epoch": 5.03,
1173
+ "learning_rate": 3.543828264758497e-05,
1174
+ "loss": 0.5512,
1175
+ "step": 14300
1176
+ },
1177
+ {
1178
+ "epoch": 5.06,
1179
+ "learning_rate": 3.5187835420393556e-05,
1180
+ "loss": 0.5324,
1181
+ "step": 14400
1182
+ },
1183
+ {
1184
+ "epoch": 5.1,
1185
+ "learning_rate": 3.4937388193202146e-05,
1186
+ "loss": 0.5381,
1187
+ "step": 14500
1188
+ },
1189
+ {
1190
+ "epoch": 5.1,
1191
+ "eval_bleu": 43.9778,
1192
+ "eval_chrf++": 62.2087,
1193
+ "eval_gen_len": 17.8153,
1194
+ "eval_loss": 0.7013605833053589,
1195
+ "eval_runtime": 1419.0259,
1196
+ "eval_samples_per_second": 5.159,
1197
+ "eval_steps_per_second": 2.58,
1198
+ "step": 14500
1199
+ },
1200
+ {
1201
+ "epoch": 5.13,
1202
+ "learning_rate": 3.468694096601073e-05,
1203
+ "loss": 0.5364,
1204
+ "step": 14600
1205
+ },
1206
+ {
1207
+ "epoch": 5.17,
1208
+ "learning_rate": 3.443649373881932e-05,
1209
+ "loss": 0.543,
1210
+ "step": 14700
1211
+ },
1212
+ {
1213
+ "epoch": 5.2,
1214
+ "learning_rate": 3.4186046511627904e-05,
1215
+ "loss": 0.5432,
1216
+ "step": 14800
1217
+ },
1218
+ {
1219
+ "epoch": 5.24,
1220
+ "learning_rate": 3.3935599284436494e-05,
1221
+ "loss": 0.542,
1222
+ "step": 14900
1223
+ },
1224
+ {
1225
+ "epoch": 5.27,
1226
+ "learning_rate": 3.368515205724508e-05,
1227
+ "loss": 0.5385,
1228
+ "step": 15000
1229
+ },
1230
+ {
1231
+ "epoch": 5.27,
1232
+ "eval_bleu": 44.1326,
1233
+ "eval_chrf++": 62.3372,
1234
+ "eval_gen_len": 17.8174,
1235
+ "eval_loss": 0.7036887407302856,
1236
+ "eval_runtime": 1418.4921,
1237
+ "eval_samples_per_second": 5.161,
1238
+ "eval_steps_per_second": 2.581,
1239
+ "step": 15000
1240
+ },
1241
+ {
1242
+ "epoch": 5.31,
1243
+ "learning_rate": 3.343470483005367e-05,
1244
+ "loss": 0.5467,
1245
+ "step": 15100
1246
+ },
1247
+ {
1248
+ "epoch": 5.34,
1249
+ "learning_rate": 3.318425760286225e-05,
1250
+ "loss": 0.5466,
1251
+ "step": 15200
1252
+ },
1253
+ {
1254
+ "epoch": 5.38,
1255
+ "learning_rate": 3.2933810375670835e-05,
1256
+ "loss": 0.5439,
1257
+ "step": 15300
1258
+ },
1259
+ {
1260
+ "epoch": 5.41,
1261
+ "learning_rate": 3.2683363148479426e-05,
1262
+ "loss": 0.5403,
1263
+ "step": 15400
1264
+ },
1265
+ {
1266
+ "epoch": 5.45,
1267
+ "learning_rate": 3.243291592128801e-05,
1268
+ "loss": 0.5481,
1269
+ "step": 15500
1270
+ },
1271
+ {
1272
+ "epoch": 5.45,
1273
+ "eval_bleu": 44.053,
1274
+ "eval_chrf++": 62.418,
1275
+ "eval_gen_len": 17.7614,
1276
+ "eval_loss": 0.6976599097251892,
1277
+ "eval_runtime": 1417.644,
1278
+ "eval_samples_per_second": 5.164,
1279
+ "eval_steps_per_second": 2.582,
1280
+ "step": 15500
1281
+ },
1282
+ {
1283
+ "epoch": 5.48,
1284
+ "learning_rate": 3.21824686940966e-05,
1285
+ "loss": 0.5445,
1286
+ "step": 15600
1287
+ },
1288
+ {
1289
+ "epoch": 5.52,
1290
+ "learning_rate": 3.1932021466905183e-05,
1291
+ "loss": 0.5518,
1292
+ "step": 15700
1293
+ },
1294
+ {
1295
+ "epoch": 5.55,
1296
+ "learning_rate": 3.1681574239713774e-05,
1297
+ "loss": 0.5451,
1298
+ "step": 15800
1299
+ },
1300
+ {
1301
+ "epoch": 5.59,
1302
+ "learning_rate": 3.143112701252236e-05,
1303
+ "loss": 0.5387,
1304
+ "step": 15900
1305
+ },
1306
+ {
1307
+ "epoch": 5.62,
1308
+ "learning_rate": 3.118067978533094e-05,
1309
+ "loss": 0.5473,
1310
+ "step": 16000
1311
+ },
1312
+ {
1313
+ "epoch": 5.62,
1314
+ "eval_bleu": 44.2406,
1315
+ "eval_chrf++": 62.4882,
1316
+ "eval_gen_len": 17.8263,
1317
+ "eval_loss": 0.6982511878013611,
1318
+ "eval_runtime": 1415.9788,
1319
+ "eval_samples_per_second": 5.17,
1320
+ "eval_steps_per_second": 2.585,
1321
+ "step": 16000
1322
+ },
1323
+ {
1324
+ "epoch": 5.66,
1325
+ "learning_rate": 3.093023255813953e-05,
1326
+ "loss": 0.5483,
1327
+ "step": 16100
1328
+ },
1329
+ {
1330
+ "epoch": 5.69,
1331
+ "learning_rate": 3.0679785330948115e-05,
1332
+ "loss": 0.537,
1333
+ "step": 16200
1334
+ },
1335
+ {
1336
+ "epoch": 5.73,
1337
+ "learning_rate": 3.0429338103756705e-05,
1338
+ "loss": 0.5479,
1339
+ "step": 16300
1340
+ },
1341
+ {
1342
+ "epoch": 5.76,
1343
+ "learning_rate": 3.0178890876565292e-05,
1344
+ "loss": 0.5445,
1345
+ "step": 16400
1346
+ },
1347
+ {
1348
+ "epoch": 5.8,
1349
+ "learning_rate": 2.992844364937388e-05,
1350
+ "loss": 0.5466,
1351
+ "step": 16500
1352
+ },
1353
+ {
1354
+ "epoch": 5.8,
1355
+ "eval_bleu": 44.2692,
1356
+ "eval_chrf++": 62.4172,
1357
+ "eval_gen_len": 17.7783,
1358
+ "eval_loss": 0.6904399394989014,
1359
+ "eval_runtime": 1415.8621,
1360
+ "eval_samples_per_second": 5.171,
1361
+ "eval_steps_per_second": 2.586,
1362
+ "step": 16500
1363
+ }
1364
+ ],
1365
+ "logging_steps": 100,
1366
+ "max_steps": 28450,
1367
+ "num_train_epochs": 10,
1368
+ "save_steps": 1500,
1369
+ "total_flos": 2.2884516243072614e+18,
1370
+ "trial_name": null,
1371
+ "trial_params": null
1372
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fb48f2e608fd6bf18e446724f849abe2dd8a18b3cb10cdd4bff2b84d2d209d9
3
+ size 4728