tiedeman commited on
Commit
55e1a3a
1 Parent(s): dfe3484

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.spm filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ language:
4
+ - de
5
+ - en
6
+ - es
7
+ - fr
8
+ - lt
9
+ - lv
10
+ - prg
11
+ - pt
12
+ - sgs
13
+
14
+ tags:
15
+ - translation
16
+ - opus-mt-tc-bible
17
+
18
+ license: apache-2.0
19
+ model-index:
20
+ - name: opus-mt-tc-bible-big-deu_eng_fra_por_spa-bat
21
+ results:
22
+ - task:
23
+ name: Translation deu-lit
24
+ type: translation
25
+ args: deu-lit
26
+ dataset:
27
+ name: flores200-devtest
28
+ type: flores200-devtest
29
+ args: deu-lit
30
+ metrics:
31
+ - name: BLEU
32
+ type: bleu
33
+ value: 22.6
34
+ - name: chr-F
35
+ type: chrf
36
+ value: 0.54957
37
+ - task:
38
+ name: Translation eng-lit
39
+ type: translation
40
+ args: eng-lit
41
+ dataset:
42
+ name: flores200-devtest
43
+ type: flores200-devtest
44
+ args: eng-lit
45
+ metrics:
46
+ - name: BLEU
47
+ type: bleu
48
+ value: 27.7
49
+ - name: chr-F
50
+ type: chrf
51
+ value: 0.59338
52
+ - task:
53
+ name: Translation fra-lit
54
+ type: translation
55
+ args: fra-lit
56
+ dataset:
57
+ name: flores200-devtest
58
+ type: flores200-devtest
59
+ args: fra-lit
60
+ metrics:
61
+ - name: BLEU
62
+ type: bleu
63
+ value: 22.3
64
+ - name: chr-F
65
+ type: chrf
66
+ value: 0.54683
67
+ - task:
68
+ name: Translation por-lit
69
+ type: translation
70
+ args: por-lit
71
+ dataset:
72
+ name: flores200-devtest
73
+ type: flores200-devtest
74
+ args: por-lit
75
+ metrics:
76
+ - name: BLEU
77
+ type: bleu
78
+ value: 22.6
79
+ - name: chr-F
80
+ type: chrf
81
+ value: 0.55033
82
+ - task:
83
+ name: Translation spa-lit
84
+ type: translation
85
+ args: spa-lit
86
+ dataset:
87
+ name: flores200-devtest
88
+ type: flores200-devtest
89
+ args: spa-lit
90
+ metrics:
91
+ - name: BLEU
92
+ type: bleu
93
+ value: 16.9
94
+ - name: chr-F
95
+ type: chrf
96
+ value: 0.50725
97
+ - task:
98
+ name: Translation deu-lav
99
+ type: translation
100
+ args: deu-lav
101
+ dataset:
102
+ name: flores101-devtest
103
+ type: flores_101
104
+ args: deu lav devtest
105
+ metrics:
106
+ - name: BLEU
107
+ type: bleu
108
+ value: 24.4
109
+ - name: chr-F
110
+ type: chrf
111
+ value: 0.54724
112
+ - task:
113
+ name: Translation eng-lav
114
+ type: translation
115
+ args: eng-lav
116
+ dataset:
117
+ name: flores101-devtest
118
+ type: flores_101
119
+ args: eng lav devtest
120
+ metrics:
121
+ - name: BLEU
122
+ type: bleu
123
+ value: 31.0
124
+ - name: chr-F
125
+ type: chrf
126
+ value: 0.59955
127
+ - task:
128
+ name: Translation eng-lit
129
+ type: translation
130
+ args: eng-lit
131
+ dataset:
132
+ name: flores101-devtest
133
+ type: flores_101
134
+ args: eng lit devtest
135
+ metrics:
136
+ - name: BLEU
137
+ type: bleu
138
+ value: 27.2
139
+ - name: chr-F
140
+ type: chrf
141
+ value: 0.58961
142
+ - task:
143
+ name: Translation fra-lav
144
+ type: translation
145
+ args: fra-lav
146
+ dataset:
147
+ name: flores101-devtest
148
+ type: flores_101
149
+ args: fra lav devtest
150
+ metrics:
151
+ - name: BLEU
152
+ type: bleu
153
+ value: 24.2
154
+ - name: chr-F
155
+ type: chrf
156
+ value: 0.54276
157
+ - task:
158
+ name: Translation fra-lit
159
+ type: translation
160
+ args: fra-lit
161
+ dataset:
162
+ name: flores101-devtest
163
+ type: flores_101
164
+ args: fra lit devtest
165
+ metrics:
166
+ - name: BLEU
167
+ type: bleu
168
+ value: 22.4
169
+ - name: chr-F
170
+ type: chrf
171
+ value: 0.54665
172
+ - task:
173
+ name: Translation spa-lav
174
+ type: translation
175
+ args: spa-lav
176
+ dataset:
177
+ name: flores101-devtest
178
+ type: flores_101
179
+ args: spa lav devtest
180
+ metrics:
181
+ - name: BLEU
182
+ type: bleu
183
+ value: 17.8
184
+ - name: chr-F
185
+ type: chrf
186
+ value: 0.50131
187
+ - task:
188
+ name: Translation deu-lav
189
+ type: translation
190
+ args: deu-lav
191
+ dataset:
192
+ name: ntrex128
193
+ type: ntrex128
194
+ args: deu-lav
195
+ metrics:
196
+ - name: BLEU
197
+ type: bleu
198
+ value: 16.8
199
+ - name: chr-F
200
+ type: chrf
201
+ value: 0.47980
202
+ - task:
203
+ name: Translation deu-lit
204
+ type: translation
205
+ args: deu-lit
206
+ dataset:
207
+ name: ntrex128
208
+ type: ntrex128
209
+ args: deu-lit
210
+ metrics:
211
+ - name: BLEU
212
+ type: bleu
213
+ value: 17.6
214
+ - name: chr-F
215
+ type: chrf
216
+ value: 0.50645
217
+ - task:
218
+ name: Translation eng-lav
219
+ type: translation
220
+ args: eng-lav
221
+ dataset:
222
+ name: ntrex128
223
+ type: ntrex128
224
+ args: eng-lav
225
+ metrics:
226
+ - name: BLEU
227
+ type: bleu
228
+ value: 20.6
229
+ - name: chr-F
230
+ type: chrf
231
+ value: 0.51026
232
+ - task:
233
+ name: Translation eng-lit
234
+ type: translation
235
+ args: eng-lit
236
+ dataset:
237
+ name: ntrex128
238
+ type: ntrex128
239
+ args: eng-lit
240
+ metrics:
241
+ - name: BLEU
242
+ type: bleu
243
+ value: 21.5
244
+ - name: chr-F
245
+ type: chrf
246
+ value: 0.54187
247
+ - task:
248
+ name: Translation fra-lav
249
+ type: translation
250
+ args: fra-lav
251
+ dataset:
252
+ name: ntrex128
253
+ type: ntrex128
254
+ args: fra-lav
255
+ metrics:
256
+ - name: BLEU
257
+ type: bleu
258
+ value: 15.5
259
+ - name: chr-F
260
+ type: chrf
261
+ value: 0.45346
262
+ - task:
263
+ name: Translation fra-lit
264
+ type: translation
265
+ args: fra-lit
266
+ dataset:
267
+ name: ntrex128
268
+ type: ntrex128
269
+ args: fra-lit
270
+ metrics:
271
+ - name: BLEU
272
+ type: bleu
273
+ value: 16.2
274
+ - name: chr-F
275
+ type: chrf
276
+ value: 0.48870
277
+ - task:
278
+ name: Translation por-lav
279
+ type: translation
280
+ args: por-lav
281
+ dataset:
282
+ name: ntrex128
283
+ type: ntrex128
284
+ args: por-lav
285
+ metrics:
286
+ - name: BLEU
287
+ type: bleu
288
+ value: 17.3
289
+ - name: chr-F
290
+ type: chrf
291
+ value: 0.47809
292
+ - task:
293
+ name: Translation por-lit
294
+ type: translation
295
+ args: por-lit
296
+ dataset:
297
+ name: ntrex128
298
+ type: ntrex128
299
+ args: por-lit
300
+ metrics:
301
+ - name: BLEU
302
+ type: bleu
303
+ value: 17.5
304
+ - name: chr-F
305
+ type: chrf
306
+ value: 0.50653
307
+ - task:
308
+ name: Translation spa-lav
309
+ type: translation
310
+ args: spa-lav
311
+ dataset:
312
+ name: ntrex128
313
+ type: ntrex128
314
+ args: spa-lav
315
+ metrics:
316
+ - name: BLEU
317
+ type: bleu
318
+ value: 17.1
319
+ - name: chr-F
320
+ type: chrf
321
+ value: 0.47690
322
+ - task:
323
+ name: Translation spa-lit
324
+ type: translation
325
+ args: spa-lit
326
+ dataset:
327
+ name: ntrex128
328
+ type: ntrex128
329
+ args: spa-lit
330
+ metrics:
331
+ - name: BLEU
332
+ type: bleu
333
+ value: 17.1
334
+ - name: chr-F
335
+ type: chrf
336
+ value: 0.50412
337
+ - task:
338
+ name: Translation deu-lit
339
+ type: translation
340
+ args: deu-lit
341
+ dataset:
342
+ name: tatoeba-test-v2021-08-07
343
+ type: tatoeba_mt
344
+ args: deu-lit
345
+ metrics:
346
+ - name: BLEU
347
+ type: bleu
348
+ value: 39.8
349
+ - name: chr-F
350
+ type: chrf
351
+ value: 0.65379
352
+ - task:
353
+ name: Translation eng-lav
354
+ type: translation
355
+ args: eng-lav
356
+ dataset:
357
+ name: tatoeba-test-v2021-08-07
358
+ type: tatoeba_mt
359
+ args: eng-lav
360
+ metrics:
361
+ - name: BLEU
362
+ type: bleu
363
+ value: 46.4
364
+ - name: chr-F
365
+ type: chrf
366
+ value: 0.68823
367
+ - task:
368
+ name: Translation eng-lit
369
+ type: translation
370
+ args: eng-lit
371
+ dataset:
372
+ name: tatoeba-test-v2021-08-07
373
+ type: tatoeba_mt
374
+ args: eng-lit
375
+ metrics:
376
+ - name: BLEU
377
+ type: bleu
378
+ value: 39.8
379
+ - name: chr-F
380
+ type: chrf
381
+ value: 0.67792
382
+ - task:
383
+ name: Translation multi-multi
384
+ type: translation
385
+ args: multi-multi
386
+ dataset:
387
+ name: tatoeba-test-v2020-07-28-v2023-09-26
388
+ type: tatoeba_mt
389
+ args: multi-multi
390
+ metrics:
391
+ - name: BLEU
392
+ type: bleu
393
+ value: 43.3
394
+ - name: chr-F
395
+ type: chrf
396
+ value: 0.68018
397
+ - task:
398
+ name: Translation spa-lit
399
+ type: translation
400
+ args: spa-lit
401
+ dataset:
402
+ name: tatoeba-test-v2021-08-07
403
+ type: tatoeba_mt
404
+ args: spa-lit
405
+ metrics:
406
+ - name: BLEU
407
+ type: bleu
408
+ value: 43.3
409
+ - name: chr-F
410
+ type: chrf
411
+ value: 0.68133
412
+ - task:
413
+ name: Translation eng-lav
414
+ type: translation
415
+ args: eng-lav
416
+ dataset:
417
+ name: newstest2017
418
+ type: wmt-2017-news
419
+ args: eng-lav
420
+ metrics:
421
+ - name: BLEU
422
+ type: bleu
423
+ value: 21.5
424
+ - name: chr-F
425
+ type: chrf
426
+ value: 0.53192
427
+ - task:
428
+ name: Translation eng-lit
429
+ type: translation
430
+ args: eng-lit
431
+ dataset:
432
+ name: newstest2019
433
+ type: wmt-2019-news
434
+ args: eng-lit
435
+ metrics:
436
+ - name: BLEU
437
+ type: bleu
438
+ value: 18.3
439
+ - name: chr-F
440
+ type: chrf
441
+ value: 0.51714
442
+ ---
443
+ # opus-mt-tc-bible-big-deu_eng_fra_por_spa-bat
444
+
445
+ ## Table of Contents
446
+ - [Model Details](#model-details)
447
+ - [Uses](#uses)
448
+ - [Risks, Limitations and Biases](#risks-limitations-and-biases)
449
+ - [How to Get Started With the Model](#how-to-get-started-with-the-model)
450
+ - [Training](#training)
451
+ - [Evaluation](#evaluation)
452
+ - [Citation Information](#citation-information)
453
+ - [Acknowledgements](#acknowledgements)
454
+
455
+ ## Model Details
456
+
457
+ Neural machine translation model for translating from unknown (deu+eng+fra+por+spa) to Baltic languages (bat).
458
+
459
+ This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
460
+ **Model Description:**
461
+ - **Developed by:** Language Technology Research Group at the University of Helsinki
462
+ - **Model Type:** Translation (transformer-big)
463
+ - **Release**: 2024-05-30
464
+ - **License:** Apache-2.0
465
+ - **Language(s):**
466
+ - Source Language(s): deu eng fra por spa
467
+ - Target Language(s): lav lit prg sgs
468
+ - Valid Target Language Labels: >>lav<< >>lit<< >>ndf<< >>olt<< >>prg<< >>prg_Latn<< >>sgs<< >>svx<< >>sxl<< >>xcu<< >>xgl<< >>xsv<< >>xzm<<
469
+ - **Original Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/deu+eng+fra+por+spa-bat/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip)
470
+ - **Resources for more information:**
471
+ - [OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/deu%2Beng%2Bfra%2Bpor%2Bspa-bat/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-05-30)
472
+ - [OPUS-MT-train GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
473
+ - [More information about MarianNMT models in the transformers library](https://huggingface.co/docs/transformers/model_doc/marian)
474
+ - [Tatoeba Translation Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge/)
475
+ - [HPLT bilingual data v1 (as part of the Tatoeba Translation Challenge dataset)](https://hplt-project.org/datasets/v1)
476
+ - [A massively parallel Bible corpus](https://aclanthology.org/L14-1215/)
477
+
478
+ This is a multilingual translation model with multiple target languages. A sentence initial language token is required in the form of `>>id<<` (id = valid target language ID), e.g. `>>lav<<`
479
+
480
+ ## Uses
481
+
482
+ This model can be used for translation and text-to-text generation.
483
+
484
+ ## Risks, Limitations and Biases
485
+
486
+ **CONTENT WARNING: Readers should be aware that the model is trained on various public data sets that may contain content that is disturbing, offensive, and can propagate historical and current stereotypes.**
487
+
488
+ Significant research has explored bias and fairness issues with language models (see, e.g., [Sheng et al. (2021)](https://aclanthology.org/2021.acl-long.330.pdf) and [Bender et al. (2021)](https://dl.acm.org/doi/pdf/10.1145/3442188.3445922)).
489
+
490
+ ## How to Get Started With the Model
491
+
492
+ A short example code:
493
+
494
+ ```python
495
+ from transformers import MarianMTModel, MarianTokenizer
496
+
497
+ src_text = [
498
+ ">>lav<< Replace this with text in an accepted source language.",
499
+ ">>sgs<< This is the second sentence."
500
+ ]
501
+
502
+ model_name = "pytorch-models/opus-mt-tc-bible-big-deu_eng_fra_por_spa-bat"
503
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
504
+ model = MarianMTModel.from_pretrained(model_name)
505
+ translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
506
+
507
+ for t in translated:
508
+ print( tokenizer.decode(t, skip_special_tokens=True) )
509
+ ```
510
+
511
+ You can also use OPUS-MT models with the transformers pipelines, for example:
512
+
513
+ ```python
514
+ from transformers import pipeline
515
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-bat")
516
+ print(pipe(">>lav<< Replace this with text in an accepted source language."))
517
+ ```
518
+
519
+ ## Training
520
+
521
+ - **Data**: opusTCv20230926max50+bt+jhubc ([source](https://github.com/Helsinki-NLP/Tatoeba-Challenge))
522
+ - **Pre-processing**: SentencePiece (spm32k,spm32k)
523
+ - **Model Type:** transformer-big
524
+ - **Original MarianNMT Model**: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/deu+eng+fra+por+spa-bat/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30.zip)
525
+ - **Training Scripts**: [GitHub Repo](https://github.com/Helsinki-NLP/OPUS-MT-train)
526
+
527
+ ## Evaluation
528
+
529
+ * [Model scores at the OPUS-MT dashboard](https://opus.nlpl.eu/dashboard/index.php?pkg=opusmt&test=all&scoreslang=all&chart=standard&model=Tatoeba-MT-models/deu%2Beng%2Bfra%2Bpor%2Bspa-bat/opusTCv20230926max50%2Bbt%2Bjhubc_transformer-big_2024-05-30)
530
+ * test set translations: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/deu+eng+fra+por+spa-bat/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.test.txt)
531
+ * test set scores: [opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/deu+eng+fra+por+spa-bat/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-29.eval.txt)
532
+ * benchmark results: [benchmark_results.txt](benchmark_results.txt)
533
+ * benchmark output: [benchmark_translations.zip](benchmark_translations.zip)
534
+
535
+ | langpair | testset | chr-F | BLEU | #sent | #words |
536
+ |----------|---------|-------|-------|-------|--------|
537
+ | deu-lit | tatoeba-test-v2021-08-07 | 0.65379 | 39.8 | 1115 | 7091 |
538
+ | eng-lav | tatoeba-test-v2021-08-07 | 0.68823 | 46.4 | 1631 | 9932 |
539
+ | eng-lit | tatoeba-test-v2021-08-07 | 0.67792 | 39.8 | 2528 | 14942 |
540
+ | spa-lit | tatoeba-test-v2021-08-07 | 0.68133 | 43.3 | 454 | 2352 |
541
+ | deu-lav | flores101-devtest | 0.54724 | 24.4 | 1012 | 22092 |
542
+ | eng-lav | flores101-devtest | 0.59955 | 31.0 | 1012 | 22092 |
543
+ | eng-lit | flores101-devtest | 0.58961 | 27.2 | 1012 | 20695 |
544
+ | fra-lav | flores101-devtest | 0.54276 | 24.2 | 1012 | 22092 |
545
+ | fra-lit | flores101-devtest | 0.54665 | 22.4 | 1012 | 20695 |
546
+ | spa-lav | flores101-devtest | 0.50131 | 17.8 | 1012 | 22092 |
547
+ | deu-lit | flores200-devtest | 0.54957 | 22.6 | 1012 | 20695 |
548
+ | eng-lit | flores200-devtest | 0.59338 | 27.7 | 1012 | 20695 |
549
+ | fra-lit | flores200-devtest | 0.54683 | 22.3 | 1012 | 20695 |
550
+ | por-lit | flores200-devtest | 0.55033 | 22.6 | 1012 | 20695 |
551
+ | spa-lit | flores200-devtest | 0.50725 | 16.9 | 1012 | 20695 |
552
+ | eng-lav | newstest2017 | 0.53192 | 21.5 | 2001 | 39392 |
553
+ | eng-lit | newstest2019 | 0.51714 | 18.3 | 998 | 19711 |
554
+ | deu-lav | ntrex128 | 0.47980 | 16.8 | 1997 | 44709 |
555
+ | deu-lit | ntrex128 | 0.50645 | 17.6 | 1997 | 41189 |
556
+ | eng-lav | ntrex128 | 0.51026 | 20.6 | 1997 | 44709 |
557
+ | eng-lit | ntrex128 | 0.54187 | 21.5 | 1997 | 41189 |
558
+ | fra-lav | ntrex128 | 0.45346 | 15.5 | 1997 | 44709 |
559
+ | fra-lit | ntrex128 | 0.48870 | 16.2 | 1997 | 41189 |
560
+ | por-lav | ntrex128 | 0.47809 | 17.3 | 1997 | 44709 |
561
+ | por-lit | ntrex128 | 0.50653 | 17.5 | 1997 | 41189 |
562
+ | spa-lav | ntrex128 | 0.47690 | 17.1 | 1997 | 44709 |
563
+ | spa-lit | ntrex128 | 0.50412 | 17.1 | 1997 | 41189 |
564
+
565
+ ## Citation Information
566
+
567
+ * Publications: [Democratizing neural machine translation with OPUS-MT](https://doi.org/10.1007/s10579-023-09704-w) and [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) and [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/) (Please, cite if you use this model.)
568
+
569
+ ```bibtex
570
+ @article{tiedemann2023democratizing,
571
+ title={Democratizing neural machine translation with {OPUS-MT}},
572
+ author={Tiedemann, J{\"o}rg and Aulamo, Mikko and Bakshandaeva, Daria and Boggia, Michele and Gr{\"o}nroos, Stig-Arne and Nieminen, Tommi and Raganato, Alessandro and Scherrer, Yves and Vazquez, Raul and Virpioja, Sami},
573
+ journal={Language Resources and Evaluation},
574
+ number={58},
575
+ pages={713--755},
576
+ year={2023},
577
+ publisher={Springer Nature},
578
+ issn={1574-0218},
579
+ doi={10.1007/s10579-023-09704-w}
580
+ }
581
+
582
+ @inproceedings{tiedemann-thottingal-2020-opus,
583
+ title = "{OPUS}-{MT} {--} Building open translation services for the World",
584
+ author = {Tiedemann, J{\"o}rg and Thottingal, Santhosh},
585
+ booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
586
+ month = nov,
587
+ year = "2020",
588
+ address = "Lisboa, Portugal",
589
+ publisher = "European Association for Machine Translation",
590
+ url = "https://aclanthology.org/2020.eamt-1.61",
591
+ pages = "479--480",
592
+ }
593
+
594
+ @inproceedings{tiedemann-2020-tatoeba,
595
+ title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
596
+ author = {Tiedemann, J{\"o}rg},
597
+ booktitle = "Proceedings of the Fifth Conference on Machine Translation",
598
+ month = nov,
599
+ year = "2020",
600
+ address = "Online",
601
+ publisher = "Association for Computational Linguistics",
602
+ url = "https://aclanthology.org/2020.wmt-1.139",
603
+ pages = "1174--1182",
604
+ }
605
+ ```
606
+
607
+ ## Acknowledgements
608
+
609
+ The work is supported by the [HPLT project](https://hplt-project.org/), funded by the European Union’s Horizon Europe research and innovation programme under grant agreement No 101070350. We are also grateful for the generous computational resources and IT infrastructure provided by [CSC -- IT Center for Science](https://www.csc.fi/), Finland, and the [EuroHPC supercomputer LUMI](https://www.lumi-supercomputer.eu/).
610
+
611
+ ## Model conversion info
612
+
613
+ * transformers version: 4.45.1
614
+ * OPUS-MT git hash: 0882077
615
+ * port time: Tue Oct 8 00:43:04 EEST 2024
616
+ * port machine: LM0-400-22516.local
benchmark_results.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ multi-multi tatoeba-test-v2020-07-28-v2023-09-26 0.68018 43.3 6367 38034
2
+ deu-lav flores101-devtest 0.54724 24.4 1012 22092
3
+ eng-lav flores101-devtest 0.59955 31.0 1012 22092
4
+ eng-lit flores101-devtest 0.58961 27.2 1012 20695
5
+ fra-lav flores101-devtest 0.54276 24.2 1012 22092
6
+ fra-lit flores101-devtest 0.54665 22.4 1012 20695
7
+ spa-lav flores101-devtest 0.50131 17.8 1012 22092
8
+ deu-lit flores200-devtest 0.54957 22.6 1012 20695
9
+ eng-lit flores200-devtest 0.59338 27.7 1012 20695
10
+ fra-lit flores200-devtest 0.54683 22.3 1012 20695
11
+ por-lit flores200-devtest 0.55033 22.6 1012 20695
12
+ spa-lit flores200-devtest 0.50725 16.9 1012 20695
13
+ eng-lav newstest2017 0.53192 21.5 2001 39392
14
+ eng-lit newstest2019 0.51714 18.3 998 19711
15
+ deu-lav ntrex128 0.47980 16.8 1997 44709
16
+ deu-lit ntrex128 0.50645 17.6 1997 41189
17
+ eng-lav ntrex128 0.51026 20.6 1997 44709
18
+ eng-lit ntrex128 0.54187 21.5 1997 41189
19
+ fra-lav ntrex128 0.45346 15.5 1997 44709
20
+ fra-lit ntrex128 0.48870 16.2 1997 41189
21
+ por-lav ntrex128 0.47809 17.3 1997 44709
22
+ por-lit ntrex128 0.50653 17.5 1997 41189
23
+ spa-lav ntrex128 0.47690 17.1 1997 44709
24
+ spa-lit ntrex128 0.50412 17.1 1997 41189
25
+ eng-lit tatoeba-test-v2020-07-28 0.67468 39.5 2500 14798
26
+ spa-lit tatoeba-test-v2020-07-28 0.68015 42.8 452 2341
27
+ eng-lit tatoeba-test-v2021-03-30 0.67451 39.5 5003 29598
28
+ spa-lit tatoeba-test-v2021-03-30 0.68064 42.8 457 2364
29
+ deu-lit tatoeba-test-v2021-08-07 0.65379 39.8 1115 7091
30
+ eng-lav tatoeba-test-v2021-08-07 0.68823 46.4 1631 9932
31
+ eng-lit tatoeba-test-v2021-08-07 0.67792 39.8 2528 14942
32
+ spa-lit tatoeba-test-v2021-08-07 0.68133 43.3 454 2352
benchmark_translations.zip ADDED
File without changes
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "pytorch-models/opus-mt-tc-bible-big-deu_eng_fra_por_spa-bat",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "relu",
5
+ "architectures": [
6
+ "MarianMTModel"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 1024,
12
+ "decoder_attention_heads": 16,
13
+ "decoder_ffn_dim": 4096,
14
+ "decoder_layerdrop": 0.0,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 59472,
17
+ "decoder_vocab_size": 59473,
18
+ "dropout": 0.1,
19
+ "encoder_attention_heads": 16,
20
+ "encoder_ffn_dim": 4096,
21
+ "encoder_layerdrop": 0.0,
22
+ "encoder_layers": 6,
23
+ "eos_token_id": 794,
24
+ "forced_eos_token_id": null,
25
+ "init_std": 0.02,
26
+ "is_encoder_decoder": true,
27
+ "max_length": null,
28
+ "max_position_embeddings": 1024,
29
+ "model_type": "marian",
30
+ "normalize_embedding": false,
31
+ "num_beams": null,
32
+ "num_hidden_layers": 6,
33
+ "pad_token_id": 59472,
34
+ "scale_embedding": true,
35
+ "share_encoder_decoder_embeddings": true,
36
+ "static_position_embeddings": true,
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.45.1",
39
+ "use_cache": true,
40
+ "vocab_size": 59473
41
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bad_words_ids": [
4
+ [
5
+ 59472
6
+ ]
7
+ ],
8
+ "bos_token_id": 0,
9
+ "decoder_start_token_id": 59472,
10
+ "eos_token_id": 794,
11
+ "forced_eos_token_id": 794,
12
+ "max_length": 512,
13
+ "num_beams": 4,
14
+ "pad_token_id": 59472,
15
+ "transformers_version": "4.45.1"
16
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:616de070605feaf979b012d637b0897383f39ea8eefd8decbd509f3fb417af56
3
+ size 949298420
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66cf068395cce6051dc34937761735e3c6d4a5ad1812174d379af42d7e9f1b87
3
+ size 949349701
source.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cea29a15c91ec7a8ea5ab10c658767ea741783eef15a4ea485c1e38906f49f00
3
+ size 819310
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
target.spm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cac0e1178e738a1ee0014be2dd4c93f0d79232e895ab2273cce38c61a9bf4b1c
3
+ size 834052
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"source_lang": "deu+eng+fra+por+spa", "target_lang": "bat", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "separate_vocabs": false, "special_tokens_map_file": null, "name_or_path": "marian-models/opusTCv20230926max50+bt+jhubc_transformer-big_2024-05-30/deu+eng+fra+por+spa-bat", "tokenizer_class": "MarianTokenizer"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff