lgrobol commited on
Commit
f1d5efc
1 Parent(s): 3df5dc2

Upload dl_opus.yaml

Browse files
Files changed (1) hide show
  1. dl_opus.yaml +247 -0
dl_opus.yaml ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ common:
2
+ output_directory: local/opus
3
+
4
+ steps:
5
+ # The quality of wikimatrix is really really bad for this pair: very poor alignment
6
+ # - type: opus_read
7
+ # parameters:
8
+ # corpus_name: WikiMatrix
9
+ # source_language: br
10
+ # target_language: fr
11
+ # release: latest
12
+ # preprocessing: raw
13
+ # src_output: wiki.br.gz
14
+ # tgt_output: wiki.fr.gz
15
+
16
+ # The quality of ccmatrix is really really bad for this pair: very few usable breton sentences
17
+ # - type: opus_read
18
+ # parameters:
19
+ # corpus_name: MultiCCAligned
20
+ # source_language: br
21
+ # target_language: fr
22
+ # release: latest
23
+ # preprocessing: raw
24
+ # src_output: cc.br.gz
25
+ # tgt_output: cc.fr.gz
26
+
27
+ - type: opus_read
28
+ parameters:
29
+ corpus_name: OfisPublik
30
+ source_language: br
31
+ target_language: fr
32
+ release: latest
33
+ preprocessing: raw
34
+ src_output: ofis.br.gz
35
+ tgt_output: ofis.fr.gz
36
+ suppress_prompts: true
37
+
38
+ - type: opus_read
39
+ parameters:
40
+ corpus_name: OpenSubtitles
41
+ source_language: br
42
+ target_language: fr
43
+ release: latest
44
+ preprocessing: raw
45
+ src_output: ost.br.gz
46
+ tgt_output: ost.fr.gz
47
+ suppress_prompts: true
48
+
49
+ - type: opus_read
50
+ parameters:
51
+ corpus_name: Tatoeba
52
+ source_language: br
53
+ target_language: fr
54
+ release: latest
55
+ preprocessing: raw
56
+ src_output: tatoeba.br.gz
57
+ tgt_output: tatoeba.fr.gz
58
+ suppress_prompts: true
59
+
60
+ # - type: opus_read
61
+ # parameters:
62
+ # corpus_name: wikimedia
63
+ # source_language: br
64
+ # target_language: fr
65
+ # release: latest
66
+ # preprocessing: raw
67
+ # src_output: wikimedia.br.gz
68
+ # tgt_output: wikimedia.fr.gz
69
+ # suppress_prompts: true
70
+
71
+ # - type: opus_read
72
+ # parameters:
73
+ # corpus_name: Mozilla-I10n
74
+ # source_language: br
75
+ # target_language: fr
76
+ # release: latest
77
+ # preprocessing: raw
78
+ # src_output: mozilla.br.gz
79
+ # tgt_output: mozilla.fr.gz
80
+ # suppress_prompts: true
81
+
82
+ # - type: opus_read
83
+ # parameters:
84
+ # corpus_name: KDE4
85
+ # source_language: br
86
+ # target_language: fr
87
+ # release: latest
88
+ # preprocessing: raw
89
+ # src_output: kde.br.gz
90
+ # tgt_output: kde.fr.gz
91
+ # suppress_prompts: true
92
+
93
+ # - type: opus_read
94
+ # parameters:
95
+ # corpus_name: GNOME
96
+ # source_language: br
97
+ # target_language: fr
98
+ # release: latest
99
+ # preprocessing: raw
100
+ # src_output: gnome.br.gz
101
+ # tgt_output: gnome.fr.gz
102
+ # suppress_prompts: true
103
+
104
+ - type: concatenate
105
+ parameters:
106
+ inputs:
107
+ - ofis.br.gz
108
+ - tatoeba.br.gz
109
+ output: good.br.gz
110
+
111
+ - type: concatenate
112
+ parameters:
113
+ inputs:
114
+ - ofis.fr.gz
115
+ - tatoeba.fr.gz
116
+ output: good.fr.gz
117
+
118
+ - type: concatenate
119
+ parameters:
120
+ inputs:
121
+ # - wiki.br.gz
122
+ # - cc.br.gz
123
+ # - wikimedia.br.gz
124
+ # - gnome.br.gz
125
+ # - kde.br.gz
126
+ # - mozilla.br.gz
127
+ - ost.br.gz
128
+ output: dubious.br.gz
129
+
130
+ - type: concatenate
131
+ parameters:
132
+ inputs:
133
+ # - wiki.fr.gz
134
+ # - cc.fr.gz
135
+ # - wikimedia.fr.gz
136
+ # - gnome.fr.gz
137
+ # - kde.fr.gz
138
+ # - mozilla.fr.gz
139
+ - ost.fr.gz
140
+ output: dubious.fr.gz
141
+
142
+ - type: concatenate
143
+ parameters:
144
+ inputs:
145
+ - dubious.br.gz
146
+ - good.br.gz
147
+ output: align_train.br.gz
148
+
149
+ - type: concatenate
150
+ parameters:
151
+ inputs:
152
+ - dubious.fr.gz
153
+ - good.fr.gz
154
+ output: align_train.fr.gz
155
+
156
+ - type: filter
157
+ parameters:
158
+ inputs:
159
+ - align_train.br.gz
160
+ - align_train.fr.gz
161
+ outputs:
162
+ - align_train-filtered.br.gz
163
+ - align_train-filtered.fr.gz
164
+ filters:
165
+ - LengthFilter:
166
+ unit: word
167
+ min_length: 1
168
+ max_length: 128
169
+
170
+ - type: train_alignment
171
+ parameters:
172
+ src_data: align_train-filtered.br.gz
173
+ tgt_data: align_train-filtered.fr.gz
174
+ output: alignment.priors
175
+ parameters: {}
176
+
177
+ # TODO: dedup and more agressive filtering
178
+ - type: filter
179
+ parameters:
180
+ inputs:
181
+ - dubious.br.gz
182
+ - dubious.fr.gz
183
+ outputs:
184
+ - dubious-filtered.br.gz
185
+ - dubious-filtered.fr.gz
186
+ filters:
187
+ - LengthFilter:
188
+ unit: word
189
+ min_length: 4
190
+ max_length: 128
191
+ - WordAlignFilter:
192
+ priors: alignment.priors
193
+
194
+ - type: concatenate
195
+ parameters:
196
+ inputs:
197
+ - dubious-filtered.br.gz
198
+ - good.br.gz
199
+ output: all.br.gz
200
+
201
+ - type: concatenate
202
+ parameters:
203
+ inputs:
204
+ - dubious-filtered.fr.gz
205
+ - good.fr.gz
206
+ output: all.fr.gz
207
+
208
+ # - type: remove_duplicates
209
+ # parameters:
210
+ # inputs:
211
+ # - all.br.gz
212
+ # outputs:
213
+ # - dedup.br.gz
214
+
215
+ # - type: remove_duplicates
216
+ # parameters:
217
+ # inputs:
218
+ # - all.fr.gz
219
+ # outputs:
220
+ # - dedup.fr.gz
221
+
222
+ - type: filter
223
+ parameters:
224
+ inputs:
225
+ - all.br.gz
226
+ - all.fr.gz
227
+ outputs:
228
+ - filtered.br.gz
229
+ - filtered.fr.gz
230
+ filters: &myfilters
231
+ - LengthFilter:
232
+ unit: word
233
+ min_length: 1
234
+ max_length: 128
235
+
236
+ - LengthRatioFilter:
237
+ unit: word
238
+ threshold: 3
239
+
240
+ - NonZeroNumeralsFilter: {}
241
+ - AlphabetRatioFilter: {}
242
+ - SimilarityFilter: {}
243
+ - RepetitionFilter:
244
+ threshold: 3
245
+ min_length: 5
246
+ max_length: 128
247
+