Upload dl_opus.yaml
Browse files- dl_opus.yaml +247 -0
dl_opus.yaml
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
common:
|
2 |
+
output_directory: local/opus
|
3 |
+
|
4 |
+
steps:
|
5 |
+
# The quality of wikimatrix is really really bad for this pair: very poor alignment
|
6 |
+
# - type: opus_read
|
7 |
+
# parameters:
|
8 |
+
# corpus_name: WikiMatrix
|
9 |
+
# source_language: br
|
10 |
+
# target_language: fr
|
11 |
+
# release: latest
|
12 |
+
# preprocessing: raw
|
13 |
+
# src_output: wiki.br.gz
|
14 |
+
# tgt_output: wiki.fr.gz
|
15 |
+
|
16 |
+
# The quality of ccmatrix is really really bad for this pair: very few usable breton sentences
|
17 |
+
# - type: opus_read
|
18 |
+
# parameters:
|
19 |
+
# corpus_name: MultiCCAligned
|
20 |
+
# source_language: br
|
21 |
+
# target_language: fr
|
22 |
+
# release: latest
|
23 |
+
# preprocessing: raw
|
24 |
+
# src_output: cc.br.gz
|
25 |
+
# tgt_output: cc.fr.gz
|
26 |
+
|
27 |
+
- type: opus_read
|
28 |
+
parameters:
|
29 |
+
corpus_name: OfisPublik
|
30 |
+
source_language: br
|
31 |
+
target_language: fr
|
32 |
+
release: latest
|
33 |
+
preprocessing: raw
|
34 |
+
src_output: ofis.br.gz
|
35 |
+
tgt_output: ofis.fr.gz
|
36 |
+
suppress_prompts: true
|
37 |
+
|
38 |
+
- type: opus_read
|
39 |
+
parameters:
|
40 |
+
corpus_name: OpenSubtitles
|
41 |
+
source_language: br
|
42 |
+
target_language: fr
|
43 |
+
release: latest
|
44 |
+
preprocessing: raw
|
45 |
+
src_output: ost.br.gz
|
46 |
+
tgt_output: ost.fr.gz
|
47 |
+
suppress_prompts: true
|
48 |
+
|
49 |
+
- type: opus_read
|
50 |
+
parameters:
|
51 |
+
corpus_name: Tatoeba
|
52 |
+
source_language: br
|
53 |
+
target_language: fr
|
54 |
+
release: latest
|
55 |
+
preprocessing: raw
|
56 |
+
src_output: tatoeba.br.gz
|
57 |
+
tgt_output: tatoeba.fr.gz
|
58 |
+
suppress_prompts: true
|
59 |
+
|
60 |
+
# - type: opus_read
|
61 |
+
# parameters:
|
62 |
+
# corpus_name: wikimedia
|
63 |
+
# source_language: br
|
64 |
+
# target_language: fr
|
65 |
+
# release: latest
|
66 |
+
# preprocessing: raw
|
67 |
+
# src_output: wikimedia.br.gz
|
68 |
+
# tgt_output: wikimedia.fr.gz
|
69 |
+
# suppress_prompts: true
|
70 |
+
|
71 |
+
# - type: opus_read
|
72 |
+
# parameters:
|
73 |
+
# corpus_name: Mozilla-I10n
|
74 |
+
# source_language: br
|
75 |
+
# target_language: fr
|
76 |
+
# release: latest
|
77 |
+
# preprocessing: raw
|
78 |
+
# src_output: mozilla.br.gz
|
79 |
+
# tgt_output: mozilla.fr.gz
|
80 |
+
# suppress_prompts: true
|
81 |
+
|
82 |
+
# - type: opus_read
|
83 |
+
# parameters:
|
84 |
+
# corpus_name: KDE4
|
85 |
+
# source_language: br
|
86 |
+
# target_language: fr
|
87 |
+
# release: latest
|
88 |
+
# preprocessing: raw
|
89 |
+
# src_output: kde.br.gz
|
90 |
+
# tgt_output: kde.fr.gz
|
91 |
+
# suppress_prompts: true
|
92 |
+
|
93 |
+
# - type: opus_read
|
94 |
+
# parameters:
|
95 |
+
# corpus_name: GNOME
|
96 |
+
# source_language: br
|
97 |
+
# target_language: fr
|
98 |
+
# release: latest
|
99 |
+
# preprocessing: raw
|
100 |
+
# src_output: gnome.br.gz
|
101 |
+
# tgt_output: gnome.fr.gz
|
102 |
+
# suppress_prompts: true
|
103 |
+
|
104 |
+
- type: concatenate
|
105 |
+
parameters:
|
106 |
+
inputs:
|
107 |
+
- ofis.br.gz
|
108 |
+
- tatoeba.br.gz
|
109 |
+
output: good.br.gz
|
110 |
+
|
111 |
+
- type: concatenate
|
112 |
+
parameters:
|
113 |
+
inputs:
|
114 |
+
- ofis.fr.gz
|
115 |
+
- tatoeba.fr.gz
|
116 |
+
output: good.fr.gz
|
117 |
+
|
118 |
+
- type: concatenate
|
119 |
+
parameters:
|
120 |
+
inputs:
|
121 |
+
# - wiki.br.gz
|
122 |
+
# - cc.br.gz
|
123 |
+
# - wikimedia.br.gz
|
124 |
+
# - gnome.br.gz
|
125 |
+
# - kde.br.gz
|
126 |
+
# - mozilla.br.gz
|
127 |
+
- ost.br.gz
|
128 |
+
output: dubious.br.gz
|
129 |
+
|
130 |
+
- type: concatenate
|
131 |
+
parameters:
|
132 |
+
inputs:
|
133 |
+
# - wiki.fr.gz
|
134 |
+
# - cc.fr.gz
|
135 |
+
# - wikimedia.fr.gz
|
136 |
+
# - gnome.fr.gz
|
137 |
+
# - kde.fr.gz
|
138 |
+
# - mozilla.fr.gz
|
139 |
+
- ost.fr.gz
|
140 |
+
output: dubious.fr.gz
|
141 |
+
|
142 |
+
- type: concatenate
|
143 |
+
parameters:
|
144 |
+
inputs:
|
145 |
+
- dubious.br.gz
|
146 |
+
- good.br.gz
|
147 |
+
output: align_train.br.gz
|
148 |
+
|
149 |
+
- type: concatenate
|
150 |
+
parameters:
|
151 |
+
inputs:
|
152 |
+
- dubious.fr.gz
|
153 |
+
- good.fr.gz
|
154 |
+
output: align_train.fr.gz
|
155 |
+
|
156 |
+
- type: filter
|
157 |
+
parameters:
|
158 |
+
inputs:
|
159 |
+
- align_train.br.gz
|
160 |
+
- align_train.fr.gz
|
161 |
+
outputs:
|
162 |
+
- align_train-filtered.br.gz
|
163 |
+
- align_train-filtered.fr.gz
|
164 |
+
filters:
|
165 |
+
- LengthFilter:
|
166 |
+
unit: word
|
167 |
+
min_length: 1
|
168 |
+
max_length: 128
|
169 |
+
|
170 |
+
- type: train_alignment
|
171 |
+
parameters:
|
172 |
+
src_data: align_train-filtered.br.gz
|
173 |
+
tgt_data: align_train-filtered.fr.gz
|
174 |
+
output: alignment.priors
|
175 |
+
parameters: {}
|
176 |
+
|
177 |
+
# TODO: dedup and more agressive filtering
|
178 |
+
- type: filter
|
179 |
+
parameters:
|
180 |
+
inputs:
|
181 |
+
- dubious.br.gz
|
182 |
+
- dubious.fr.gz
|
183 |
+
outputs:
|
184 |
+
- dubious-filtered.br.gz
|
185 |
+
- dubious-filtered.fr.gz
|
186 |
+
filters:
|
187 |
+
- LengthFilter:
|
188 |
+
unit: word
|
189 |
+
min_length: 4
|
190 |
+
max_length: 128
|
191 |
+
- WordAlignFilter:
|
192 |
+
priors: alignment.priors
|
193 |
+
|
194 |
+
- type: concatenate
|
195 |
+
parameters:
|
196 |
+
inputs:
|
197 |
+
- dubious-filtered.br.gz
|
198 |
+
- good.br.gz
|
199 |
+
output: all.br.gz
|
200 |
+
|
201 |
+
- type: concatenate
|
202 |
+
parameters:
|
203 |
+
inputs:
|
204 |
+
- dubious-filtered.fr.gz
|
205 |
+
- good.fr.gz
|
206 |
+
output: all.fr.gz
|
207 |
+
|
208 |
+
# - type: remove_duplicates
|
209 |
+
# parameters:
|
210 |
+
# inputs:
|
211 |
+
# - all.br.gz
|
212 |
+
# outputs:
|
213 |
+
# - dedup.br.gz
|
214 |
+
|
215 |
+
# - type: remove_duplicates
|
216 |
+
# parameters:
|
217 |
+
# inputs:
|
218 |
+
# - all.fr.gz
|
219 |
+
# outputs:
|
220 |
+
# - dedup.fr.gz
|
221 |
+
|
222 |
+
- type: filter
|
223 |
+
parameters:
|
224 |
+
inputs:
|
225 |
+
- all.br.gz
|
226 |
+
- all.fr.gz
|
227 |
+
outputs:
|
228 |
+
- filtered.br.gz
|
229 |
+
- filtered.fr.gz
|
230 |
+
filters: &myfilters
|
231 |
+
- LengthFilter:
|
232 |
+
unit: word
|
233 |
+
min_length: 1
|
234 |
+
max_length: 128
|
235 |
+
|
236 |
+
- LengthRatioFilter:
|
237 |
+
unit: word
|
238 |
+
threshold: 3
|
239 |
+
|
240 |
+
- NonZeroNumeralsFilter: {}
|
241 |
+
- AlphabetRatioFilter: {}
|
242 |
+
- SimilarityFilter: {}
|
243 |
+
- RepetitionFilter:
|
244 |
+
threshold: 3
|
245 |
+
min_length: 5
|
246 |
+
max_length: 128
|
247 |
+
|