cointegrated
commited on
Commit
•
3e140a9
1
Parent(s):
99244fb
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: cc-by-nc-4.0
|
3 |
+
language_bcp47: # TODO: convert them to traditional formats
|
4 |
+
- ace_Arab
|
5 |
+
- ace_Latn
|
6 |
+
- acm_Arab
|
7 |
+
- acq_Arab
|
8 |
+
- aeb_Arab
|
9 |
+
- afr_Latn
|
10 |
+
- ajp_Arab
|
11 |
+
- aka_Latn
|
12 |
+
- amh_Ethi
|
13 |
+
- apc_Arab
|
14 |
+
- arb_Arab
|
15 |
+
- ars_Arab
|
16 |
+
- ary_Arab
|
17 |
+
- arz_Arab
|
18 |
+
- asm_Beng
|
19 |
+
- ast_Latn
|
20 |
+
- awa_Deva
|
21 |
+
- ayr_Latn
|
22 |
+
- azb_Arab
|
23 |
+
- azj_Latn
|
24 |
+
- bak_Cyrl
|
25 |
+
- bam_Latn
|
26 |
+
- ban_Latn
|
27 |
+
- bel_Cyrl
|
28 |
+
- bem_Latn
|
29 |
+
- ben_Beng
|
30 |
+
- bho_Deva
|
31 |
+
- bjn_Arab
|
32 |
+
- bjn_Latn
|
33 |
+
- bod_Tibt
|
34 |
+
- bos_Latn
|
35 |
+
- bug_Latn
|
36 |
+
- bul_Cyrl
|
37 |
+
- cat_Latn
|
38 |
+
- ceb_Latn
|
39 |
+
- ces_Latn
|
40 |
+
- cjk_Latn
|
41 |
+
- ckb_Arab
|
42 |
+
- crh_Latn
|
43 |
+
- cym_Latn
|
44 |
+
- dan_Latn
|
45 |
+
- deu_Latn
|
46 |
+
- dik_Latn
|
47 |
+
- dyu_Latn
|
48 |
+
- dzo_Tibt
|
49 |
+
- ell_Grek
|
50 |
+
- eng_Latn
|
51 |
+
- epo_Latn
|
52 |
+
- est_Latn
|
53 |
+
- eus_Latn
|
54 |
+
- ewe_Latn
|
55 |
+
- fao_Latn
|
56 |
+
- pes_Arab
|
57 |
+
- fij_Latn
|
58 |
+
- fin_Latn
|
59 |
+
- fon_Latn
|
60 |
+
- fra_Latn
|
61 |
+
- fur_Latn
|
62 |
+
- fuv_Latn
|
63 |
+
- gla_Latn
|
64 |
+
- gle_Latn
|
65 |
+
- glg_Latn
|
66 |
+
- grn_Latn
|
67 |
+
- guj_Gujr
|
68 |
+
- hat_Latn
|
69 |
+
- hau_Latn
|
70 |
+
- heb_Hebr
|
71 |
+
- hin_Deva
|
72 |
+
- hne_Deva
|
73 |
+
- hrv_Latn
|
74 |
+
- hun_Latn
|
75 |
+
- hye_Armn
|
76 |
+
- ibo_Latn
|
77 |
+
- ilo_Latn
|
78 |
+
- ind_Latn
|
79 |
+
- isl_Latn
|
80 |
+
- ita_Latn
|
81 |
+
- jav_Latn
|
82 |
+
- jpn_Jpan
|
83 |
+
- kab_Latn
|
84 |
+
- kac_Latn
|
85 |
+
- kam_Latn
|
86 |
+
- kan_Knda
|
87 |
+
- kas_Arab
|
88 |
+
- kas_Deva
|
89 |
+
- kat_Geor
|
90 |
+
- knc_Arab
|
91 |
+
- knc_Latn
|
92 |
+
- kaz_Cyrl
|
93 |
+
- kbp_Latn
|
94 |
+
- kea_Latn
|
95 |
+
- khm_Khmr
|
96 |
+
- kik_Latn
|
97 |
+
- kin_Latn
|
98 |
+
- kir_Cyrl
|
99 |
+
- kmb_Latn
|
100 |
+
- kon_Latn
|
101 |
+
- kor_Hang
|
102 |
+
- kmr_Latn
|
103 |
+
- lao_Laoo
|
104 |
+
- lvs_Latn
|
105 |
+
- lij_Latn
|
106 |
+
- lim_Latn
|
107 |
+
- lin_Latn
|
108 |
+
- lit_Latn
|
109 |
+
- lmo_Latn
|
110 |
+
- ltg_Latn
|
111 |
+
- ltz_Latn
|
112 |
+
- lua_Latn
|
113 |
+
- lug_Latn
|
114 |
+
- luo_Latn
|
115 |
+
- lus_Latn
|
116 |
+
- mag_Deva
|
117 |
+
- mai_Deva
|
118 |
+
- mal_Mlym
|
119 |
+
- mar_Deva
|
120 |
+
- min_Latn
|
121 |
+
- mkd_Cyrl
|
122 |
+
- plt_Latn
|
123 |
+
- mlt_Latn
|
124 |
+
- mni_Beng
|
125 |
+
- khk_Cyrl
|
126 |
+
- mos_Latn
|
127 |
+
- mri_Latn
|
128 |
+
- zsm_Latn
|
129 |
+
- mya_Mymr
|
130 |
+
- nld_Latn
|
131 |
+
- nno_Latn
|
132 |
+
- nob_Latn
|
133 |
+
- npi_Deva
|
134 |
+
- nso_Latn
|
135 |
+
- nus_Latn
|
136 |
+
- nya_Latn
|
137 |
+
- oci_Latn
|
138 |
+
- gaz_Latn
|
139 |
+
- ory_Orya
|
140 |
+
- pag_Latn
|
141 |
+
- pan_Guru
|
142 |
+
- pap_Latn
|
143 |
+
- pol_Latn
|
144 |
+
- por_Latn
|
145 |
+
- prs_Arab
|
146 |
+
- pbt_Arab
|
147 |
+
- quy_Latn
|
148 |
+
- ron_Latn
|
149 |
+
- run_Latn
|
150 |
+
- rus_Cyrl
|
151 |
+
- sag_Latn
|
152 |
+
- san_Deva
|
153 |
+
- sat_Beng
|
154 |
+
- scn_Latn
|
155 |
+
- shn_Mymr
|
156 |
+
- sin_Sinh
|
157 |
+
- slk_Latn
|
158 |
+
- slv_Latn
|
159 |
+
- smo_Latn
|
160 |
+
- sna_Latn
|
161 |
+
- snd_Arab
|
162 |
+
- som_Latn
|
163 |
+
- sot_Latn
|
164 |
+
- spa_Latn
|
165 |
+
- als_Latn
|
166 |
+
- srd_Latn
|
167 |
+
- srp_Cyrl
|
168 |
+
- ssw_Latn
|
169 |
+
- sun_Latn
|
170 |
+
- swe_Latn
|
171 |
+
- swh_Latn
|
172 |
+
- szl_Latn
|
173 |
+
- tam_Taml
|
174 |
+
- tat_Cyrl
|
175 |
+
- tel_Telu
|
176 |
+
- tgk_Cyrl
|
177 |
+
- tgl_Latn
|
178 |
+
- tha_Thai
|
179 |
+
- tir_Ethi
|
180 |
+
- taq_Latn
|
181 |
+
- taq_Tfng
|
182 |
+
- tpi_Latn
|
183 |
+
- tsn_Latn
|
184 |
+
- tso_Latn
|
185 |
+
- tuk_Latn
|
186 |
+
- tum_Latn
|
187 |
+
- tur_Latn
|
188 |
+
- twi_Latn
|
189 |
+
- tzm_Tfng
|
190 |
+
- uig_Arab
|
191 |
+
- ukr_Cyrl
|
192 |
+
- umb_Latn
|
193 |
+
- urd_Arab
|
194 |
+
- uzn_Latn
|
195 |
+
- vec_Latn
|
196 |
+
- vie_Latn
|
197 |
+
- war_Latn
|
198 |
+
- wol_Latn
|
199 |
+
- xho_Latn
|
200 |
+
- ydd_Hebr
|
201 |
+
- yor_Latn
|
202 |
+
- yue_Hant
|
203 |
+
- zho_Hans
|
204 |
+
- zho_Hant
|
205 |
+
- zul_Latn
|
206 |
+
---
|
207 |
+
This is a port of the multilingual text encoder from https://huggingface.co/facebook/SONAR to `transformers` format from `fairseq2`.
|
208 |
+
|
209 |
+
For advanced examples of usage, please take a look at https://github.com/facebookresearch/SONAR.
|
210 |
+
|
211 |
+
How to use:
|
212 |
+
```Python
|
213 |
+
!pip install transformers sentencepiece -q
|
214 |
+
|
215 |
+
import torch
|
216 |
+
from transformers import AutoTokenizer
|
217 |
+
from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder
|
218 |
+
|
219 |
+
model_name = "cointegrated/SONAR_200_text_encoder"
|
220 |
+
encoder = M2M100Encoder.from_pretrained(model_name)
|
221 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
222 |
+
|
223 |
+
def encode_mean_pool(texts, tokenizer, encoder, lang='eng_Latn', norm=False):
|
224 |
+
tokenizer.src_lang = lang
|
225 |
+
with torch.inference_mode():
|
226 |
+
batch = tokenizer(texts, return_tensors='pt', padding=True)
|
227 |
+
seq_embs = encoder(**batch).last_hidden_state
|
228 |
+
mask = batch.attention_mask
|
229 |
+
mean_emb = (seq_embs * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1)
|
230 |
+
if norm:
|
231 |
+
mean_emb = torch.nn.functional.normalize(mean_emb)
|
232 |
+
return mean_emb
|
233 |
+
|
234 |
+
sentences = ['My name is SONAR.', 'I can embed the sentences into vectorial space.']
|
235 |
+
embs = encode_mean_pool(sentences, tokenizer, encoder, lang="eng_Latn")
|
236 |
+
print(embs.shape)
|
237 |
+
# torch.Size([2, 1024])
|
238 |
+
print(embs)
|
239 |
+
# tensor([[-0.0053, 0.0020, -0.0006, ..., 0.0094, -0.0009, 0.0070],
|
240 |
+
# [-0.0003, -0.0071, 0.0076, ..., 0.0055, 0.0022, -0.0083]])
|
241 |
+
```
|