cgr71ii commited on
Commit
0f358b0
1 Parent(s): c28be7d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +47 -0
README.md CHANGED
@@ -1,3 +1,50 @@
1
  ---
2
  license: cc-by-sa-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-sa-4.0
3
  ---
4
+ # Usage
5
+
6
+ ```python
7
+ import re
8
+ import urllib.parse
9
+
10
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
+ import nltk.tokenize
12
+ import torch
13
+
14
+ preprocess_tokenizer_regex = r'[^\W_0-9]+|[^\w\s]+|_+|\s+|[0-9]+' # Similar to wordpunct_tokenize
15
+ preprocess_tokenizer = nltk.tokenize.RegexpTokenizer(preprocess_tokenizer_regex).tokenize
16
+
17
+ def preprocess_url(url):
18
+ protocol_idx = url.find("://")
19
+ protocol_idx = (protocol_idx + 3) if protocol_idx != -1 else 0
20
+ url = url.rstrip('/')[protocol_idx:]
21
+ url = urllib.parse.unquote(url, errors="backslashreplace")
22
+
23
+ # Remove blanks
24
+ url = re.sub(r'\s+', ' ', url)
25
+ url = re.sub(r'^\s+|\s+$', '', url)
26
+
27
+ # Tokenize
28
+ url = ' '.join(preprocess_tokenizer(url))
29
+
30
+ return url
31
+
32
+ tokenizer = AutoTokenizer.from_pretrained("Transducens/xlm-roberta-base-url2lang")
33
+ model = AutoModelForSequenceClassification.from_pretrained("Transducens/xlm-roberta-base-url2lang")
34
+
35
+ # prepare input
36
+ url = preprocess_url("https://es.wikipedia.org/wiki/Halo_3#Matchmaking")
37
+ encoded_input = tokenizer(url, add_special_tokens=True, truncation=True, padding="longest",
38
+ return_attention_mask=True, return_tensors="pt", max_length=256)
39
+
40
+ # forward pass
41
+ output = model(encoded_input["input_ids"], encoded_input["attention_mask"])
42
+
43
+ # obtain lang
44
+ probabilities = torch.softmax(output["logits"], dim=1).cpu().squeeze(0)
45
+ lang_idx = torch.argmax(probabilities, dim=0).item()
46
+ probability = probabilities[lang_idx].item()
47
+ lang = model.config.id2lang[str(lang_idx)]
48
+
49
+ print(f"Language (probability): {lang} ({probability})")
50
+ ```