Edit model card

Usage

import re
import urllib.parse

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import nltk.tokenize
import torch

preprocess_tokenizer_regex = r'[^\W_0-9]+|[^\w\s]+|_+|\s+|[0-9]+' # Similar to wordpunct_tokenize
preprocess_tokenizer = nltk.tokenize.RegexpTokenizer(preprocess_tokenizer_regex).tokenize

def preprocess_url(url):
    protocol_idx = url.find("://")
    protocol_idx = (protocol_idx + 3) if protocol_idx != -1 else 0
    url = url.rstrip('/')[protocol_idx:]
    url = urllib.parse.unquote(url, errors="backslashreplace")

    # Remove blanks
    url = re.sub(r'\s+', ' ', url)
    url = re.sub(r'^\s+|\s+$', '', url)

    # Tokenize
    url = ' '.join(preprocess_tokenizer(url))

    return url

tokenizer = AutoTokenizer.from_pretrained("Transducens/xlm-roberta-base-url2lang")
model = AutoModelForSequenceClassification.from_pretrained("Transducens/xlm-roberta-base-url2lang")

# prepare input
url = preprocess_url("https://es.wikipedia.org/wiki/Halo_3#Matchmaking")
encoded_input = tokenizer(url, add_special_tokens=True, truncation=True, padding="longest",
                          return_attention_mask=True, return_tensors="pt", max_length=256)

# forward pass
output = model(encoded_input["input_ids"], encoded_input["attention_mask"])

# obtain lang
probabilities = torch.softmax(output["logits"], dim=1).cpu().squeeze(0)
lang_idx = torch.argmax(probabilities, dim=0).item()
probability = probabilities[lang_idx].item()
lang = model.config.id2lang[str(lang_idx)]

print(f"Language (probability): {lang} ({probability})")
Downloads last month
16
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.