metadata

inference: false
language: pt
datasets:
  - lener_br
license: mit
pipeline_tag: token-classification

DeBERTinha XSmall for NER

Full Token Classification Example

from transformers import AutoModelForTokenClassification, AutoTokenizer, AutoConfig
import torch

model_name = "sagui-nlp/debertinha-ptbr-xsmall-lenerbr"
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=13)
tokenizer = AutoTokenizer.from_pretrained(model_name)

input_text = "Acrescento que não há de se falar em violação do artigo 114, § 3º, da Constituição Federal, posto que referido dispositivo revela-se impertinente, tratando da possibilidade de ajuizamento de dissídio coletivo pelo Ministério Público do Trabalho nos casos de greve em atividade essencial."

inputs = tokenizer(input_text, max_length=512, truncation=True, return_tensors="pt")
tokens = inputs.tokens()

outputs = model(**inputs).logits
predictions = torch.argmax(outputs, dim=2)

entities = []
current_entity = []
current_label = None
for token, prediction in zip(tokens[1:-1], predictions[0].numpy()[1:-1]):
    # print((token, model.config.id2label[prediction]))
    if not len(current_entity):
        current_entity.append(token)
        current_label = model.config.id2label[prediction]
    elif token.startswith("▁"):
        entities.append(("".join(current_entity), current_label))
        current_entity = [token]
        current_label = model.config.id2label[prediction]
    else:
        current_entity.append(token)
entities.append(("".join(current_entity), current_label))
list(filter(lambda x:x[1]!="O", entities))

Training notes

Training was done on label of only the first token

label_all_tokens = False
task="ner"

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=512)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

dataset = dataset.map(tokenize_and_align_labels, batched=True)

Citation

@misc{campiotti2023debertinha,
      title={DeBERTinha: A Multistep Approach to Adapt DebertaV3 XSmall for Brazilian Portuguese Natural Language Processing Task}, 
      author={Israel Campiotti and Matheus Rodrigues and Yuri Albuquerque and Rafael Azevedo and Alyson Andrade},
      year={2023},
      eprint={2309.16844},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}