davda54 commited on
Commit
d5b960e
1 Parent(s): c6d1bab

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +158 -1
README.md CHANGED
@@ -1,3 +1,160 @@
1
  ---
2
- license: mit
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - 'no'
4
+ - nb
5
+ - nn
6
+ - en
7
+ inference: false
8
+ tags:
9
+ - Norwegian
10
+ - English
11
+ - translation
12
+ license: cc-by-4.0
13
+ pipeline_tag: translation
14
  ---
15
+
16
+ # NorT5 base finetuned for English ↔ Norwegian (Bokmål or Nynorsk, all 6 directions) translation
17
+
18
+ <img src="https://huggingface.co/ltg/norbert3-base/resolve/main/norbert.png" width=12.5%>
19
+
20
+
21
+
22
+ ## Example usage
23
+
24
+ This model is specifically finetuned for translating documents in any direction between Norwegian Bokmål, Norwegian Nynorsk and English.
25
+ Unlike traditional NMT models, it is trained on paragraph-to-paragraph translation – the translation quality is thus better if you feed it whole paragraphs instead of segmented sentences.
26
+
27
+ A simple example of how to use this model can be found in the `translate.py` file:
28
+
29
+ ```python
30
+ import torch
31
+ import transformers
32
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
33
+ from transformers.generation import LogitsProcessor
34
+
35
+
36
+ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
37
+ def __init__(self, penalty: float, model):
38
+ last_bias = model.classifier.nonlinearity[-1].bias.data
39
+ last_bias = torch.nn.functional.log_softmax(last_bias)
40
+ self.penalty = penalty * (last_bias - last_bias.max())
41
+
42
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
43
+ penalized_score = torch.gather(scores + self.penalty.unsqueeze(0).to(input_ids.device), 1, input_ids).to(scores.dtype)
44
+ scores.scatter_(1, input_ids, penalized_score)
45
+ return scores
46
+
47
+
48
+ class Translator:
49
+ def __init__(self, model_path="ltg/nort5-base-en-no-translation", device="cpu"):
50
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
51
+ self.cls_index = self.tokenizer.convert_tokens_to_ids("[CLS]")
52
+ self.sep_index = self.tokenizer.convert_tokens_to_ids("[SEP]")
53
+ self.eos_index = self.tokenizer.convert_tokens_to_ids("[EOS]")
54
+ self.pad_index = self.tokenizer.convert_tokens_to_ids("[PAD]")
55
+ self.eng_index = self.tokenizer.convert_tokens_to_ids(">>eng<<")
56
+ self.nob_index = self.tokenizer.convert_tokens_to_ids(">>nob<<")
57
+ self.nno_index = self.tokenizer.convert_tokens_to_ids(">>nno<<")
58
+
59
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path, trust_remote_code=True)
60
+
61
+ self.device = device
62
+ print(f"SYSTEM: Running on {self.device}", flush=True)
63
+
64
+ self.model = self.model.to(device)
65
+ self.model.eval()
66
+
67
+ print(f"Sucessfully loaded the model to the memory")
68
+
69
+ self.LANGUAGE_IDS = {
70
+ "en": self.eng_index,
71
+ "nb": self.nob_index,
72
+ "nn": self.nno_index
73
+ }
74
+
75
+ def __call__(self, source, source_language, target_language):
76
+ source = [s.strip() for s in source.split('\n')]
77
+ source_subwords = self.tokenizer(source).input_ids
78
+ source_subwords = [[self.cls_index, self.LANGUAGE_IDS[target_language], self.LANGUAGE_IDS[source_language]] + s + [self.sep_index] for s in source_subwords]
79
+ source_subwords = [torch.tensor(s) for s in source_subwords]
80
+ source_subwords = torch.nn.utils.rnn.pad_sequence(source_subwords, batch_first=True, padding_value=self.pad_index)
81
+ source_subwords = source_subwords[:, :512].to(self.device)
82
+
83
+ def generate(model, **kwargs):
84
+ with torch.inference_mode():
85
+ with torch.autocast(enabled=self.device != "cpu", device_type="cuda", dtype=torch.bfloat16):
86
+ return model.generate(**kwargs)
87
+
88
+ generate_kwargs = dict(
89
+ input_ids=source_subwords,
90
+ attention_mask=(source_subwords != self.pad_index).long(),
91
+ max_new_tokens = 512-1,
92
+ num_beams=8,
93
+ length_penalty=1.6,
94
+ early_stopping=True,
95
+ do_sample=False,
96
+ use_cache=True,
97
+ logits_processor=[RepetitionPenaltyLogitsProcessor(0.5, self.model), transformers.LogitNormalization()]
98
+ )
99
+ output = generate(self.model, **generate_kwargs).tolist()
100
+ paragraphs = [self.tokenizer.decode(c, skip_special_tokens=True).strip() for c in output]
101
+ translation = '\n'.join(paragraphs)
102
+
103
+ return translation
104
+
105
+
106
+ if __name__ == "__main__":
107
+
108
+ translator = Translator()
109
+
110
+ en_text = "How are you feeling right now? Better?"
111
+ no_text = translator(en_text, "en", "nb")
112
+
113
+ print(en_text)
114
+ print(no_text)
115
+ ```
116
+
117
+
118
+ ## The NorT5 and NorBERT family
119
+
120
+ The official release of a new generation of NorT5 language models described in paper [**NorBench — A Benchmark for Norwegian Language Models**](https://arxiv.org/abs/2305.03880). Plese read the paper to learn more details about the model.
121
+
122
+
123
+ ## Other sizes:
124
+ - [NorT5 xs (32M)](https://huggingface.co/ltg/nort5-xs)
125
+ - [NorT5 small (88M)](https://huggingface.co/ltg/nort5-small)
126
+ - [NorT5 base (228M)](https://huggingface.co/ltg/nort5-base)
127
+ - [NorT5 large (808M)](https://huggingface.co/ltg/nort5-large)
128
+
129
+
130
+ ## Encoder-only NorBERT siblings:
131
+ - [NorBERT 3 xs (15M)](https://huggingface.co/ltg/norbert3-xs)
132
+ - [NorBERT 3 small (40M)](https://huggingface.co/ltg/norbert3-small)
133
+ - [NorBERT 3 base (123M)](https://huggingface.co/ltg/norbert3-base)
134
+ - [NorBERT 3 large (323M)](https://huggingface.co/ltg/norbert3-large)
135
+
136
+
137
+ ## Cite us
138
+
139
+ ```bibtex
140
+ @inproceedings{samuel-etal-2023-norbench,
141
+ title = "{N}or{B}ench {--} A Benchmark for {N}orwegian Language Models",
142
+ author = "Samuel, David and
143
+ Kutuzov, Andrey and
144
+ Touileb, Samia and
145
+ Velldal, Erik and
146
+ {\O}vrelid, Lilja and
147
+ R{\o}nningstad, Egil and
148
+ Sigdel, Elina and
149
+ Palatkina, Anna",
150
+ booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)",
151
+ month = may,
152
+ year = "2023",
153
+ address = "T{\'o}rshavn, Faroe Islands",
154
+ publisher = "University of Tartu Library",
155
+ url = "https://aclanthology.org/2023.nodalida-1.61",
156
+ pages = "618--633",
157
+ abstract = "We present NorBench: a streamlined suite of NLP tasks and probes for evaluating Norwegian language models (LMs) on standardized data splits and evaluation metrics. We also introduce a range of new Norwegian language models (both encoder and encoder-decoder based). Finally, we compare and analyze their performance, along with other existing LMs, across the different benchmark tests of NorBench.",
158
+ }
159
+
160
+ ```