magistermilitum
commited on
Commit
•
a276bc9
1
Parent(s):
b72fa5a
Update README.md
Browse files
README.md
CHANGED
@@ -35,3 +35,191 @@ widget:
|
|
35 |
|
36 |
|
37 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
|
37 |
---
|
38 |
+
|
39 |
+
## Model Details
|
40 |
+
|
41 |
+
This is a Fine-tuned version of the multilingual Roberta model on medieval charters. The model is intended to recognize Locations and persons in medieval texts
|
42 |
+
in a Flat and nested manner. The train dataset entails 8k annotated texts on medieval latin, french and Spanish from a period ranging from 11th to 15th centuries.
|
43 |
+
|
44 |
+
|
45 |
+
### How to Get Started with the Model
|
46 |
+
The model is intended to be used in a simple way manner:
|
47 |
+
|
48 |
+
```python
|
49 |
+
import torch
|
50 |
+
from transformers import pipeline
|
51 |
+
|
52 |
+
pipe = pipeline("token-classification", model="magistermilitum/roberta-multilingual-medieval-ner")
|
53 |
+
|
54 |
+
results = list(map(pipe, list_of_sentences))
|
55 |
+
results =[[[y["entity"],y["word"], y["start"], y["end"]] for y in x] for x in results]
|
56 |
+
print(results)
|
57 |
+
```
|
58 |
+
|
59 |
+
|
60 |
+
### Model Description
|
61 |
+
|
62 |
+
The following snippet can transforms model inferences to CONLL format using the BIO format.
|
63 |
+
|
64 |
+
```python
|
65 |
+
class TextProcessor:
|
66 |
+
def __init__(self, filename):
|
67 |
+
self.filename = filename
|
68 |
+
self.sent_detector = nltk.data.load("tokenizers/punkt/english.pickle") #sentence tokenizer
|
69 |
+
self.sentences = []
|
70 |
+
self.new_sentences = []
|
71 |
+
self.results = []
|
72 |
+
self.new_sentences_token_info = []
|
73 |
+
self.new_sentences_bio = []
|
74 |
+
self.BIO_TAGS = []
|
75 |
+
self.stripped_BIO_TAGS = []
|
76 |
+
|
77 |
+
def read_file(self):
|
78 |
+
with open(self.filename, 'r') as f:
|
79 |
+
text = f.read()
|
80 |
+
self.sentences = self.sent_detector.tokenize(text.strip())
|
81 |
+
|
82 |
+
def process_sentences(self): #We split long sentences as encoder has a 256 max-lenght. Sentences with les of 40 words will be merged.
|
83 |
+
for sentence in self.sentences:
|
84 |
+
if len(sentence.split()) < 40 and self.new_sentences:
|
85 |
+
self.new_sentences[-1] += " " + sentence
|
86 |
+
else:
|
87 |
+
self.new_sentences.append(sentence)
|
88 |
+
|
89 |
+
def apply_model(self, pipe):
|
90 |
+
self.results = list(map(pipe, self.new_sentences))
|
91 |
+
self.results=[[[y["entity"],y["word"], y["start"], y["end"]] for y in x] for x in self.results]
|
92 |
+
|
93 |
+
def tokenize_sentences(self):
|
94 |
+
for n_s in self.new_sentences:
|
95 |
+
tokens=n_s.split() # Basic tokenization
|
96 |
+
token_info = []
|
97 |
+
|
98 |
+
# Initialize a variable to keep track of character index
|
99 |
+
char_index = 0
|
100 |
+
# Iterate through the tokens and record start and end info
|
101 |
+
for token in tokens:
|
102 |
+
start = char_index
|
103 |
+
end = char_index + len(token) # Subtract 1 for the last character of the token
|
104 |
+
token_info.append((token, start, end))
|
105 |
+
|
106 |
+
char_index += len(token) + 1 # Add 1 for the whitespace
|
107 |
+
self.new_sentences_token_info.append(token_info)
|
108 |
+
|
109 |
+
def process_results(self): #merge subwords and BIO tags
|
110 |
+
for result in self.results:
|
111 |
+
merged_bio_result = []
|
112 |
+
current_word = ""
|
113 |
+
current_label = None
|
114 |
+
current_start = None
|
115 |
+
current_end = None
|
116 |
+
for entity, subword, start, end in result:
|
117 |
+
if subword.startswith("▁"):
|
118 |
+
subword = subword[1:]
|
119 |
+
merged_bio_result.append([current_word, current_label, current_start, current_end])
|
120 |
+
current_word = "" ; current_label = None ; current_start = None ; current_end = None
|
121 |
+
if current_start is None:
|
122 |
+
current_word = subword ; current_label = entity ; current_start = start+1 ; current_end= end
|
123 |
+
else:
|
124 |
+
current_word += subword ; current_end = end
|
125 |
+
if current_word:
|
126 |
+
merged_bio_result.append([current_word, current_label, current_start, current_end])
|
127 |
+
self.new_sentences_bio.append(merged_bio_result[1:])
|
128 |
+
|
129 |
+
def match_tokens_with_entities(self): #match BIO tags with tokens
|
130 |
+
for i,ss in enumerate(self.new_sentences_token_info):
|
131 |
+
for word in ss:
|
132 |
+
for ent in self.new_sentences_bio[i]:
|
133 |
+
if word[1]==ent[2]:
|
134 |
+
if ent[1]=="L-PERS":
|
135 |
+
self.BIO_TAGS.append([word[0], "I-PERS", "B-LOC"])
|
136 |
+
break
|
137 |
+
else:
|
138 |
+
if "LOC" in ent[1]:
|
139 |
+
self.BIO_TAGS.append([word[0], "O", ent[1]])
|
140 |
+
else:
|
141 |
+
self.BIO_TAGS.append([word[0], ent[1], "O"])
|
142 |
+
break
|
143 |
+
else:
|
144 |
+
self.BIO_TAGS.append([word[0], "O", "O"])
|
145 |
+
|
146 |
+
def separate_dots_and_comma(self): #optional
|
147 |
+
signs=[",", ";", ":", "."]
|
148 |
+
for bio in self.BIO_TAGS:
|
149 |
+
if any(bio[0][-1]==sign for sign in signs) and len(bio[0])>1:
|
150 |
+
self.stripped_BIO_TAGS.append([bio[0][:-1], bio[1], bio[2]]);
|
151 |
+
self.stripped_BIO_TAGS.append([bio[0][-1], "O", "O"])
|
152 |
+
else:
|
153 |
+
self.stripped_BIO_TAGS.append(bio)
|
154 |
+
|
155 |
+
def save_BIO(self):
|
156 |
+
with open('output_BIO_a.txt', 'w', encoding='utf-8') as output_file:
|
157 |
+
output_file.write("TOKEN\tPERS\tLOCS\n"+"\n".join(["\t".join(x) for x in self.stripped_BIO_TAGS]))
|
158 |
+
|
159 |
+
# Usage:
|
160 |
+
processor = TextProcessor('sentence.txt')
|
161 |
+
processor.read_file()
|
162 |
+
processor.process_sentences()
|
163 |
+
processor.apply_model(pipe)
|
164 |
+
processor.tokenize_sentences()
|
165 |
+
processor.process_results()
|
166 |
+
processor.match_tokens_with_entities()
|
167 |
+
processor.separate_dots_and_comma()
|
168 |
+
processor.save_BIO()
|
169 |
+
```
|
170 |
+
|
171 |
+
- **Developed by:** [Sergio Torres Aguilar]
|
172 |
+
- **Model type:** [XLM-Roberta]
|
173 |
+
- **Language(s) (NLP):** [Medieval Latin, Spanish, French]
|
174 |
+
- **Finetuned from model [optional]:** [Named Entity Recognition]
|
175 |
+
|
176 |
+
### Direct Use
|
177 |
+
|
178 |
+
A sentence as : "Ego Radulfus de Francorvilla miles, notum facio tam presentibus cum futuris quod, cum Guillelmo Bateste militi de Miliaco"
|
179 |
+
|
180 |
+
Will be annotated in BIO format as:
|
181 |
+
|
182 |
+
```python
|
183 |
+
('Ego', 'O', 'O')
|
184 |
+
('Radulfus', 'B-PERS')
|
185 |
+
('de', 'I-PERS', 'O')
|
186 |
+
('Francorvilla', 'I-PERS', 'B-LOC')
|
187 |
+
('miles', 'O')
|
188 |
+
(',', 'O', 'O')
|
189 |
+
('notum', 'O', 'O')
|
190 |
+
('facio', 'O', 'O')
|
191 |
+
('tam', 'O', 'O')
|
192 |
+
('presentibus', 'O', 'O')
|
193 |
+
('quam', 'O', 'O')
|
194 |
+
('futuris', 'O', 'O')
|
195 |
+
('quod', 'O', 'O')
|
196 |
+
(',', 'O', 'O')
|
197 |
+
('cum', 'O', 'O')
|
198 |
+
('Guillelmo', 'B-PERS', 'O')
|
199 |
+
('Bateste', 'I-PERS', 'O')
|
200 |
+
('militi', 'O', 'O')
|
201 |
+
('de', 'O', 'O')
|
202 |
+
('Miliaco', 'O', 'B-LOC')
|
203 |
+
```
|
204 |
+
|
205 |
+
### Training Procedure
|
206 |
+
|
207 |
+
The model was fine-tuned during 5 epoch on the XML-Roberta-Large using a 5e-5 Lr and a batch size of 16.
|
208 |
+
|
209 |
+
|
210 |
+
**BibTeX:**
|
211 |
+
```bibtex
|
212 |
+
@inproceedings{aguilar2022multilingual,
|
213 |
+
title={Multilingual Named Entity Recognition for Medieval Charters Using Stacked Embeddings and Bert-based Models.},
|
214 |
+
author={Aguilar, Sergio Torres},
|
215 |
+
booktitle={Proceedings of the second workshop on language technologies for historical and ancient languages},
|
216 |
+
pages={119--128},
|
217 |
+
year={2022}
|
218 |
+
}
|
219 |
+
```
|
220 |
+
|
221 |
+
## Model Card Contact
|
222 |
+
|
223 |
+
[sergio.torres@uni.lu]
|
224 |
+
|
225 |
+
|