Spaces:
Runtime error
Runtime error
from evaluate_model import compute_metrics | |
from datasets import load_from_disk, Dataset | |
from transformers import AutoTokenizer | |
import os | |
import pickle | |
from transformers import AutoModelForTokenClassification | |
# from transformers import DataCollatorForTokenClassification | |
from utils import tokenize_and_align_labels | |
from rich import print | |
import huggingface_hub | |
import torch | |
import json | |
from tqdm import tqdm | |
# _ = load_dotenv(find_dotenv()) # read local .env file | |
hf_token= os.environ['HF_TOKEN'] | |
huggingface_hub.login(hf_token) | |
checkpoint = 'elshehawy/finer-ord-transformers' | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
data_path = './data/merged_dataset/' | |
test = load_from_disk(data_path)['test'] | |
test = Dataset.from_dict(test[:16]) | |
feature_path = './data/ner_feature.pickle' | |
with open(feature_path, 'rb') as f: | |
ner_feature = pickle.load(f) | |
# data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) | |
ner_model = AutoModelForTokenClassification.from_pretrained(checkpoint) | |
# tokenized_dataset.set_format('torch') | |
def collate_fn(data): | |
input_ids = [(element['input_ids']) for element in data] | |
attention_mask = [element['attention_mask'] for element in data] | |
token_type_ids = [element['token_type_ids'] for element in data] | |
labels = [element['labels'] for element in data] | |
return input_ids, token_type_ids, attention_mask, labels | |
ner_model = ner_model.eval() | |
def get_metrics_trf(data): | |
# print(device) | |
data = Dataset.from_dict(data) | |
tokenized_data = data.map( | |
tokenize_and_align_labels, | |
batched=True, | |
batch_size=None, | |
remove_columns=data.column_names[2:], | |
fn_kwargs={'tokenizer': tokenizer} | |
) | |
loader = torch.utils.data.DataLoader(tokenized_data, batch_size=16, collate_fn=collate_fn) | |
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') | |
y_true, logits = [], [] | |
for input_ids, token_type_ids, attention_mask, labels in tqdm(loader): | |
ner_model.to(device) | |
with torch.no_grad(): | |
logits.extend( | |
ner_model( | |
input_ids=torch.tensor(input_ids).to(device), | |
token_type_ids=torch.tensor(token_type_ids).to(device), | |
attention_mask=torch.tensor(attention_mask).to(device) | |
).logits.cpu().numpy() | |
) | |
y_true.extend(labels) | |
all_metrics = compute_metrics((logits, y_true)) | |
return all_metrics | |
# with open('./metrics/trf/metrics.json', 'w') as f: | |
# json.dump(all_metrics, f) | |
def find_orgs_in_data(tokens, labels): | |
orgs = [] | |
prev_tok_id = 0 | |
for i, (token, label) in enumerate(zip(tokens, labels)): | |
if label == 'B-ORG': | |
org = [] | |
org.append(token) | |
orgs.append(org) | |
prev_tok_id = i | |
if label == 'I-ORG' and (i-1) == prev_tok_id: | |
org = orgs[-1] | |
org.append(token) | |
orgs[-1] = org | |
prev_tok_id = i | |
# print(i) | |
return [tokenizer.convert_tokens_to_string(org) for org in orgs] | |
def store_sample_data(data): | |
data = Dataset.from_dict(data) | |
test_data = [] | |
for sent in data: | |
labels = [ner_feature.feature.int2str(l) for l in sent['ner_tags']] | |
# print(labels) | |
sent_orgs = find_orgs(sent['tokens'], labels) | |
sent_text = tokenizer.convert_tokens_to_string(sent['tokens']) | |
test_data.append({ | |
'id': sent['id'], | |
'text': sent_text, | |
'orgs': sent_orgs | |
}) | |
return test_data | |
# with open('./data/sample_data.json', 'w') as f: | |
# json.dump(test_data, f) |