|
from datasets import Dataset, DatasetDict |
|
import pandas as pd |
|
from config import max_length, label2id |
|
from model import tokenizer |
|
import os |
|
import torch |
|
|
|
|
|
def convert_to_stsb_features(example_batch): |
|
inputs = example_batch['content'] |
|
features = tokenizer.batch_encode_plus( |
|
inputs, truncation=True, max_length=max_length, padding='max_length') |
|
|
|
|
|
features["labels"] = [0]*len(example_batch["content"]) |
|
|
|
return features |
|
|
|
|
|
|
|
|
|
def convert_to_features(dataset_dict, convert_func_dict): |
|
columns_dict = { |
|
"document": ['input_ids', 'attention_mask', 'labels'], |
|
|
|
|
|
} |
|
features_dict = {} |
|
|
|
for task_name, dataset in dataset_dict.items(): |
|
features_dict[task_name] = {} |
|
print(task_name) |
|
for phase, phase_dataset in dataset.items(): |
|
features_dict[task_name][phase] = phase_dataset.map( |
|
convert_func_dict[task_name], |
|
batched=True, |
|
load_from_cache_file=False, |
|
) |
|
print(task_name, phase, len(phase_dataset), |
|
len(features_dict[task_name][phase])) |
|
features_dict[task_name][phase].set_format( |
|
type="torch", |
|
columns=columns_dict[task_name], |
|
) |
|
print("=>",task_name, phase, len(phase_dataset), |
|
len(features_dict[task_name][phase])) |
|
return features_dict |
|
|
|
|