Spaces:
Build error
Build error
from os import path | |
from tqdm import tqdm | |
from typing import List, Generator, Optional, Union | |
from datasets import Dataset | |
from dataset.st_dataset import SummInstance, SummDataset | |
# Set directory to load non_huggingface dataset scripts | |
FILE_DIRECTORY_PATH = path.dirname(path.realpath(__file__)) | |
BASE_NONHUGGINGFACE_DATASETS_PATH = path.join( | |
FILE_DIRECTORY_PATH, "non_huggingface_datasets_builders" | |
) | |
# Huggingface Datasets | |
class CnndmDataset(SummDataset): | |
""" | |
The CNN/DM dataset | |
""" | |
dataset_name = "CNN/DailyMail" | |
is_query_based = False | |
is_dialogue_based = False | |
is_multi_document = False | |
huggingface_dataset = True | |
huggingface_page = "https://huggingface.co/datasets/cnn_dailymail" | |
def __init__(self): | |
super().__init__( | |
dataset_args=( | |
"cnn_dailymail", | |
"3.0.0", | |
) | |
) | |
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: | |
""" | |
Overrides the SummDataset '_process_data()' method | |
This method processes the data contained in the dataset | |
and puts each data instance into a SummInstance object | |
:param dataset: a train/validation/test dataset | |
:rtype: a generator yielding SummInstance objects | |
""" | |
for instance in tqdm(data): | |
article: str = instance["article"] | |
highlights: str = instance["highlights"] | |
summ_instance = SummInstance(source=article, summary=highlights) | |
yield summ_instance | |
class MultinewsDataset(SummDataset): | |
""" | |
The Multi News dataset | |
""" | |
dataset_name = "Multinews" | |
is_query_based = False | |
is_dialogue_based = False | |
is_multi_document = True | |
huggingface_dataset = True | |
huggingface_page = "https://huggingface.co/datasets/multi_news" | |
def __init__(self): | |
super().__init__(dataset_args=("multi_news",)) | |
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: | |
""" | |
Overrides the SummDataset '_process_data()' method | |
This method processes the data contained in the dataset | |
and puts each data instance into a SummInstance object | |
:param dataset: a train/validation/test dataset | |
:rtype: a generator yielding SummInstance objects | |
""" | |
for instance in tqdm(data): | |
document: list = [ | |
doc for doc in instance["document"].split("|||||") if doc | |
] # removes the empty string generated | |
# since each doc ends with the delimiting token '|||||' | |
# the final doc creates an empty string | |
summary: str = instance["summary"] | |
summ_instance = SummInstance(source=document, summary=summary) | |
yield summ_instance | |
class SamsumDataset(SummDataset): | |
""" | |
The SAMsum Dataset | |
""" | |
dataset_name = "Samsum" | |
is_query_based = False | |
is_dialogue_based = True | |
is_multi_document = False | |
huggingface_dataset = True | |
huggingface_page = "https://huggingface.co/datasets/samsum" | |
def __init__(self): | |
super().__init__(dataset_args=("samsum",)) | |
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: | |
""" | |
Overrides the SummDataset '_process_data()' method | |
This method processes the data contained in the dataset | |
and puts each data instance into a SummInstance object | |
:param dataset: a train/validation/test dataset | |
:rtype: a generator yielding SummInstance objects | |
""" | |
for instance in tqdm(data): | |
dialogue: List = instance["dialogue"].split( | |
"\r\n" | |
) # split each dialogue into a list of strings such as | |
# ["speaker1 : utter..", "speaker2 : utter..."] | |
summary: str = instance["summary"] | |
summ_instance = SummInstance(source=dialogue, summary=summary) | |
yield summ_instance | |
class XsumDataset(SummDataset): | |
""" | |
The Xsum Dataset | |
""" | |
dataset_name = "Xsum" | |
huggingface_dataset = True | |
huggingface_page = "https://huggingface.co/datasets/xsum" | |
is_query_based = False | |
is_dialogue_based = False | |
is_multi_document = False | |
def __init__(self): | |
super().__init__(dataset_args=("xsum",)) | |
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: | |
""" | |
Overrides the SummDataset '_process_data()' method | |
This method processes the data contained in the dataset | |
and puts each data instance into a SummInstance object | |
:param dataset: a train/validation/test dataset | |
:rtype: a generator yielding SummInstance objects | |
""" | |
for instance in tqdm(data): | |
document: List = instance["document"] | |
summary: str = instance["summary"] | |
summ_instance = SummInstance(source=document, summary=summary) | |
yield summ_instance | |
class PubmedqaDataset(SummDataset): | |
""" | |
The Pubmed QA dataset | |
""" | |
dataset_name = "Pubmedqa" | |
is_query_based = True | |
is_dialogue_based = False | |
is_multi_document = False | |
huggingface_dataset = True | |
huggingface_page = "https://huggingface.co/datasets/pubmed_qa" | |
def __init__(self, seed=None): | |
super().__init__( | |
dataset_args=( | |
"pubmed_qa", | |
"pqa_artificial", | |
) | |
) | |
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: | |
""" | |
Overrides the SummDataset '_process_data()' method | |
This method processes the data contained in the dataset | |
and puts each data instance into a SummInstance object | |
:param dataset: a train/validation/test dataset | |
:rtype: a generator yielding SummInstance objects | |
""" | |
for instance in tqdm(data): | |
context: str = " ".join(instance["context"]["contexts"]) | |
answer: str = instance["long_answer"] | |
query: str = instance["question"] | |
summ_instance = SummInstance(source=context, summary=answer, query=query) | |
yield summ_instance | |
class MlsumDataset(SummDataset): | |
""" | |
The MLsum Dataset - A multi-lingual dataset featuring 5 languages | |
Includes 1.5 million news articles and their corresponding summaries | |
"de" - German | |
"es" - Spanish | |
"fr" - French | |
"ru" - Russian | |
"tu" - Turkish | |
""" | |
dataset_name = "MlSum" | |
is_query_based = False | |
is_dialogue_based = False | |
is_multi_document = False | |
huggingface_dataset = True | |
huggingface_page = "https://huggingface.co/datasets/mlsum" | |
supported_languages = ["de", "es", "fr", "ru", "tu"] | |
mlsum_instantiation_guide = """The languages supported for the Mlsum Dataset are: | |
de - German | |
es - Spanish | |
fr - French | |
ru - Russian | |
tu - Turkish | |
Examples to instantiate the dataset: | |
1. Dataset with only one language | |
dataset = MlsumDataset({language_token}) | |
dataset = MlsumDataset("es") | |
dataset = MlsumDataset("tu")... | |
2. Dataset with a multiple languages | |
dataset = MlsumDataset({list of language_token}) | |
dataset = MlsumDataset(["es","de"]) | |
dataset = MlsumDataset(["es","de", "tu"])... | |
3. Dataset with all supported languages (default) | |
dataset = MlsumDataset(all) | |
dataset = MlsumDataset() | |
""" | |
def __init__(self, languages: Optional[Union[str, List[str]]] = "all"): | |
super().__init__(dataset_args=(languages,)) | |
def _load_dataset_safe(self, languages: Optional[Union[str, List[str]]]): | |
""" | |
Overrides the parent class method | |
Method loads multiple datasets of different languages provided in :param languages: | |
It then concatenates these datasets into one combined dataset | |
:rtype: datasetDict containing the combined dataset | |
:param languages: Optional, either a string or list of strings specifying the languages | |
to load | |
""" | |
print(MlsumDataset.mlsum_instantiation_guide) | |
# Choose languages to download articles | |
if languages == "all": | |
selected_languages = MlsumDataset.supported_languages | |
elif isinstance(languages, list): | |
for language in languages: | |
assert self.is_supported(language) | |
selected_languages = languages | |
else: | |
assert self.is_supported(languages) | |
selected_languages = [languages] | |
# Concatenate selected languaeges into one dataset | |
language_datasets = [] | |
for language in selected_languages: | |
dataset = super()._load_dataset_safe( | |
"mlsum", | |
language, | |
) | |
language_datasets.append(dataset) | |
mlsum_dataset = self._concatenate_dataset_dicts(language_datasets) | |
return mlsum_dataset | |
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: | |
""" | |
Overrides the SummDataset '_process_data()' method | |
This method processes the data contained in the dataset | |
and puts each data instance into a SummInstance object | |
:param dataset: a train/validation/test dataset | |
:rtype: a generator yielding SummInstance objects | |
""" | |
for instance in tqdm(data): | |
article: List = instance["text"] | |
summary: str = instance["summary"] | |
summ_instance = SummInstance(source=article, summary=summary) | |
yield summ_instance | |
def is_supported(self, language: str): | |
""" | |
Checks whether the requested langues is supported | |
:param language: string containing the requested language | |
:rtype bool: | |
""" | |
if language not in MlsumDataset.supported_languages: | |
print(MlsumDataset.mlsum_instantiation_guide) | |
raise ValueError( | |
f"The language(s): '{language}' entered is not supported. See above message for usage info" | |
) | |
else: | |
return True | |
# Non-huggingface datasets | |
class ScisummnetDataset(SummDataset): | |
""" | |
The SciSummNet dataset. As a dataset not included by huggingface, we need to do manually download, set basic | |
information for the dataset | |
""" | |
dataset_name = "ScisummNet" | |
version = "1.1.0" | |
description = ( | |
"A summary of scientific papers should ideally incorporate the impact of the papers on the " | |
"research community reflected by citations. To facilitate research in citation-aware scientific " | |
"paper summarization (Scisumm), the CL-Scisumm shared task has been organized since 2014 for " | |
"papers in the computational linguistics and NLP domain." | |
) | |
is_dialogue_based = False | |
is_multi_document = False | |
is_query_based = False | |
huggingface_dataset = False | |
builder_script_path = path.join( | |
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" | |
) | |
def __init__(self, seed=None): | |
super().__init__() | |
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: | |
""" | |
Overrides the SummDataset '_process_data()' method | |
This method processes the data contained in the dataset | |
and puts each data instance into a SummInstance object | |
:param dataset: a train/validation/test dataset | |
:rtype: a generator yielding SummInstance objects | |
""" | |
for instance in tqdm(data): | |
docs: List = [ | |
instance["document_xml"], | |
instance["citing_sentences_annotated.json"], | |
] | |
summary: str = instance["summary"] | |
summ_instance = SummInstance(source=docs, summary=summary) | |
yield summ_instance | |
class SummscreenDataset(SummDataset): | |
""" | |
The SummScreen dataset. As a dataset not included by huggingface, we need to do manually download, set basic | |
information for the dataset | |
""" | |
dataset_name = "Summscreen" | |
version = "1.1.0" | |
is_dialogue_based = True | |
is_multi_document = False | |
is_query_based = False | |
huggingface_dataset = False | |
builder_script_path = path.join( | |
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" | |
) | |
def __init__(self, seed=None): | |
super().__init__() | |
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: | |
""" | |
Overrides the SummDataset '_process_data()' method | |
This method processes the data contained in the dataset | |
and puts each data instance into a SummInstance object | |
:param dataset: a train/validation/test dataset | |
:rtype: a generator yielding SummInstance objects | |
""" | |
for instance in tqdm(data): | |
transcript: List = instance[ | |
"transcript" | |
] # convert string into a list of string dialogues | |
recap: str = instance["recap"] | |
summ_instance = SummInstance(source=transcript, summary=recap) | |
yield summ_instance | |
class QMsumDataset(SummDataset): | |
""" | |
QMSum Dataset | |
""" | |
dataset_name = "QMsum" | |
description = """ | |
QMSum is a new human-annotated benchmark for query-based multi-domain meeting summarization task, | |
which consists of 1,808 query-summary pairs over 232 meetings in multiple domains. | |
""" | |
is_dialogue_based = True | |
is_multi_document = False | |
is_query_based = True | |
huggingface_dataset = False | |
builder_script_path = path.join( | |
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" | |
) | |
def __init__(self): | |
super().__init__() | |
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: | |
""" | |
Overrides the SummDataset '_process_data()' method | |
This method processes the data contained in the dataset | |
and puts each data instance into a SummInstance object | |
:param dataset: a train/validation/test dataset | |
:rtype: a generator yielding SummInstance objects | |
""" | |
for instance in tqdm(data): | |
for query_set in ( | |
instance["general_query_list"] + instance["specific_query_list"] | |
): | |
meeting: List = [ | |
utterance["speaker"] + " : " + utterance["content"] | |
for utterance in instance["meeting_transcripts"] | |
] | |
query: str = query_set["query"] | |
summary: str = query_set["answer"] | |
summ_instance = SummInstance( | |
source=meeting, summary=summary, query=query | |
) | |
yield summ_instance | |
class ArxivDataset(SummDataset): | |
""" | |
The Arxiv Dataset | |
""" | |
dataset_name = "Arxiv_longsummarization" | |
description = """ | |
A summarization dataset comprised of pairs of scientific papers. | |
The dataset provides a challenging testbed for abstractive summarization. | |
It contains papers and their abstracts. | |
""" | |
is_dialogue_based = False | |
is_multi_document = False | |
is_query_based = False | |
huggingface_dataset = False | |
builder_script_path = path.join( | |
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" | |
) | |
def __init__(self): | |
print( | |
"*****************", | |
"***Attention***", | |
"This dataset is quite large (approx 5Gb and will need about 15 Gb for the extraction process", | |
"Cancel/interrupt the download if size and time constraints will not be met", | |
"*****************", | |
sep="\n", | |
) | |
super().__init__() | |
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: | |
""" | |
Overrides the SummDataset '_process_data()' method | |
This method processes the data contained in the dataset | |
and puts each data instance into a SummInstance object | |
:param dataset: a train/validation/test dataset | |
:rtype: a generator yielding SummInstance objects | |
""" | |
for instance in tqdm(data): | |
article: List = instance["article_text"] | |
abstract: str = " ".join(instance["abstract_text"]) | |
summ_instance = SummInstance(source=article, summary=abstract) | |
yield summ_instance | |