from os import path from tqdm import tqdm from typing import List, Generator, Optional, Union from datasets import Dataset from dataset.st_dataset import SummInstance, SummDataset # Set directory to load non_huggingface dataset scripts FILE_DIRECTORY_PATH = path.dirname(path.realpath(__file__)) BASE_NONHUGGINGFACE_DATASETS_PATH = path.join( FILE_DIRECTORY_PATH, "non_huggingface_datasets_builders" ) # Huggingface Datasets class CnndmDataset(SummDataset): """ The CNN/DM dataset """ dataset_name = "CNN/DailyMail" is_query_based = False is_dialogue_based = False is_multi_document = False huggingface_dataset = True huggingface_page = "https://huggingface.co/datasets/cnn_dailymail" def __init__(self): super().__init__( dataset_args=( "cnn_dailymail", "3.0.0", ) ) def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: """ Overrides the SummDataset '_process_data()' method This method processes the data contained in the dataset and puts each data instance into a SummInstance object :param dataset: a train/validation/test dataset :rtype: a generator yielding SummInstance objects """ for instance in tqdm(data): article: str = instance["article"] highlights: str = instance["highlights"] summ_instance = SummInstance(source=article, summary=highlights) yield summ_instance class MultinewsDataset(SummDataset): """ The Multi News dataset """ dataset_name = "Multinews" is_query_based = False is_dialogue_based = False is_multi_document = True huggingface_dataset = True huggingface_page = "https://huggingface.co/datasets/multi_news" def __init__(self): super().__init__(dataset_args=("multi_news",)) def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: """ Overrides the SummDataset '_process_data()' method This method processes the data contained in the dataset and puts each data instance into a SummInstance object :param dataset: a train/validation/test dataset :rtype: a generator yielding SummInstance objects """ for instance in tqdm(data): document: list = [ doc for doc in instance["document"].split("|||||") if doc ] # removes the empty string generated # since each doc ends with the delimiting token '|||||' # the final doc creates an empty string summary: str = instance["summary"] summ_instance = SummInstance(source=document, summary=summary) yield summ_instance class SamsumDataset(SummDataset): """ The SAMsum Dataset """ dataset_name = "Samsum" is_query_based = False is_dialogue_based = True is_multi_document = False huggingface_dataset = True huggingface_page = "https://huggingface.co/datasets/samsum" def __init__(self): super().__init__(dataset_args=("samsum",)) def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: """ Overrides the SummDataset '_process_data()' method This method processes the data contained in the dataset and puts each data instance into a SummInstance object :param dataset: a train/validation/test dataset :rtype: a generator yielding SummInstance objects """ for instance in tqdm(data): dialogue: List = instance["dialogue"].split( "\r\n" ) # split each dialogue into a list of strings such as # ["speaker1 : utter..", "speaker2 : utter..."] summary: str = instance["summary"] summ_instance = SummInstance(source=dialogue, summary=summary) yield summ_instance class XsumDataset(SummDataset): """ The Xsum Dataset """ dataset_name = "Xsum" huggingface_dataset = True huggingface_page = "https://huggingface.co/datasets/xsum" is_query_based = False is_dialogue_based = False is_multi_document = False def __init__(self): super().__init__(dataset_args=("xsum",)) def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: """ Overrides the SummDataset '_process_data()' method This method processes the data contained in the dataset and puts each data instance into a SummInstance object :param dataset: a train/validation/test dataset :rtype: a generator yielding SummInstance objects """ for instance in tqdm(data): document: List = instance["document"] summary: str = instance["summary"] summ_instance = SummInstance(source=document, summary=summary) yield summ_instance class PubmedqaDataset(SummDataset): """ The Pubmed QA dataset """ dataset_name = "Pubmedqa" is_query_based = True is_dialogue_based = False is_multi_document = False huggingface_dataset = True huggingface_page = "https://huggingface.co/datasets/pubmed_qa" def __init__(self, seed=None): super().__init__( dataset_args=( "pubmed_qa", "pqa_artificial", ) ) def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: """ Overrides the SummDataset '_process_data()' method This method processes the data contained in the dataset and puts each data instance into a SummInstance object :param dataset: a train/validation/test dataset :rtype: a generator yielding SummInstance objects """ for instance in tqdm(data): context: str = " ".join(instance["context"]["contexts"]) answer: str = instance["long_answer"] query: str = instance["question"] summ_instance = SummInstance(source=context, summary=answer, query=query) yield summ_instance class MlsumDataset(SummDataset): """ The MLsum Dataset - A multi-lingual dataset featuring 5 languages Includes 1.5 million news articles and their corresponding summaries "de" - German "es" - Spanish "fr" - French "ru" - Russian "tu" - Turkish """ dataset_name = "MlSum" is_query_based = False is_dialogue_based = False is_multi_document = False huggingface_dataset = True huggingface_page = "https://huggingface.co/datasets/mlsum" supported_languages = ["de", "es", "fr", "ru", "tu"] mlsum_instantiation_guide = """The languages supported for the Mlsum Dataset are: de - German es - Spanish fr - French ru - Russian tu - Turkish Examples to instantiate the dataset: 1. Dataset with only one language dataset = MlsumDataset({language_token}) dataset = MlsumDataset("es") dataset = MlsumDataset("tu")... 2. Dataset with a multiple languages dataset = MlsumDataset({list of language_token}) dataset = MlsumDataset(["es","de"]) dataset = MlsumDataset(["es","de", "tu"])... 3. Dataset with all supported languages (default) dataset = MlsumDataset(all) dataset = MlsumDataset() """ def __init__(self, languages: Optional[Union[str, List[str]]] = "all"): super().__init__(dataset_args=(languages,)) def _load_dataset_safe(self, languages: Optional[Union[str, List[str]]]): """ Overrides the parent class method Method loads multiple datasets of different languages provided in :param languages: It then concatenates these datasets into one combined dataset :rtype: datasetDict containing the combined dataset :param languages: Optional, either a string or list of strings specifying the languages to load """ print(MlsumDataset.mlsum_instantiation_guide) # Choose languages to download articles if languages == "all": selected_languages = MlsumDataset.supported_languages elif isinstance(languages, list): for language in languages: assert self.is_supported(language) selected_languages = languages else: assert self.is_supported(languages) selected_languages = [languages] # Concatenate selected languaeges into one dataset language_datasets = [] for language in selected_languages: dataset = super()._load_dataset_safe( "mlsum", language, ) language_datasets.append(dataset) mlsum_dataset = self._concatenate_dataset_dicts(language_datasets) return mlsum_dataset def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: """ Overrides the SummDataset '_process_data()' method This method processes the data contained in the dataset and puts each data instance into a SummInstance object :param dataset: a train/validation/test dataset :rtype: a generator yielding SummInstance objects """ for instance in tqdm(data): article: List = instance["text"] summary: str = instance["summary"] summ_instance = SummInstance(source=article, summary=summary) yield summ_instance def is_supported(self, language: str): """ Checks whether the requested langues is supported :param language: string containing the requested language :rtype bool: """ if language not in MlsumDataset.supported_languages: print(MlsumDataset.mlsum_instantiation_guide) raise ValueError( f"The language(s): '{language}' entered is not supported. See above message for usage info" ) else: return True # Non-huggingface datasets class ScisummnetDataset(SummDataset): """ The SciSummNet dataset. As a dataset not included by huggingface, we need to do manually download, set basic information for the dataset """ dataset_name = "ScisummNet" version = "1.1.0" description = ( "A summary of scientific papers should ideally incorporate the impact of the papers on the " "research community reflected by citations. To facilitate research in citation-aware scientific " "paper summarization (Scisumm), the CL-Scisumm shared task has been organized since 2014 for " "papers in the computational linguistics and NLP domain." ) is_dialogue_based = False is_multi_document = False is_query_based = False huggingface_dataset = False builder_script_path = path.join( BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" ) def __init__(self, seed=None): super().__init__() def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: """ Overrides the SummDataset '_process_data()' method This method processes the data contained in the dataset and puts each data instance into a SummInstance object :param dataset: a train/validation/test dataset :rtype: a generator yielding SummInstance objects """ for instance in tqdm(data): docs: List = [ instance["document_xml"], instance["citing_sentences_annotated.json"], ] summary: str = instance["summary"] summ_instance = SummInstance(source=docs, summary=summary) yield summ_instance class SummscreenDataset(SummDataset): """ The SummScreen dataset. As a dataset not included by huggingface, we need to do manually download, set basic information for the dataset """ dataset_name = "Summscreen" version = "1.1.0" is_dialogue_based = True is_multi_document = False is_query_based = False huggingface_dataset = False builder_script_path = path.join( BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" ) def __init__(self, seed=None): super().__init__() def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: """ Overrides the SummDataset '_process_data()' method This method processes the data contained in the dataset and puts each data instance into a SummInstance object :param dataset: a train/validation/test dataset :rtype: a generator yielding SummInstance objects """ for instance in tqdm(data): transcript: List = instance[ "transcript" ] # convert string into a list of string dialogues recap: str = instance["recap"] summ_instance = SummInstance(source=transcript, summary=recap) yield summ_instance class QMsumDataset(SummDataset): """ QMSum Dataset """ dataset_name = "QMsum" description = """ QMSum is a new human-annotated benchmark for query-based multi-domain meeting summarization task, which consists of 1,808 query-summary pairs over 232 meetings in multiple domains. """ is_dialogue_based = True is_multi_document = False is_query_based = True huggingface_dataset = False builder_script_path = path.join( BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" ) def __init__(self): super().__init__() def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: """ Overrides the SummDataset '_process_data()' method This method processes the data contained in the dataset and puts each data instance into a SummInstance object :param dataset: a train/validation/test dataset :rtype: a generator yielding SummInstance objects """ for instance in tqdm(data): for query_set in ( instance["general_query_list"] + instance["specific_query_list"] ): meeting: List = [ utterance["speaker"] + " : " + utterance["content"] for utterance in instance["meeting_transcripts"] ] query: str = query_set["query"] summary: str = query_set["answer"] summ_instance = SummInstance( source=meeting, summary=summary, query=query ) yield summ_instance class ArxivDataset(SummDataset): """ The Arxiv Dataset """ dataset_name = "Arxiv_longsummarization" description = """ A summarization dataset comprised of pairs of scientific papers. The dataset provides a challenging testbed for abstractive summarization. It contains papers and their abstracts. """ is_dialogue_based = False is_multi_document = False is_query_based = False huggingface_dataset = False builder_script_path = path.join( BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" ) def __init__(self): print( "*****************", "***Attention***", "This dataset is quite large (approx 5Gb and will need about 15 Gb for the extraction process", "Cancel/interrupt the download if size and time constraints will not be met", "*****************", sep="\n", ) super().__init__() def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: """ Overrides the SummDataset '_process_data()' method This method processes the data contained in the dataset and puts each data instance into a SummInstance object :param dataset: a train/validation/test dataset :rtype: a generator yielding SummInstance objects """ for instance in tqdm(data): article: List = instance["article_text"] abstract: str = " ".join(instance["abstract_text"]) summ_instance = SummInstance(source=article, summary=abstract) yield summ_instance