import tarfile from pathlib import Path import pandas as pd import requests from sklearn.model_selection import train_test_split if not Path("sentences.csv").exists(): print("Downloading data") link = "http://downloads.tatoeba.org/exports/sentences.tar.bz2" with requests.get(link , stream=True) as rx, tarfile.open(fileobj=rx.raw, mode="r|bz2") as tarobj: tarobj.extractall("./") print("Preparing sentences") sents = pd.read_csv("sentences.csv", sep="\t", names=["index", "lang", "text"]).dropna().drop("index", axis=1) sents = sents[sents.lang != "\\N"] sents = sents[sents.groupby("lang")["lang"].transform("size") > 500] sents = sents.groupby("lang").apply(lambda group: group.sample(11_000, replace=True)).droplevel("lang").drop_duplicates() sents = sents.sample(frac=1).reset_index().drop('index', axis=1) lang_count = len(sents.lang.unique()) print(f"Splitting sentences in {lang_count} languages") train, validation_test = train_test_split(sents, stratify=sents.lang, test_size=0.1) validation, test = train_test_split(validation_test, stratify=validation_test.lang, test_size=0.5) print("Writing files") train.to_csv("train_tatoeba.csv", index=False) validation.to_csv("validation_tatoeba.csv", index=False) test.to_csv("test_tatoeba.csv", index=False) Path("train_tatoeba.txt").write_text("\n".join(train.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values)) Path("validation_tatoeba.txt").write_text("\n".join(validation.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values)) Path("test_tatoeba.txt").write_text("\n".join(test.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values)) print("Done")