|
import tarfile |
|
from pathlib import Path |
|
|
|
import pandas as pd |
|
import requests |
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
if not Path("sentences.csv").exists(): |
|
print("Downloading data") |
|
link = "http://downloads.tatoeba.org/exports/sentences.tar.bz2" |
|
with requests.get(link , stream=True) as rx, tarfile.open(fileobj=rx.raw, mode="r|bz2") as tarobj: |
|
tarobj.extractall("./") |
|
|
|
print("Preparing sentences") |
|
sents = pd.read_csv("sentences.csv", sep="\t", names=["index", "lang", "text"]).dropna().drop("index", axis=1) |
|
sents = sents[sents.lang != "\\N"] |
|
sents = sents[sents.groupby("lang")["lang"].transform("size") > 500] |
|
sents = sents.groupby("lang").apply(lambda group: group.sample(11_000, replace=True)).droplevel("lang").drop_duplicates() |
|
sents = sents.sample(frac=1).reset_index().drop('index', axis=1) |
|
lang_count = len(sents.lang.unique()) |
|
|
|
print(f"Splitting sentences in {lang_count} languages") |
|
train, validation_test = train_test_split(sents, stratify=sents.lang, test_size=0.1) |
|
validation, test = train_test_split(validation_test, stratify=validation_test.lang, test_size=0.5) |
|
|
|
print("Writing files") |
|
train.to_csv("train_tatoeba.csv", index=False) |
|
validation.to_csv("validation_tatoeba.csv", index=False) |
|
test.to_csv("test_tatoeba.csv", index=False) |
|
|
|
Path("train_tatoeba.txt").write_text("\n".join(train.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values)) |
|
Path("validation_tatoeba.txt").write_text("\n".join(validation.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values)) |
|
Path("test_tatoeba.txt").write_text("\n".join(test.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values)) |
|
|
|
print("Done") |
|
|