File size: 1,756 Bytes

486585a

import tarfile
from pathlib import Path

import pandas as pd
import requests
from sklearn.model_selection import train_test_split


if not Path("sentences.csv").exists():
    print("Downloading data")
    link = "http://downloads.tatoeba.org/exports/sentences.tar.bz2"
    with requests.get(link , stream=True) as rx, tarfile.open(fileobj=rx.raw, mode="r|bz2") as tarobj:
        tarobj.extractall("./")

print("Preparing sentences")
sents = pd.read_csv("sentences.csv", sep="\t", names=["index", "lang", "text"]).dropna().drop("index", axis=1)
sents = sents[sents.lang != "\\N"]
sents = sents[sents.groupby("lang")["lang"].transform("size") > 500]
sents = sents.groupby("lang").apply(lambda group: group.sample(11_000, replace=True)).droplevel("lang").drop_duplicates()
sents = sents.sample(frac=1).reset_index().drop('index', axis=1)
lang_count = len(sents.lang.unique())

print(f"Splitting sentences in {lang_count} languages")
train, validation_test = train_test_split(sents, stratify=sents.lang, test_size=0.1)
validation, test = train_test_split(validation_test, stratify=validation_test.lang, test_size=0.5)

print("Writing files")
train.to_csv("train_tatoeba.csv", index=False)
validation.to_csv("validation_tatoeba.csv", index=False)
test.to_csv("test_tatoeba.csv", index=False)

Path("train_tatoeba.txt").write_text("\n".join(train.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
Path("validation_tatoeba.txt").write_text("\n".join(validation.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
Path("test_tatoeba.txt").write_text("\n".join(test.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))

print("Done")