nb-nordic-lid / code /create_tatoeba_data.py
versae's picture
First full version of the models
486585a
import tarfile
from pathlib import Path
import pandas as pd
import requests
from sklearn.model_selection import train_test_split
if not Path("sentences.csv").exists():
print("Downloading data")
link = "http://downloads.tatoeba.org/exports/sentences.tar.bz2"
with requests.get(link , stream=True) as rx, tarfile.open(fileobj=rx.raw, mode="r|bz2") as tarobj:
tarobj.extractall("./")
print("Preparing sentences")
sents = pd.read_csv("sentences.csv", sep="\t", names=["index", "lang", "text"]).dropna().drop("index", axis=1)
sents = sents[sents.lang != "\\N"]
sents = sents[sents.groupby("lang")["lang"].transform("size") > 500]
sents = sents.groupby("lang").apply(lambda group: group.sample(11_000, replace=True)).droplevel("lang").drop_duplicates()
sents = sents.sample(frac=1).reset_index().drop('index', axis=1)
lang_count = len(sents.lang.unique())
print(f"Splitting sentences in {lang_count} languages")
train, validation_test = train_test_split(sents, stratify=sents.lang, test_size=0.1)
validation, test = train_test_split(validation_test, stratify=validation_test.lang, test_size=0.5)
print("Writing files")
train.to_csv("train_tatoeba.csv", index=False)
validation.to_csv("validation_tatoeba.csv", index=False)
test.to_csv("test_tatoeba.csv", index=False)
Path("train_tatoeba.txt").write_text("\n".join(train.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
Path("validation_tatoeba.txt").write_text("\n".join(validation.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
Path("test_tatoeba.txt").write_text("\n".join(test.apply(lambda row: f"__label__{row['lang']} {row['text']}".replace('\n', ' '), axis=1).values))
print("Done")