#wget http://downloads.tatoeba.org/exports/sentences.tar.bz2 | |
#bunzip2 sentences.tar.bz2 | |
#tar xvf sentences.tar | |
#awk -F"\t" '{print"__label__"$2" "$3}' < sentences.csv | shuf > all.txt | |
#head -3 all.txt | |
#head -n 10000 all.txt > validation_tatoeba.txt | |
#tail -n +10001 all.txt > train_tatoeba.txt | |
python create_fasttext_data.py | |
python create_tatoeba_data.py | |
cat train*.txt | shuf > train_all.txt | |
cat validation*.txt | shuf > validation_all.txt | |
cat test*.txt | shuf > test_all.txt | |
python <<EOF | |
from pathlib import Path | |
import pandas as pd | |
pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("train_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("train_all.csv", index=False) | |
pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("validation_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("validation_all.csv", index=False) | |
pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("test_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("test_all.csv", index=False) | |
EOF | |