File size: 1,093 Bytes
486585a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
#bunzip2 sentences.tar.bz2
#tar xvf sentences.tar
#awk -F"\t" '{print"__label__"$2" "$3}' < sentences.csv | shuf > all.txt
#head -3 all.txt
#head -n 10000 all.txt > validation_tatoeba.txt
#tail -n +10001 all.txt > train_tatoeba.txt

python create_fasttext_data.py
python create_tatoeba_data.py

cat train*.txt | shuf > train_all.txt
cat validation*.txt | shuf > validation_all.txt
cat test*.txt | shuf > test_all.txt
python <<EOF
from pathlib import Path
import pandas as pd

pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("train_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("train_all.csv", index=False)
pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("validation_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("validation_all.csv", index=False)
pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("test_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("test_all.csv", index=False)

EOF