#wget http://downloads.tatoeba.org/exports/sentences.tar.bz2 #bunzip2 sentences.tar.bz2 #tar xvf sentences.tar #awk -F"\t" '{print"__label__"$2" "$3}' < sentences.csv | shuf > all.txt #head -3 all.txt #head -n 10000 all.txt > validation_tatoeba.txt #tail -n +10001 all.txt > train_tatoeba.txt python create_fasttext_data.py python create_tatoeba_data.py cat train*.txt | shuf > train_all.txt cat validation*.txt | shuf > validation_all.txt cat test*.txt | shuf > test_all.txt python <