xls-r-uzbek-cv8 / kenlm /lm /interpolate /merge_vocab_test.cc
lucio's picture
Training in progress, step 5000
8652957
raw
history blame
5.31 kB
#define BOOST_TEST_MODULE InterpolateMergeVocabTest
#include <boost/test/unit_test.hpp>
#include "lm/enumerate_vocab.hh"
#include "lm/interpolate/merge_vocab.hh"
#include "lm/interpolate/universal_vocab.hh"
#include "lm/lm_exception.hh"
#include "lm/vocab.hh"
#include "lm/word_index.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
#include "util/file_stream.hh"
#include "util/tokenize_piece.hh"
#include <algorithm>
#include <cstring>
#include <vector>
namespace lm {
namespace interpolate {
namespace {
struct VocabEntry {
explicit VocabEntry(StringPiece value) :
str(value), hash(util::MurmurHash64A(value.data(), value.size())) {}
StringPiece str;
uint64_t hash;
bool operator<(const VocabEntry &other) const {
return hash < other.hash;
}
};
int WriteVocabFile(const std::vector<VocabEntry> &vocab, util::scoped_fd &file) {
file.reset(util::MakeTemp(util::DefaultTempDirectory()));
{
util::FileStream out(file.get(), 128);
for (std::vector<VocabEntry>::const_iterator i = vocab.begin(); i != vocab.end(); ++i) {
out << i->str << '\0';
}
}
util::SeekOrThrow(file.get(), 0);
return file.get();
}
std::vector<VocabEntry> ParseVocab(StringPiece words) {
std::vector<VocabEntry> entries;
entries.push_back(VocabEntry("<unk>"));
for (util::TokenIter<util::SingleCharacter> i(words, '\t'); i; ++i) {
entries.push_back(VocabEntry(*i));
}
std::sort(entries.begin() + 1, entries.end());
return entries;
}
int WriteVocabFile(StringPiece words, util::scoped_fd &file) {
return WriteVocabFile(ParseVocab(words), file);
}
class TestFiles {
public:
TestFiles() {}
int Test0() {
return WriteVocabFile("this\tis\ta\tfirst\tcut", test[0]);
}
int Test1() {
return WriteVocabFile("is this\tthis a\tfirst cut\ta first", test[1]);
}
int Test2() {
return WriteVocabFile("is\tsecd\ti", test[2]);
}
int NoUNK() {
std::vector<VocabEntry> no_unk_vec;
no_unk_vec.push_back(VocabEntry("toto"));
return WriteVocabFile(no_unk_vec, no_unk);
}
int BadOrder() {
std::vector<VocabEntry> bad_order_vec;
bad_order_vec.push_back(VocabEntry("<unk>"));
bad_order_vec.push_back(VocabEntry("0"));
bad_order_vec.push_back(VocabEntry("1"));
bad_order_vec.push_back(VocabEntry("2"));
bad_order_vec.push_back(VocabEntry("a"));
return WriteVocabFile(bad_order_vec, bad_order);
}
private:
util::scoped_fd test[3], no_unk, bad_order;
};
class DoNothingEnumerate : public EnumerateVocab {
public:
void Add(WordIndex, const StringPiece &) {}
};
BOOST_AUTO_TEST_CASE(MergeVocabTest) {
TestFiles files;
util::FixedArray<int> used_files(3);
used_files.push_back(files.Test0());
used_files.push_back(files.Test1());
used_files.push_back(files.Test2());
std::vector<lm::WordIndex> model_max_idx;
model_max_idx.push_back(10);
model_max_idx.push_back(10);
model_max_idx.push_back(10);
util::scoped_fd combined(util::MakeTemp(util::DefaultTempDirectory()));
UniversalVocab universal_vocab(model_max_idx);
{
ngram::ImmediateWriteWordsWrapper writer(NULL, combined.get(), 0);
MergeVocab(used_files, universal_vocab, writer);
}
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 0), 0);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 0), 0);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 0), 0);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 1), 1);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 1), 2);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 1), 8);
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 5), 11);
#if BYTE_ORDER == LITTLE_ENDIAN
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 4);
#elif BYTE_ORDER == BIG_ENDIAN
// MurmurHash has a different ordering of the vocabulary.
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 5);
#endif
BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 3), 10);
util::SeekOrThrow(combined.get(), 0);
util::FilePiece f(combined.release());
std::vector<VocabEntry> expected = ParseVocab("a\tis this\tthis a\tfirst cut\tthis\ta first\tcut\tis\ti\tsecd\tfirst");
for (std::vector<VocabEntry>::const_iterator i = expected.begin(); i != expected.end(); ++i) {
BOOST_CHECK_EQUAL(i->str, f.ReadLine('\0'));
}
BOOST_CHECK_THROW(f.ReadLine('\0'), util::EndOfFileException);
}
BOOST_AUTO_TEST_CASE(MergeVocabNoUnkTest) {
TestFiles files;
util::FixedArray<int> used_files(1);
used_files.push_back(files.NoUNK());
std::vector<lm::WordIndex> model_max_idx;
model_max_idx.push_back(10);
UniversalVocab universal_vocab(model_max_idx);
DoNothingEnumerate nothing;
BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException);
}
BOOST_AUTO_TEST_CASE(MergeVocabWrongOrderTest) {
TestFiles files;
util::FixedArray<int> used_files(2);
used_files.push_back(files.Test0());
used_files.push_back(files.BadOrder());
std::vector<lm::WordIndex> model_max_idx;
model_max_idx.push_back(10);
model_max_idx.push_back(10);
lm::interpolate::UniversalVocab universal_vocab(model_max_idx);
DoNothingEnumerate nothing;
BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException);
}
}}} // namespaces