diff --git a/MHGTagger/CRFTagger.py b/MHGTagger/CRFTagger.py new file mode 100644 index 0000000000000000000000000000000000000000..6b738e2d3ee536f1beb1bcdef2f29902965f58e5 --- /dev/null +++ b/MHGTagger/CRFTagger.py @@ -0,0 +1,110 @@ + +import sys +import torch +from torch import nn +from torch.nn import functional as F + +from .RNNTagger import RNNTagger + + +### auxiliary functions ############################################ + +def logsumexp(x, dim): + """ sums up log-scale values """ + offset, _ = torch.max(x, dim=dim) + offset_broadcasted = offset.unsqueeze(dim) + safe_log_sum_exp = torch.log(torch.exp(x-offset_broadcasted).sum(dim=dim)) + return safe_log_sum_exp + offset + +def lookup(T, indices): + """ look up probabilities of tags in a vector, matrix, or 3D tensor """ + if T.dim() == 3: + return T.gather(2, indices.unsqueeze(2)).squeeze(2) + elif T.dim() == 2: + return T.gather(1, indices.unsqueeze(1)).squeeze(1) + elif T.dim() == 1: + return T[indices] + else: + raise Exception('unexpected tensor size in function "lookup"') + + +### tagger class ############################################### + +class CRFTagger(nn.Module): + """ implements a CRF tagger """ + + def __init__(self, num_chars, num_tags, char_emb_size, + char_rec_size, word_rec_size, word_rnn_depth, + dropout_rate, word_emb_size, beam_size): + + super(CRFTagger, self).__init__() + + # simple LSTMTagger which computes tag scores + self.base_tagger = RNNTagger(num_chars, num_tags, char_emb_size, + char_rec_size, word_rec_size, + word_rnn_depth, dropout_rate, word_emb_size) + self.beam_size = beam_size if 0 < beam_size < num_tags else num_tags + self.weights = nn.Parameter(torch.zeros(num_tags, num_tags)) + self.dropout = nn.Dropout(dropout_rate) + + + def forward(self, fwd_charIDs, bwd_charIDs, tags=None): + + annotation_mode = (tags is None) + + scores = self.base_tagger(fwd_charIDs, bwd_charIDs) + + # extract the highest-scoring tags for each word and their scores + best_scores, best_tags = scores.topk(self.beam_size, dim=-1) + + if self.training: # not done during dev evaluation + # check whether the goldstandard tags are among the best tags + gs_contained = (best_tags == tags.unsqueeze(1)).sum(dim=-1) + + # replace the tag with the lowest score at each position + # by the gs tag if the gs tag is not in the list + last_column = gs_contained * best_tags[:,-1] + (1-gs_contained) * tags + s = lookup(scores, last_column) + best_tags = torch.cat((best_tags[:,:-1], last_column.unsqueeze(1)), dim=1) + best_scores = torch.cat((best_scores[:,:-1], s.unsqueeze(1)), dim=1) + + best_previous = [] # stores the backpointers of the Viterbi algorithm + viterbi_scores = best_scores[0] + if not annotation_mode: + forward_scores = best_scores[0] + for i in range(1,scores.size(0)): # for all word positions except the first + # lookup of the tag-pair weights + w = self.weights[best_tags[i-1]][:,best_tags[i]] + + # Viterbi algorithm + values = viterbi_scores.unsqueeze(1) + best_scores[i].unsqueeze(0) + w + viterbi_scores, best_prev = torch.max(values, dim=0) + best_previous.append(best_prev) + + # Forward algorithm + if not annotation_mode: + values = forward_scores.unsqueeze(1) + best_scores[i].unsqueeze(0) + w + forward_scores = logsumexp(values, dim=0) + + # Viterbi algorithm + _, index = torch.max(viterbi_scores, dim=0) + best_indices = [index] + for i in range(len(best_previous)-1, -1, -1): + index = best_previous[i][index] + best_indices.append(index) + + # reverse the indices and map them to tag IDs + best_indices = torch.stack(best_indices[::-1]) + predicted_tags = lookup(best_tags, best_indices) + + if annotation_mode: + return predicted_tags + else: + # loss computation + basetagger_scores = lookup(scores, tags).sum() + CRFweights = self.weights[tags[:-1], tags[1:]].sum() if tags.size(0)>1 else 0 + logZ = logsumexp(forward_scores, dim=0) # log partition function + logprob = basetagger_scores + CRFweights - logZ + + return predicted_tags, -logprob + diff --git a/MHGTagger/Data.py b/MHGTagger/Data.py new file mode 100644 index 0000000000000000000000000000000000000000..9d692895b83db756cd5cd1aff94742b834511630 --- /dev/null +++ b/MHGTagger/Data.py @@ -0,0 +1,186 @@ + +import sys +from collections import Counter, OrderedDict +import pickle +import numpy + +unk_string = '' +pad_string = '' + +def read_tagged_sentences(path, max_sent_len): + """ + Read a dataset. + Each line consists of a token and a tag separated by a tab character + """ + sentences, words, tags = [], [], [] + with open(path) as file: + for line in file: + line = line.rstrip() + if line: + word, tag, *_ = line.split("\t") + words.append(word) + tags.append(tag) + else: + # empty line marking the end of a sentence + if 0 < len(words) < max_sent_len: + sentences.append((words, tags)) + words, tags = [], [] + return sentences + + +def read_word_embeddings(filename): + # Read word embeddings from file. + word_embeddings = [] + if filename is not None: + print("reading word embeddings ...", file=sys.stderr) + with open(filename) as file: + for line in file: + word, *vec = line.rstrip().split(' ') + if word != unk_string: + word_embeddings.append((word, numpy.array(vec, dtype=numpy.float32))) + print("done", file=sys.stderr) + word_emb_size = len(word_embeddings[0][1]) if word_embeddings else 0 + return word_embeddings, word_emb_size + + +def make_dict(counter, min_freq=0, add_pad_symbol=False): + """ + Create a dictionary which maps strings with some minimal frequency to numbers. + We don't use pack_padded sequence, so it is OK to assign ID 1 to the + padding symbol. + """ + symlist = [unk_string] + ([pad_string] if add_pad_symbol else []) + \ + [elem for elem,freq in counter.most_common() if freq>=min_freq] + string2ID = {elem:i for i,elem in enumerate(symlist)} + return string2ID, symlist + + +class Data(object): + """ + class for reading a tagged training and development corpus or a test corpus + """ + + IGNORE_INDEX = -100 + + def __init__(self, *args): + if len(args) == 1: + self.init_test(*args) + else: + self.init_train(*args) + + ### functions needed during training ############################################### + + def init_train(self, path_train, path_dev, word_trunc_len, + min_char_freq, max_sent_len, word_embeddings, ignore_tag): + + self.word_trunc_len = word_trunc_len # length to which words are truncated or filled up + + # reading the datasets + self.train_sentences = read_tagged_sentences(path_train, max_sent_len) + self.dev_sentences = read_tagged_sentences(path_dev, max_sent_len) + + ### create dictionaries which map characters or tags to IDs + char_counter = Counter() + tag_counter = Counter() + for words, tags in self.train_sentences: + tag_counter.update(tags) + for word in words: + char_counter.update(word) + self.char2ID, _ = make_dict(char_counter, min_char_freq, add_pad_symbol=True) + + if ignore_tag is not None: + tag_counter.pop(ignore_tag, None) # remove this special tag if present + self.tag2ID, self.ID2tag = make_dict(tag_counter) + self.tag2ID[ignore_tag] = self.IGNORE_INDEX # empty tags will not be trained + else: + self.tag2ID, self.ID2tag = make_dict(tag_counter) + + ### sizes of the symbol inventories + self.num_char_types = len(self.char2ID) + self.num_tag_types = len(self.ID2tag) + + self.word_embeddings, self.word_emb_size = read_word_embeddings(word_embeddings) + + + def get_charIDs(self, word): + ''' + maps a word to a sequence of character IDs + ''' + + unkID = self.char2ID[unk_string] + padID = self.char2ID[pad_string] + + charIDs = [self.char2ID.get(c, unkID) for c in word] + + # add enough padding symbols + fwd_charIDs = [padID] * self.word_trunc_len + charIDs + bwd_charIDs = [padID] * self.word_trunc_len + charIDs[::-1] + + # truncate + fwd_charIDs = fwd_charIDs[-self.word_trunc_len:] + bwd_charIDs = bwd_charIDs[-self.word_trunc_len:] + + return fwd_charIDs, bwd_charIDs + + + def words2charIDvec(self, words): + """ + converts words to char-ID vectors + """ + + ### convert words to character ID sequences + fwd_charID_seqs = [] + bwd_charID_seqs = [] + for word in words: + fwd_charIDs, bwd_charIDs = self.get_charIDs(word) + fwd_charID_seqs.append(fwd_charIDs) + bwd_charID_seqs.append(bwd_charIDs) + + fwd_charID_seqs = numpy.asarray(fwd_charID_seqs, dtype='int32') + bwd_charID_seqs = numpy.asarray(bwd_charID_seqs, dtype='int32') + + return fwd_charID_seqs, bwd_charID_seqs + + + def tags2IDs(self, tags): + """ + takes a list of tags and converts them to IDs using the tag2ID dictionary + """ + unkID = self.tag2ID[unk_string] + IDs = [self.tag2ID.get(tag, unkID) for tag in tags] + return numpy.asarray(IDs, dtype='int32') + + + def save_parameters(self, filename): + """ save parameters to a file """ + all_params = (self.word_trunc_len, self.char2ID, self.ID2tag) + with open(filename, "wb") as file: + pickle.dump(all_params, file) + + + ### functions needed during tagging ############################################### + + def init_test(self, filename): + """ load parameters from a file """ + with open(filename, "rb") as file: + self.word_trunc_len, self.char2ID, self.ID2tag = pickle.load(file) + + def sentences(self, filename): + """ read data to be tagged. One token per line. Empty line follows a sentence """ + with open(filename) as f: + words = [] + for line in f: + line = line.rstrip() + if line != '': + words.append(line) + elif len(words) > 0: + # empty line indicates the end of a sentence + yield words + words = [] + + def single_sentences(self, sentence): + yield sentence + + def IDs2tags(self, IDs): + """ takes a list of IDs and converts them to tags using the ID2tag dictionary """ + return [self.ID2tag[int(ID)] for ID in IDs] diff --git a/MHGTagger/RNNTagger.py b/MHGTagger/RNNTagger.py new file mode 100644 index 0000000000000000000000000000000000000000..4a10b66bf5e257cd45816c9c5ff3103ce2ea11d5 --- /dev/null +++ b/MHGTagger/RNNTagger.py @@ -0,0 +1,111 @@ + +import sys +import torch +from torch import nn + + +class WordRepresentation(nn.Module): + ''' + RNN for computing character-based word representations + ''' + def __init__(self, num_chars, emb_size, rec_size, dropout_rate): + super().__init__() + + # character embedding lookup table + self.embeddings = nn.Embedding(num_chars, emb_size) + + # character-based LSTMs + self.fwd_rnn = nn.LSTM(emb_size, rec_size) + self.bwd_rnn = nn.LSTM(emb_size, rec_size) + + self.dropout = nn.Dropout(dropout_rate) + + + def forward(self, fwd_charIDs, bwd_charIDs): + # swap the 2 dimensions and lookup the embeddings + fwd_embs = self.embeddings(fwd_charIDs.t()) + bwd_embs = self.embeddings(bwd_charIDs.t()) + + # run the biLSTM over characters + fwd_outputs, _ = self.fwd_rnn(fwd_embs) + bwd_outputs, _ = self.bwd_rnn(bwd_embs) + + # concatenate the forward and backward final states to form + # word representations + word_reprs = torch.cat((fwd_outputs[-1], bwd_outputs[-1]), -1) + + return word_reprs + + +class ResidualLSTM(nn.Module): + ''' Deep BiRNN with residual connections ''' + + def __init__(self, input_size, rec_size, num_rnns, dropout_rate): + super().__init__() + self.rnn = nn.LSTM(input_size, rec_size, + bidirectional=True, batch_first=True) + + self.deep_rnns = nn.ModuleList([ + nn.LSTM(2*rec_size, rec_size, bidirectional=True, batch_first=True) + for _ in range(num_rnns-1)]) + + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, state): + state, _ = self.rnn(state) + for rnn in self.deep_rnns: + hidden, _ = rnn(self.dropout(state)) + state = state + hidden # residual connection + return state + + +class RNNTagger(nn.Module): + ''' main tagger module ''' + + def __init__(self, num_chars, num_tags, char_emb_size, char_rec_size, + word_rec_size, word_rnn_depth, dropout_rate, word_emb_size): + + super().__init__() + + # character-based BiLSTMs + self.word_representations = WordRepresentation(num_chars, char_emb_size, + char_rec_size, dropout_rate) + # word-based BiLSTM + self.word_rnn = ResidualLSTM(char_rec_size*2, word_rec_size, word_rnn_depth, + dropout_rate) + # output feed-forward network + self.output_layer = nn.Linear(2*word_rec_size, num_tags) + + # dropout layers + self.dropout = nn.Dropout(dropout_rate) + + # word embedding projection layer for finetuning on word embeddings + if word_emb_size > 0: + self.projection_layer = nn.Linear(2*char_rec_size, word_emb_size) + + + def forward(self, fwd_charIDs, bwd_charIDs, word_embedding_training=False): + + # compute the character-based word representations + word_reprs = self.word_representations(fwd_charIDs, bwd_charIDs) + + if word_embedding_training: + if not hasattr(self, 'projection_layer'): + sys.exit("Error: The embedding projection layer is undefined!") + # Project the word representations to word embedding vectors + # for finetuning on word embeddings as an auxiliary task + word_embs = self.projection_layer(word_reprs) + return word_embs + + # apply dropout + word_reprs = self.dropout(word_reprs) + + # run the BiLSTM over words + reprs = self.word_rnn(word_reprs.unsqueeze(0)).squeeze(0) + reprs = self.dropout(reprs) # and apply dropout + + # apply the output layers + scores = self.output_layer(reprs) + + return scores + diff --git a/MHGTagger/__pycache__/CRFTagger.cpython-310.pyc b/MHGTagger/__pycache__/CRFTagger.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59c5064bbb75ae5ba1bb67a97026989e4817485d Binary files /dev/null and b/MHGTagger/__pycache__/CRFTagger.cpython-310.pyc differ diff --git a/MHGTagger/__pycache__/CRFTagger.cpython-37.pyc b/MHGTagger/__pycache__/CRFTagger.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4def17fb7a1ed805b7bc9d9f430babe54410069 Binary files /dev/null and b/MHGTagger/__pycache__/CRFTagger.cpython-37.pyc differ diff --git a/MHGTagger/__pycache__/CRFTagger.cpython-38.pyc b/MHGTagger/__pycache__/CRFTagger.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8b1f5d2e1c5aaf40bd84cc63818da0d83f186b3 Binary files /dev/null and b/MHGTagger/__pycache__/CRFTagger.cpython-38.pyc differ diff --git a/MHGTagger/__pycache__/Data.cpython-37.pyc b/MHGTagger/__pycache__/Data.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60a78608bfb458782ab91d0c6b1dfd8da9b70daa Binary files /dev/null and b/MHGTagger/__pycache__/Data.cpython-37.pyc differ diff --git a/MHGTagger/__pycache__/Data.cpython-38.pyc b/MHGTagger/__pycache__/Data.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a508b228249e607f8e124e2cfcb681cb65cbaf3c Binary files /dev/null and b/MHGTagger/__pycache__/Data.cpython-38.pyc differ diff --git a/MHGTagger/__pycache__/NMT.cpython-310.pyc b/MHGTagger/__pycache__/NMT.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ea23c210be29715e86ed5a4792caa8d204b0e94 Binary files /dev/null and b/MHGTagger/__pycache__/NMT.cpython-310.pyc differ diff --git a/MHGTagger/__pycache__/NMTData.cpython-310.pyc b/MHGTagger/__pycache__/NMTData.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95d9e53ddb77087e5d7d9eb71a17f09daf748592 Binary files /dev/null and b/MHGTagger/__pycache__/NMTData.cpython-310.pyc differ diff --git a/MHGTagger/__pycache__/RNNData.cpython-310.pyc b/MHGTagger/__pycache__/RNNData.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ecfc30fa485f3b10778524c98fb46af8ec497416 Binary files /dev/null and b/MHGTagger/__pycache__/RNNData.cpython-310.pyc differ diff --git a/MHGTagger/__pycache__/RNNData.cpython-37.pyc b/MHGTagger/__pycache__/RNNData.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1753831519d35b4f67b321130e996b221e6c017 Binary files /dev/null and b/MHGTagger/__pycache__/RNNData.cpython-37.pyc differ diff --git a/MHGTagger/__pycache__/RNNData.cpython-38.pyc b/MHGTagger/__pycache__/RNNData.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4391bd788654ab131a86dc8a789576a92c6d7bc Binary files /dev/null and b/MHGTagger/__pycache__/RNNData.cpython-38.pyc differ diff --git a/MHGTagger/__pycache__/RNNTagger.cpython-310.pyc b/MHGTagger/__pycache__/RNNTagger.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..380d43089f35bd30141a22fd8c2c3e94da43f426 Binary files /dev/null and b/MHGTagger/__pycache__/RNNTagger.cpython-310.pyc differ diff --git a/MHGTagger/__pycache__/RNNTagger.cpython-37.pyc b/MHGTagger/__pycache__/RNNTagger.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36260cf254193ba40980a4dbcb21ff8b89863e14 Binary files /dev/null and b/MHGTagger/__pycache__/RNNTagger.cpython-37.pyc differ diff --git a/MHGTagger/__pycache__/RNNTagger.cpython-38.pyc b/MHGTagger/__pycache__/RNNTagger.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0fd8c4cb7f236f1e8fb4eee812ca3eb7638061a Binary files /dev/null and b/MHGTagger/__pycache__/RNNTagger.cpython-38.pyc differ diff --git a/MHGTagger/__pycache__/rnn_annotate.cpython-38.pyc b/MHGTagger/__pycache__/rnn_annotate.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b42b6b2875d55595d82f178ba44e480a18a8582 Binary files /dev/null and b/MHGTagger/__pycache__/rnn_annotate.cpython-38.pyc differ diff --git a/MHGTagger/rnn_annotate.py b/MHGTagger/rnn_annotate.py new file mode 100755 index 0000000000000000000000000000000000000000..fef2524d40d8645588fefbd78fe572c1c8e45d16 --- /dev/null +++ b/MHGTagger/rnn_annotate.py @@ -0,0 +1,145 @@ +#!/usr/bin/python3 + +import sys +import pickle +import torch +from huggingface_hub import hf_hub_download + +from .Data import Data +from .RNNTagger import RNNTagger +from .CRFTagger import CRFTagger + + +########################################################################### +# main function +########################################################################### + +class Args: + def __init__(self, path_param, model_id, path_data, crf_beam_size, gpu, min_prob, print_probs) -> None: + self.path_param = path_param + self.model_id = model_id + self.path_data = path_data + self.crf_beam_size = crf_beam_size + self.gpu = gpu + self.min_prob = min_prob + self.print_probs = print_probs + +# if __name__ == "__main__": +def annotate(tokens, path_param='MHGTagger/tagger', model_id='nielklug/rnn_tagger', path_data='', crf_beam_size=10, gpu=-1, min_prob=-1.0, print_probs=True): + + # parser = argparse.ArgumentParser(description='Annotation program of the RNN-Tagger.') + + # parser.add_argument('path_param', type=str, + # help='name of parameter file') + # parser.add_argument('path_data', type=str, + # help='name of the file with input data') + # parser.add_argument('--crf_beam_size', type=int, default=10, + # help='size of the CRF beam (if the system contains a CRF layer)') + # parser.add_argument('--gpu', type=int, default=0, + # help='selection of the GPU. The default is: 0 (CPU=-1)') + # parser.add_argument("--min_prob", type=float, default=-1.0, + # help="print all tags whose probability exceeds the probability of the best tag times this threshold") + # parser.add_argument("--print_probs", action="store_true", default=False, + # help="print the tag probabilities") + + args = Args(path_param, model_id, path_data, crf_beam_size, gpu, min_prob, print_probs) + + # Select the processing device + if args.gpu >= 0: + if not torch.cuda.is_available(): + print('No gpu available. Using cpu instead.', file=sys.stderr) + args.gpu = -1 + else: + if args.gpu >= torch.cuda.device_count(): + print('gpu '+str(args.gpu)+' not available. Using gpu 0 instead.', file=sys.stderr) + args.gpu = 0 + torch.cuda.set_device(args.gpu) + device = torch.device('cuda' if args.gpu >= 0 else 'cpu') + + # load parameters + data = Data(args.path_param+'.io') # read the symbol mapping tables + + with open(args.path_param+'.hyper', 'rb') as file: + hyper_params = pickle.load(file) + model = CRFTagger(*hyper_params) if len(hyper_params)==10 \ + else RNNTagger(*hyper_params) + + model_file = hf_hub_download(repo_id=args.model_id, filename='tagger.rnn') + model.load_state_dict(torch.load(model_file, + map_location=torch.device('cpu'))) + + model = model.to(device) + + if type(model) is CRFTagger: + for optvar, option in zip((args.min_prob, args.print_probs), + ("min_prob","print_probs")): + if optvar: + print(f"Warning: Option --{option} is ignored because the model has a CRF output layer", file=sys.stderr) + + model.eval() + with torch.no_grad(): + for i, words in enumerate(data.single_sentences(tokens)): + # print(i, end='\r', file=sys.stderr, flush=True) + + # map words to numbers and create Torch variables + fwd_charIDs, bwd_charIDs = data.words2charIDvec(words) + fwd_charIDs = torch.LongTensor(fwd_charIDs).to(device) + bwd_charIDs = torch.LongTensor(bwd_charIDs).to(device) + + words_all = [] + tagged = [] + probs_all = [] + # run the model + if type(model) is RNNTagger: + tagscores = model(fwd_charIDs, bwd_charIDs) + if args.min_prob == -1.0: + # only print the word and tag with the highest score + tagIDs = tagscores.argmax(-1) + tags = data.IDs2tags(tagIDs.to("cpu")) + if not args.print_probs: + for word, tag in zip(words, tags): + # print(word, tag, sep="\t") + words_all.append(word) + tagged.append(tag) + else: + # print probabilities as well + tagprobs = torch.nn.functional.softmax(tagscores, dim=-1) + # get the probabilities of the highest-scoring tags + probs = tagprobs[range(len(tagIDs)), tagIDs].to("cpu").tolist() + # print the result + for word, tag, prob in zip(words, tags, probs): + # print(word, tag, round(float(prob), 4), sep="\t") + words_all.append(word) + tagged.append(tag) + probs_all.append(round(float(prob), 4)) + else: + # print the best tags for each word + tagprobs = torch.nn.functional.softmax(tagscores, dim=-1) + # get the most probable tag and its probability + best_probs, _ = tagprobs.max(-1) + # get all tags with a probability above best_prob * min_prob + thresholds = best_probs * args.min_prob + greaterflags = (tagprobs > thresholds.unsqueeze(1)) + for word, flags, probs in zip(words, greaterflags, tagprobs): + # get the IDs of the best tags + IDs = flags.nonzero() + # get the best tags and their probabilities + best_probs = probs[IDs].to("cpu") + best_tags = data.IDs2tags(IDs.to("cpu")) + # sort the tags by decreasing probability + sorted_list = sorted(zip(best_tags, best_probs), key=lambda x:-x[1]) + best_tags, best_probs = zip(*sorted_list) + # generate the output + if args.print_probs: + # append the probabilities to the tags + best_tags = [f"{t} {float(p):.4f}" for t, p in zip(best_tags, best_probs)] + print(word, ' '.join(best_tags), sep="\t") + elif type(model) is CRFTagger: + tagIDs = model(fwd_charIDs, bwd_charIDs) + tags = data.IDs2tags(tagIDs) + for word, tag in zip(words, tags): + print(word, tag, sep='\t') + else: + sys.exit('Error') + + return (words_all, tagged, probs_all) diff --git a/MHGTagger/tagger.hyper b/MHGTagger/tagger.hyper new file mode 100644 index 0000000000000000000000000000000000000000..179a6843288c6dc3c92e8eca4c77054cc8205b9d Binary files /dev/null and b/MHGTagger/tagger.hyper differ diff --git a/MHGTagger/tagger.io b/MHGTagger/tagger.io new file mode 100644 index 0000000000000000000000000000000000000000..31328c5b669a578beb8edea6acc5e14414821e13 Binary files /dev/null and b/MHGTagger/tagger.io differ diff --git a/README.md b/README.md index 00c3ed890c9c38afd3c49b74628b3dcb959d7581..60755f8ce689be136f00e6a87407db063a79ebd1 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ --- -title: Mhg Parsing +title: MHG Parsing emoji: 🌍 colorFrom: gray colorTo: red diff --git a/Tagset_Mappings/POS-mapping.txt b/Tagset_Mappings/POS-mapping.txt new file mode 100644 index 0000000000000000000000000000000000000000..a81e4816e7a097818e64bf444d38ee30fe5c8378 --- /dev/null +++ b/Tagset_Mappings/POS-mapping.txt @@ -0,0 +1,73 @@ +$_ $_ +ADJA ADJA +ADJD ADJD +ADJN ADJA.Pos +ADJS ADJA +APPR APPR +APPRART APPRART +AVD ADV +AVD-KO* ADV +AVG PWAV +AVW PWAV +CARDA CARD +CARDD CARD +CARDN CARD +CARDS CARD +DDA PDAT +DDART ART +DDD PDAT +DDN PDAT +DDS PDS +DGA PWAT +DGS PWS +DIA PIAT +DIART ART +DID PDAT +DIN PDAT +DIS PIS +DPOSA PPOSAT +DPOSD PPOSS +DPOSN PPOSAT +DPOSS NN +DRELS PRELS +DWA PWAT +DWD PWS +DWS PWS +FM FM +ITJ ITJ +KO* KOUS +KOKOM KOKOM +KON KON +KOUS KOUS +NA NN +NE NE +PART PART +PAVAP PROAV +PAVD PROAV +PAVG PROAV +PAVW PWAV +PG PWS +PI PIS +PPER PPER +PRF PRF +PTK ADV +PTKA PTKA +PTKANT PTKANT +PTKNEG PTKNEG +PTKVZ PTKVZ +PW PWS +VAFIN VAFIN +VAIMP VAIMP +VAINF VAINF +VAPP VAPP +VAPS ADJD.Pos +VMFIN VMFIN +VMIMP VMIMP +VMINF VMINF +VMPP VMPP +VMPS ADJD.Pos +VVFIN VVFIN +VVIMP VVIMP +VVINF VVINF +VVPP VVPP +VVPS ADJD.Pos diff --git a/Tagset_Mappings/__pycache__/tag_mapping.cpython-38.pyc b/Tagset_Mappings/__pycache__/tag_mapping.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84339b214ec8cbcc1e8c38f2a510dd11af1c4e9a Binary files /dev/null and b/Tagset_Mappings/__pycache__/tag_mapping.cpython-38.pyc differ diff --git a/Tagset_Mappings/feature-mapping.txt b/Tagset_Mappings/feature-mapping.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ddd5a04e6001aa9053dcc66ea55bc35b0581ecb --- /dev/null +++ b/Tagset_Mappings/feature-mapping.txt @@ -0,0 +1,11 @@ +Masc,Fem * +Fem,Masc * +Masc,Neut * +Neut,Masc * +Fem,Neut * +Neut,Fem * +Abl Dat +Instr Dat +Akk Acc +Voc Nom +bSg Sg diff --git a/Tagset_Mappings/tag_mapping.py b/Tagset_Mappings/tag_mapping.py new file mode 100755 index 0000000000000000000000000000000000000000..9f340164413b6b5ae69a00286ff17d5924579705 --- /dev/null +++ b/Tagset_Mappings/tag_mapping.py @@ -0,0 +1,129 @@ +#!/usr/bin/python3 + +""" +cd schmid/MHG-Parser/Tagset-Mappings +python tag-mapping.py ../self-attentive-parser-master/data/mhg/MHG.tagged > ../self-attentive-parser-master/data/mhg/MHG_new.mapped +""" + +import sys +import fileinput + +with open("Tagset_Mappings/POS-mapping.txt") as file: + pos_map = dict(line.split() for line in file if line.strip()) + +with open("Tagset_Mappings/feature-mapping.txt") as file: + feature_map = dict(line.split() for line in file if line.strip()) + +def map_tags(tags): + return [map_tag(tag) for tag in tags] + + +def map_tag(tag): + tag.replace('AVD.Comp', 'AVD').replace('AVD.Sup', 'AVD') + pos, *features = tag.split(".") + pos = pos.split('|')[0] + pos = pos_map[pos] + pos, *features2 = pos.split(".") + features = features2 + features + features = [feature_map.get(f, f) for f in features] + if pos == 'ADJA': + if len(features) == 5: + features = [features[0], features[2], features[3], features[1]] + elif len(features) in [3,4]: + features = [features[0], features[2], '*', features[1]] + elif len(features) == 2: + features = [features[0], '*', '*', features[1]] + elif len(features) == 1: + features = [features[0], '*', '*', '*'] + elif pos in ['ADV', 'CARD']: + features = [] + elif pos in ['ART', 'APPRART']: + if len(features) == 4: + features = [features[1], features[2], features[0]] + elif len(features) in [0, 1]: + features = ['*', '*', '*'] + elif pos == 'NN': + if len(features) == 4: + features = [features[1], features[2], features[0]] + elif len(features) == 0: + features = ['*', '*', '*'] + elif pos == 'NE': + if len(features) == 2: + features.append('*') + elif len(features) == 1: + features.extend(['*', '*']) + elif pos == 'PDAT': + if len(features) == 4: + features = [features[1], features[2], features[0]] + elif len(features) == 0: + features = ['*', '*', '*'] + elif pos == 'PIAT': + if len(features) == 4: + features = [features[1], features[2], features[0]] + if len(features) == 2: + features = [features[1], '*', features[0]] + elif len(features) == 0: + features = ['*', '*', '*'] + elif pos == 'PPOSAT': + if len(features) in [3, 4]: + features = [features[1], features[2], features[0]] + elif len(features) == 0: + features = ['*', '*', '*'] + elif pos == 'PWAT' and len(features) == 4: + features = [features[1], features[2], features[0]] + elif pos == 'PPOSS': + features = ['*.*.*'] + elif pos == 'PDS': + if len(features) == 4: + features = [features[1], features[2], features[0]] + elif len(features) == 1: + features.extend(['*', '*']) + elif len(features) == 2: + features = [features[1], '*', '*'] + elif pos == 'PIS': + if len(features) == 4: + features = [features[1], features[2], features[0]] + elif len(features) == 0: + features = ['*', '*', '*'] + elif pos == 'PWS': + if len(features) == 4: + features = [features[1], features[2], features[0]] + elif len(features) == 0: + features = ['*', '*', '*'] + elif pos == 'PRELS' and len(features) == 3: + features = [features[1], features[2], features[0]] + elif pos == 'PPER' and len(features) == 4: + features = [features[3], features[1], features[2], features[0]] + elif pos == 'PRF' and len(features) == 3: + features = ['*', features[0], features[1]] + elif pos in ['VAFIN','VMFIN','VVFIN'] and len(features) == 4: + features = [features[3], features[2], features[1], features[0]] + elif pos in ['VAIMP','VMIMP','VVIMP'] and len(features) == 2: + features = [features[1], features[0], 'Imp'] + elif pos in ['VAINF','VMINF','VVINF'] and len(features) == 0: + features = ['Inf'] + elif pos in ['VAPP','VMPP','VVPP'] and len(features) == 0: + features = ['Psp'] + return '.'.join([pos]+features) + +# for i, line in enumerate(fileinput.input()): +# print(i, end="\r", file=sys.stderr) +# line = line.strip() +# if line: +# word, tag, *_ = line.split("\t") +# tag = tag.replace('APPR|DDART', 'APPRART') +# for t in tag.split("|"): +# print(word, map_tag(t), sep="\t") +# else: +# print() + + +# for i, line in enumerate(fileinput.input()): +# print(i, end="\r", file=sys.stderr) +# line = line.strip() +# if line: +# word, tag, *_ = line.split("\t") +# tag = tag.replace('APPR|DDART', 'APPRART') +# print(word, map_tag(tag.split('|')[0]), sep="\t") +# else: +# print() \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..3ef49d07b75ffa55f030f98ccd2b1271550dac34 --- /dev/null +++ b/app.py @@ -0,0 +1,47 @@ +import streamlit as st +from parse import parse_text +import nltk +from nltk import Tree +import pandas as pd +import re +from nltk.tree.prettyprinter import TreePrettyPrinter + + +st.title("MHG parsing system (demo)") +text = st.text_area("""This is a simple demo of a Middle High German (MHG) parsing system using delexicalization method.\n\n + Enter some MHG text below!""") + +st.text("""Example MHG sentences: +1. Swer an rehte güete wendet sîn gemüete, dem volget sælde und êre, des gît gewisse +lêre künec Artûs der guote, der mit rîters muote nâch lobe kunde strîten. +2. Uns ist in alten mæren wunders vil geseitvon helden lobebæren, von grôzer arebeit, +von freuden, hôchgezîten, von weinen und von klagen, von küener recken strîten muget +ir nu wunder hœren sagen.""") + +nltk.download('punkt') + + +if text: + tokens, tags, probs, parse_tree = parse_text(text) + + # create a table to show the tagged results: + zipped = list(zip(tokens, tags, probs)) + + df = pd.DataFrame(zipped, columns=['Token', 'Tag', 'Prob.']) + + # Convert the bracket parse tree into an NLTK Tree + t = Tree.fromstring(re.sub(r'(\.[^ )]+)+', '', parse_tree)) + + tree_svg = TreePrettyPrinter(t).svg(nodecolor='black', leafcolor='black', funccolor='black') + + col1 = st.columns(1)[0] + col1.header("POS tagging result:") + col1.table(df) + + col2 = st.columns(1)[0] + col2.header("Parsing result:") + col2.write(parse_tree.replace('_', '\_').replace('$', '\$').replace('*', '\*')) + +# Display the graph in the Streamlit app + col2.image(tree_svg, use_column_width=True) + diff --git a/parse.py b/parse.py new file mode 100644 index 0000000000000000000000000000000000000000..9213ea943b4759e69edf7787580a986f2ecb1095 --- /dev/null +++ b/parse.py @@ -0,0 +1,19 @@ +import re +from MHGTagger.rnn_annotate import annotate +from Tagset_Mappings.tag_mapping import map_tags +from parsing.src.parse import run_parse +from nltk import word_tokenize + +def parse_text(text): + tokens = tokenize(text) + tokens, tags, probs = annotate(tokens) + tags = map_tags(tags) + parse_tree = run_parse(tokens, tags)[0] + return tokens, tags, probs, parse_tree + +def tokenize(text: str): + text = re.sub(r'\s*([.,;:?!"])\s', r' \1 ', text) + text = re.sub(r'\s*([.,;:?!"]) ', r' \1 ', text) + tokens = word_tokenize(text) + return tokens + \ No newline at end of file diff --git a/parsing/EVALB/COLLINS.prm b/parsing/EVALB/COLLINS.prm new file mode 100644 index 0000000000000000000000000000000000000000..cb1a2ff04b1f378145519cd121745db9fc14c645 --- /dev/null +++ b/parsing/EVALB/COLLINS.prm @@ -0,0 +1,66 @@ +##------------------------------------------## +## Debug mode ## +## 0: No debugging ## +## 1: print data for individual sentence ## +##------------------------------------------## +DEBUG 0 + +##------------------------------------------## +## MAX error ## +## Number of error to stop the process. ## +## This is useful if there could be ## +## tokanization error. ## +## The process will stop when this number## +## of errors are accumulated. ## +##------------------------------------------## +MAX_ERROR 10 + +##------------------------------------------## +## Cut-off length for statistics ## +## At the end of evaluation, the ## +## statistics for the senetnces of length## +## less than or equal to this number will## +## be shown, on top of the statistics ## +## for all the sentences ## +##------------------------------------------## +CUTOFF_LEN 40 + +##------------------------------------------## +## unlabeled or labeled bracketing ## +## 0: unlabeled bracketing ## +## 1: labeled bracketing ## +##------------------------------------------## +LABELED 1 + +##------------------------------------------## +## Delete labels ## +## list of labels to be ignored. ## +## If it is a pre-terminal label, delete ## +## the word along with the brackets. ## +## If it is a non-terminal label, just ## +## delete the brackets (don't delete ## +## deildrens). ## +##------------------------------------------## +DELETE_LABEL TOP +DELETE_LABEL -NONE- +DELETE_LABEL , +DELETE_LABEL : +DELETE_LABEL `` +DELETE_LABEL '' +DELETE_LABEL . + +##------------------------------------------## +## Delete labels for length calculation ## +## list of labels to be ignored for ## +## length calculation purpose ## +##------------------------------------------## +DELETE_LABEL_FOR_LENGTH -NONE- + +##------------------------------------------## +## Equivalent labels, words ## +## the pairs are considered equivalent ## +## This is non-directional. ## +##------------------------------------------## +EQ_LABEL ADVP PRT + +# EQ_WORD Example example diff --git a/parsing/EVALB/LICENSE b/parsing/EVALB/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..68a49daad8ff7e35068f2b7a97d643aab440eaec --- /dev/null +++ b/parsing/EVALB/LICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/parsing/EVALB/Makefile b/parsing/EVALB/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0fe4ada5512816f87c8769547ce9992a857636a0 --- /dev/null +++ b/parsing/EVALB/Makefile @@ -0,0 +1,4 @@ +all: evalb + +evalb: evalb.c + gcc -Wall -g -o evalb evalb.c diff --git a/parsing/EVALB/README b/parsing/EVALB/README new file mode 100644 index 0000000000000000000000000000000000000000..106e927eaf73e1d9aebf5c7dd7e8f4f47748bdf1 --- /dev/null +++ b/parsing/EVALB/README @@ -0,0 +1,300 @@ +################################################################# +# # +# Bug fix and additional functionality for evalb # +# # +# This updated version of evalb fixes a bug in which sentences # +# were incorrectly categorized as "length mismatch" when the # +# the parse output had certain mislabeled parts-of-speech. # +# # +# The bug was the result of evalb treating one of the tags (in # +# gold or test) as a label to be deleted (see sections [6],[7] # +# for details), but not the corresponding tag in the other. # +# This most often occurs with punctuation. See the subdir # +# "bug" for an example gld and tst file demonstating the bug, # +# as well as output of evalb with and without the bug fix. # +# # +# For the present version in case of length mismatch, the nodes # +# causing the imbalance are reinserted to resolve the miscount. # +# If the lengths of gold and test truly differ, the error is # +# still reported. The parameter file "new.prm" (derived from # +# COLLINS.prm) shows how to add new potential mislabelings for # +# quotes (",``,',`). # +# # +# I have preserved DJB's revision for modern compilers except # +# for the delcaration of "exit" which is provided by stdlib. # +# # +# Other changes: # +# # +# * output of F-Measure in addition to precision and recall # +# (I did not update the documention in section [4] for this) # +# # +# * more comprehensive DEBUG output that includes bracketing # +# information as evalb is processing each sentence # +# (useful in working through this, and peraps other bugs). # +# Use either the "-D" run-time switch or set DEBUG to 2 in # +# the parameter file. # +# # +# * added DELETE_LABEL lines in new.prm for S1 nodes produced # +# by the Charniak parser and "?", "!" punctuation produced by # +# the Bikel parser. # +# # +# # +# David Ellis (Brown) # +# # +# January.2006 # +################################################################# + +################################################################# +# # +# Update of evalb for modern compilers # +# # +# This is an updated version of evalb, for use with modern C # +# compilers. There are a few updates, each marked in the code: # +# # +# /* DJB: explanation of comment */ # +# # +# The updates are purely to help compilation with recent # +# versions of GCC (and other C compilers). There are *NO* other # +# changes to the algorithm itself. # +# # +# I have made these changes following recommendations from # +# users of the Corpora Mailing List, especially Peet Morris and # +# Ramon Ziai. # +# # +# David Brooks (Birmingham) # +# # +# September.2005 # +################################################################# + +################################################################# +# # +# README file for evalb # +# # +# Satoshi Sekine (NYU) # +# Mike Collins (UPenn) # +# # +# October.1997 # +################################################################# + +Contents of this README: + + [0] COPYRIGHT + [1] INTRODUCTION + [2] INSTALLATION AND RUN + [3] OPTIONS + [4] OUTPUT FORMAT FROM THE SCORER + [5] HOW TO CREATE A GOLDFILE FROM THE TREEBANK + [6] THE PARAMETER FILE + [7] MORE DETAILS ABOUT THE SCORING ALGORITHM + + +[0] COPYRIGHT + +The authors abandon the copyright of this program. Everyone is +permitted to copy and distribute the program or a portion of the program +with no charge and no restrictions unless it is harmful to someone. + +However, the authors are delightful for the user's kindness of proper +usage and letting the authors know bugs or problems. + +This software is provided "AS IS", and the authors make no warranties, +express or implied. + +To legally enforce the abandonment of copyright, this package is released +under the Unlicense (see LICENSE). + +[1] INTRODUCTION + +Evaluation of bracketing looks simple, but in fact, there are minor +differences from system to system. This is a program to parametarize +such minor differences and to give an informative result. + +"evalb" evaluates bracketing accuracy in a test-file against a gold-file. +It returns recall, precision, tagging accuracy. It uses an identical +algorithm to that used in (Collins ACL97). + + +[2] Installation and Run + +To compile the scorer, type + +> make + + +To run the scorer: + +> evalb -p Parameter_file Gold_file Test_file + + +For example to use the sample files: + +> evalb -p sample.prm sample.gld sample.tst + + + +[3] OPTIONS + +You can specify system parameters in the command line options. +Other options concerning to evaluation metrix should be specified +in parameter file, described later. + + -p param_file parameter file + -d debug mode + -e n number of error to kill (default=10) + -h help + + + +[4] OUTPUT FORMAT FROM THE SCORER + +The scorer gives individual scores for each sentence, for +example: + + Sent. Matched Bracket Cross Correct Tag + ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy +============================================================================ + 1 8 0 100.00 100.00 5 5 5 0 6 5 83.33 + +At the end of the output the === Summary === section gives statistics +for all sentences, and for sentences <=40 words in length. The summary +contains the following information: + +i) Number of sentences -- total number of sentences. + +ii) Number of Error/Skip sentences -- should both be 0 if there is no + problem with the parsed/gold files. + +iii) Number of valid sentences = Number of sentences - Number of Error/Skip + sentences + +iv) Bracketing recall = (number of correct constituents) + ---------------------------------------- + (number of constituents in the goldfile) + +v) Bracketing precision = (number of correct constituents) + ---------------------------------------- + (number of constituents in the parsed file) + +vi) Complete match = percentaage of sentences where recall and precision are + both 100%. + +vii) Average crossing = (number of constituents crossing a goldfile constituen + ---------------------------------------------------- + (number of sentences) + +viii) No crossing = percentage of sentences which have 0 crossing brackets. + +ix) 2 or less crossing = percentage of sentences which have <=2 crossing brackets. + +x) Tagging accuracy = percentage of correct POS tags (but see [5].3 for exact + details of what is counted). + + + +[5] HOW TO CREATE A GOLDFILE FROM THE PENN TREEBANK + + +The gold and parsed files are in a format similar to this: + +(TOP (S (INTJ (RB No)) (, ,) (NP (PRP it)) (VP (VBD was) (RB n't) (NP (NNP Black) (NNP Monday))) (. .))) + +To create a gold file from the treebank: + +tgrep -wn '/.*/' | tgrep_proc.prl + +will produce a goldfile in the required format. ("tgrep -wn '/.*/'" prints +parse trees, "tgrep_process.prl" just skips blank lines). + +For example, to produce a goldfile for section 23 of the treebank: + +tgrep -wn '/.*/' | tail +90895 | tgrep_process.prl | sed 2416q > sec23.gold + + + +[6] THE PARAMETER (.prm) FILE + + +The .prm file sets options regarding the scoring method. COLLINS.prm gives +the same scoring behaviour as the scorer used in (Collins 97). The options +chosen were: + +1) LABELED 1 + +to give labelled precision/recall figures, i.e. a constituent must have the +same span *and* label as a constituent in the goldfile. + +2) DELETE_LABEL TOP + +Don't count the "TOP" label (which is always given in the output of tgrep) +when scoring. + +3) DELETE_LABEL -NONE- + +Remove traces (and all constituents which dominate nothing but traces) when +scoring. For example + +.... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .))) + +would be processed to give + +.... (VP (VBD reported)) (. .))) + + +4) +DELETE_LABEL , -- for the purposes of scoring remove punctuation +DELETE_LABEL : +DELETE_LABEL `` +DELETE_LABEL '' +DELETE_LABEL . + +5) DELETE_LABEL_FOR_LENGTH -NONE- -- don't include traces when calculating + the length of a sentence (important + when classifying a sentence as <=40 + words or >40 words) + +6) EQ_LABEL ADVP PRT + +Count ADVP and PRT as being the same label when scoring. + + + + +[7] MORE DETAILS ABOUT THE SCORING ALGORITHM + + +1) The scorer initially processes the files to remove all nodes specified +by DELETE_LABEL in the .prm file. It also recursively removes nodes which +dominate nothing due to all their children being removed. For example, if +-NONE- is specified as a label to be deleted, + +.... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .))) + +would be processed to give + +.... (VP (VBD reported)) (. .))) + +2) The scorer also removes all functional tags attached to non-terminals +(functional tags are prefixed with "-" or "=" in the treebank). For example +"NP-SBJ" is processed to give "NP", "NP=2" is changed to "NP". + + +3) Tagging accuracy counts tags for all words *except* any tags which are +deleted by a DELETE_LABEL specification in the .prm file. (For example, for +COLLINS.prm, punctuation tagged as "," ":" etc. would not be included). + +4) When calculating the length of a sentence, all words with POS tags not +included in the "DELETE_LABEL_FOR_LENGTH" list in the .prm file are +counted. (For COLLINS.prm, only "-NONE-" is specified in this list, so +traces are removed before calculating the length of the sentence). + +5) There are some subtleties in scoring when either the goldfile or parsed +file contains multiple constituents for the same span which have the same +non-terminal label. e.g. (NP (NP the man)) If the goldfile contains n +constituents for the same span, and the parsed file contains m constituents +with that nonterminal, the scorer works as follows: + +i) If m>n, then the precision is n/m, recall is 100% + +ii) If n>m, then the precision is 100%, recall is m/n. + +iii) If n==m, recall and precision are both 100%. diff --git a/parsing/EVALB/bug/bug.gld b/parsing/EVALB/bug/bug.gld new file mode 100644 index 0000000000000000000000000000000000000000..288a25400f7939d2de2379c46f69ec1e91f0df04 --- /dev/null +++ b/parsing/EVALB/bug/bug.gld @@ -0,0 +1,5 @@ +(TOP (S (NP-SBJ (DT The) (NN Thy-1) (NN gene) (NN promoter) ) (VP (VBZ resembles) (NP (DT a) (`` ") (JJ housekeeping) ('' ") (NN promoter) ) (PP (IN in) (SBAR (IN that) (S (NP-SBJ-68 (PRP it) ) (VP-COOD (VP (VBZ is) (ADJP-PRD (JJ located) (PP (IN within) (NP (DT a) (JJ methylation-free) (NN island) )))) (, ,) (VP (VBZ lacks) (NP (DT a) (JJ canonical) (NN TATA) (NN box) )) (, ,) (CC and) (VP (VBZ displays) (NP (NN heterogeneity) ) (PP (IN in) (NP (NP (DT the) (JJ 5'-end) (NNS termini) ) (PP (IN of) (NP (DT the) (NN mRNA) )))))))))) (. .) ) ) +(TOP (S (NP-SBJ (DT The) (JJ latter) (`` ") (NP (NP (JJ nuclear) (NN factor) ) (PP (IN for) (NP (VBN activated) (NN T) (NNS cells) ))) ('' ") ) (ADVP (RB likely) ) (VP (VBZ contributes) (PP (TO to) (NP (NP (DT the) (NN tissue) (NN specificity) ) (PP (IN of) (NP (NN IL-2) (NN gene) (NN expression) ))))) (. .) ) ) +(TOP (S (ADVP (RB Thus) ) (, ,) (NP-SBJ (PRP we) ) (VP (VBD postulated) (SBAR-COOD (SBAR (IN that) (S (NP-SBJ (NP (DT the) (JJ circadian) (NN modification) ) (PP (IN of) (NP (NN GR) ))) (VP (VBD was) (ADJP-PRD (JJ independent) (PP (IN of) (NP-COOD (NP (NP (DT the) (JJ diurnal) (NNS fluctuations) ) (PP (IN in) (NP (NN plasma) (NN cortisol) (NN level) ))) (CC or) (NP (NP (DT the) (JJ circadian) (NNS variations) ) (PP (IN in) (NP (JJ environmental) (NN lighting) ))))))))) (CC and) (SBAR (IN that) (S (NP-SBJ-79 (DT the) (NN rhythmicity) ) (VP (MD might) (VP (VB be) (VP (VBN regulated) (NP (-NONE- *-79) ) (PP (IN by) (NP-LGS (NP (DT the) (`` ') (JJ circadian) (NN pacemaker) ('' ') ) (ADJP (JJ located) (PP (IN in) (NP (DT the) (JJ human) (JJ basal) (NN brain) )))))))))))) (. .) ) ) +(TOP (S (NP-SBJ-70 (JJ Such) (NN transcription) (NNS factors) ) (VP (VBP play) (NP (DT a) (JJ key) (NN role) ) (PP (IN in) (NP (NP (DT the) (NN development) ) (PP (IN of) (NP (DT the) (JJ mature) (NN T-cell) (NN phenotype) )))) (PP (IN by) (S (NP-SBJ (-NONE- *-70) ) (VP (VBG functioning) (PP (IN as) (`` ') (NP (NP (JJ master) (NNS regulators) ) (PP (IN of) (NP (NN T-cell) (NN differentiation) ))) ('' ') ))))) (. .) ) ) +(TOP (S (NP-SBJ (NP (DT The) (NN conversion) ) (PP (IN of) (NP (DT the) (NN TCEd) )) (PP (TO to) (NP (DT a) (`` ') (JJ perfect) ('' ') (NN NF-kB) (NN binding) (NN site) ))) (VP-COOD (VP (VBZ leads) (PP (TO to) (NP-19 (NP (DT a) (JJR tighter) (NN binding) ) (PP (IN of) (NP (NN NF-kB) )) (PP (TO to) (NP (NN TCEd) (NN DNA) ))))) (CC and) (, ,) (VP (PP (IN as) (NP (DT a) (JJ functional) (NN consequence) )) (, ,) (PP (TO to) (NP=19 (NP (DT the) (NN activity) ) (PP (IN of) (NP (DT the) (`` ') (VBN converted) ('' ') (NN TCEd) (NNS motifs) )) (PP (IN in) (NP (NN HeLa) (NNS cells) )))))) (. .) ) ) diff --git a/parsing/EVALB/bug/bug.rsl-new b/parsing/EVALB/bug/bug.rsl-new new file mode 100644 index 0000000000000000000000000000000000000000..4b143a2a283e278ec844cbf1a5b37269cb2d7387 --- /dev/null +++ b/parsing/EVALB/bug/bug.rsl-new @@ -0,0 +1,39 @@ +Sent. Matched Bracket Cross Correct Tag +ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy +============================================================================ +1 37 0 77.27 65.38 17 22 26 5 34 27 79.41 +2 21 0 69.23 64.29 9 13 14 2 20 16 80.00 +3 47 0 80.00 82.35 28 35 34 4 44 40 90.91 +4 26 0 35.29 37.50 6 17 16 8 25 18 72.00 +5 44 0 42.31 33.33 11 26 33 17 38 28 73.68 +============================================================================ + 62.83 57.72 71 113 123 0 161 129 80.12 +=== Summary === + +-- All -- +Number of sentence = 5 +Number of Error sentence = 0 +Number of Skip sentence = 0 +Number of Valid sentence = 5 +Bracketing Recall = 62.83 +Bracketing Precision = 57.72 +Bracketing FMeasure = 60.17 +Complete match = 0.00 +Average crossing = 7.20 +No crossing = 0.00 +2 or less crossing = 20.00 +Tagging accuracy = 80.12 + +-- len<=40 -- +Number of sentence = 3 +Number of Error sentence = 0 +Number of Skip sentence = 0 +Number of Valid sentence = 3 +Bracketing Recall = 61.54 +Bracketing Precision = 57.14 +Bracketing FMeasure = 59.26 +Complete match = 0.00 +Average crossing = 5.00 +No crossing = 0.00 +2 or less crossing = 33.33 +Tagging accuracy = 77.22 diff --git a/parsing/EVALB/bug/bug.rsl-old b/parsing/EVALB/bug/bug.rsl-old new file mode 100644 index 0000000000000000000000000000000000000000..3f10bc04a014f090ac80690e96daebb576c61931 --- /dev/null +++ b/parsing/EVALB/bug/bug.rsl-old @@ -0,0 +1,45 @@ +Sent. Matched Bracket Cross Correct Tag +ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy +============================================================================ +1 : Length unmatch (33|35) + 1 37 1 0.00 0.00 0 0 0 0 0 0 0.00 +2 : Length unmatch (19|21) + 2 21 1 0.00 0.00 0 0 0 0 0 0 0.00 +3 : Length unmatch (44|45) + 3 47 1 0.00 0.00 0 0 0 0 0 0 0.00 +4 : Length unmatch (24|26) + 4 26 1 0.00 0.00 0 0 0 0 0 0 0.00 +5 : Length unmatch (38|39) + 5 44 1 0.00 0.00 0 0 0 0 0 0 0.00 +============================================================================ + 0 0 0.00 + +=== Summary === + +-- All -- +Number of sentence = 5 +Number of Error sentence = 5 +Number of Skip sentence = 0 +Number of Valid sentence = 0 +Bracketing Recall = 0.00 +Bracketing Precision = 0.00 +Bracketing FMeasure = nan +Complete match = 0.00 +Average crossing = 0.00 +No crossing = 0.00 +2 or less crossing = 0.00 +Tagging accuracy = 0.00 + +-- len<=40 -- +Number of sentence = 3 +Number of Error sentence = 3 +Number of Skip sentence = 0 +Number of Valid sentence = 0 +Bracketing Recall = 0.00 +Bracketing Precision = 0.00 +Bracketing FMeasure = nan +Complete match = 0.00 +Average crossing = 0.00 +No crossing = 0.00 +2 or less crossing = 0.00 +Tagging accuracy = 0.00 diff --git a/parsing/EVALB/bug/bug.tst b/parsing/EVALB/bug/bug.tst new file mode 100644 index 0000000000000000000000000000000000000000..d6b51942b4bbc45c2ec029b713cfcbc5c0117e38 --- /dev/null +++ b/parsing/EVALB/bug/bug.tst @@ -0,0 +1,5 @@ +(S1 (S (NP (DT The) (JJ Thy-1) (NN gene) (NN promoter)) (VP (VP (VBZ resembles) (NP (NP (DT a) (ADJP (CD ") (NN housekeeping)) (NN ") (NN promoter)) (SBAR (WHPP (IN in) (WHNP (WDT that))) (S (NP (PRP it)) (VP (VBZ is) (VP (VBN located) (PP (IN within) (NP (DT a) (JJ methylation-free) (NN island))))))))) (, ,) (VP (VBZ lacks) (NP (DT a) (JJ canonical) (NNP TATA) (NN box))) (, ,) (CC and) (VP (VBZ displays) (NP (NP (NN heterogeneity)) (PP (IN in) (NP (NP (DT the) (JJ 5'-end) (NNS termini)) (PP (IN of) (NP (DT the) (NN mRNA)))))))) (. .))) +(S1 (S (NP (NP (DT The) (JJ latter) (CD ") (JJ nuclear) (NN factor)) (PP (IN for) (NP (VBN activated) (NN T) (NNS cells)))) (VP (VBZ ") (ADJP (JJ likely) (S (VP (VBZ contributes) (PP (TO to) (NP (NP (DT the) (NN tissue) (NN specificity)) (PP (IN of) (NP (JJ IL-2) (NN gene) (NN expression))))))))) (. .))) +(S1 (S (ADVP (RB Thus)) (, ,) (NP (PRP we)) (VP (VBD postulated) (SBAR (SBAR (IN that) (S (NP (NP (DT the) (JJ circadian) (NN modification)) (PP (IN of) (NP (NNP GR)))) (VP (VBD was) (ADJP (JJ independent) (PP (IN of) (NP (DT the) (JJ diurnal) (NNS fluctuations)))) (PP (IN in) (NP (NP (NN plasma) (JJ cortisol) (NN level)) (CC or) (NP (NP (DT the) (JJ circadian) (NNS variations)) (PP (IN in) (NP (JJ environmental) (NN lighting))))))))) (CC and) (SBAR (IN that) (S (NP (DT the) (NN rhythmicity)) (VP (MD might) (VP (VB be) (VP (VBN regulated) (PP (IN by) (NP (DT the) ('' ') (NP (JJ circadian) (NN pacemaker) (POS ')) (VP (VBN located) (PP (IN in) (NP (DT the) (JJ human) (JJ basal) (NN brain))))))))))))) (. .))) +(S1 (S (NP (JJ Such) (NN transcription) (NNS factors)) (VP (VBP play) (NP (NP (DT a) (JJ key) (NN role)) (PP (IN in) (NP (NP (DT the) (NN development)) (PP (IN of) (NP (NP (DT the) (JJ mature) (JJ T-cell) (NN phenotype)) (PP (IN by) (NP (NP (NN functioning) (RB as) (POS ')) (NN master) (NNS regulators))))) (PP (IN of) (NP (JJ T-cell) (NN differentiation) (POS '))))))) (. .))) +(S1 (S (NP (NP (DT The) (NN conversion)) (PP (IN of) (NP (DT the)))) (VP (VBD TCEd) (PP (TO to) (NP (NP (DT a) ('' ') (JJ perfect) ('' ') (NN NF-kB)) (SBAR (S (NP (JJ binding) (NN site)) (VP (VBZ leads) (PP (TO to) (NP (NP (NP (DT a) (ADJP (RBR tighter) (JJ binding)) (PP (IN of) (NP (NP (NNS NF-kB)) (PP (PP (TO to) (NP (JJ TCEd) (NN DNA))) (CC and) (PP (, ,) (PP (IN as) (NP (DT a) (JJ functional) (NN consequence))) (, ,) (TO to) (NP (NP (DT the) (NN activity)) (PP (IN of) (NP (DT the)))))))) (POS ')) (JJ converted) ('' ') (JJ TCEd) (NNS motifs)) (PP (IN in) (NP (NNP HeLa) (NNS cells))))))))))) (. .))) diff --git a/parsing/EVALB/evalb b/parsing/EVALB/evalb new file mode 100755 index 0000000000000000000000000000000000000000..908d298243a797a7c666252ac79fd646fd5b34c5 Binary files /dev/null and b/parsing/EVALB/evalb differ diff --git a/parsing/EVALB/evalb.c b/parsing/EVALB/evalb.c new file mode 100644 index 0000000000000000000000000000000000000000..9a3be2de2df01868fc1bbb6993bcee94766825b1 --- /dev/null +++ b/parsing/EVALB/evalb.c @@ -0,0 +1,1537 @@ +/*****************************************************************/ +/* evalb [-p param_file] [-dh] [-e n] gold-file test-file */ +/* */ +/* Evaluate bracketing in test-file against gold-file. */ +/* Return recall, precision, tagging accuracy. */ +/* */ +/*