Spaces:
Sleeping
Sleeping
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- MHGTagger/CRFTagger.py +110 -0
- MHGTagger/Data.py +186 -0
- MHGTagger/RNNTagger.py +111 -0
- MHGTagger/__pycache__/CRFTagger.cpython-310.pyc +0 -0
- MHGTagger/__pycache__/CRFTagger.cpython-37.pyc +0 -0
- MHGTagger/__pycache__/CRFTagger.cpython-38.pyc +0 -0
- MHGTagger/__pycache__/Data.cpython-37.pyc +0 -0
- MHGTagger/__pycache__/Data.cpython-38.pyc +0 -0
- MHGTagger/__pycache__/NMT.cpython-310.pyc +0 -0
- MHGTagger/__pycache__/NMTData.cpython-310.pyc +0 -0
- MHGTagger/__pycache__/RNNData.cpython-310.pyc +0 -0
- MHGTagger/__pycache__/RNNData.cpython-37.pyc +0 -0
- MHGTagger/__pycache__/RNNData.cpython-38.pyc +0 -0
- MHGTagger/__pycache__/RNNTagger.cpython-310.pyc +0 -0
- MHGTagger/__pycache__/RNNTagger.cpython-37.pyc +0 -0
- MHGTagger/__pycache__/RNNTagger.cpython-38.pyc +0 -0
- MHGTagger/__pycache__/rnn_annotate.cpython-38.pyc +0 -0
- MHGTagger/rnn_annotate.py +145 -0
- MHGTagger/tagger.hyper +0 -0
- MHGTagger/tagger.io +0 -0
- README.md +1 -1
- Tagset_Mappings/POS-mapping.txt +73 -0
- Tagset_Mappings/__pycache__/tag_mapping.cpython-38.pyc +0 -0
- Tagset_Mappings/feature-mapping.txt +11 -0
- Tagset_Mappings/tag_mapping.py +129 -0
- app.py +47 -0
- parse.py +19 -0
- parsing/EVALB/COLLINS.prm +66 -0
- parsing/EVALB/LICENSE +24 -0
- parsing/EVALB/Makefile +4 -0
- parsing/EVALB/README +300 -0
- parsing/EVALB/bug/bug.gld +5 -0
- parsing/EVALB/bug/bug.rsl-new +39 -0
- parsing/EVALB/bug/bug.rsl-old +45 -0
- parsing/EVALB/bug/bug.tst +5 -0
- parsing/EVALB/evalb +0 -0
- parsing/EVALB/evalb.c +1537 -0
- parsing/EVALB/new.prm +87 -0
- parsing/EVALB/nk.prm +92 -0
- parsing/EVALB/sample/sample.gld +24 -0
- parsing/EVALB/sample/sample.prm +65 -0
- parsing/EVALB/sample/sample.rsl +56 -0
- parsing/EVALB/sample/sample.tst +24 -0
- parsing/EVALB/tgrep_proc.prl +9 -0
- parsing/EVALB_SPMRL/Makefile +65 -0
- parsing/EVALB_SPMRL/README +76 -0
- parsing/EVALB_SPMRL/README.orig +230 -0
- parsing/EVALB_SPMRL/evalb.c +1724 -0
- parsing/EVALB_SPMRL/spmrl.prm +91 -0
- parsing/EVALB_SPMRL/spmrl_hebrew.prm +118 -0
MHGTagger/CRFTagger.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import sys
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
|
7 |
+
from .RNNTagger import RNNTagger
|
8 |
+
|
9 |
+
|
10 |
+
### auxiliary functions ############################################
|
11 |
+
|
12 |
+
def logsumexp(x, dim):
|
13 |
+
""" sums up log-scale values """
|
14 |
+
offset, _ = torch.max(x, dim=dim)
|
15 |
+
offset_broadcasted = offset.unsqueeze(dim)
|
16 |
+
safe_log_sum_exp = torch.log(torch.exp(x-offset_broadcasted).sum(dim=dim))
|
17 |
+
return safe_log_sum_exp + offset
|
18 |
+
|
19 |
+
def lookup(T, indices):
|
20 |
+
""" look up probabilities of tags in a vector, matrix, or 3D tensor """
|
21 |
+
if T.dim() == 3:
|
22 |
+
return T.gather(2, indices.unsqueeze(2)).squeeze(2)
|
23 |
+
elif T.dim() == 2:
|
24 |
+
return T.gather(1, indices.unsqueeze(1)).squeeze(1)
|
25 |
+
elif T.dim() == 1:
|
26 |
+
return T[indices]
|
27 |
+
else:
|
28 |
+
raise Exception('unexpected tensor size in function "lookup"')
|
29 |
+
|
30 |
+
|
31 |
+
### tagger class ###############################################
|
32 |
+
|
33 |
+
class CRFTagger(nn.Module):
|
34 |
+
""" implements a CRF tagger """
|
35 |
+
|
36 |
+
def __init__(self, num_chars, num_tags, char_emb_size,
|
37 |
+
char_rec_size, word_rec_size, word_rnn_depth,
|
38 |
+
dropout_rate, word_emb_size, beam_size):
|
39 |
+
|
40 |
+
super(CRFTagger, self).__init__()
|
41 |
+
|
42 |
+
# simple LSTMTagger which computes tag scores
|
43 |
+
self.base_tagger = RNNTagger(num_chars, num_tags, char_emb_size,
|
44 |
+
char_rec_size, word_rec_size,
|
45 |
+
word_rnn_depth, dropout_rate, word_emb_size)
|
46 |
+
self.beam_size = beam_size if 0 < beam_size < num_tags else num_tags
|
47 |
+
self.weights = nn.Parameter(torch.zeros(num_tags, num_tags))
|
48 |
+
self.dropout = nn.Dropout(dropout_rate)
|
49 |
+
|
50 |
+
|
51 |
+
def forward(self, fwd_charIDs, bwd_charIDs, tags=None):
|
52 |
+
|
53 |
+
annotation_mode = (tags is None)
|
54 |
+
|
55 |
+
scores = self.base_tagger(fwd_charIDs, bwd_charIDs)
|
56 |
+
|
57 |
+
# extract the highest-scoring tags for each word and their scores
|
58 |
+
best_scores, best_tags = scores.topk(self.beam_size, dim=-1)
|
59 |
+
|
60 |
+
if self.training: # not done during dev evaluation
|
61 |
+
# check whether the goldstandard tags are among the best tags
|
62 |
+
gs_contained = (best_tags == tags.unsqueeze(1)).sum(dim=-1)
|
63 |
+
|
64 |
+
# replace the tag with the lowest score at each position
|
65 |
+
# by the gs tag if the gs tag is not in the list
|
66 |
+
last_column = gs_contained * best_tags[:,-1] + (1-gs_contained) * tags
|
67 |
+
s = lookup(scores, last_column)
|
68 |
+
best_tags = torch.cat((best_tags[:,:-1], last_column.unsqueeze(1)), dim=1)
|
69 |
+
best_scores = torch.cat((best_scores[:,:-1], s.unsqueeze(1)), dim=1)
|
70 |
+
|
71 |
+
best_previous = [] # stores the backpointers of the Viterbi algorithm
|
72 |
+
viterbi_scores = best_scores[0]
|
73 |
+
if not annotation_mode:
|
74 |
+
forward_scores = best_scores[0]
|
75 |
+
for i in range(1,scores.size(0)): # for all word positions except the first
|
76 |
+
# lookup of the tag-pair weights
|
77 |
+
w = self.weights[best_tags[i-1]][:,best_tags[i]]
|
78 |
+
|
79 |
+
# Viterbi algorithm
|
80 |
+
values = viterbi_scores.unsqueeze(1) + best_scores[i].unsqueeze(0) + w
|
81 |
+
viterbi_scores, best_prev = torch.max(values, dim=0)
|
82 |
+
best_previous.append(best_prev)
|
83 |
+
|
84 |
+
# Forward algorithm
|
85 |
+
if not annotation_mode:
|
86 |
+
values = forward_scores.unsqueeze(1) + best_scores[i].unsqueeze(0) + w
|
87 |
+
forward_scores = logsumexp(values, dim=0)
|
88 |
+
|
89 |
+
# Viterbi algorithm
|
90 |
+
_, index = torch.max(viterbi_scores, dim=0)
|
91 |
+
best_indices = [index]
|
92 |
+
for i in range(len(best_previous)-1, -1, -1):
|
93 |
+
index = best_previous[i][index]
|
94 |
+
best_indices.append(index)
|
95 |
+
|
96 |
+
# reverse the indices and map them to tag IDs
|
97 |
+
best_indices = torch.stack(best_indices[::-1])
|
98 |
+
predicted_tags = lookup(best_tags, best_indices)
|
99 |
+
|
100 |
+
if annotation_mode:
|
101 |
+
return predicted_tags
|
102 |
+
else:
|
103 |
+
# loss computation
|
104 |
+
basetagger_scores = lookup(scores, tags).sum()
|
105 |
+
CRFweights = self.weights[tags[:-1], tags[1:]].sum() if tags.size(0)>1 else 0
|
106 |
+
logZ = logsumexp(forward_scores, dim=0) # log partition function
|
107 |
+
logprob = basetagger_scores + CRFweights - logZ
|
108 |
+
|
109 |
+
return predicted_tags, -logprob
|
110 |
+
|
MHGTagger/Data.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import sys
|
3 |
+
from collections import Counter, OrderedDict
|
4 |
+
import pickle
|
5 |
+
import numpy
|
6 |
+
|
7 |
+
unk_string = '<UNK>'
|
8 |
+
pad_string = '<PAD>'
|
9 |
+
|
10 |
+
def read_tagged_sentences(path, max_sent_len):
|
11 |
+
"""
|
12 |
+
Read a dataset.
|
13 |
+
Each line consists of a token and a tag separated by a tab character
|
14 |
+
"""
|
15 |
+
sentences, words, tags = [], [], []
|
16 |
+
with open(path) as file:
|
17 |
+
for line in file:
|
18 |
+
line = line.rstrip()
|
19 |
+
if line:
|
20 |
+
word, tag, *_ = line.split("\t")
|
21 |
+
words.append(word)
|
22 |
+
tags.append(tag)
|
23 |
+
else:
|
24 |
+
# empty line marking the end of a sentence
|
25 |
+
if 0 < len(words) < max_sent_len:
|
26 |
+
sentences.append((words, tags))
|
27 |
+
words, tags = [], []
|
28 |
+
return sentences
|
29 |
+
|
30 |
+
|
31 |
+
def read_word_embeddings(filename):
|
32 |
+
# Read word embeddings from file.
|
33 |
+
word_embeddings = []
|
34 |
+
if filename is not None:
|
35 |
+
print("reading word embeddings ...", file=sys.stderr)
|
36 |
+
with open(filename) as file:
|
37 |
+
for line in file:
|
38 |
+
word, *vec = line.rstrip().split(' ')
|
39 |
+
if word != unk_string:
|
40 |
+
word_embeddings.append((word, numpy.array(vec, dtype=numpy.float32)))
|
41 |
+
print("done", file=sys.stderr)
|
42 |
+
word_emb_size = len(word_embeddings[0][1]) if word_embeddings else 0
|
43 |
+
return word_embeddings, word_emb_size
|
44 |
+
|
45 |
+
|
46 |
+
def make_dict(counter, min_freq=0, add_pad_symbol=False):
|
47 |
+
"""
|
48 |
+
Create a dictionary which maps strings with some minimal frequency to numbers.
|
49 |
+
We don't use pack_padded sequence, so it is OK to assign ID 1 to the
|
50 |
+
padding symbol.
|
51 |
+
"""
|
52 |
+
symlist = [unk_string] + ([pad_string] if add_pad_symbol else []) + \
|
53 |
+
[elem for elem,freq in counter.most_common() if freq>=min_freq]
|
54 |
+
string2ID = {elem:i for i,elem in enumerate(symlist)}
|
55 |
+
return string2ID, symlist
|
56 |
+
|
57 |
+
|
58 |
+
class Data(object):
|
59 |
+
"""
|
60 |
+
class for reading a tagged training and development corpus or a test corpus
|
61 |
+
"""
|
62 |
+
|
63 |
+
IGNORE_INDEX = -100
|
64 |
+
|
65 |
+
def __init__(self, *args):
|
66 |
+
if len(args) == 1:
|
67 |
+
self.init_test(*args)
|
68 |
+
else:
|
69 |
+
self.init_train(*args)
|
70 |
+
|
71 |
+
### functions needed during training ###############################################
|
72 |
+
|
73 |
+
def init_train(self, path_train, path_dev, word_trunc_len,
|
74 |
+
min_char_freq, max_sent_len, word_embeddings, ignore_tag):
|
75 |
+
|
76 |
+
self.word_trunc_len = word_trunc_len # length to which words are truncated or filled up
|
77 |
+
|
78 |
+
# reading the datasets
|
79 |
+
self.train_sentences = read_tagged_sentences(path_train, max_sent_len)
|
80 |
+
self.dev_sentences = read_tagged_sentences(path_dev, max_sent_len)
|
81 |
+
|
82 |
+
### create dictionaries which map characters or tags to IDs
|
83 |
+
char_counter = Counter()
|
84 |
+
tag_counter = Counter()
|
85 |
+
for words, tags in self.train_sentences:
|
86 |
+
tag_counter.update(tags)
|
87 |
+
for word in words:
|
88 |
+
char_counter.update(word)
|
89 |
+
self.char2ID, _ = make_dict(char_counter, min_char_freq, add_pad_symbol=True)
|
90 |
+
|
91 |
+
if ignore_tag is not None:
|
92 |
+
tag_counter.pop(ignore_tag, None) # remove this special tag if present
|
93 |
+
self.tag2ID, self.ID2tag = make_dict(tag_counter)
|
94 |
+
self.tag2ID[ignore_tag] = self.IGNORE_INDEX # empty tags will not be trained
|
95 |
+
else:
|
96 |
+
self.tag2ID, self.ID2tag = make_dict(tag_counter)
|
97 |
+
|
98 |
+
### sizes of the symbol inventories
|
99 |
+
self.num_char_types = len(self.char2ID)
|
100 |
+
self.num_tag_types = len(self.ID2tag)
|
101 |
+
|
102 |
+
self.word_embeddings, self.word_emb_size = read_word_embeddings(word_embeddings)
|
103 |
+
|
104 |
+
|
105 |
+
def get_charIDs(self, word):
|
106 |
+
'''
|
107 |
+
maps a word to a sequence of character IDs
|
108 |
+
'''
|
109 |
+
|
110 |
+
unkID = self.char2ID[unk_string]
|
111 |
+
padID = self.char2ID[pad_string]
|
112 |
+
|
113 |
+
charIDs = [self.char2ID.get(c, unkID) for c in word]
|
114 |
+
|
115 |
+
# add enough padding symbols
|
116 |
+
fwd_charIDs = [padID] * self.word_trunc_len + charIDs
|
117 |
+
bwd_charIDs = [padID] * self.word_trunc_len + charIDs[::-1]
|
118 |
+
|
119 |
+
# truncate
|
120 |
+
fwd_charIDs = fwd_charIDs[-self.word_trunc_len:]
|
121 |
+
bwd_charIDs = bwd_charIDs[-self.word_trunc_len:]
|
122 |
+
|
123 |
+
return fwd_charIDs, bwd_charIDs
|
124 |
+
|
125 |
+
|
126 |
+
def words2charIDvec(self, words):
|
127 |
+
"""
|
128 |
+
converts words to char-ID vectors
|
129 |
+
"""
|
130 |
+
|
131 |
+
### convert words to character ID sequences
|
132 |
+
fwd_charID_seqs = []
|
133 |
+
bwd_charID_seqs = []
|
134 |
+
for word in words:
|
135 |
+
fwd_charIDs, bwd_charIDs = self.get_charIDs(word)
|
136 |
+
fwd_charID_seqs.append(fwd_charIDs)
|
137 |
+
bwd_charID_seqs.append(bwd_charIDs)
|
138 |
+
|
139 |
+
fwd_charID_seqs = numpy.asarray(fwd_charID_seqs, dtype='int32')
|
140 |
+
bwd_charID_seqs = numpy.asarray(bwd_charID_seqs, dtype='int32')
|
141 |
+
|
142 |
+
return fwd_charID_seqs, bwd_charID_seqs
|
143 |
+
|
144 |
+
|
145 |
+
def tags2IDs(self, tags):
|
146 |
+
"""
|
147 |
+
takes a list of tags and converts them to IDs using the tag2ID dictionary
|
148 |
+
"""
|
149 |
+
unkID = self.tag2ID[unk_string]
|
150 |
+
IDs = [self.tag2ID.get(tag, unkID) for tag in tags]
|
151 |
+
return numpy.asarray(IDs, dtype='int32')
|
152 |
+
|
153 |
+
|
154 |
+
def save_parameters(self, filename):
|
155 |
+
""" save parameters to a file """
|
156 |
+
all_params = (self.word_trunc_len, self.char2ID, self.ID2tag)
|
157 |
+
with open(filename, "wb") as file:
|
158 |
+
pickle.dump(all_params, file)
|
159 |
+
|
160 |
+
|
161 |
+
### functions needed during tagging ###############################################
|
162 |
+
|
163 |
+
def init_test(self, filename):
|
164 |
+
""" load parameters from a file """
|
165 |
+
with open(filename, "rb") as file:
|
166 |
+
self.word_trunc_len, self.char2ID, self.ID2tag = pickle.load(file)
|
167 |
+
|
168 |
+
def sentences(self, filename):
|
169 |
+
""" read data to be tagged. One token per line. Empty line follows a sentence """
|
170 |
+
with open(filename) as f:
|
171 |
+
words = []
|
172 |
+
for line in f:
|
173 |
+
line = line.rstrip()
|
174 |
+
if line != '':
|
175 |
+
words.append(line)
|
176 |
+
elif len(words) > 0:
|
177 |
+
# empty line indicates the end of a sentence
|
178 |
+
yield words
|
179 |
+
words = []
|
180 |
+
|
181 |
+
def single_sentences(self, sentence):
|
182 |
+
yield sentence
|
183 |
+
|
184 |
+
def IDs2tags(self, IDs):
|
185 |
+
""" takes a list of IDs and converts them to tags using the ID2tag dictionary """
|
186 |
+
return [self.ID2tag[int(ID)] for ID in IDs]
|
MHGTagger/RNNTagger.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import sys
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
|
6 |
+
|
7 |
+
class WordRepresentation(nn.Module):
|
8 |
+
'''
|
9 |
+
RNN for computing character-based word representations
|
10 |
+
'''
|
11 |
+
def __init__(self, num_chars, emb_size, rec_size, dropout_rate):
|
12 |
+
super().__init__()
|
13 |
+
|
14 |
+
# character embedding lookup table
|
15 |
+
self.embeddings = nn.Embedding(num_chars, emb_size)
|
16 |
+
|
17 |
+
# character-based LSTMs
|
18 |
+
self.fwd_rnn = nn.LSTM(emb_size, rec_size)
|
19 |
+
self.bwd_rnn = nn.LSTM(emb_size, rec_size)
|
20 |
+
|
21 |
+
self.dropout = nn.Dropout(dropout_rate)
|
22 |
+
|
23 |
+
|
24 |
+
def forward(self, fwd_charIDs, bwd_charIDs):
|
25 |
+
# swap the 2 dimensions and lookup the embeddings
|
26 |
+
fwd_embs = self.embeddings(fwd_charIDs.t())
|
27 |
+
bwd_embs = self.embeddings(bwd_charIDs.t())
|
28 |
+
|
29 |
+
# run the biLSTM over characters
|
30 |
+
fwd_outputs, _ = self.fwd_rnn(fwd_embs)
|
31 |
+
bwd_outputs, _ = self.bwd_rnn(bwd_embs)
|
32 |
+
|
33 |
+
# concatenate the forward and backward final states to form
|
34 |
+
# word representations
|
35 |
+
word_reprs = torch.cat((fwd_outputs[-1], bwd_outputs[-1]), -1)
|
36 |
+
|
37 |
+
return word_reprs
|
38 |
+
|
39 |
+
|
40 |
+
class ResidualLSTM(nn.Module):
|
41 |
+
''' Deep BiRNN with residual connections '''
|
42 |
+
|
43 |
+
def __init__(self, input_size, rec_size, num_rnns, dropout_rate):
|
44 |
+
super().__init__()
|
45 |
+
self.rnn = nn.LSTM(input_size, rec_size,
|
46 |
+
bidirectional=True, batch_first=True)
|
47 |
+
|
48 |
+
self.deep_rnns = nn.ModuleList([
|
49 |
+
nn.LSTM(2*rec_size, rec_size, bidirectional=True, batch_first=True)
|
50 |
+
for _ in range(num_rnns-1)])
|
51 |
+
|
52 |
+
self.dropout = nn.Dropout(dropout_rate)
|
53 |
+
|
54 |
+
def forward(self, state):
|
55 |
+
state, _ = self.rnn(state)
|
56 |
+
for rnn in self.deep_rnns:
|
57 |
+
hidden, _ = rnn(self.dropout(state))
|
58 |
+
state = state + hidden # residual connection
|
59 |
+
return state
|
60 |
+
|
61 |
+
|
62 |
+
class RNNTagger(nn.Module):
|
63 |
+
''' main tagger module '''
|
64 |
+
|
65 |
+
def __init__(self, num_chars, num_tags, char_emb_size, char_rec_size,
|
66 |
+
word_rec_size, word_rnn_depth, dropout_rate, word_emb_size):
|
67 |
+
|
68 |
+
super().__init__()
|
69 |
+
|
70 |
+
# character-based BiLSTMs
|
71 |
+
self.word_representations = WordRepresentation(num_chars, char_emb_size,
|
72 |
+
char_rec_size, dropout_rate)
|
73 |
+
# word-based BiLSTM
|
74 |
+
self.word_rnn = ResidualLSTM(char_rec_size*2, word_rec_size, word_rnn_depth,
|
75 |
+
dropout_rate)
|
76 |
+
# output feed-forward network
|
77 |
+
self.output_layer = nn.Linear(2*word_rec_size, num_tags)
|
78 |
+
|
79 |
+
# dropout layers
|
80 |
+
self.dropout = nn.Dropout(dropout_rate)
|
81 |
+
|
82 |
+
# word embedding projection layer for finetuning on word embeddings
|
83 |
+
if word_emb_size > 0:
|
84 |
+
self.projection_layer = nn.Linear(2*char_rec_size, word_emb_size)
|
85 |
+
|
86 |
+
|
87 |
+
def forward(self, fwd_charIDs, bwd_charIDs, word_embedding_training=False):
|
88 |
+
|
89 |
+
# compute the character-based word representations
|
90 |
+
word_reprs = self.word_representations(fwd_charIDs, bwd_charIDs)
|
91 |
+
|
92 |
+
if word_embedding_training:
|
93 |
+
if not hasattr(self, 'projection_layer'):
|
94 |
+
sys.exit("Error: The embedding projection layer is undefined!")
|
95 |
+
# Project the word representations to word embedding vectors
|
96 |
+
# for finetuning on word embeddings as an auxiliary task
|
97 |
+
word_embs = self.projection_layer(word_reprs)
|
98 |
+
return word_embs
|
99 |
+
|
100 |
+
# apply dropout
|
101 |
+
word_reprs = self.dropout(word_reprs)
|
102 |
+
|
103 |
+
# run the BiLSTM over words
|
104 |
+
reprs = self.word_rnn(word_reprs.unsqueeze(0)).squeeze(0)
|
105 |
+
reprs = self.dropout(reprs) # and apply dropout
|
106 |
+
|
107 |
+
# apply the output layers
|
108 |
+
scores = self.output_layer(reprs)
|
109 |
+
|
110 |
+
return scores
|
111 |
+
|
MHGTagger/__pycache__/CRFTagger.cpython-310.pyc
ADDED
Binary file (2.85 kB). View file
|
|
MHGTagger/__pycache__/CRFTagger.cpython-37.pyc
ADDED
Binary file (2.81 kB). View file
|
|
MHGTagger/__pycache__/CRFTagger.cpython-38.pyc
ADDED
Binary file (2.84 kB). View file
|
|
MHGTagger/__pycache__/Data.cpython-37.pyc
ADDED
Binary file (5.64 kB). View file
|
|
MHGTagger/__pycache__/Data.cpython-38.pyc
ADDED
Binary file (5.83 kB). View file
|
|
MHGTagger/__pycache__/NMT.cpython-310.pyc
ADDED
Binary file (9.98 kB). View file
|
|
MHGTagger/__pycache__/NMTData.cpython-310.pyc
ADDED
Binary file (6.78 kB). View file
|
|
MHGTagger/__pycache__/RNNData.cpython-310.pyc
ADDED
Binary file (6.2 kB). View file
|
|
MHGTagger/__pycache__/RNNData.cpython-37.pyc
ADDED
Binary file (6.04 kB). View file
|
|
MHGTagger/__pycache__/RNNData.cpython-38.pyc
ADDED
Binary file (6.08 kB). View file
|
|
MHGTagger/__pycache__/RNNTagger.cpython-310.pyc
ADDED
Binary file (2.94 kB). View file
|
|
MHGTagger/__pycache__/RNNTagger.cpython-37.pyc
ADDED
Binary file (3.13 kB). View file
|
|
MHGTagger/__pycache__/RNNTagger.cpython-38.pyc
ADDED
Binary file (3.11 kB). View file
|
|
MHGTagger/__pycache__/rnn_annotate.cpython-38.pyc
ADDED
Binary file (3.28 kB). View file
|
|
MHGTagger/rnn_annotate.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
|
3 |
+
import sys
|
4 |
+
import pickle
|
5 |
+
import torch
|
6 |
+
from huggingface_hub import hf_hub_download
|
7 |
+
|
8 |
+
from .Data import Data
|
9 |
+
from .RNNTagger import RNNTagger
|
10 |
+
from .CRFTagger import CRFTagger
|
11 |
+
|
12 |
+
|
13 |
+
###########################################################################
|
14 |
+
# main function
|
15 |
+
###########################################################################
|
16 |
+
|
17 |
+
class Args:
|
18 |
+
def __init__(self, path_param, model_id, path_data, crf_beam_size, gpu, min_prob, print_probs) -> None:
|
19 |
+
self.path_param = path_param
|
20 |
+
self.model_id = model_id
|
21 |
+
self.path_data = path_data
|
22 |
+
self.crf_beam_size = crf_beam_size
|
23 |
+
self.gpu = gpu
|
24 |
+
self.min_prob = min_prob
|
25 |
+
self.print_probs = print_probs
|
26 |
+
|
27 |
+
# if __name__ == "__main__":
|
28 |
+
def annotate(tokens, path_param='MHGTagger/tagger', model_id='nielklug/rnn_tagger', path_data='', crf_beam_size=10, gpu=-1, min_prob=-1.0, print_probs=True):
|
29 |
+
|
30 |
+
# parser = argparse.ArgumentParser(description='Annotation program of the RNN-Tagger.')
|
31 |
+
|
32 |
+
# parser.add_argument('path_param', type=str,
|
33 |
+
# help='name of parameter file')
|
34 |
+
# parser.add_argument('path_data', type=str,
|
35 |
+
# help='name of the file with input data')
|
36 |
+
# parser.add_argument('--crf_beam_size', type=int, default=10,
|
37 |
+
# help='size of the CRF beam (if the system contains a CRF layer)')
|
38 |
+
# parser.add_argument('--gpu', type=int, default=0,
|
39 |
+
# help='selection of the GPU. The default is: 0 (CPU=-1)')
|
40 |
+
# parser.add_argument("--min_prob", type=float, default=-1.0,
|
41 |
+
# help="print all tags whose probability exceeds the probability of the best tag times this threshold")
|
42 |
+
# parser.add_argument("--print_probs", action="store_true", default=False,
|
43 |
+
# help="print the tag probabilities")
|
44 |
+
|
45 |
+
args = Args(path_param, model_id, path_data, crf_beam_size, gpu, min_prob, print_probs)
|
46 |
+
|
47 |
+
# Select the processing device
|
48 |
+
if args.gpu >= 0:
|
49 |
+
if not torch.cuda.is_available():
|
50 |
+
print('No gpu available. Using cpu instead.', file=sys.stderr)
|
51 |
+
args.gpu = -1
|
52 |
+
else:
|
53 |
+
if args.gpu >= torch.cuda.device_count():
|
54 |
+
print('gpu '+str(args.gpu)+' not available. Using gpu 0 instead.', file=sys.stderr)
|
55 |
+
args.gpu = 0
|
56 |
+
torch.cuda.set_device(args.gpu)
|
57 |
+
device = torch.device('cuda' if args.gpu >= 0 else 'cpu')
|
58 |
+
|
59 |
+
# load parameters
|
60 |
+
data = Data(args.path_param+'.io') # read the symbol mapping tables
|
61 |
+
|
62 |
+
with open(args.path_param+'.hyper', 'rb') as file:
|
63 |
+
hyper_params = pickle.load(file)
|
64 |
+
model = CRFTagger(*hyper_params) if len(hyper_params)==10 \
|
65 |
+
else RNNTagger(*hyper_params)
|
66 |
+
|
67 |
+
model_file = hf_hub_download(repo_id=args.model_id, filename='tagger.rnn')
|
68 |
+
model.load_state_dict(torch.load(model_file,
|
69 |
+
map_location=torch.device('cpu')))
|
70 |
+
|
71 |
+
model = model.to(device)
|
72 |
+
|
73 |
+
if type(model) is CRFTagger:
|
74 |
+
for optvar, option in zip((args.min_prob, args.print_probs),
|
75 |
+
("min_prob","print_probs")):
|
76 |
+
if optvar:
|
77 |
+
print(f"Warning: Option --{option} is ignored because the model has a CRF output layer", file=sys.stderr)
|
78 |
+
|
79 |
+
model.eval()
|
80 |
+
with torch.no_grad():
|
81 |
+
for i, words in enumerate(data.single_sentences(tokens)):
|
82 |
+
# print(i, end='\r', file=sys.stderr, flush=True)
|
83 |
+
|
84 |
+
# map words to numbers and create Torch variables
|
85 |
+
fwd_charIDs, bwd_charIDs = data.words2charIDvec(words)
|
86 |
+
fwd_charIDs = torch.LongTensor(fwd_charIDs).to(device)
|
87 |
+
bwd_charIDs = torch.LongTensor(bwd_charIDs).to(device)
|
88 |
+
|
89 |
+
words_all = []
|
90 |
+
tagged = []
|
91 |
+
probs_all = []
|
92 |
+
# run the model
|
93 |
+
if type(model) is RNNTagger:
|
94 |
+
tagscores = model(fwd_charIDs, bwd_charIDs)
|
95 |
+
if args.min_prob == -1.0:
|
96 |
+
# only print the word and tag with the highest score
|
97 |
+
tagIDs = tagscores.argmax(-1)
|
98 |
+
tags = data.IDs2tags(tagIDs.to("cpu"))
|
99 |
+
if not args.print_probs:
|
100 |
+
for word, tag in zip(words, tags):
|
101 |
+
# print(word, tag, sep="\t")
|
102 |
+
words_all.append(word)
|
103 |
+
tagged.append(tag)
|
104 |
+
else:
|
105 |
+
# print probabilities as well
|
106 |
+
tagprobs = torch.nn.functional.softmax(tagscores, dim=-1)
|
107 |
+
# get the probabilities of the highest-scoring tags
|
108 |
+
probs = tagprobs[range(len(tagIDs)), tagIDs].to("cpu").tolist()
|
109 |
+
# print the result
|
110 |
+
for word, tag, prob in zip(words, tags, probs):
|
111 |
+
# print(word, tag, round(float(prob), 4), sep="\t")
|
112 |
+
words_all.append(word)
|
113 |
+
tagged.append(tag)
|
114 |
+
probs_all.append(round(float(prob), 4))
|
115 |
+
else:
|
116 |
+
# print the best tags for each word
|
117 |
+
tagprobs = torch.nn.functional.softmax(tagscores, dim=-1)
|
118 |
+
# get the most probable tag and its probability
|
119 |
+
best_probs, _ = tagprobs.max(-1)
|
120 |
+
# get all tags with a probability above best_prob * min_prob
|
121 |
+
thresholds = best_probs * args.min_prob
|
122 |
+
greaterflags = (tagprobs > thresholds.unsqueeze(1))
|
123 |
+
for word, flags, probs in zip(words, greaterflags, tagprobs):
|
124 |
+
# get the IDs of the best tags
|
125 |
+
IDs = flags.nonzero()
|
126 |
+
# get the best tags and their probabilities
|
127 |
+
best_probs = probs[IDs].to("cpu")
|
128 |
+
best_tags = data.IDs2tags(IDs.to("cpu"))
|
129 |
+
# sort the tags by decreasing probability
|
130 |
+
sorted_list = sorted(zip(best_tags, best_probs), key=lambda x:-x[1])
|
131 |
+
best_tags, best_probs = zip(*sorted_list)
|
132 |
+
# generate the output
|
133 |
+
if args.print_probs:
|
134 |
+
# append the probabilities to the tags
|
135 |
+
best_tags = [f"{t} {float(p):.4f}" for t, p in zip(best_tags, best_probs)]
|
136 |
+
print(word, ' '.join(best_tags), sep="\t")
|
137 |
+
elif type(model) is CRFTagger:
|
138 |
+
tagIDs = model(fwd_charIDs, bwd_charIDs)
|
139 |
+
tags = data.IDs2tags(tagIDs)
|
140 |
+
for word, tag in zip(words, tags):
|
141 |
+
print(word, tag, sep='\t')
|
142 |
+
else:
|
143 |
+
sys.exit('Error')
|
144 |
+
|
145 |
+
return (words_all, tagged, probs_all)
|
MHGTagger/tagger.hyper
ADDED
Binary file (41 Bytes). View file
|
|
MHGTagger/tagger.io
ADDED
Binary file (229 kB). View file
|
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🌍
|
4 |
colorFrom: gray
|
5 |
colorTo: red
|
|
|
1 |
---
|
2 |
+
title: MHG Parsing
|
3 |
emoji: 🌍
|
4 |
colorFrom: gray
|
5 |
colorTo: red
|
Tagset_Mappings/POS-mapping.txt
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
$_ $_
|
2 |
+
ADJA ADJA
|
3 |
+
ADJD ADJD
|
4 |
+
ADJN ADJA.Pos
|
5 |
+
ADJS ADJA
|
6 |
+
APPR APPR
|
7 |
+
APPRART APPRART
|
8 |
+
AVD ADV
|
9 |
+
AVD-KO* ADV
|
10 |
+
AVG PWAV
|
11 |
+
AVW PWAV
|
12 |
+
CARDA CARD
|
13 |
+
CARDD CARD
|
14 |
+
CARDN CARD
|
15 |
+
CARDS CARD
|
16 |
+
DDA PDAT
|
17 |
+
DDART ART
|
18 |
+
DDD PDAT
|
19 |
+
DDN PDAT
|
20 |
+
DDS PDS
|
21 |
+
DGA PWAT
|
22 |
+
DGS PWS
|
23 |
+
DIA PIAT
|
24 |
+
DIART ART
|
25 |
+
DID PDAT
|
26 |
+
DIN PDAT
|
27 |
+
DIS PIS
|
28 |
+
DPOSA PPOSAT
|
29 |
+
DPOSD PPOSS
|
30 |
+
DPOSN PPOSAT
|
31 |
+
DPOSS NN
|
32 |
+
DRELS PRELS
|
33 |
+
DWA PWAT
|
34 |
+
DWD PWS
|
35 |
+
DWS PWS
|
36 |
+
FM FM
|
37 |
+
ITJ ITJ
|
38 |
+
KO* KOUS
|
39 |
+
KOKOM KOKOM
|
40 |
+
KON KON
|
41 |
+
KOUS KOUS
|
42 |
+
NA NN
|
43 |
+
NE NE
|
44 |
+
PART PART
|
45 |
+
PAVAP PROAV
|
46 |
+
PAVD PROAV
|
47 |
+
PAVG PROAV
|
48 |
+
PAVW PWAV
|
49 |
+
PG PWS
|
50 |
+
PI PIS
|
51 |
+
PPER PPER
|
52 |
+
PRF PRF
|
53 |
+
PTK ADV
|
54 |
+
PTKA PTKA
|
55 |
+
PTKANT PTKANT
|
56 |
+
PTKNEG PTKNEG
|
57 |
+
PTKVZ PTKVZ
|
58 |
+
PW PWS
|
59 |
+
VAFIN VAFIN
|
60 |
+
VAIMP VAIMP
|
61 |
+
VAINF VAINF
|
62 |
+
VAPP VAPP
|
63 |
+
VAPS ADJD.Pos
|
64 |
+
VMFIN VMFIN
|
65 |
+
VMIMP VMIMP
|
66 |
+
VMINF VMINF
|
67 |
+
VMPP VMPP
|
68 |
+
VMPS ADJD.Pos
|
69 |
+
VVFIN VVFIN
|
70 |
+
VVIMP VVIMP
|
71 |
+
VVINF VVINF
|
72 |
+
VVPP VVPP
|
73 |
+
VVPS ADJD.Pos
|
Tagset_Mappings/__pycache__/tag_mapping.cpython-38.pyc
ADDED
Binary file (3.28 kB). View file
|
|
Tagset_Mappings/feature-mapping.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Masc,Fem *
|
2 |
+
Fem,Masc *
|
3 |
+
Masc,Neut *
|
4 |
+
Neut,Masc *
|
5 |
+
Fem,Neut *
|
6 |
+
Neut,Fem *
|
7 |
+
Abl Dat
|
8 |
+
Instr Dat
|
9 |
+
Akk Acc
|
10 |
+
Voc Nom
|
11 |
+
bSg Sg
|
Tagset_Mappings/tag_mapping.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
|
3 |
+
"""
|
4 |
+
cd schmid/MHG-Parser/Tagset-Mappings
|
5 |
+
python tag-mapping.py ../self-attentive-parser-master/data/mhg/MHG.tagged > ../self-attentive-parser-master/data/mhg/MHG_new.mapped
|
6 |
+
"""
|
7 |
+
|
8 |
+
import sys
|
9 |
+
import fileinput
|
10 |
+
|
11 |
+
with open("Tagset_Mappings/POS-mapping.txt") as file:
|
12 |
+
pos_map = dict(line.split() for line in file if line.strip())
|
13 |
+
|
14 |
+
with open("Tagset_Mappings/feature-mapping.txt") as file:
|
15 |
+
feature_map = dict(line.split() for line in file if line.strip())
|
16 |
+
|
17 |
+
def map_tags(tags):
|
18 |
+
return [map_tag(tag) for tag in tags]
|
19 |
+
|
20 |
+
|
21 |
+
def map_tag(tag):
|
22 |
+
tag.replace('AVD.Comp', 'AVD').replace('AVD.Sup', 'AVD')
|
23 |
+
pos, *features = tag.split(".")
|
24 |
+
pos = pos.split('|')[0]
|
25 |
+
pos = pos_map[pos]
|
26 |
+
pos, *features2 = pos.split(".")
|
27 |
+
features = features2 + features
|
28 |
+
features = [feature_map.get(f, f) for f in features]
|
29 |
+
if pos == 'ADJA':
|
30 |
+
if len(features) == 5:
|
31 |
+
features = [features[0], features[2], features[3], features[1]]
|
32 |
+
elif len(features) in [3,4]:
|
33 |
+
features = [features[0], features[2], '*', features[1]]
|
34 |
+
elif len(features) == 2:
|
35 |
+
features = [features[0], '*', '*', features[1]]
|
36 |
+
elif len(features) == 1:
|
37 |
+
features = [features[0], '*', '*', '*']
|
38 |
+
elif pos in ['ADV', 'CARD']:
|
39 |
+
features = []
|
40 |
+
elif pos in ['ART', 'APPRART']:
|
41 |
+
if len(features) == 4:
|
42 |
+
features = [features[1], features[2], features[0]]
|
43 |
+
elif len(features) in [0, 1]:
|
44 |
+
features = ['*', '*', '*']
|
45 |
+
elif pos == 'NN':
|
46 |
+
if len(features) == 4:
|
47 |
+
features = [features[1], features[2], features[0]]
|
48 |
+
elif len(features) == 0:
|
49 |
+
features = ['*', '*', '*']
|
50 |
+
elif pos == 'NE':
|
51 |
+
if len(features) == 2:
|
52 |
+
features.append('*')
|
53 |
+
elif len(features) == 1:
|
54 |
+
features.extend(['*', '*'])
|
55 |
+
elif pos == 'PDAT':
|
56 |
+
if len(features) == 4:
|
57 |
+
features = [features[1], features[2], features[0]]
|
58 |
+
elif len(features) == 0:
|
59 |
+
features = ['*', '*', '*']
|
60 |
+
elif pos == 'PIAT':
|
61 |
+
if len(features) == 4:
|
62 |
+
features = [features[1], features[2], features[0]]
|
63 |
+
if len(features) == 2:
|
64 |
+
features = [features[1], '*', features[0]]
|
65 |
+
elif len(features) == 0:
|
66 |
+
features = ['*', '*', '*']
|
67 |
+
elif pos == 'PPOSAT':
|
68 |
+
if len(features) in [3, 4]:
|
69 |
+
features = [features[1], features[2], features[0]]
|
70 |
+
elif len(features) == 0:
|
71 |
+
features = ['*', '*', '*']
|
72 |
+
elif pos == 'PWAT' and len(features) == 4:
|
73 |
+
features = [features[1], features[2], features[0]]
|
74 |
+
elif pos == 'PPOSS':
|
75 |
+
features = ['*.*.*']
|
76 |
+
elif pos == 'PDS':
|
77 |
+
if len(features) == 4:
|
78 |
+
features = [features[1], features[2], features[0]]
|
79 |
+
elif len(features) == 1:
|
80 |
+
features.extend(['*', '*'])
|
81 |
+
elif len(features) == 2:
|
82 |
+
features = [features[1], '*', '*']
|
83 |
+
elif pos == 'PIS':
|
84 |
+
if len(features) == 4:
|
85 |
+
features = [features[1], features[2], features[0]]
|
86 |
+
elif len(features) == 0:
|
87 |
+
features = ['*', '*', '*']
|
88 |
+
elif pos == 'PWS':
|
89 |
+
if len(features) == 4:
|
90 |
+
features = [features[1], features[2], features[0]]
|
91 |
+
elif len(features) == 0:
|
92 |
+
features = ['*', '*', '*']
|
93 |
+
elif pos == 'PRELS' and len(features) == 3:
|
94 |
+
features = [features[1], features[2], features[0]]
|
95 |
+
elif pos == 'PPER' and len(features) == 4:
|
96 |
+
features = [features[3], features[1], features[2], features[0]]
|
97 |
+
elif pos == 'PRF' and len(features) == 3:
|
98 |
+
features = ['*', features[0], features[1]]
|
99 |
+
elif pos in ['VAFIN','VMFIN','VVFIN'] and len(features) == 4:
|
100 |
+
features = [features[3], features[2], features[1], features[0]]
|
101 |
+
elif pos in ['VAIMP','VMIMP','VVIMP'] and len(features) == 2:
|
102 |
+
features = [features[1], features[0], 'Imp']
|
103 |
+
elif pos in ['VAINF','VMINF','VVINF'] and len(features) == 0:
|
104 |
+
features = ['Inf']
|
105 |
+
elif pos in ['VAPP','VMPP','VVPP'] and len(features) == 0:
|
106 |
+
features = ['Psp']
|
107 |
+
return '.'.join([pos]+features)
|
108 |
+
|
109 |
+
# for i, line in enumerate(fileinput.input()):
|
110 |
+
# print(i, end="\r", file=sys.stderr)
|
111 |
+
# line = line.strip()
|
112 |
+
# if line:
|
113 |
+
# word, tag, *_ = line.split("\t")
|
114 |
+
# tag = tag.replace('APPR|DDART', 'APPRART')
|
115 |
+
# for t in tag.split("|"):
|
116 |
+
# print(word, map_tag(t), sep="\t")
|
117 |
+
# else:
|
118 |
+
# print()
|
119 |
+
|
120 |
+
|
121 |
+
# for i, line in enumerate(fileinput.input()):
|
122 |
+
# print(i, end="\r", file=sys.stderr)
|
123 |
+
# line = line.strip()
|
124 |
+
# if line:
|
125 |
+
# word, tag, *_ = line.split("\t")
|
126 |
+
# tag = tag.replace('APPR|DDART', 'APPRART')
|
127 |
+
# print(word, map_tag(tag.split('|')[0]), sep="\t")
|
128 |
+
# else:
|
129 |
+
# print()
|
app.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from parse import parse_text
|
3 |
+
import nltk
|
4 |
+
from nltk import Tree
|
5 |
+
import pandas as pd
|
6 |
+
import re
|
7 |
+
from nltk.tree.prettyprinter import TreePrettyPrinter
|
8 |
+
|
9 |
+
|
10 |
+
st.title("MHG parsing system (demo)")
|
11 |
+
text = st.text_area("""This is a simple demo of a Middle High German (MHG) parsing system using delexicalization method.\n\n
|
12 |
+
Enter some MHG text below!""")
|
13 |
+
|
14 |
+
st.text("""Example MHG sentences:
|
15 |
+
1. Swer an rehte güete wendet sîn gemüete, dem volget sælde und êre, des gît gewisse
|
16 |
+
lêre künec Artûs der guote, der mit rîters muote nâch lobe kunde strîten.
|
17 |
+
2. Uns ist in alten mæren wunders vil geseitvon helden lobebæren, von grôzer arebeit,
|
18 |
+
von freuden, hôchgezîten, von weinen und von klagen, von küener recken strîten muget
|
19 |
+
ir nu wunder hœren sagen.""")
|
20 |
+
|
21 |
+
nltk.download('punkt')
|
22 |
+
|
23 |
+
|
24 |
+
if text:
|
25 |
+
tokens, tags, probs, parse_tree = parse_text(text)
|
26 |
+
|
27 |
+
# create a table to show the tagged results:
|
28 |
+
zipped = list(zip(tokens, tags, probs))
|
29 |
+
|
30 |
+
df = pd.DataFrame(zipped, columns=['Token', 'Tag', 'Prob.'])
|
31 |
+
|
32 |
+
# Convert the bracket parse tree into an NLTK Tree
|
33 |
+
t = Tree.fromstring(re.sub(r'(\.[^ )]+)+', '', parse_tree))
|
34 |
+
|
35 |
+
tree_svg = TreePrettyPrinter(t).svg(nodecolor='black', leafcolor='black', funccolor='black')
|
36 |
+
|
37 |
+
col1 = st.columns(1)[0]
|
38 |
+
col1.header("POS tagging result:")
|
39 |
+
col1.table(df)
|
40 |
+
|
41 |
+
col2 = st.columns(1)[0]
|
42 |
+
col2.header("Parsing result:")
|
43 |
+
col2.write(parse_tree.replace('_', '\_').replace('$', '\$').replace('*', '\*'))
|
44 |
+
|
45 |
+
# Display the graph in the Streamlit app
|
46 |
+
col2.image(tree_svg, use_column_width=True)
|
47 |
+
|
parse.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from MHGTagger.rnn_annotate import annotate
|
3 |
+
from Tagset_Mappings.tag_mapping import map_tags
|
4 |
+
from parsing.src.parse import run_parse
|
5 |
+
from nltk import word_tokenize
|
6 |
+
|
7 |
+
def parse_text(text):
|
8 |
+
tokens = tokenize(text)
|
9 |
+
tokens, tags, probs = annotate(tokens)
|
10 |
+
tags = map_tags(tags)
|
11 |
+
parse_tree = run_parse(tokens, tags)[0]
|
12 |
+
return tokens, tags, probs, parse_tree
|
13 |
+
|
14 |
+
def tokenize(text: str):
|
15 |
+
text = re.sub(r'\s*([.,;:?!"])\s', r' \1 ', text)
|
16 |
+
text = re.sub(r'\s*([.,;:?!"]) ', r' \1 ', text)
|
17 |
+
tokens = word_tokenize(text)
|
18 |
+
return tokens
|
19 |
+
|
parsing/EVALB/COLLINS.prm
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##------------------------------------------##
|
2 |
+
## Debug mode ##
|
3 |
+
## 0: No debugging ##
|
4 |
+
## 1: print data for individual sentence ##
|
5 |
+
##------------------------------------------##
|
6 |
+
DEBUG 0
|
7 |
+
|
8 |
+
##------------------------------------------##
|
9 |
+
## MAX error ##
|
10 |
+
## Number of error to stop the process. ##
|
11 |
+
## This is useful if there could be ##
|
12 |
+
## tokanization error. ##
|
13 |
+
## The process will stop when this number##
|
14 |
+
## of errors are accumulated. ##
|
15 |
+
##------------------------------------------##
|
16 |
+
MAX_ERROR 10
|
17 |
+
|
18 |
+
##------------------------------------------##
|
19 |
+
## Cut-off length for statistics ##
|
20 |
+
## At the end of evaluation, the ##
|
21 |
+
## statistics for the senetnces of length##
|
22 |
+
## less than or equal to this number will##
|
23 |
+
## be shown, on top of the statistics ##
|
24 |
+
## for all the sentences ##
|
25 |
+
##------------------------------------------##
|
26 |
+
CUTOFF_LEN 40
|
27 |
+
|
28 |
+
##------------------------------------------##
|
29 |
+
## unlabeled or labeled bracketing ##
|
30 |
+
## 0: unlabeled bracketing ##
|
31 |
+
## 1: labeled bracketing ##
|
32 |
+
##------------------------------------------##
|
33 |
+
LABELED 1
|
34 |
+
|
35 |
+
##------------------------------------------##
|
36 |
+
## Delete labels ##
|
37 |
+
## list of labels to be ignored. ##
|
38 |
+
## If it is a pre-terminal label, delete ##
|
39 |
+
## the word along with the brackets. ##
|
40 |
+
## If it is a non-terminal label, just ##
|
41 |
+
## delete the brackets (don't delete ##
|
42 |
+
## deildrens). ##
|
43 |
+
##------------------------------------------##
|
44 |
+
DELETE_LABEL TOP
|
45 |
+
DELETE_LABEL -NONE-
|
46 |
+
DELETE_LABEL ,
|
47 |
+
DELETE_LABEL :
|
48 |
+
DELETE_LABEL ``
|
49 |
+
DELETE_LABEL ''
|
50 |
+
DELETE_LABEL .
|
51 |
+
|
52 |
+
##------------------------------------------##
|
53 |
+
## Delete labels for length calculation ##
|
54 |
+
## list of labels to be ignored for ##
|
55 |
+
## length calculation purpose ##
|
56 |
+
##------------------------------------------##
|
57 |
+
DELETE_LABEL_FOR_LENGTH -NONE-
|
58 |
+
|
59 |
+
##------------------------------------------##
|
60 |
+
## Equivalent labels, words ##
|
61 |
+
## the pairs are considered equivalent ##
|
62 |
+
## This is non-directional. ##
|
63 |
+
##------------------------------------------##
|
64 |
+
EQ_LABEL ADVP PRT
|
65 |
+
|
66 |
+
# EQ_WORD Example example
|
parsing/EVALB/LICENSE
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This is free and unencumbered software released into the public domain.
|
2 |
+
|
3 |
+
Anyone is free to copy, modify, publish, use, compile, sell, or
|
4 |
+
distribute this software, either in source code form or as a compiled
|
5 |
+
binary, for any purpose, commercial or non-commercial, and by any
|
6 |
+
means.
|
7 |
+
|
8 |
+
In jurisdictions that recognize copyright laws, the author or authors
|
9 |
+
of this software dedicate any and all copyright interest in the
|
10 |
+
software to the public domain. We make this dedication for the benefit
|
11 |
+
of the public at large and to the detriment of our heirs and
|
12 |
+
successors. We intend this dedication to be an overt act of
|
13 |
+
relinquishment in perpetuity of all present and future rights to this
|
14 |
+
software under copyright law.
|
15 |
+
|
16 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
19 |
+
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
20 |
+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
21 |
+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22 |
+
OTHER DEALINGS IN THE SOFTWARE.
|
23 |
+
|
24 |
+
For more information, please refer to <http://unlicense.org/>
|
parsing/EVALB/Makefile
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
all: evalb
|
2 |
+
|
3 |
+
evalb: evalb.c
|
4 |
+
gcc -Wall -g -o evalb evalb.c
|
parsing/EVALB/README
ADDED
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#################################################################
|
2 |
+
# #
|
3 |
+
# Bug fix and additional functionality for evalb #
|
4 |
+
# #
|
5 |
+
# This updated version of evalb fixes a bug in which sentences #
|
6 |
+
# were incorrectly categorized as "length mismatch" when the #
|
7 |
+
# the parse output had certain mislabeled parts-of-speech. #
|
8 |
+
# #
|
9 |
+
# The bug was the result of evalb treating one of the tags (in #
|
10 |
+
# gold or test) as a label to be deleted (see sections [6],[7] #
|
11 |
+
# for details), but not the corresponding tag in the other. #
|
12 |
+
# This most often occurs with punctuation. See the subdir #
|
13 |
+
# "bug" for an example gld and tst file demonstating the bug, #
|
14 |
+
# as well as output of evalb with and without the bug fix. #
|
15 |
+
# #
|
16 |
+
# For the present version in case of length mismatch, the nodes #
|
17 |
+
# causing the imbalance are reinserted to resolve the miscount. #
|
18 |
+
# If the lengths of gold and test truly differ, the error is #
|
19 |
+
# still reported. The parameter file "new.prm" (derived from #
|
20 |
+
# COLLINS.prm) shows how to add new potential mislabelings for #
|
21 |
+
# quotes (",``,',`). #
|
22 |
+
# #
|
23 |
+
# I have preserved DJB's revision for modern compilers except #
|
24 |
+
# for the delcaration of "exit" which is provided by stdlib. #
|
25 |
+
# #
|
26 |
+
# Other changes: #
|
27 |
+
# #
|
28 |
+
# * output of F-Measure in addition to precision and recall #
|
29 |
+
# (I did not update the documention in section [4] for this) #
|
30 |
+
# #
|
31 |
+
# * more comprehensive DEBUG output that includes bracketing #
|
32 |
+
# information as evalb is processing each sentence #
|
33 |
+
# (useful in working through this, and peraps other bugs). #
|
34 |
+
# Use either the "-D" run-time switch or set DEBUG to 2 in #
|
35 |
+
# the parameter file. #
|
36 |
+
# #
|
37 |
+
# * added DELETE_LABEL lines in new.prm for S1 nodes produced #
|
38 |
+
# by the Charniak parser and "?", "!" punctuation produced by #
|
39 |
+
# the Bikel parser. #
|
40 |
+
# #
|
41 |
+
# #
|
42 |
+
# David Ellis (Brown) #
|
43 |
+
# #
|
44 |
+
# January.2006 #
|
45 |
+
#################################################################
|
46 |
+
|
47 |
+
#################################################################
|
48 |
+
# #
|
49 |
+
# Update of evalb for modern compilers #
|
50 |
+
# #
|
51 |
+
# This is an updated version of evalb, for use with modern C #
|
52 |
+
# compilers. There are a few updates, each marked in the code: #
|
53 |
+
# #
|
54 |
+
# /* DJB: explanation of comment */ #
|
55 |
+
# #
|
56 |
+
# The updates are purely to help compilation with recent #
|
57 |
+
# versions of GCC (and other C compilers). There are *NO* other #
|
58 |
+
# changes to the algorithm itself. #
|
59 |
+
# #
|
60 |
+
# I have made these changes following recommendations from #
|
61 |
+
# users of the Corpora Mailing List, especially Peet Morris and #
|
62 |
+
# Ramon Ziai. #
|
63 |
+
# #
|
64 |
+
# David Brooks (Birmingham) #
|
65 |
+
# #
|
66 |
+
# September.2005 #
|
67 |
+
#################################################################
|
68 |
+
|
69 |
+
#################################################################
|
70 |
+
# #
|
71 |
+
# README file for evalb #
|
72 |
+
# #
|
73 |
+
# Satoshi Sekine (NYU) #
|
74 |
+
# Mike Collins (UPenn) #
|
75 |
+
# #
|
76 |
+
# October.1997 #
|
77 |
+
#################################################################
|
78 |
+
|
79 |
+
Contents of this README:
|
80 |
+
|
81 |
+
[0] COPYRIGHT
|
82 |
+
[1] INTRODUCTION
|
83 |
+
[2] INSTALLATION AND RUN
|
84 |
+
[3] OPTIONS
|
85 |
+
[4] OUTPUT FORMAT FROM THE SCORER
|
86 |
+
[5] HOW TO CREATE A GOLDFILE FROM THE TREEBANK
|
87 |
+
[6] THE PARAMETER FILE
|
88 |
+
[7] MORE DETAILS ABOUT THE SCORING ALGORITHM
|
89 |
+
|
90 |
+
|
91 |
+
[0] COPYRIGHT
|
92 |
+
|
93 |
+
The authors abandon the copyright of this program. Everyone is
|
94 |
+
permitted to copy and distribute the program or a portion of the program
|
95 |
+
with no charge and no restrictions unless it is harmful to someone.
|
96 |
+
|
97 |
+
However, the authors are delightful for the user's kindness of proper
|
98 |
+
usage and letting the authors know bugs or problems.
|
99 |
+
|
100 |
+
This software is provided "AS IS", and the authors make no warranties,
|
101 |
+
express or implied.
|
102 |
+
|
103 |
+
To legally enforce the abandonment of copyright, this package is released
|
104 |
+
under the Unlicense (see LICENSE).
|
105 |
+
|
106 |
+
[1] INTRODUCTION
|
107 |
+
|
108 |
+
Evaluation of bracketing looks simple, but in fact, there are minor
|
109 |
+
differences from system to system. This is a program to parametarize
|
110 |
+
such minor differences and to give an informative result.
|
111 |
+
|
112 |
+
"evalb" evaluates bracketing accuracy in a test-file against a gold-file.
|
113 |
+
It returns recall, precision, tagging accuracy. It uses an identical
|
114 |
+
algorithm to that used in (Collins ACL97).
|
115 |
+
|
116 |
+
|
117 |
+
[2] Installation and Run
|
118 |
+
|
119 |
+
To compile the scorer, type
|
120 |
+
|
121 |
+
> make
|
122 |
+
|
123 |
+
|
124 |
+
To run the scorer:
|
125 |
+
|
126 |
+
> evalb -p Parameter_file Gold_file Test_file
|
127 |
+
|
128 |
+
|
129 |
+
For example to use the sample files:
|
130 |
+
|
131 |
+
> evalb -p sample.prm sample.gld sample.tst
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
[3] OPTIONS
|
136 |
+
|
137 |
+
You can specify system parameters in the command line options.
|
138 |
+
Other options concerning to evaluation metrix should be specified
|
139 |
+
in parameter file, described later.
|
140 |
+
|
141 |
+
-p param_file parameter file
|
142 |
+
-d debug mode
|
143 |
+
-e n number of error to kill (default=10)
|
144 |
+
-h help
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
[4] OUTPUT FORMAT FROM THE SCORER
|
149 |
+
|
150 |
+
The scorer gives individual scores for each sentence, for
|
151 |
+
example:
|
152 |
+
|
153 |
+
Sent. Matched Bracket Cross Correct Tag
|
154 |
+
ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy
|
155 |
+
============================================================================
|
156 |
+
1 8 0 100.00 100.00 5 5 5 0 6 5 83.33
|
157 |
+
|
158 |
+
At the end of the output the === Summary === section gives statistics
|
159 |
+
for all sentences, and for sentences <=40 words in length. The summary
|
160 |
+
contains the following information:
|
161 |
+
|
162 |
+
i) Number of sentences -- total number of sentences.
|
163 |
+
|
164 |
+
ii) Number of Error/Skip sentences -- should both be 0 if there is no
|
165 |
+
problem with the parsed/gold files.
|
166 |
+
|
167 |
+
iii) Number of valid sentences = Number of sentences - Number of Error/Skip
|
168 |
+
sentences
|
169 |
+
|
170 |
+
iv) Bracketing recall = (number of correct constituents)
|
171 |
+
----------------------------------------
|
172 |
+
(number of constituents in the goldfile)
|
173 |
+
|
174 |
+
v) Bracketing precision = (number of correct constituents)
|
175 |
+
----------------------------------------
|
176 |
+
(number of constituents in the parsed file)
|
177 |
+
|
178 |
+
vi) Complete match = percentaage of sentences where recall and precision are
|
179 |
+
both 100%.
|
180 |
+
|
181 |
+
vii) Average crossing = (number of constituents crossing a goldfile constituen
|
182 |
+
----------------------------------------------------
|
183 |
+
(number of sentences)
|
184 |
+
|
185 |
+
viii) No crossing = percentage of sentences which have 0 crossing brackets.
|
186 |
+
|
187 |
+
ix) 2 or less crossing = percentage of sentences which have <=2 crossing brackets.
|
188 |
+
|
189 |
+
x) Tagging accuracy = percentage of correct POS tags (but see [5].3 for exact
|
190 |
+
details of what is counted).
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
[5] HOW TO CREATE A GOLDFILE FROM THE PENN TREEBANK
|
195 |
+
|
196 |
+
|
197 |
+
The gold and parsed files are in a format similar to this:
|
198 |
+
|
199 |
+
(TOP (S (INTJ (RB No)) (, ,) (NP (PRP it)) (VP (VBD was) (RB n't) (NP (NNP Black) (NNP Monday))) (. .)))
|
200 |
+
|
201 |
+
To create a gold file from the treebank:
|
202 |
+
|
203 |
+
tgrep -wn '/.*/' | tgrep_proc.prl
|
204 |
+
|
205 |
+
will produce a goldfile in the required format. ("tgrep -wn '/.*/'" prints
|
206 |
+
parse trees, "tgrep_process.prl" just skips blank lines).
|
207 |
+
|
208 |
+
For example, to produce a goldfile for section 23 of the treebank:
|
209 |
+
|
210 |
+
tgrep -wn '/.*/' | tail +90895 | tgrep_process.prl | sed 2416q > sec23.gold
|
211 |
+
|
212 |
+
|
213 |
+
|
214 |
+
[6] THE PARAMETER (.prm) FILE
|
215 |
+
|
216 |
+
|
217 |
+
The .prm file sets options regarding the scoring method. COLLINS.prm gives
|
218 |
+
the same scoring behaviour as the scorer used in (Collins 97). The options
|
219 |
+
chosen were:
|
220 |
+
|
221 |
+
1) LABELED 1
|
222 |
+
|
223 |
+
to give labelled precision/recall figures, i.e. a constituent must have the
|
224 |
+
same span *and* label as a constituent in the goldfile.
|
225 |
+
|
226 |
+
2) DELETE_LABEL TOP
|
227 |
+
|
228 |
+
Don't count the "TOP" label (which is always given in the output of tgrep)
|
229 |
+
when scoring.
|
230 |
+
|
231 |
+
3) DELETE_LABEL -NONE-
|
232 |
+
|
233 |
+
Remove traces (and all constituents which dominate nothing but traces) when
|
234 |
+
scoring. For example
|
235 |
+
|
236 |
+
.... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)))
|
237 |
+
|
238 |
+
would be processed to give
|
239 |
+
|
240 |
+
.... (VP (VBD reported)) (. .)))
|
241 |
+
|
242 |
+
|
243 |
+
4)
|
244 |
+
DELETE_LABEL , -- for the purposes of scoring remove punctuation
|
245 |
+
DELETE_LABEL :
|
246 |
+
DELETE_LABEL ``
|
247 |
+
DELETE_LABEL ''
|
248 |
+
DELETE_LABEL .
|
249 |
+
|
250 |
+
5) DELETE_LABEL_FOR_LENGTH -NONE- -- don't include traces when calculating
|
251 |
+
the length of a sentence (important
|
252 |
+
when classifying a sentence as <=40
|
253 |
+
words or >40 words)
|
254 |
+
|
255 |
+
6) EQ_LABEL ADVP PRT
|
256 |
+
|
257 |
+
Count ADVP and PRT as being the same label when scoring.
|
258 |
+
|
259 |
+
|
260 |
+
|
261 |
+
|
262 |
+
[7] MORE DETAILS ABOUT THE SCORING ALGORITHM
|
263 |
+
|
264 |
+
|
265 |
+
1) The scorer initially processes the files to remove all nodes specified
|
266 |
+
by DELETE_LABEL in the .prm file. It also recursively removes nodes which
|
267 |
+
dominate nothing due to all their children being removed. For example, if
|
268 |
+
-NONE- is specified as a label to be deleted,
|
269 |
+
|
270 |
+
.... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)))
|
271 |
+
|
272 |
+
would be processed to give
|
273 |
+
|
274 |
+
.... (VP (VBD reported)) (. .)))
|
275 |
+
|
276 |
+
2) The scorer also removes all functional tags attached to non-terminals
|
277 |
+
(functional tags are prefixed with "-" or "=" in the treebank). For example
|
278 |
+
"NP-SBJ" is processed to give "NP", "NP=2" is changed to "NP".
|
279 |
+
|
280 |
+
|
281 |
+
3) Tagging accuracy counts tags for all words *except* any tags which are
|
282 |
+
deleted by a DELETE_LABEL specification in the .prm file. (For example, for
|
283 |
+
COLLINS.prm, punctuation tagged as "," ":" etc. would not be included).
|
284 |
+
|
285 |
+
4) When calculating the length of a sentence, all words with POS tags not
|
286 |
+
included in the "DELETE_LABEL_FOR_LENGTH" list in the .prm file are
|
287 |
+
counted. (For COLLINS.prm, only "-NONE-" is specified in this list, so
|
288 |
+
traces are removed before calculating the length of the sentence).
|
289 |
+
|
290 |
+
5) There are some subtleties in scoring when either the goldfile or parsed
|
291 |
+
file contains multiple constituents for the same span which have the same
|
292 |
+
non-terminal label. e.g. (NP (NP the man)) If the goldfile contains n
|
293 |
+
constituents for the same span, and the parsed file contains m constituents
|
294 |
+
with that nonterminal, the scorer works as follows:
|
295 |
+
|
296 |
+
i) If m>n, then the precision is n/m, recall is 100%
|
297 |
+
|
298 |
+
ii) If n>m, then the precision is 100%, recall is m/n.
|
299 |
+
|
300 |
+
iii) If n==m, recall and precision are both 100%.
|
parsing/EVALB/bug/bug.gld
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
(TOP (S (NP-SBJ (DT The) (NN Thy-1) (NN gene) (NN promoter) ) (VP (VBZ resembles) (NP (DT a) (`` ") (JJ housekeeping) ('' ") (NN promoter) ) (PP (IN in) (SBAR (IN that) (S (NP-SBJ-68 (PRP it) ) (VP-COOD (VP (VBZ is) (ADJP-PRD (JJ located) (PP (IN within) (NP (DT a) (JJ methylation-free) (NN island) )))) (, ,) (VP (VBZ lacks) (NP (DT a) (JJ canonical) (NN TATA) (NN box) )) (, ,) (CC and) (VP (VBZ displays) (NP (NN heterogeneity) ) (PP (IN in) (NP (NP (DT the) (JJ 5'-end) (NNS termini) ) (PP (IN of) (NP (DT the) (NN mRNA) )))))))))) (. .) ) )
|
2 |
+
(TOP (S (NP-SBJ (DT The) (JJ latter) (`` ") (NP (NP (JJ nuclear) (NN factor) ) (PP (IN for) (NP (VBN activated) (NN T) (NNS cells) ))) ('' ") ) (ADVP (RB likely) ) (VP (VBZ contributes) (PP (TO to) (NP (NP (DT the) (NN tissue) (NN specificity) ) (PP (IN of) (NP (NN IL-2) (NN gene) (NN expression) ))))) (. .) ) )
|
3 |
+
(TOP (S (ADVP (RB Thus) ) (, ,) (NP-SBJ (PRP we) ) (VP (VBD postulated) (SBAR-COOD (SBAR (IN that) (S (NP-SBJ (NP (DT the) (JJ circadian) (NN modification) ) (PP (IN of) (NP (NN GR) ))) (VP (VBD was) (ADJP-PRD (JJ independent) (PP (IN of) (NP-COOD (NP (NP (DT the) (JJ diurnal) (NNS fluctuations) ) (PP (IN in) (NP (NN plasma) (NN cortisol) (NN level) ))) (CC or) (NP (NP (DT the) (JJ circadian) (NNS variations) ) (PP (IN in) (NP (JJ environmental) (NN lighting) ))))))))) (CC and) (SBAR (IN that) (S (NP-SBJ-79 (DT the) (NN rhythmicity) ) (VP (MD might) (VP (VB be) (VP (VBN regulated) (NP (-NONE- *-79) ) (PP (IN by) (NP-LGS (NP (DT the) (`` ') (JJ circadian) (NN pacemaker) ('' ') ) (ADJP (JJ located) (PP (IN in) (NP (DT the) (JJ human) (JJ basal) (NN brain) )))))))))))) (. .) ) )
|
4 |
+
(TOP (S (NP-SBJ-70 (JJ Such) (NN transcription) (NNS factors) ) (VP (VBP play) (NP (DT a) (JJ key) (NN role) ) (PP (IN in) (NP (NP (DT the) (NN development) ) (PP (IN of) (NP (DT the) (JJ mature) (NN T-cell) (NN phenotype) )))) (PP (IN by) (S (NP-SBJ (-NONE- *-70) ) (VP (VBG functioning) (PP (IN as) (`` ') (NP (NP (JJ master) (NNS regulators) ) (PP (IN of) (NP (NN T-cell) (NN differentiation) ))) ('' ') ))))) (. .) ) )
|
5 |
+
(TOP (S (NP-SBJ (NP (DT The) (NN conversion) ) (PP (IN of) (NP (DT the) (NN TCEd) )) (PP (TO to) (NP (DT a) (`` ') (JJ perfect) ('' ') (NN NF-kB) (NN binding) (NN site) ))) (VP-COOD (VP (VBZ leads) (PP (TO to) (NP-19 (NP (DT a) (JJR tighter) (NN binding) ) (PP (IN of) (NP (NN NF-kB) )) (PP (TO to) (NP (NN TCEd) (NN DNA) ))))) (CC and) (, ,) (VP (PP (IN as) (NP (DT a) (JJ functional) (NN consequence) )) (, ,) (PP (TO to) (NP=19 (NP (DT the) (NN activity) ) (PP (IN of) (NP (DT the) (`` ') (VBN converted) ('' ') (NN TCEd) (NNS motifs) )) (PP (IN in) (NP (NN HeLa) (NNS cells) )))))) (. .) ) )
|
parsing/EVALB/bug/bug.rsl-new
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Sent. Matched Bracket Cross Correct Tag
|
2 |
+
ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy
|
3 |
+
============================================================================
|
4 |
+
1 37 0 77.27 65.38 17 22 26 5 34 27 79.41
|
5 |
+
2 21 0 69.23 64.29 9 13 14 2 20 16 80.00
|
6 |
+
3 47 0 80.00 82.35 28 35 34 4 44 40 90.91
|
7 |
+
4 26 0 35.29 37.50 6 17 16 8 25 18 72.00
|
8 |
+
5 44 0 42.31 33.33 11 26 33 17 38 28 73.68
|
9 |
+
============================================================================
|
10 |
+
62.83 57.72 71 113 123 0 161 129 80.12
|
11 |
+
=== Summary ===
|
12 |
+
|
13 |
+
-- All --
|
14 |
+
Number of sentence = 5
|
15 |
+
Number of Error sentence = 0
|
16 |
+
Number of Skip sentence = 0
|
17 |
+
Number of Valid sentence = 5
|
18 |
+
Bracketing Recall = 62.83
|
19 |
+
Bracketing Precision = 57.72
|
20 |
+
Bracketing FMeasure = 60.17
|
21 |
+
Complete match = 0.00
|
22 |
+
Average crossing = 7.20
|
23 |
+
No crossing = 0.00
|
24 |
+
2 or less crossing = 20.00
|
25 |
+
Tagging accuracy = 80.12
|
26 |
+
|
27 |
+
-- len<=40 --
|
28 |
+
Number of sentence = 3
|
29 |
+
Number of Error sentence = 0
|
30 |
+
Number of Skip sentence = 0
|
31 |
+
Number of Valid sentence = 3
|
32 |
+
Bracketing Recall = 61.54
|
33 |
+
Bracketing Precision = 57.14
|
34 |
+
Bracketing FMeasure = 59.26
|
35 |
+
Complete match = 0.00
|
36 |
+
Average crossing = 5.00
|
37 |
+
No crossing = 0.00
|
38 |
+
2 or less crossing = 33.33
|
39 |
+
Tagging accuracy = 77.22
|
parsing/EVALB/bug/bug.rsl-old
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Sent. Matched Bracket Cross Correct Tag
|
2 |
+
ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy
|
3 |
+
============================================================================
|
4 |
+
1 : Length unmatch (33|35)
|
5 |
+
1 37 1 0.00 0.00 0 0 0 0 0 0 0.00
|
6 |
+
2 : Length unmatch (19|21)
|
7 |
+
2 21 1 0.00 0.00 0 0 0 0 0 0 0.00
|
8 |
+
3 : Length unmatch (44|45)
|
9 |
+
3 47 1 0.00 0.00 0 0 0 0 0 0 0.00
|
10 |
+
4 : Length unmatch (24|26)
|
11 |
+
4 26 1 0.00 0.00 0 0 0 0 0 0 0.00
|
12 |
+
5 : Length unmatch (38|39)
|
13 |
+
5 44 1 0.00 0.00 0 0 0 0 0 0 0.00
|
14 |
+
============================================================================
|
15 |
+
0 0 0.00
|
16 |
+
|
17 |
+
=== Summary ===
|
18 |
+
|
19 |
+
-- All --
|
20 |
+
Number of sentence = 5
|
21 |
+
Number of Error sentence = 5
|
22 |
+
Number of Skip sentence = 0
|
23 |
+
Number of Valid sentence = 0
|
24 |
+
Bracketing Recall = 0.00
|
25 |
+
Bracketing Precision = 0.00
|
26 |
+
Bracketing FMeasure = nan
|
27 |
+
Complete match = 0.00
|
28 |
+
Average crossing = 0.00
|
29 |
+
No crossing = 0.00
|
30 |
+
2 or less crossing = 0.00
|
31 |
+
Tagging accuracy = 0.00
|
32 |
+
|
33 |
+
-- len<=40 --
|
34 |
+
Number of sentence = 3
|
35 |
+
Number of Error sentence = 3
|
36 |
+
Number of Skip sentence = 0
|
37 |
+
Number of Valid sentence = 0
|
38 |
+
Bracketing Recall = 0.00
|
39 |
+
Bracketing Precision = 0.00
|
40 |
+
Bracketing FMeasure = nan
|
41 |
+
Complete match = 0.00
|
42 |
+
Average crossing = 0.00
|
43 |
+
No crossing = 0.00
|
44 |
+
2 or less crossing = 0.00
|
45 |
+
Tagging accuracy = 0.00
|
parsing/EVALB/bug/bug.tst
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
(S1 (S (NP (DT The) (JJ Thy-1) (NN gene) (NN promoter)) (VP (VP (VBZ resembles) (NP (NP (DT a) (ADJP (CD ") (NN housekeeping)) (NN ") (NN promoter)) (SBAR (WHPP (IN in) (WHNP (WDT that))) (S (NP (PRP it)) (VP (VBZ is) (VP (VBN located) (PP (IN within) (NP (DT a) (JJ methylation-free) (NN island))))))))) (, ,) (VP (VBZ lacks) (NP (DT a) (JJ canonical) (NNP TATA) (NN box))) (, ,) (CC and) (VP (VBZ displays) (NP (NP (NN heterogeneity)) (PP (IN in) (NP (NP (DT the) (JJ 5'-end) (NNS termini)) (PP (IN of) (NP (DT the) (NN mRNA)))))))) (. .)))
|
2 |
+
(S1 (S (NP (NP (DT The) (JJ latter) (CD ") (JJ nuclear) (NN factor)) (PP (IN for) (NP (VBN activated) (NN T) (NNS cells)))) (VP (VBZ ") (ADJP (JJ likely) (S (VP (VBZ contributes) (PP (TO to) (NP (NP (DT the) (NN tissue) (NN specificity)) (PP (IN of) (NP (JJ IL-2) (NN gene) (NN expression))))))))) (. .)))
|
3 |
+
(S1 (S (ADVP (RB Thus)) (, ,) (NP (PRP we)) (VP (VBD postulated) (SBAR (SBAR (IN that) (S (NP (NP (DT the) (JJ circadian) (NN modification)) (PP (IN of) (NP (NNP GR)))) (VP (VBD was) (ADJP (JJ independent) (PP (IN of) (NP (DT the) (JJ diurnal) (NNS fluctuations)))) (PP (IN in) (NP (NP (NN plasma) (JJ cortisol) (NN level)) (CC or) (NP (NP (DT the) (JJ circadian) (NNS variations)) (PP (IN in) (NP (JJ environmental) (NN lighting))))))))) (CC and) (SBAR (IN that) (S (NP (DT the) (NN rhythmicity)) (VP (MD might) (VP (VB be) (VP (VBN regulated) (PP (IN by) (NP (DT the) ('' ') (NP (JJ circadian) (NN pacemaker) (POS ')) (VP (VBN located) (PP (IN in) (NP (DT the) (JJ human) (JJ basal) (NN brain))))))))))))) (. .)))
|
4 |
+
(S1 (S (NP (JJ Such) (NN transcription) (NNS factors)) (VP (VBP play) (NP (NP (DT a) (JJ key) (NN role)) (PP (IN in) (NP (NP (DT the) (NN development)) (PP (IN of) (NP (NP (DT the) (JJ mature) (JJ T-cell) (NN phenotype)) (PP (IN by) (NP (NP (NN functioning) (RB as) (POS ')) (NN master) (NNS regulators))))) (PP (IN of) (NP (JJ T-cell) (NN differentiation) (POS '))))))) (. .)))
|
5 |
+
(S1 (S (NP (NP (DT The) (NN conversion)) (PP (IN of) (NP (DT the)))) (VP (VBD TCEd) (PP (TO to) (NP (NP (DT a) ('' ') (JJ perfect) ('' ') (NN NF-kB)) (SBAR (S (NP (JJ binding) (NN site)) (VP (VBZ leads) (PP (TO to) (NP (NP (NP (DT a) (ADJP (RBR tighter) (JJ binding)) (PP (IN of) (NP (NP (NNS NF-kB)) (PP (PP (TO to) (NP (JJ TCEd) (NN DNA))) (CC and) (PP (, ,) (PP (IN as) (NP (DT a) (JJ functional) (NN consequence))) (, ,) (TO to) (NP (NP (DT the) (NN activity)) (PP (IN of) (NP (DT the)))))))) (POS ')) (JJ converted) ('' ') (JJ TCEd) (NNS motifs)) (PP (IN in) (NP (NNP HeLa) (NNS cells))))))))))) (. .)))
|
parsing/EVALB/evalb
ADDED
Binary file (59.6 kB). View file
|
|
parsing/EVALB/evalb.c
ADDED
@@ -0,0 +1,1537 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/*****************************************************************/
|
2 |
+
/* evalb [-p param_file] [-dh] [-e n] gold-file test-file */
|
3 |
+
/* */
|
4 |
+
/* Evaluate bracketing in test-file against gold-file. */
|
5 |
+
/* Return recall, precision, tagging accuracy. */
|
6 |
+
/* */
|
7 |
+
/* <option> */
|
8 |
+
/* -p param_file parameter file */
|
9 |
+
/* -d debug mode */
|
10 |
+
/* -e n number of error to kill (default=10) */
|
11 |
+
/* -h help */
|
12 |
+
/* */
|
13 |
+
/* Satoshi Sekine (NYU) */
|
14 |
+
/* Mike Collins (UPenn) */
|
15 |
+
/* */
|
16 |
+
/* October.1997 */
|
17 |
+
/* */
|
18 |
+
/* Please refer README for the update information */
|
19 |
+
/*****************************************************************/
|
20 |
+
|
21 |
+
#include <stdio.h>
|
22 |
+
#include <stdlib.h> //### added for exit, atoi decls
|
23 |
+
#include <ctype.h>
|
24 |
+
#include <string.h>
|
25 |
+
|
26 |
+
|
27 |
+
/* Internal Data format -------------------------------------------*/
|
28 |
+
/* */
|
29 |
+
/* (S (NP (NNX this)) (VP (VBX is) (NP (DT a) (NNX pen))) (SYM .)) */
|
30 |
+
/* */
|
31 |
+
/* wn=5 */
|
32 |
+
/* word label */
|
33 |
+
/* terminal[0] = this NNX */
|
34 |
+
/* terminal[1] = is VBX */
|
35 |
+
/* terminal[2] = a DT */
|
36 |
+
/* terminal[3] = pen NNX */
|
37 |
+
/* terminal[4] = . SYM */
|
38 |
+
/* */
|
39 |
+
/* bn=4 */
|
40 |
+
/* start end label */
|
41 |
+
/* bracket[0] = 0 5 S */
|
42 |
+
/* bracket[1] = 0 0 NP */
|
43 |
+
/* bracket[2] = 1 4 VP */
|
44 |
+
/* bracket[3] = 2 4 NP */
|
45 |
+
/* */
|
46 |
+
/* matched bracketing */
|
47 |
+
/* Recall = --------------------------- */
|
48 |
+
/* # of bracket in ref-data */
|
49 |
+
/* */
|
50 |
+
/* matched bracketing */
|
51 |
+
/* Recall = --------------------------- */
|
52 |
+
/* # of bracket in test-data */
|
53 |
+
/* */
|
54 |
+
/*-----------------------------------------------------------------*/
|
55 |
+
|
56 |
+
/******************/
|
57 |
+
/* constant macro */
|
58 |
+
/******************/
|
59 |
+
|
60 |
+
#define MAX_SENT_LEN 5000
|
61 |
+
#define MAX_WORD_IN_SENT 200
|
62 |
+
#define MAX_BRACKET_IN_SENT 200
|
63 |
+
#define MAX_WORD_LEN 100
|
64 |
+
#define MAX_LABEL_LEN 30
|
65 |
+
#define MAX_QUOTE_TERM 20
|
66 |
+
|
67 |
+
#define MAX_DELETE_LABEL 100
|
68 |
+
#define MAX_EQ_LABEL 100
|
69 |
+
#define MAX_EQ_WORD 100
|
70 |
+
|
71 |
+
#define MAX_LINE_LEN 500
|
72 |
+
|
73 |
+
#define DEFAULT_MAX_ERROR 10
|
74 |
+
#define DEFAULT_CUT_LEN 40
|
75 |
+
|
76 |
+
/*************/
|
77 |
+
/* structure */
|
78 |
+
/*************/
|
79 |
+
|
80 |
+
typedef struct ss_terminal {
|
81 |
+
char word[MAX_WORD_LEN];
|
82 |
+
char label[MAX_LABEL_LEN];
|
83 |
+
int result; /* 0:unmatch, 1:match, 9:undef */
|
84 |
+
} s_terminal;
|
85 |
+
|
86 |
+
typedef struct ss_term_ind {
|
87 |
+
s_terminal term;
|
88 |
+
int index;
|
89 |
+
int bracket;
|
90 |
+
int endslen;
|
91 |
+
int ends[MAX_BRACKET_IN_SENT];
|
92 |
+
} s_term_ind;
|
93 |
+
|
94 |
+
typedef struct ss_bracket {
|
95 |
+
int start;
|
96 |
+
int end;
|
97 |
+
unsigned int buf_start;
|
98 |
+
unsigned int buf_end;
|
99 |
+
char label[MAX_LABEL_LEN];
|
100 |
+
int result; /* 0: unmatch, 1:match, 5:delete 9:undef */
|
101 |
+
} s_bracket;
|
102 |
+
|
103 |
+
|
104 |
+
typedef struct ss_equiv {
|
105 |
+
char *s1;
|
106 |
+
char *s2;
|
107 |
+
} s_equiv;
|
108 |
+
|
109 |
+
|
110 |
+
/****************************/
|
111 |
+
/* global variables */
|
112 |
+
/* gold-data: suffix = 1 */
|
113 |
+
/* test-data: suffix = 2 */
|
114 |
+
/****************************/
|
115 |
+
|
116 |
+
/*---------------*/
|
117 |
+
/* Sentence data */
|
118 |
+
/*---------------*/
|
119 |
+
int wn1, wn2; /* number of words in sentence */
|
120 |
+
int r_wn1; /* number of words in sentence */
|
121 |
+
/* which only ignores labels in */
|
122 |
+
/* DELETE_LABEL_FOR_LENGTH */
|
123 |
+
|
124 |
+
s_terminal terminal1[MAX_WORD_IN_SENT]; /* terminal information */
|
125 |
+
s_terminal terminal2[MAX_WORD_IN_SENT];
|
126 |
+
|
127 |
+
s_term_ind quotterm1[MAX_QUOTE_TERM]; /* special terminals ("'","POS") */
|
128 |
+
s_term_ind quotterm2[MAX_QUOTE_TERM];
|
129 |
+
|
130 |
+
int bn1, bn2; /* number of brackets */
|
131 |
+
|
132 |
+
int r_bn1, r_bn2; /* number of brackets */
|
133 |
+
/* after deletion */
|
134 |
+
|
135 |
+
s_bracket bracket1[MAX_BRACKET_IN_SENT]; /* bracket information */
|
136 |
+
s_bracket bracket2[MAX_BRACKET_IN_SENT];
|
137 |
+
|
138 |
+
|
139 |
+
/*------------*/
|
140 |
+
/* Total data */
|
141 |
+
/*------------*/
|
142 |
+
int TOTAL_bn1, TOTAL_bn2, TOTAL_match; /* total number of brackets */
|
143 |
+
int TOTAL_sent; /* No. of sentence */
|
144 |
+
int TOTAL_error_sent; /* No. of error sentence */
|
145 |
+
int TOTAL_skip_sent; /* No. of skip sentence */
|
146 |
+
int TOTAL_comp_sent; /* No. of complete match sent */
|
147 |
+
int TOTAL_word; /* total number of word */
|
148 |
+
int TOTAL_crossing; /* total crossing */
|
149 |
+
int TOTAL_no_crossing; /* no crossing sentence */
|
150 |
+
int TOTAL_2L_crossing; /* 2 or less crossing sentence */
|
151 |
+
int TOTAL_correct_tag; /* total correct tagging */
|
152 |
+
|
153 |
+
int TOT_cut_len = DEFAULT_CUT_LEN; /* Cut-off length in statistics */
|
154 |
+
|
155 |
+
/* data for sentences with len <= CUT_LEN */
|
156 |
+
/* Historically it was 40. */
|
157 |
+
int TOT40_bn1, TOT40_bn2, TOT40_match; /* total number of brackets */
|
158 |
+
int TOT40_sent; /* No. of sentence */
|
159 |
+
int TOT40_error_sent; /* No. of error sentence */
|
160 |
+
int TOT40_skip_sent; /* No. of skip sentence */
|
161 |
+
int TOT40_comp_sent; /* No. of complete match sent */
|
162 |
+
int TOT40_word; /* total number of word */
|
163 |
+
int TOT40_crossing; /* total crossing */
|
164 |
+
int TOT40_no_crossing; /* no crossing sentence */
|
165 |
+
int TOT40_2L_crossing; /* 2 or less crossing sentence */
|
166 |
+
int TOT40_correct_tag; /* total correct tagging */
|
167 |
+
|
168 |
+
/*------------*/
|
169 |
+
/* miscallous */
|
170 |
+
/*------------*/
|
171 |
+
int Line; /* line number */
|
172 |
+
int Error_count = 0; /* Error count */
|
173 |
+
int Status; /* Result status for each sent */
|
174 |
+
/* 0: OK, 1: skip, 2: error */
|
175 |
+
|
176 |
+
/*-------------------*/
|
177 |
+
/* stack manuplation */
|
178 |
+
/*-------------------*/
|
179 |
+
int stack_top;
|
180 |
+
int stack[MAX_BRACKET_IN_SENT];
|
181 |
+
|
182 |
+
/************************************************************/
|
183 |
+
/* User parameters which can be specified in parameter file */
|
184 |
+
/************************************************************/
|
185 |
+
|
186 |
+
/*------------------------------------------*/
|
187 |
+
/* Debug mode */
|
188 |
+
/* print out data for individual sentence */
|
189 |
+
/*------------------------------------------*/
|
190 |
+
int DEBUG=0;
|
191 |
+
|
192 |
+
/*------------------------------------------*/
|
193 |
+
/* MAX error */
|
194 |
+
/* Number of error to stop the process. */
|
195 |
+
/* This is useful if there could be */
|
196 |
+
/* tokanization error. */
|
197 |
+
/* The process will stop when this number*/
|
198 |
+
/* of errors are accumulated. */
|
199 |
+
/*------------------------------------------*/
|
200 |
+
int Max_error = DEFAULT_MAX_ERROR;
|
201 |
+
|
202 |
+
/*------------------------------------------*/
|
203 |
+
/* Cut-off length for statistics */
|
204 |
+
/* int TOT_cut_len = DEFAULT_CUT_LEN; */
|
205 |
+
/* (Defined above) */
|
206 |
+
/*------------------------------------------*/
|
207 |
+
|
208 |
+
|
209 |
+
/*------------------------------------------*/
|
210 |
+
/* unlabeled or labeled bracketing */
|
211 |
+
/* 0: unlabeled bracketing */
|
212 |
+
/* 1: labeled bracketing */
|
213 |
+
/*------------------------------------------*/
|
214 |
+
int F_label = 1;
|
215 |
+
|
216 |
+
/*------------------------------------------*/
|
217 |
+
/* Delete labels */
|
218 |
+
/* list of labels to be ignored. */
|
219 |
+
/* If it is a pre-terminal label, delete */
|
220 |
+
/* the word along with the brackets. */
|
221 |
+
/* If it is a non-terminal label, just */
|
222 |
+
/* delete the brackets (don't delete */
|
223 |
+
/* childrens). */
|
224 |
+
/*------------------------------------------*/
|
225 |
+
char *Delete_label[MAX_DELETE_LABEL];
|
226 |
+
int Delete_label_n = 0;
|
227 |
+
|
228 |
+
/*------------------------------------------*/
|
229 |
+
/* Delete labels for length calculation */
|
230 |
+
/* list of labels to be ignored for */
|
231 |
+
/* length calculation purpose */
|
232 |
+
/*------------------------------------------*/
|
233 |
+
char *Delete_label_for_length[MAX_DELETE_LABEL];
|
234 |
+
int Delete_label_for_length_n = 0;
|
235 |
+
|
236 |
+
/*------------------------------------------*/
|
237 |
+
/* Labels to be considered for misquote */
|
238 |
+
/* (could be possesive or quote) */
|
239 |
+
/*------------------------------------------*/
|
240 |
+
char *Quote_term[MAX_QUOTE_TERM];
|
241 |
+
int Quote_term_n = 0;
|
242 |
+
|
243 |
+
/*------------------------------------------*/
|
244 |
+
/* Equivalent labels, words */
|
245 |
+
/* the pairs are considered equivalent */
|
246 |
+
/* This is non-directional. */
|
247 |
+
/*------------------------------------------*/
|
248 |
+
s_equiv EQ_label[MAX_EQ_LABEL];
|
249 |
+
int EQ_label_n = 0;
|
250 |
+
|
251 |
+
s_equiv EQ_word[MAX_EQ_WORD];
|
252 |
+
int EQ_word_n = 0;
|
253 |
+
|
254 |
+
|
255 |
+
|
256 |
+
/************************/
|
257 |
+
/* Function return-type */
|
258 |
+
/************************/
|
259 |
+
int main();
|
260 |
+
void init_global();
|
261 |
+
void print_head();
|
262 |
+
void init();
|
263 |
+
void read_parameter_file();
|
264 |
+
void set_param();
|
265 |
+
int narg();
|
266 |
+
int read_line();
|
267 |
+
|
268 |
+
void pushb();
|
269 |
+
int popb();
|
270 |
+
int stackempty();
|
271 |
+
|
272 |
+
void calc_result(unsigned char *buf1,unsigned char *buf);
|
273 |
+
void fix_quote();
|
274 |
+
void reinsert_term();
|
275 |
+
void massage_data();
|
276 |
+
void modify_label();
|
277 |
+
void individual_result();
|
278 |
+
void print_total();
|
279 |
+
void dsp_info();
|
280 |
+
int is_terminator();
|
281 |
+
int is_deletelabel();
|
282 |
+
int is_deletelabel_for_length();
|
283 |
+
int is_quote_term();
|
284 |
+
int word_comp();
|
285 |
+
int label_comp();
|
286 |
+
|
287 |
+
void Error();
|
288 |
+
void Fatal();
|
289 |
+
void Usage();
|
290 |
+
|
291 |
+
/* ### provided by std headers
|
292 |
+
int fprintf();
|
293 |
+
int printf();
|
294 |
+
int atoi();
|
295 |
+
int fclose();
|
296 |
+
int sscanf();
|
297 |
+
*/
|
298 |
+
|
299 |
+
/***********/
|
300 |
+
/* program */
|
301 |
+
/***********/
|
302 |
+
#define ARG_CHECK(st) if(!(*++(*argv) || (--argc && *++argv))){ \
|
303 |
+
fprintf(stderr,"Missing argument: %s\n",st); \
|
304 |
+
}
|
305 |
+
|
306 |
+
int
|
307 |
+
main(argc,argv)
|
308 |
+
int argc;
|
309 |
+
char *argv[];
|
310 |
+
{
|
311 |
+
char *filename1, *filename2;
|
312 |
+
FILE *fd1, *fd2;
|
313 |
+
unsigned char buff[5000];
|
314 |
+
unsigned char buff1[5000];
|
315 |
+
|
316 |
+
filename1=NULL;
|
317 |
+
filename2=NULL;
|
318 |
+
|
319 |
+
for(argc--,argv++;argc>0;argc--,argv++){
|
320 |
+
if(**argv == '-'){
|
321 |
+
while(*++(*argv)){
|
322 |
+
switch(**argv){
|
323 |
+
|
324 |
+
case 'h': /* help */
|
325 |
+
Usage();
|
326 |
+
exit(1);
|
327 |
+
|
328 |
+
case 'd': /* debug mode */
|
329 |
+
DEBUG = 1;
|
330 |
+
goto nextarg;
|
331 |
+
|
332 |
+
case 'D': /* debug mode */
|
333 |
+
DEBUG = 2;
|
334 |
+
goto nextarg;
|
335 |
+
|
336 |
+
case 'c': /* cut-off length */
|
337 |
+
ARG_CHECK("cut-off length for statistices");
|
338 |
+
TOT_cut_len = atoi(*argv);
|
339 |
+
goto nextarg;
|
340 |
+
|
341 |
+
case 'e': /* max error */
|
342 |
+
ARG_CHECK("number of error to kill");
|
343 |
+
Max_error = atoi(*argv);
|
344 |
+
goto nextarg;
|
345 |
+
|
346 |
+
case 'p': /* parameter file */
|
347 |
+
ARG_CHECK("parameter file");
|
348 |
+
read_parameter_file(*argv);
|
349 |
+
goto nextarg;
|
350 |
+
|
351 |
+
default:
|
352 |
+
Usage();
|
353 |
+
exit(0);
|
354 |
+
}
|
355 |
+
}
|
356 |
+
} else {
|
357 |
+
if(filename1==NULL){
|
358 |
+
filename1 = *argv;
|
359 |
+
}else if(filename2==NULL){
|
360 |
+
filename2 = *argv;
|
361 |
+
}
|
362 |
+
}
|
363 |
+
nextarg: continue;
|
364 |
+
}
|
365 |
+
|
366 |
+
init_global();
|
367 |
+
|
368 |
+
|
369 |
+
if((fd1 = fopen(filename1,"r"))==NULL){
|
370 |
+
Fatal("Can't open gold file (%s)\n",filename1);
|
371 |
+
}
|
372 |
+
if((fd2 = fopen(filename2,"r"))==NULL){
|
373 |
+
Fatal("Can't open test file (%s)\n",filename2);
|
374 |
+
}
|
375 |
+
|
376 |
+
print_head();
|
377 |
+
|
378 |
+
for(Line=1;fgets(buff,5000,fd1)!=NULL;Line++){
|
379 |
+
|
380 |
+
init();
|
381 |
+
|
382 |
+
/* READ 1 */
|
383 |
+
r_wn1 = read_line(buff,terminal1,quotterm1,&wn1,bracket1,&bn1);
|
384 |
+
|
385 |
+
strcpy(buff1,buff);
|
386 |
+
|
387 |
+
/* READ 2 */
|
388 |
+
if(fgets(buff,5000,fd2)==NULL){
|
389 |
+
Error("Number of lines unmatch (too many lines in gold file)\n");
|
390 |
+
break;
|
391 |
+
}
|
392 |
+
|
393 |
+
read_line(buff,terminal2,quotterm2,&wn2,bracket2,&bn2);
|
394 |
+
|
395 |
+
/* Calculate result and print it */
|
396 |
+
calc_result(buff1,buff);
|
397 |
+
|
398 |
+
if(DEBUG>=1){
|
399 |
+
dsp_info();
|
400 |
+
}
|
401 |
+
}
|
402 |
+
|
403 |
+
if(fgets(buff,5000,fd2)!=NULL){
|
404 |
+
Error("Number of lines unmatch (too many lines in test file)\n");
|
405 |
+
}
|
406 |
+
|
407 |
+
print_total();
|
408 |
+
|
409 |
+
return (0);
|
410 |
+
}
|
411 |
+
|
412 |
+
|
413 |
+
/*-----------------------------*/
|
414 |
+
/* initialize global variables */
|
415 |
+
/*-----------------------------*/
|
416 |
+
void
|
417 |
+
init_global()
|
418 |
+
{
|
419 |
+
TOTAL_bn1 = TOTAL_bn2 = TOTAL_match = 0;
|
420 |
+
TOTAL_sent = TOTAL_error_sent = TOTAL_skip_sent = TOTAL_comp_sent = 0;
|
421 |
+
TOTAL_word = TOTAL_correct_tag = 0;
|
422 |
+
TOTAL_crossing = 0;
|
423 |
+
TOTAL_no_crossing = TOTAL_2L_crossing = 0;
|
424 |
+
|
425 |
+
TOT40_bn1 = TOT40_bn2 = TOT40_match = 0;
|
426 |
+
TOT40_sent = TOT40_error_sent = TOT40_skip_sent = TOT40_comp_sent = 0;
|
427 |
+
TOT40_word = TOT40_correct_tag = 0;
|
428 |
+
TOT40_crossing = 0;
|
429 |
+
TOT40_no_crossing = TOT40_2L_crossing = 0;
|
430 |
+
|
431 |
+
}
|
432 |
+
|
433 |
+
|
434 |
+
/*------------------*/
|
435 |
+
/* print head title */
|
436 |
+
/*------------------*/
|
437 |
+
void
|
438 |
+
print_head()
|
439 |
+
{
|
440 |
+
printf(" Sent. Matched Bracket Cross Correct Tag\n");
|
441 |
+
printf(" ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy\n");
|
442 |
+
printf("============================================================================\n");
|
443 |
+
}
|
444 |
+
|
445 |
+
|
446 |
+
/*-----------------------------------------------*/
|
447 |
+
/* initialization at each individual computation */
|
448 |
+
/*-----------------------------------------------*/
|
449 |
+
void
|
450 |
+
init()
|
451 |
+
{
|
452 |
+
int i;
|
453 |
+
|
454 |
+
wn1 = 0;
|
455 |
+
wn2 = 0;
|
456 |
+
bn1 = 0;
|
457 |
+
bn2 = 0;
|
458 |
+
r_bn1 = 0;
|
459 |
+
r_bn2 = 0;
|
460 |
+
|
461 |
+
for(i=0;i<MAX_WORD_IN_SENT;i++){
|
462 |
+
terminal1[i].word[0] = '\0';
|
463 |
+
terminal1[i].label[0] = '\0';
|
464 |
+
terminal1[i].result = 9;
|
465 |
+
terminal2[i].word[0] = '\0';
|
466 |
+
terminal2[i].label[0] = '\0';
|
467 |
+
terminal2[i].result = 9;
|
468 |
+
}
|
469 |
+
|
470 |
+
for(i=0;i<MAX_QUOTE_TERM;i++){
|
471 |
+
quotterm1[i].term.word[0] = '\0';
|
472 |
+
quotterm1[i].term.label[0] = '\0';
|
473 |
+
quotterm1[i].term.result = 9;
|
474 |
+
quotterm1[i].index = -1;
|
475 |
+
quotterm1[i].bracket = -1;
|
476 |
+
quotterm2[i].term.word[0] = '\0';
|
477 |
+
quotterm2[i].term.label[0] = '\0';
|
478 |
+
quotterm2[i].term.result = 9;
|
479 |
+
quotterm2[i].index = -1;
|
480 |
+
quotterm2[i].bracket = -1;
|
481 |
+
}
|
482 |
+
|
483 |
+
for(i=0;i<MAX_BRACKET_IN_SENT;i++){
|
484 |
+
bracket1[i].start = -1;
|
485 |
+
bracket1[i].end = -1;
|
486 |
+
bracket1[i].label[0] = '\0';
|
487 |
+
bracket1[i].result = 9;
|
488 |
+
bracket2[i].start = -1;
|
489 |
+
bracket2[i].end = -1;
|
490 |
+
bracket2[i].label[0] = '\0';
|
491 |
+
bracket2[i].result = 9;
|
492 |
+
}
|
493 |
+
|
494 |
+
Status = 0;
|
495 |
+
}
|
496 |
+
|
497 |
+
/*----------------*/
|
498 |
+
/* parameter file */
|
499 |
+
/*----------------*/
|
500 |
+
void
|
501 |
+
read_parameter_file(filename)
|
502 |
+
char *filename;
|
503 |
+
{
|
504 |
+
char buff[MAX_LINE_LEN];
|
505 |
+
FILE *fd;
|
506 |
+
int line;
|
507 |
+
int i;
|
508 |
+
|
509 |
+
if((fd=fopen(filename,"r"))==NULL){
|
510 |
+
Fatal("Can't open parameter file (%s)\n",filename);
|
511 |
+
}
|
512 |
+
|
513 |
+
for(line=1;fgets(buff,MAX_LINE_LEN,fd)!=NULL;line++){
|
514 |
+
|
515 |
+
/* clean up the tail and find unvalid line */
|
516 |
+
/*-----------------------------------------*/
|
517 |
+
for(i=strlen(buff)-1;i>0 && (isspace(buff[i]) || buff[i]=='\n');i--){
|
518 |
+
buff[i]='\0';
|
519 |
+
}
|
520 |
+
if(buff[0]=='#' || /* comment-line */
|
521 |
+
strlen(buff)<3){ /* too short, just ignore */
|
522 |
+
continue;
|
523 |
+
}
|
524 |
+
|
525 |
+
/* place the parameter and value */
|
526 |
+
/*-------------------------------*/
|
527 |
+
for(i=0;!isspace(buff[i]);i++);
|
528 |
+
for(;isspace(buff[i]) && buff[i]!='\0';i++);
|
529 |
+
if(buff[i]=='\0'){
|
530 |
+
fprintf(stderr,"Empty value in parameter file (%d)\n",line);
|
531 |
+
}
|
532 |
+
|
533 |
+
/* set parameter and value */
|
534 |
+
/*-------------------------*/
|
535 |
+
set_param(buff,buff+i);
|
536 |
+
}
|
537 |
+
|
538 |
+
fclose(fd);
|
539 |
+
}
|
540 |
+
|
541 |
+
|
542 |
+
#define STRNCMP(s) (strncmp(param,s,strlen(s))==0 && \
|
543 |
+
(param[strlen(s)]=='\0' || isspace(param[strlen(s)])))
|
544 |
+
|
545 |
+
|
546 |
+
void
|
547 |
+
set_param(param,value)
|
548 |
+
char *param, *value;
|
549 |
+
{
|
550 |
+
char l1[MAX_LABEL_LEN], l2[MAX_LABEL_LEN];
|
551 |
+
|
552 |
+
if(STRNCMP("DEBUG")){
|
553 |
+
|
554 |
+
DEBUG = atoi(value);
|
555 |
+
|
556 |
+
}else if(STRNCMP("MAX_ERROR")){
|
557 |
+
|
558 |
+
Max_error = atoi(value);
|
559 |
+
|
560 |
+
}else if(STRNCMP("CUTOFF_LEN")){
|
561 |
+
|
562 |
+
TOT_cut_len = atoi(value);
|
563 |
+
|
564 |
+
}else if(STRNCMP("LABELED")){
|
565 |
+
|
566 |
+
F_label = atoi(value);
|
567 |
+
|
568 |
+
}else if(STRNCMP("DELETE_LABEL")){
|
569 |
+
|
570 |
+
Delete_label[Delete_label_n] = (char *)malloc(strlen(value)+1);
|
571 |
+
strcpy(Delete_label[Delete_label_n],value);
|
572 |
+
Delete_label_n++;
|
573 |
+
|
574 |
+
}else if(STRNCMP("DELETE_LABEL_FOR_LENGTH")){
|
575 |
+
|
576 |
+
Delete_label_for_length[Delete_label_for_length_n] = (char *)malloc(strlen(value)+1);
|
577 |
+
strcpy(Delete_label_for_length[Delete_label_for_length_n],value);
|
578 |
+
Delete_label_for_length_n++;
|
579 |
+
|
580 |
+
}else if(STRNCMP("QUOTE_LABEL")){
|
581 |
+
|
582 |
+
Quote_term[Quote_term_n] = (char *)malloc(strlen(value)+1);
|
583 |
+
strcpy(Quote_term[Quote_term_n],value);
|
584 |
+
Quote_term_n++;
|
585 |
+
|
586 |
+
}else if(STRNCMP("EQ_LABEL")){
|
587 |
+
|
588 |
+
if(narg(value)!=2){
|
589 |
+
fprintf(stderr,"EQ_LABEL requires two values\n");
|
590 |
+
return;
|
591 |
+
}
|
592 |
+
sscanf(value,"%s %s",l1,l2);
|
593 |
+
EQ_label[EQ_label_n].s1 = (char *)malloc(strlen(l1)+1);
|
594 |
+
strcpy(EQ_label[EQ_label_n].s1,l1);
|
595 |
+
EQ_label[EQ_label_n].s2 = (char *)malloc(strlen(l2)+1);
|
596 |
+
strcpy(EQ_label[EQ_label_n].s2,l2);
|
597 |
+
EQ_label_n++;
|
598 |
+
|
599 |
+
}else if(STRNCMP("EQ_WORD")){
|
600 |
+
|
601 |
+
if(narg(value)!=2){
|
602 |
+
fprintf(stderr,"EQ_WORD requires two values\n");
|
603 |
+
return;
|
604 |
+
}
|
605 |
+
sscanf(value,"%s %s",l1,l2);
|
606 |
+
EQ_word[EQ_word_n].s1 = (char *)malloc(strlen(l1)+1);
|
607 |
+
strcpy(EQ_word[EQ_word_n].s1,l1);
|
608 |
+
EQ_word[EQ_word_n].s2 = (char *)malloc(strlen(l2)+1);
|
609 |
+
strcpy(EQ_word[EQ_word_n].s2,l2);
|
610 |
+
EQ_word_n++;
|
611 |
+
|
612 |
+
}else{
|
613 |
+
|
614 |
+
fprintf(stderr,"Unknown keyword (%s) in parameter file\n",param);
|
615 |
+
|
616 |
+
}
|
617 |
+
}
|
618 |
+
|
619 |
+
|
620 |
+
int
|
621 |
+
narg(s)
|
622 |
+
char *s;
|
623 |
+
{
|
624 |
+
int n;
|
625 |
+
|
626 |
+
for(n=0;*s!='\0';){
|
627 |
+
for(;isspace(*s);s++);
|
628 |
+
if(*s=='\0'){
|
629 |
+
break;
|
630 |
+
}
|
631 |
+
n++;
|
632 |
+
for(;!isspace(*s);s++){
|
633 |
+
if(*s=='\0'){
|
634 |
+
break;
|
635 |
+
}
|
636 |
+
}
|
637 |
+
}
|
638 |
+
|
639 |
+
return(n);
|
640 |
+
}
|
641 |
+
|
642 |
+
/*-----------------------------*/
|
643 |
+
/* Read line and gather data. */
|
644 |
+
/* Return langth of sentence. */
|
645 |
+
/*-----------------------------*/
|
646 |
+
int
|
647 |
+
read_line(buff, terminal, quotterm, wn, bracket, bn)
|
648 |
+
char *buff;
|
649 |
+
s_terminal terminal[];
|
650 |
+
s_term_ind quotterm[];
|
651 |
+
int *wn;
|
652 |
+
s_bracket bracket[];
|
653 |
+
int *bn;
|
654 |
+
{
|
655 |
+
char *p, *q, label[MAX_LABEL_LEN], word[MAX_WORD_LEN];
|
656 |
+
int qt; /* quote term counter */
|
657 |
+
int wid, bid; /* word ID, bracket ID */
|
658 |
+
int n; /* temporary remembering the position */
|
659 |
+
int b; /* temporary remembering bid */
|
660 |
+
int i;
|
661 |
+
int len; /* length of the sentence */
|
662 |
+
|
663 |
+
len = 0;
|
664 |
+
stack_top=0;
|
665 |
+
|
666 |
+
for(p=buff,qt=0,wid=0,bid=0;*p!='\0';){
|
667 |
+
|
668 |
+
if(isspace(*p)){
|
669 |
+
p++;
|
670 |
+
continue;
|
671 |
+
|
672 |
+
/* open bracket */
|
673 |
+
/*--------------*/
|
674 |
+
}else if(*p=='('){
|
675 |
+
|
676 |
+
n=wid;
|
677 |
+
for(p++,i=0;!is_terminator(*p);p++,i++){
|
678 |
+
label[i]=*p;
|
679 |
+
}
|
680 |
+
label[i]='\0';
|
681 |
+
|
682 |
+
/* Find terminals */
|
683 |
+
q = p;
|
684 |
+
if(isspace(*q)){
|
685 |
+
for(q++;isspace(*q);q++);
|
686 |
+
for(i=0;!is_terminator(*q);q++,i++){
|
687 |
+
word[i]=*q;
|
688 |
+
}
|
689 |
+
word[i]='\0';
|
690 |
+
|
691 |
+
/* compute length */
|
692 |
+
if(*q==')' && !is_deletelabel_for_length(label)==1){
|
693 |
+
len++;
|
694 |
+
}
|
695 |
+
if (DEBUG>1)
|
696 |
+
printf("label=%s, word=%s, wid=%d\n",label,word,wid);
|
697 |
+
/* quote terminal */
|
698 |
+
if(*q==')' && is_quote_term(label,word)==1){
|
699 |
+
strcpy(quotterm[qt].term.word,word);
|
700 |
+
strcpy(quotterm[qt].term.label,label);
|
701 |
+
quotterm[qt].index = wid;
|
702 |
+
quotterm[qt].bracket = bid;
|
703 |
+
quotterm[qt].endslen = stack_top;
|
704 |
+
//quotterm[qt].ends = (int*)malloc(stack_top*sizeof(int));
|
705 |
+
memcpy(quotterm[qt].ends,stack,stack_top*sizeof(int));
|
706 |
+
qt++;
|
707 |
+
}
|
708 |
+
|
709 |
+
/* delete terminal */
|
710 |
+
if(*q==')' && is_deletelabel(label)==1){
|
711 |
+
p = q+1;
|
712 |
+
continue;
|
713 |
+
|
714 |
+
/* valid terminal */
|
715 |
+
}else if(*q==')'){
|
716 |
+
strcpy(terminal[wid].word,word);
|
717 |
+
strcpy(terminal[wid].label,label);
|
718 |
+
wid++;
|
719 |
+
p = q+1;
|
720 |
+
continue;
|
721 |
+
|
722 |
+
/* error */
|
723 |
+
}else if(*q!='('){
|
724 |
+
Error("More than two elements in a bracket\n");
|
725 |
+
}
|
726 |
+
}
|
727 |
+
|
728 |
+
/* otherwise non-terminal label */
|
729 |
+
bracket[bid].start = wid;
|
730 |
+
bracket[bid].buf_start = p-buff;
|
731 |
+
strcpy(bracket[bid].label,label);
|
732 |
+
pushb(bid);
|
733 |
+
bid++;
|
734 |
+
|
735 |
+
/* close bracket */
|
736 |
+
/*---------------*/
|
737 |
+
}else if(*p==')'){
|
738 |
+
|
739 |
+
b = popb();
|
740 |
+
bracket[b].end = wid;
|
741 |
+
bracket[b].buf_end = p-buff;
|
742 |
+
p++;
|
743 |
+
|
744 |
+
/* error */
|
745 |
+
/*-------*/
|
746 |
+
}else{
|
747 |
+
|
748 |
+
Error("Reading sentence\n");
|
749 |
+
}
|
750 |
+
}
|
751 |
+
|
752 |
+
if(!stackempty()){
|
753 |
+
Error("Bracketing is unbalanced (too many open bracket)\n");
|
754 |
+
}
|
755 |
+
|
756 |
+
*wn = wid;
|
757 |
+
*bn = bid;
|
758 |
+
|
759 |
+
return(len);
|
760 |
+
}
|
761 |
+
|
762 |
+
|
763 |
+
/*----------------------*/
|
764 |
+
/* stack operation */
|
765 |
+
/* for bracketing pairs */
|
766 |
+
/*----------------------*/
|
767 |
+
void
|
768 |
+
pushb(item)
|
769 |
+
int item;
|
770 |
+
{
|
771 |
+
stack[stack_top++]=item;
|
772 |
+
}
|
773 |
+
|
774 |
+
int
|
775 |
+
popb()
|
776 |
+
{
|
777 |
+
int item;
|
778 |
+
|
779 |
+
item = stack[stack_top-1];
|
780 |
+
|
781 |
+
if(stack_top-- < 0){
|
782 |
+
Error("Bracketing unbalance (too many close bracket)\n");
|
783 |
+
}
|
784 |
+
return(item);
|
785 |
+
}
|
786 |
+
|
787 |
+
int
|
788 |
+
stackempty()
|
789 |
+
{
|
790 |
+
if(stack_top==0){
|
791 |
+
return(1);
|
792 |
+
}else{
|
793 |
+
return(0);
|
794 |
+
}
|
795 |
+
}
|
796 |
+
|
797 |
+
|
798 |
+
/*------------------*/
|
799 |
+
/* calculate result */
|
800 |
+
/*------------------*/
|
801 |
+
void
|
802 |
+
calc_result(unsigned char *buf1,unsigned char *buf)
|
803 |
+
{
|
804 |
+
int i, j, l;
|
805 |
+
int match, crossing, correct_tag;
|
806 |
+
|
807 |
+
int last_i = -1;
|
808 |
+
|
809 |
+
char my_buf[1000];
|
810 |
+
int match_found = 0;
|
811 |
+
|
812 |
+
char match_j[200];
|
813 |
+
for (j = 0; j < bn2; ++j) {
|
814 |
+
match_j[j] = 0;
|
815 |
+
}
|
816 |
+
|
817 |
+
/* ML */
|
818 |
+
if (DEBUG>1)
|
819 |
+
printf("\n");
|
820 |
+
|
821 |
+
|
822 |
+
/* Find skip and error */
|
823 |
+
/*---------------------*/
|
824 |
+
if(wn2==0){
|
825 |
+
Status = 2;
|
826 |
+
individual_result(0,0,0,0,0,0);
|
827 |
+
return;
|
828 |
+
}
|
829 |
+
|
830 |
+
if(wn1 != wn2){
|
831 |
+
//if (DEBUG>1)
|
832 |
+
//Error("Length unmatch (%d|%d)\n",wn1,wn2);
|
833 |
+
fix_quote();
|
834 |
+
if(wn1 != wn2){
|
835 |
+
Error("Length unmatch (%d|%d)\n",wn1,wn2);
|
836 |
+
individual_result(0,0,0,0,0,0);
|
837 |
+
return;
|
838 |
+
}
|
839 |
+
}
|
840 |
+
|
841 |
+
for(i=0;i<wn1;i++){
|
842 |
+
if(word_comp(terminal1[i].word,terminal2[i].word)==0){
|
843 |
+
Error("Words unmatch (%s|%s)\n",terminal1[i].word,
|
844 |
+
terminal2[i].word);
|
845 |
+
individual_result(0,0,0,0,0,0);
|
846 |
+
return;
|
847 |
+
}
|
848 |
+
}
|
849 |
+
|
850 |
+
/* massage the data */
|
851 |
+
/*------------------*/
|
852 |
+
massage_data();
|
853 |
+
|
854 |
+
/* matching brackets */
|
855 |
+
/*-------------------*/
|
856 |
+
match = 0;
|
857 |
+
for(i=0;i<bn1;i++){
|
858 |
+
for(j=0;j<bn2;j++){
|
859 |
+
|
860 |
+
if (DEBUG>1)
|
861 |
+
printf("1.res=%d, 2.res=%d, 1.start=%d, 2.start=%d, 1.end=%d, 2.end=%d\n",bracket1[i].result,bracket2[j].result,bracket1[i].start,bracket2[j].start,bracket1[i].end,bracket2[j].end);
|
862 |
+
|
863 |
+
// does bracket match?
|
864 |
+
if(bracket1[i].result != 5 &&
|
865 |
+
bracket2[j].result == 0 &&
|
866 |
+
bracket1[i].start == bracket2[j].start && bracket1[i].end == bracket2[j].end) {
|
867 |
+
|
868 |
+
// (1) do we not care about the label or (2) does the label match?
|
869 |
+
if (F_label==0 || label_comp(bracket1[i].label,bracket2[j].label)==1) {
|
870 |
+
bracket1[i].result = bracket2[j].result = 1;
|
871 |
+
match++;
|
872 |
+
match_found = 1;
|
873 |
+
break;
|
874 |
+
} else {
|
875 |
+
if (DEBUG>1) {
|
876 |
+
printf(" LABEL[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
|
877 |
+
l = bracket1[i].buf_end-bracket1[i].buf_start;
|
878 |
+
strncpy(my_buf,buf1+bracket1[i].buf_start,l);
|
879 |
+
my_buf[l] = '\0';
|
880 |
+
printf("%s\n",my_buf);
|
881 |
+
}
|
882 |
+
match_found = 1;
|
883 |
+
match_j[j] = 1;
|
884 |
+
}
|
885 |
+
}
|
886 |
+
}
|
887 |
+
|
888 |
+
if (!match_found && bracket1[i].result != 5 && DEBUG>1) {
|
889 |
+
/* ### ML 09/28/03: gold bracket with no corresponding test bracket */
|
890 |
+
printf(" BRACKET[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
|
891 |
+
l = bracket1[i].buf_end-bracket1[i].buf_start;
|
892 |
+
strncpy(my_buf,buf1+bracket1[i].buf_start,l);
|
893 |
+
my_buf[l] = '\0';
|
894 |
+
printf("%s\n",my_buf);
|
895 |
+
}
|
896 |
+
match_found = 0;
|
897 |
+
}
|
898 |
+
|
899 |
+
for(j=0;j<bn2;j++){
|
900 |
+
if (bracket2[j].result==0 && !match_j[j] && DEBUG>1) {
|
901 |
+
/* test bracket with no corresponding gold bracket */
|
902 |
+
printf(" EXTRA[%d-%d]: ",bracket2[j].start,bracket2[j].end-1);
|
903 |
+
l = bracket2[j].buf_end-bracket2[j].buf_start;
|
904 |
+
strncpy(my_buf,buf+bracket2[j].buf_start,l);
|
905 |
+
my_buf[l] = '\0';
|
906 |
+
printf("%s\n",my_buf);
|
907 |
+
}
|
908 |
+
}
|
909 |
+
|
910 |
+
/* crossing */
|
911 |
+
/*----------*/
|
912 |
+
crossing = 0;
|
913 |
+
|
914 |
+
/* crossing is counted based on the brackets */
|
915 |
+
/* in test rather than gold file (by Mike) */
|
916 |
+
for(j=0;j<bn2;j++){
|
917 |
+
for(i=0;i<bn1;i++){
|
918 |
+
if(bracket1[i].result != 5 &&
|
919 |
+
bracket2[j].result != 5 &&
|
920 |
+
((bracket1[i].start < bracket2[j].start &&
|
921 |
+
bracket1[i].end > bracket2[j].start &&
|
922 |
+
bracket1[i].end < bracket2[j].end) ||
|
923 |
+
(bracket1[i].start > bracket2[j].start &&
|
924 |
+
bracket1[i].start < bracket2[j].end &&
|
925 |
+
bracket1[i].end > bracket2[j].end))){
|
926 |
+
|
927 |
+
/* ### ML 09/01/03: get details on cross-brackettings */
|
928 |
+
if (i != last_i) {
|
929 |
+
if (DEBUG>1) {
|
930 |
+
printf(" CROSSING[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
|
931 |
+
l = bracket1[i].buf_end-bracket1[i].buf_start;
|
932 |
+
strncpy(my_buf,buf1+bracket1[i].buf_start,l);
|
933 |
+
my_buf[l] = '\0';
|
934 |
+
printf("%s\n",my_buf);
|
935 |
+
|
936 |
+
/* ML
|
937 |
+
printf("\n CROSSING at bracket %d:\n",i-1);
|
938 |
+
printf(" GOLD (tokens %d-%d): ",bracket1[i].start,bracket1[i].end-1);
|
939 |
+
l = bracket1[i].buf_end-bracket1[i].buf_start;
|
940 |
+
strncpy(my_buf,buf1+bracket1[i].buf_start,l);
|
941 |
+
my_buf[l] = '\0';
|
942 |
+
printf("%s\n",my_buf);
|
943 |
+
*/
|
944 |
+
}
|
945 |
+
last_i = i;
|
946 |
+
}
|
947 |
+
|
948 |
+
/* ML
|
949 |
+
printf(" TEST (tokens %d-%d): ",bracket2[j].start,bracket2[j].end-1);
|
950 |
+
l = bracket2[j].buf_end-bracket2[j].buf_start;
|
951 |
+
strncpy(my_buf,buf+bracket2[j].buf_start,l);
|
952 |
+
my_buf[l] = '\0';
|
953 |
+
printf("%s\n",my_buf);
|
954 |
+
*/
|
955 |
+
|
956 |
+
crossing++;
|
957 |
+
break;
|
958 |
+
}
|
959 |
+
}
|
960 |
+
}
|
961 |
+
|
962 |
+
/* Tagging accuracy */
|
963 |
+
/*------------------*/
|
964 |
+
correct_tag=0;
|
965 |
+
for(i=0;i<wn1;i++){
|
966 |
+
if(label_comp(terminal1[i].label,terminal2[i].label)==1){
|
967 |
+
terminal1[i].result = terminal2[i].result = 1;
|
968 |
+
correct_tag++;
|
969 |
+
} else {
|
970 |
+
terminal1[i].result = terminal2[i].result = 0;
|
971 |
+
}
|
972 |
+
}
|
973 |
+
|
974 |
+
individual_result(wn1,r_bn1,r_bn2,match,crossing,correct_tag);
|
975 |
+
}
|
976 |
+
|
977 |
+
void
|
978 |
+
fix_quote()
|
979 |
+
{
|
980 |
+
int i,j,k;
|
981 |
+
if (DEBUG>1) {
|
982 |
+
for(i=0;i<MAX_QUOTE_TERM;i++){
|
983 |
+
if (quotterm1[i].index!=-1)
|
984 |
+
printf("%d: %s - %s\n",quotterm1[i].index,
|
985 |
+
quotterm1[i].term.label,
|
986 |
+
quotterm1[i].term.word);
|
987 |
+
if (quotterm2[i].index!=-1)
|
988 |
+
printf("%d: %s - %s\n",quotterm2[i].index,
|
989 |
+
quotterm2[i].term.label,
|
990 |
+
quotterm2[i].term.word);
|
991 |
+
}
|
992 |
+
}
|
993 |
+
for(i=0;i<MAX_QUOTE_TERM;i++) {
|
994 |
+
int ind = quotterm2[i].index;
|
995 |
+
if (ind!=-1) {
|
996 |
+
for(j=0;j<MAX_QUOTE_TERM;j++){
|
997 |
+
if (quotterm1[j].index==ind &&
|
998 |
+
strcmp(quotterm1[j].term.label,
|
999 |
+
quotterm2[i].term.label)!=0) {
|
1000 |
+
if (is_deletelabel(quotterm1[j].term.label) && !is_deletelabel(quotterm2[i].term.label)) {
|
1001 |
+
reinsert_term("term1[j],terminal1,bracket1,&wn1);
|
1002 |
+
for (k=j;k<MAX_QUOTE_TERM;k++)
|
1003 |
+
if (quotterm1[k].index!=-1)
|
1004 |
+
quotterm1[k].index++;
|
1005 |
+
} else if (is_deletelabel(quotterm2[i].term.label) && !is_deletelabel(quotterm1[j].term.label)) {
|
1006 |
+
reinsert_term("term2[i],terminal2,bracket2,&wn2);
|
1007 |
+
for (k=i;k<MAX_QUOTE_TERM;k++)
|
1008 |
+
if (quotterm2[k].index!=-1)
|
1009 |
+
quotterm2[k].index++;
|
1010 |
+
}
|
1011 |
+
}
|
1012 |
+
}
|
1013 |
+
} else break;
|
1014 |
+
}
|
1015 |
+
}
|
1016 |
+
|
1017 |
+
void
|
1018 |
+
reinsert_term(quot,terminal,bracket,wn)
|
1019 |
+
s_term_ind* quot;
|
1020 |
+
s_terminal terminal[];
|
1021 |
+
s_bracket bracket[];
|
1022 |
+
int* wn;
|
1023 |
+
{
|
1024 |
+
int ind = quot->index;
|
1025 |
+
int bra = quot->bracket;
|
1026 |
+
s_terminal* term = "->term;
|
1027 |
+
int k;
|
1028 |
+
memmove(&terminal[ind+1],
|
1029 |
+
&terminal[ind],
|
1030 |
+
sizeof(s_terminal)*(MAX_WORD_IN_SENT-ind-1));
|
1031 |
+
strcpy(terminal[ind].label,term->label);
|
1032 |
+
strcpy(terminal[ind].word,term->word);
|
1033 |
+
(*wn)++;
|
1034 |
+
if (DEBUG>1)
|
1035 |
+
printf("bra=%d, ind=%d\n",bra,ind);
|
1036 |
+
for(k=0;k<MAX_BRACKET_IN_SENT;k++) {
|
1037 |
+
if (bracket[k].start==-1)
|
1038 |
+
break;
|
1039 |
+
if (DEBUG>1)
|
1040 |
+
printf("bracket[%d]={%d,%d}\n",k,bracket[k].start,bracket[k].end);
|
1041 |
+
if (k>=bra) {
|
1042 |
+
bracket[k].start++;
|
1043 |
+
bracket[k].end++;
|
1044 |
+
}
|
1045 |
+
//if (bracket[k].start<=ind && bracket[k].end>=ind)
|
1046 |
+
//bracket[k].end++;
|
1047 |
+
}
|
1048 |
+
if (DEBUG>1)
|
1049 |
+
printf("endslen=%d\n",quot->endslen);
|
1050 |
+
for(k=0;k<quot->endslen;k++) {
|
1051 |
+
//printf("ends[%d]=%d",k,quot->ends[k]);
|
1052 |
+
bracket[quot->ends[k]].end++;
|
1053 |
+
}
|
1054 |
+
//free(quot->ends);
|
1055 |
+
}
|
1056 |
+
/*
|
1057 |
+
void
|
1058 |
+
adjust_end(ind,bra)
|
1059 |
+
int ind;
|
1060 |
+
int bra;
|
1061 |
+
{
|
1062 |
+
for(k=0;k<MAX_BRACKET_IN_SENT;k++) {
|
1063 |
+
if (bracket[k].start==-1)
|
1064 |
+
break;
|
1065 |
+
printf("bracket[%d]={%d,%d}\n",k,bracket[k].start,bracket[k].end);
|
1066 |
+
if (k>=bra)
|
1067 |
+
bracket[k].end++;
|
1068 |
+
}
|
1069 |
+
}
|
1070 |
+
*/
|
1071 |
+
void
|
1072 |
+
massage_data()
|
1073 |
+
{
|
1074 |
+
int i, j;
|
1075 |
+
|
1076 |
+
/* for GOLD */
|
1077 |
+
/*----------*/
|
1078 |
+
for(i=0;i<bn1;i++){
|
1079 |
+
|
1080 |
+
bracket1[i].result = 0;
|
1081 |
+
|
1082 |
+
/* Zero element */
|
1083 |
+
if(bracket1[i].start == bracket1[i].end){
|
1084 |
+
bracket1[i].result = 5;
|
1085 |
+
continue;
|
1086 |
+
}
|
1087 |
+
|
1088 |
+
/* Modify label */
|
1089 |
+
modify_label(bracket1[i].label);
|
1090 |
+
|
1091 |
+
/* Delete label */
|
1092 |
+
for(j=0;j<Delete_label_n;j++){
|
1093 |
+
if(label_comp(bracket1[i].label,Delete_label[j])==1){
|
1094 |
+
bracket1[i].result = 5;
|
1095 |
+
}
|
1096 |
+
}
|
1097 |
+
}
|
1098 |
+
|
1099 |
+
/* for TEST */
|
1100 |
+
/*----------*/
|
1101 |
+
for(i=0;i<bn2;i++){
|
1102 |
+
|
1103 |
+
bracket2[i].result = 0;
|
1104 |
+
|
1105 |
+
/* Zero element */
|
1106 |
+
if(bracket2[i].start == bracket2[i].end){
|
1107 |
+
bracket2[i].result = 5;
|
1108 |
+
continue;
|
1109 |
+
}
|
1110 |
+
|
1111 |
+
/* Modify label */
|
1112 |
+
modify_label(bracket2[i].label);
|
1113 |
+
|
1114 |
+
/* Delete label */
|
1115 |
+
for(j=0;j<Delete_label_n;j++){
|
1116 |
+
if(label_comp(bracket2[i].label,Delete_label[j])==1){
|
1117 |
+
bracket2[i].result = 5;
|
1118 |
+
}
|
1119 |
+
}
|
1120 |
+
}
|
1121 |
+
|
1122 |
+
|
1123 |
+
/* count up real number of brackets (exclude deleted ones) */
|
1124 |
+
/*---------------------------------------------------------*/
|
1125 |
+
r_bn1 = r_bn2 = 0;
|
1126 |
+
|
1127 |
+
for(i=0;i<bn1;i++){
|
1128 |
+
if(bracket1[i].result != 5){
|
1129 |
+
r_bn1++;
|
1130 |
+
}
|
1131 |
+
}
|
1132 |
+
|
1133 |
+
for(i=0;i<bn2;i++){
|
1134 |
+
if(bracket2[i].result != 5){
|
1135 |
+
r_bn2++;
|
1136 |
+
}
|
1137 |
+
}
|
1138 |
+
}
|
1139 |
+
|
1140 |
+
|
1141 |
+
/*------------------------*/
|
1142 |
+
/* trim the tail of label */
|
1143 |
+
/*------------------------*/
|
1144 |
+
void
|
1145 |
+
modify_label(label)
|
1146 |
+
char *label;
|
1147 |
+
{
|
1148 |
+
char *p;
|
1149 |
+
|
1150 |
+
for(p=label;*p!='\0';p++){
|
1151 |
+
if(*p=='-' || *p=='='){
|
1152 |
+
*p='\0';
|
1153 |
+
break;
|
1154 |
+
}
|
1155 |
+
}
|
1156 |
+
}
|
1157 |
+
|
1158 |
+
|
1159 |
+
/*-----------------------------------------------*/
|
1160 |
+
/* add individual statistics to TOTAL statictics */
|
1161 |
+
/*-----------------------------------------------*/
|
1162 |
+
void
|
1163 |
+
individual_result(wn1,bn1,bn2,match,crossing,correct_tag)
|
1164 |
+
int wn1,bn1,bn2,match,crossing,correct_tag;
|
1165 |
+
{
|
1166 |
+
|
1167 |
+
/* Statistics for ALL */
|
1168 |
+
/*--------------------*/
|
1169 |
+
TOTAL_sent++;
|
1170 |
+
if(Status==1){
|
1171 |
+
TOTAL_error_sent++;
|
1172 |
+
}else if(Status==2){
|
1173 |
+
TOTAL_skip_sent++;
|
1174 |
+
}else{
|
1175 |
+
TOTAL_bn1 += bn1;
|
1176 |
+
TOTAL_bn2 += bn2;
|
1177 |
+
TOTAL_match += match;
|
1178 |
+
if(bn1==bn2 && bn2==match){
|
1179 |
+
TOTAL_comp_sent++;
|
1180 |
+
}
|
1181 |
+
TOTAL_word += wn1;
|
1182 |
+
TOTAL_crossing += crossing;
|
1183 |
+
if(crossing==0){
|
1184 |
+
TOTAL_no_crossing++;
|
1185 |
+
}
|
1186 |
+
if(crossing <= 2){
|
1187 |
+
TOTAL_2L_crossing++;
|
1188 |
+
}
|
1189 |
+
TOTAL_correct_tag += correct_tag;
|
1190 |
+
}
|
1191 |
+
|
1192 |
+
|
1193 |
+
/* Statistics for sent length <= TOT_cut_len */
|
1194 |
+
/*-------------------------------------------*/
|
1195 |
+
if(r_wn1<=TOT_cut_len){
|
1196 |
+
TOT40_sent++;
|
1197 |
+
if(Status==1){
|
1198 |
+
TOT40_error_sent++;
|
1199 |
+
}else if(Status==2){
|
1200 |
+
TOT40_skip_sent++;
|
1201 |
+
}else{
|
1202 |
+
TOT40_bn1 += bn1;
|
1203 |
+
TOT40_bn2 += bn2;
|
1204 |
+
TOT40_match += match;
|
1205 |
+
if(bn1==bn2 && bn2==match){
|
1206 |
+
TOT40_comp_sent++;
|
1207 |
+
}
|
1208 |
+
TOT40_word += wn1;
|
1209 |
+
TOT40_crossing += crossing;
|
1210 |
+
if(crossing==0){
|
1211 |
+
TOT40_no_crossing++;
|
1212 |
+
}
|
1213 |
+
if(crossing <= 2){
|
1214 |
+
TOT40_2L_crossing++;
|
1215 |
+
}
|
1216 |
+
TOT40_correct_tag += correct_tag;
|
1217 |
+
}
|
1218 |
+
}
|
1219 |
+
|
1220 |
+
/* Print individual result */
|
1221 |
+
/*-------------------------*/
|
1222 |
+
printf("%4d %3d %d ",Line,r_wn1,Status);
|
1223 |
+
printf("%6.2f %6.2f %3d %3d %3d %3d",
|
1224 |
+
(r_bn1==0?0.0:100.0*match/r_bn1),
|
1225 |
+
(r_bn2==0?0.0:100.0*match/r_bn2),
|
1226 |
+
match, r_bn1, r_bn2, crossing);
|
1227 |
+
|
1228 |
+
printf(" %4d %4d %6.2f\n",wn1,correct_tag,
|
1229 |
+
(wn1==0?0.0:100.0*correct_tag/wn1));
|
1230 |
+
}
|
1231 |
+
|
1232 |
+
|
1233 |
+
/*------------------------*/
|
1234 |
+
/* print total statistics */
|
1235 |
+
/*------------------------*/
|
1236 |
+
void
|
1237 |
+
print_total()
|
1238 |
+
{
|
1239 |
+
int sentn;
|
1240 |
+
double r,p,f;
|
1241 |
+
|
1242 |
+
printf("============================================================================\n");
|
1243 |
+
|
1244 |
+
if(TOTAL_bn1>0 && TOTAL_bn2>0){
|
1245 |
+
printf(" %6.2f %6.2f %6d %5d %5d %5d",
|
1246 |
+
(TOTAL_bn1>0?100.0*TOTAL_match/TOTAL_bn1:0.0),
|
1247 |
+
(TOTAL_bn2>0?100.0*TOTAL_match/TOTAL_bn2:0.0),
|
1248 |
+
TOTAL_match,
|
1249 |
+
TOTAL_bn1,
|
1250 |
+
TOTAL_bn2,
|
1251 |
+
TOTAL_crossing);
|
1252 |
+
}
|
1253 |
+
|
1254 |
+
printf(" %5d %5d %6.2f",
|
1255 |
+
TOTAL_word,
|
1256 |
+
TOTAL_correct_tag,
|
1257 |
+
(TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0));
|
1258 |
+
|
1259 |
+
printf("\n");
|
1260 |
+
printf("=== Summary ===\n");
|
1261 |
+
|
1262 |
+
sentn = TOTAL_sent - TOTAL_error_sent - TOTAL_skip_sent;
|
1263 |
+
|
1264 |
+
printf("\n-- All --\n");
|
1265 |
+
printf("Number of sentence = %6d\n",TOTAL_sent);
|
1266 |
+
printf("Number of Error sentence = %6d\n",TOTAL_error_sent);
|
1267 |
+
printf("Number of Skip sentence = %6d\n",TOTAL_skip_sent);
|
1268 |
+
printf("Number of Valid sentence = %6d\n",sentn);
|
1269 |
+
|
1270 |
+
r = TOTAL_bn1>0 ? 100.0*TOTAL_match/TOTAL_bn1 : 0.0;
|
1271 |
+
printf("Bracketing Recall = %6.2f\n",r);
|
1272 |
+
|
1273 |
+
p = TOTAL_bn2>0 ? 100.0*TOTAL_match/TOTAL_bn2 : 0.0;
|
1274 |
+
printf("Bracketing Precision = %6.2f\n",p);
|
1275 |
+
|
1276 |
+
f = 2*p*r/(p+r);
|
1277 |
+
printf("Bracketing FMeasure = %6.2f\n",f);
|
1278 |
+
|
1279 |
+
printf("Complete match = %6.2f\n",
|
1280 |
+
(sentn>0?100.0*TOTAL_comp_sent/sentn:0.0));
|
1281 |
+
printf("Average crossing = %6.2f\n",
|
1282 |
+
(sentn>0?1.0*TOTAL_crossing/sentn:0.0));
|
1283 |
+
printf("No crossing = %6.2f\n",
|
1284 |
+
(sentn>0?100.0*TOTAL_no_crossing/sentn:0.0));
|
1285 |
+
printf("2 or less crossing = %6.2f\n",
|
1286 |
+
(sentn>0?100.0*TOTAL_2L_crossing/sentn:0.0));
|
1287 |
+
printf("Tagging accuracy = %6.2f\n",
|
1288 |
+
(TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0));
|
1289 |
+
|
1290 |
+
sentn = TOT40_sent - TOT40_error_sent - TOT40_skip_sent;
|
1291 |
+
|
1292 |
+
printf("\n-- len<=%d --\n",TOT_cut_len);
|
1293 |
+
printf("Number of sentence = %6d\n",TOT40_sent);
|
1294 |
+
printf("Number of Error sentence = %6d\n",TOT40_error_sent);
|
1295 |
+
printf("Number of Skip sentence = %6d\n",TOT40_skip_sent);
|
1296 |
+
printf("Number of Valid sentence = %6d\n",sentn);
|
1297 |
+
|
1298 |
+
|
1299 |
+
r = TOT40_bn1>0 ? 100.0*TOT40_match/TOT40_bn1 : 0.0;
|
1300 |
+
printf("Bracketing Recall = %6.2f\n",r);
|
1301 |
+
|
1302 |
+
p = TOT40_bn2>0 ? 100.0*TOT40_match/TOT40_bn2 : 0.0;
|
1303 |
+
printf("Bracketing Precision = %6.2f\n",p);
|
1304 |
+
|
1305 |
+
f = 2*p*r/(p+r);
|
1306 |
+
printf("Bracketing FMeasure = %6.2f\n",f);
|
1307 |
+
|
1308 |
+
printf("Complete match = %6.2f\n",
|
1309 |
+
(sentn>0?100.0*TOT40_comp_sent/sentn:0.0));
|
1310 |
+
printf("Average crossing = %6.2f\n",
|
1311 |
+
(sentn>0?1.0*TOT40_crossing/sentn:0.0));
|
1312 |
+
printf("No crossing = %6.2f\n",
|
1313 |
+
(sentn>0?100.0*TOT40_no_crossing/sentn:0.0));
|
1314 |
+
printf("2 or less crossing = %6.2f\n",
|
1315 |
+
(sentn>0?100.0*TOT40_2L_crossing/sentn:0.0));
|
1316 |
+
printf("Tagging accuracy = %6.2f\n",
|
1317 |
+
(TOT40_word>0?100.0*TOT40_correct_tag/TOT40_word:0.0));
|
1318 |
+
|
1319 |
+
}
|
1320 |
+
|
1321 |
+
|
1322 |
+
/*--------------------------------*/
|
1323 |
+
/* display individual information */
|
1324 |
+
/*--------------------------------*/
|
1325 |
+
void
|
1326 |
+
dsp_info()
|
1327 |
+
{
|
1328 |
+
int i, n;
|
1329 |
+
|
1330 |
+
printf("-<1>---(wn1=%3d, bn1=%3d)- ",wn1,bn1);
|
1331 |
+
printf("-<2>---(wn2=%3d, bn2=%3d)-\n",wn2,bn2);
|
1332 |
+
|
1333 |
+
n = (wn1>wn2?wn1:wn2);
|
1334 |
+
|
1335 |
+
for(i=0;i<n;i++){
|
1336 |
+
if(terminal1[i].word[0]!='\0'){
|
1337 |
+
printf("%3d : %d : %-6s %-16s ",i,terminal1[i].result,
|
1338 |
+
terminal1[i].label,terminal1[i].word);
|
1339 |
+
}else{
|
1340 |
+
printf(" ");
|
1341 |
+
}
|
1342 |
+
|
1343 |
+
if(terminal2[i].word[0]!='\0'){
|
1344 |
+
printf("%3d : %d : %-6s %-16s\n",i,terminal2[i].result,
|
1345 |
+
terminal2[i].label,terminal2[i].word);
|
1346 |
+
}else{
|
1347 |
+
printf("\n");
|
1348 |
+
}
|
1349 |
+
}
|
1350 |
+
printf("\n");
|
1351 |
+
|
1352 |
+
n = (bn1>bn2?bn1:bn2);
|
1353 |
+
|
1354 |
+
for(i=0;i<n;i++){
|
1355 |
+
if(bracket1[i].start != -1){
|
1356 |
+
printf("%3d : %d : %3d %3d %-6s ",i,bracket1[i].result,
|
1357 |
+
bracket1[i].start,bracket1[i].end,
|
1358 |
+
bracket1[i].label);
|
1359 |
+
} else {
|
1360 |
+
printf(" ");
|
1361 |
+
}
|
1362 |
+
|
1363 |
+
if(bracket2[i].start != -1){
|
1364 |
+
printf("%3d : %d : %3d %3d %-6s\n",i,bracket2[i].result,
|
1365 |
+
bracket2[i].start,bracket2[i].end,
|
1366 |
+
bracket2[i].label);
|
1367 |
+
} else {
|
1368 |
+
printf("\n");
|
1369 |
+
}
|
1370 |
+
}
|
1371 |
+
printf("\n");
|
1372 |
+
|
1373 |
+
printf("========\n");
|
1374 |
+
|
1375 |
+
}
|
1376 |
+
|
1377 |
+
|
1378 |
+
/*-----------------*/
|
1379 |
+
/* some predicates */
|
1380 |
+
/*-----------------*/
|
1381 |
+
int
|
1382 |
+
is_terminator(c)
|
1383 |
+
char c;
|
1384 |
+
{
|
1385 |
+
if(isspace(c) || c=='(' || c==')'){
|
1386 |
+
return(1);
|
1387 |
+
}else{
|
1388 |
+
return(0);
|
1389 |
+
}
|
1390 |
+
}
|
1391 |
+
|
1392 |
+
int
|
1393 |
+
is_deletelabel(s)
|
1394 |
+
char *s;
|
1395 |
+
{
|
1396 |
+
int i;
|
1397 |
+
|
1398 |
+
for(i=0;i<Delete_label_n;i++){
|
1399 |
+
if(strcmp(s,Delete_label[i])==0){
|
1400 |
+
return(1);
|
1401 |
+
}
|
1402 |
+
}
|
1403 |
+
|
1404 |
+
return(0);
|
1405 |
+
}
|
1406 |
+
|
1407 |
+
int
|
1408 |
+
is_deletelabel_for_length(s)
|
1409 |
+
char *s;
|
1410 |
+
{
|
1411 |
+
int i;
|
1412 |
+
|
1413 |
+
for(i=0;i<Delete_label_for_length_n;i++){
|
1414 |
+
if(strcmp(s,Delete_label_for_length[i])==0){
|
1415 |
+
return(1);
|
1416 |
+
}
|
1417 |
+
}
|
1418 |
+
|
1419 |
+
return(0);
|
1420 |
+
}
|
1421 |
+
|
1422 |
+
int
|
1423 |
+
is_quote_term(s,w)
|
1424 |
+
char *s;
|
1425 |
+
char *w;
|
1426 |
+
{
|
1427 |
+
int i;
|
1428 |
+
|
1429 |
+
for(i=0;i<Quote_term_n;i++){
|
1430 |
+
if(strcmp(s,Quote_term[i])==0){
|
1431 |
+
if (strcmp(w,"'")==0 || strcmp(w,"\"")==0 || strcmp(w,"/")==0)
|
1432 |
+
return(1);
|
1433 |
+
}
|
1434 |
+
}
|
1435 |
+
|
1436 |
+
return(0);
|
1437 |
+
}
|
1438 |
+
|
1439 |
+
|
1440 |
+
/*---------------*/
|
1441 |
+
/* compare words */
|
1442 |
+
/*---------------*/
|
1443 |
+
int
|
1444 |
+
word_comp(s1,s2)
|
1445 |
+
char *s1,*s2;
|
1446 |
+
{
|
1447 |
+
int i;
|
1448 |
+
|
1449 |
+
if(strcmp(s1,s2)==0){
|
1450 |
+
return(1);
|
1451 |
+
}
|
1452 |
+
|
1453 |
+
for(i=0;i<EQ_word_n;i++){
|
1454 |
+
if((strcmp(s1,EQ_word[i].s1)==0 &&
|
1455 |
+
strcmp(s2,EQ_word[i].s2)==0) ||
|
1456 |
+
(strcmp(s1,EQ_word[i].s2)==0 &&
|
1457 |
+
strcmp(s2,EQ_word[i].s1)==0)){
|
1458 |
+
return(1);
|
1459 |
+
}
|
1460 |
+
}
|
1461 |
+
|
1462 |
+
return(0);
|
1463 |
+
}
|
1464 |
+
|
1465 |
+
/*----------------*/
|
1466 |
+
/* compare labels */
|
1467 |
+
/*----------------*/
|
1468 |
+
int
|
1469 |
+
label_comp(s1,s2)
|
1470 |
+
char *s1,*s2;
|
1471 |
+
{
|
1472 |
+
int i;
|
1473 |
+
|
1474 |
+
if(strcmp(s1,s2)==0){
|
1475 |
+
return(1);
|
1476 |
+
}
|
1477 |
+
|
1478 |
+
for(i=0;i<EQ_label_n;i++){
|
1479 |
+
if((strcmp(s1,EQ_label[i].s1)==0 &&
|
1480 |
+
strcmp(s2,EQ_label[i].s2)==0) ||
|
1481 |
+
(strcmp(s1,EQ_label[i].s2)==0 &&
|
1482 |
+
strcmp(s2,EQ_label[i].s1)==0)){
|
1483 |
+
return(1);
|
1484 |
+
}
|
1485 |
+
}
|
1486 |
+
|
1487 |
+
return(0);
|
1488 |
+
}
|
1489 |
+
|
1490 |
+
|
1491 |
+
/*--------*/
|
1492 |
+
/* errors */
|
1493 |
+
/*--------*/
|
1494 |
+
void
|
1495 |
+
Error(s,arg1,arg2,arg3)
|
1496 |
+
char *s, *arg1, *arg2, *arg3;
|
1497 |
+
{
|
1498 |
+
Status = 1;
|
1499 |
+
fprintf(stderr,"%d : ",Line);
|
1500 |
+
fprintf(stderr,s,arg1,arg2,arg3);
|
1501 |
+
if(Error_count++>Max_error){
|
1502 |
+
exit(1);
|
1503 |
+
}
|
1504 |
+
}
|
1505 |
+
|
1506 |
+
|
1507 |
+
/*---------------------*/
|
1508 |
+
/* fatal error to exit */
|
1509 |
+
/*---------------------*/
|
1510 |
+
void
|
1511 |
+
Fatal(s,arg1,arg2,arg3)
|
1512 |
+
char *s, *arg1, *arg2, *arg3;
|
1513 |
+
{
|
1514 |
+
fprintf(stderr,s,arg1,arg2,arg3);
|
1515 |
+
exit(1);
|
1516 |
+
}
|
1517 |
+
|
1518 |
+
|
1519 |
+
/*-------*/
|
1520 |
+
/* Usage */
|
1521 |
+
/*-------*/
|
1522 |
+
void
|
1523 |
+
Usage()
|
1524 |
+
{
|
1525 |
+
fprintf(stderr," evalb [-dDh][-c n][-e n][-p param_file] gold-file test-file \n");
|
1526 |
+
fprintf(stderr," \n");
|
1527 |
+
fprintf(stderr," Evaluate bracketing in test-file against gold-file. \n");
|
1528 |
+
fprintf(stderr," Return recall, precision, F-Measure, tag accuracy. \n");
|
1529 |
+
fprintf(stderr," \n");
|
1530 |
+
fprintf(stderr," <option> \n");
|
1531 |
+
fprintf(stderr," -d debug mode \n");
|
1532 |
+
fprintf(stderr," -D debug mode plus bracketing info \n");
|
1533 |
+
fprintf(stderr," -c n cut-off length forstatistics (def.=40)\n");
|
1534 |
+
fprintf(stderr," -e n number of error to kill (default=10) \n");
|
1535 |
+
fprintf(stderr," -p param_file parameter file \n");
|
1536 |
+
fprintf(stderr," -h help \n");
|
1537 |
+
}
|
parsing/EVALB/new.prm
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##------------------------------------------##
|
2 |
+
## Debug mode ##
|
3 |
+
## 0: No debugging ##
|
4 |
+
## 1: print data for individual sentence ##
|
5 |
+
## 2: print detailed bracketing info ##
|
6 |
+
##------------------------------------------##
|
7 |
+
DEBUG 0
|
8 |
+
|
9 |
+
##------------------------------------------##
|
10 |
+
## MAX error ##
|
11 |
+
## Number of error to stop the process. ##
|
12 |
+
## This is useful if there could be ##
|
13 |
+
## tokanization error. ##
|
14 |
+
## The process will stop when this number##
|
15 |
+
## of errors are accumulated. ##
|
16 |
+
##------------------------------------------##
|
17 |
+
MAX_ERROR 10
|
18 |
+
|
19 |
+
##------------------------------------------##
|
20 |
+
## Cut-off length for statistics ##
|
21 |
+
## At the end of evaluation, the ##
|
22 |
+
## statistics for the senetnces of length##
|
23 |
+
## less than or equal to this number will##
|
24 |
+
## be shown, on top of the statistics ##
|
25 |
+
## for all the sentences ##
|
26 |
+
##------------------------------------------##
|
27 |
+
CUTOFF_LEN 40
|
28 |
+
|
29 |
+
##------------------------------------------##
|
30 |
+
## unlabeled or labeled bracketing ##
|
31 |
+
## 0: unlabeled bracketing ##
|
32 |
+
## 1: labeled bracketing ##
|
33 |
+
##------------------------------------------##
|
34 |
+
LABELED 1
|
35 |
+
|
36 |
+
##------------------------------------------##
|
37 |
+
## Delete labels ##
|
38 |
+
## list of labels to be ignored. ##
|
39 |
+
## If it is a pre-terminal label, delete ##
|
40 |
+
## the word along with the brackets. ##
|
41 |
+
## If it is a non-terminal label, just ##
|
42 |
+
## delete the brackets (don't delete ##
|
43 |
+
## deildrens). ##
|
44 |
+
##------------------------------------------##
|
45 |
+
DELETE_LABEL TOP
|
46 |
+
DELETE_LABEL S1
|
47 |
+
DELETE_LABEL -NONE-
|
48 |
+
DELETE_LABEL ,
|
49 |
+
DELETE_LABEL :
|
50 |
+
DELETE_LABEL ``
|
51 |
+
DELETE_LABEL ''
|
52 |
+
DELETE_LABEL .
|
53 |
+
DELETE_LABEL ?
|
54 |
+
DELETE_LABEL !
|
55 |
+
|
56 |
+
##------------------------------------------##
|
57 |
+
## Delete labels for length calculation ##
|
58 |
+
## list of labels to be ignored for ##
|
59 |
+
## length calculation purpose ##
|
60 |
+
##------------------------------------------##
|
61 |
+
DELETE_LABEL_FOR_LENGTH -NONE-
|
62 |
+
|
63 |
+
##------------------------------------------##
|
64 |
+
## Labels to be considered for misquote ##
|
65 |
+
## (could be possesive or quote) ##
|
66 |
+
##------------------------------------------##
|
67 |
+
QUOTE_LABEL ``
|
68 |
+
QUOTE_LABEL ''
|
69 |
+
QUOTE_LABEL POS
|
70 |
+
|
71 |
+
##------------------------------------------##
|
72 |
+
## These ones are less common, but ##
|
73 |
+
## are on occasion output by parsers: ##
|
74 |
+
##------------------------------------------##
|
75 |
+
QUOTE_LABEL NN
|
76 |
+
QUOTE_LABEL CD
|
77 |
+
QUOTE_LABEL VBZ
|
78 |
+
QUOTE_LABEL :
|
79 |
+
|
80 |
+
##------------------------------------------##
|
81 |
+
## Equivalent labels, words ##
|
82 |
+
## the pairs are considered equivalent ##
|
83 |
+
## This is non-directional. ##
|
84 |
+
##------------------------------------------##
|
85 |
+
EQ_LABEL ADVP PRT
|
86 |
+
|
87 |
+
# EQ_WORD Example example
|
parsing/EVALB/nk.prm
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Based on new.prm (and by extension COLLINS.prm)
|
2 |
+
# The only change from new.prm is increasing MAX_ERROR. The evaluation should be
|
3 |
+
# identical to the standard setup, except that evalb won't give up early for a
|
4 |
+
# parser that has just started training and does not yet produce good results.
|
5 |
+
|
6 |
+
##------------------------------------------##
|
7 |
+
## Debug mode ##
|
8 |
+
## 0: No debugging ##
|
9 |
+
## 1: print data for individual sentence ##
|
10 |
+
## 2: print detailed bracketing info ##
|
11 |
+
##------------------------------------------##
|
12 |
+
DEBUG 0
|
13 |
+
|
14 |
+
##------------------------------------------##
|
15 |
+
## MAX error ##
|
16 |
+
## Number of error to stop the process. ##
|
17 |
+
## This is useful if there could be ##
|
18 |
+
## tokanization error. ##
|
19 |
+
## The process will stop when this number##
|
20 |
+
## of errors are accumulated. ##
|
21 |
+
##------------------------------------------##
|
22 |
+
MAX_ERROR 10000
|
23 |
+
|
24 |
+
##------------------------------------------##
|
25 |
+
## Cut-off length for statistics ##
|
26 |
+
## At the end of evaluation, the ##
|
27 |
+
## statistics for the senetnces of length##
|
28 |
+
## less than or equal to this number will##
|
29 |
+
## be shown, on top of the statistics ##
|
30 |
+
## for all the sentences ##
|
31 |
+
##------------------------------------------##
|
32 |
+
CUTOFF_LEN 40
|
33 |
+
|
34 |
+
##------------------------------------------##
|
35 |
+
## unlabeled or labeled bracketing ##
|
36 |
+
## 0: unlabeled bracketing ##
|
37 |
+
## 1: labeled bracketing ##
|
38 |
+
##------------------------------------------##
|
39 |
+
LABELED 1
|
40 |
+
|
41 |
+
##------------------------------------------##
|
42 |
+
## Delete labels ##
|
43 |
+
## list of labels to be ignored. ##
|
44 |
+
## If it is a pre-terminal label, delete ##
|
45 |
+
## the word along with the brackets. ##
|
46 |
+
## If it is a non-terminal label, just ##
|
47 |
+
## delete the brackets (don't delete ##
|
48 |
+
## deildrens). ##
|
49 |
+
##------------------------------------------##
|
50 |
+
DELETE_LABEL TOP
|
51 |
+
DELETE_LABEL S1
|
52 |
+
DELETE_LABEL -NONE-
|
53 |
+
DELETE_LABEL ,
|
54 |
+
DELETE_LABEL :
|
55 |
+
DELETE_LABEL ``
|
56 |
+
DELETE_LABEL ''
|
57 |
+
DELETE_LABEL .
|
58 |
+
DELETE_LABEL ?
|
59 |
+
DELETE_LABEL !
|
60 |
+
|
61 |
+
##------------------------------------------##
|
62 |
+
## Delete labels for length calculation ##
|
63 |
+
## list of labels to be ignored for ##
|
64 |
+
## length calculation purpose ##
|
65 |
+
##------------------------------------------##
|
66 |
+
DELETE_LABEL_FOR_LENGTH -NONE-
|
67 |
+
|
68 |
+
##------------------------------------------##
|
69 |
+
## Labels to be considered for misquote ##
|
70 |
+
## (could be possesive or quote) ##
|
71 |
+
##------------------------------------------##
|
72 |
+
QUOTE_LABEL ``
|
73 |
+
QUOTE_LABEL ''
|
74 |
+
QUOTE_LABEL POS
|
75 |
+
|
76 |
+
##------------------------------------------##
|
77 |
+
## These ones are less common, but ##
|
78 |
+
## are on occasion output by parsers: ##
|
79 |
+
##------------------------------------------##
|
80 |
+
QUOTE_LABEL NN
|
81 |
+
QUOTE_LABEL CD
|
82 |
+
QUOTE_LABEL VBZ
|
83 |
+
QUOTE_LABEL :
|
84 |
+
|
85 |
+
##------------------------------------------##
|
86 |
+
## Equivalent labels, words ##
|
87 |
+
## the pairs are considered equivalent ##
|
88 |
+
## This is non-directional. ##
|
89 |
+
##------------------------------------------##
|
90 |
+
EQ_LABEL ADVP PRT
|
91 |
+
|
92 |
+
# EQ_WORD Example example
|
parsing/EVALB/sample/sample.gld
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
2 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
3 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
4 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
5 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
6 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
7 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
8 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
9 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
10 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
11 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
12 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
13 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
14 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
15 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
16 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
17 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
18 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
19 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
20 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
21 |
+
(S (A-SBJ-1 (P this)) (B-WHATEVER (Q is) (A (R a) (T test))))
|
22 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))))
|
23 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))) (-NONE- *))
|
24 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))) (: *))
|
parsing/EVALB/sample/sample.prm
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##------------------------------------------##
|
2 |
+
## Debug mode ##
|
3 |
+
## print out data for individual sentence ##
|
4 |
+
##------------------------------------------##
|
5 |
+
DEBUG 0
|
6 |
+
|
7 |
+
##------------------------------------------##
|
8 |
+
## MAX error ##
|
9 |
+
## Number of error to stop the process. ##
|
10 |
+
## This is useful if there could be ##
|
11 |
+
## tokanization error. ##
|
12 |
+
## The process will stop when this number##
|
13 |
+
## of errors are accumulated. ##
|
14 |
+
##------------------------------------------##
|
15 |
+
MAX_ERROR 10
|
16 |
+
|
17 |
+
##------------------------------------------##
|
18 |
+
## Cut-off length for statistics ##
|
19 |
+
## At the end of evaluation, the ##
|
20 |
+
## statistics for the senetnces of length##
|
21 |
+
## less than or equal to this number will##
|
22 |
+
## be shown, on top of the statistics ##
|
23 |
+
## for all the sentences ##
|
24 |
+
##------------------------------------------##
|
25 |
+
CUTOFF_LEN 40
|
26 |
+
|
27 |
+
##------------------------------------------##
|
28 |
+
## unlabeled or labeled bracketing ##
|
29 |
+
## 0: unlabeled bracketing ##
|
30 |
+
## 1: labeled bracketing ##
|
31 |
+
##------------------------------------------##
|
32 |
+
LABELED 1
|
33 |
+
|
34 |
+
##------------------------------------------##
|
35 |
+
## Delete labels ##
|
36 |
+
## list of labels to be ignored. ##
|
37 |
+
## If it is a pre-terminal label, delete ##
|
38 |
+
## the word along with the brackets. ##
|
39 |
+
## If it is a non-terminal label, just ##
|
40 |
+
## delete the brackets (don't delete ##
|
41 |
+
## deildrens). ##
|
42 |
+
##------------------------------------------##
|
43 |
+
DELETE_LABEL TOP
|
44 |
+
DELETE_LABEL -NONE-
|
45 |
+
DELETE_LABEL ,
|
46 |
+
DELETE_LABEL :
|
47 |
+
DELETE_LABEL ``
|
48 |
+
DELETE_LABEL ''
|
49 |
+
|
50 |
+
##------------------------------------------##
|
51 |
+
## Delete labels for length calculation ##
|
52 |
+
## list of labels to be ignored for ##
|
53 |
+
## length calculation purpose ##
|
54 |
+
##------------------------------------------##
|
55 |
+
DELETE_LABEL_FOR_LENGTH -NONE-
|
56 |
+
|
57 |
+
|
58 |
+
##------------------------------------------##
|
59 |
+
## Equivalent labels, words ##
|
60 |
+
## the pairs are considered equivalent ##
|
61 |
+
## This is non-directional. ##
|
62 |
+
##------------------------------------------##
|
63 |
+
EQ_LABEL T TT
|
64 |
+
|
65 |
+
EQ_WORD This this
|
parsing/EVALB/sample/sample.rsl
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Sent. Matched Bracket Cross Correct Tag
|
2 |
+
ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy
|
3 |
+
============================================================================
|
4 |
+
1 4 0 100.00 100.00 4 4 4 0 4 4 100.00
|
5 |
+
2 4 0 75.00 75.00 3 4 4 0 4 4 100.00
|
6 |
+
3 4 0 100.00 100.00 4 4 4 0 4 3 75.00
|
7 |
+
4 4 0 75.00 75.00 3 4 4 0 4 3 75.00
|
8 |
+
5 4 0 75.00 75.00 3 4 4 0 4 4 100.00
|
9 |
+
6 4 0 50.00 66.67 2 4 3 1 4 4 100.00
|
10 |
+
7 4 0 25.00 100.00 1 4 1 0 4 4 100.00
|
11 |
+
8 4 0 0.00 0.00 0 4 0 0 4 4 100.00
|
12 |
+
9 4 0 100.00 80.00 4 4 5 0 4 4 100.00
|
13 |
+
10 4 0 100.00 50.00 4 4 8 0 4 4 100.00
|
14 |
+
11 4 2 0.00 0.00 0 0 0 0 4 0 0.00
|
15 |
+
12 4 1 0.00 0.00 0 0 0 0 4 0 0.00
|
16 |
+
13 4 1 0.00 0.00 0 0 0 0 4 0 0.00
|
17 |
+
14 4 2 0.00 0.00 0 0 0 0 4 0 0.00
|
18 |
+
15 4 0 100.00 100.00 4 4 4 0 4 4 100.00
|
19 |
+
16 4 1 0.00 0.00 0 0 0 0 4 0 0.00
|
20 |
+
17 4 1 0.00 0.00 0 0 0 0 4 0 0.00
|
21 |
+
18 4 0 100.00 100.00 4 4 4 0 4 4 100.00
|
22 |
+
19 4 0 100.00 100.00 4 4 4 0 4 4 100.00
|
23 |
+
20 4 1 0.00 0.00 0 0 0 0 4 0 0.00
|
24 |
+
21 4 0 100.00 100.00 4 4 4 0 4 4 100.00
|
25 |
+
22 44 0 100.00 100.00 34 34 34 0 44 44 100.00
|
26 |
+
23 4 0 100.00 100.00 4 4 4 0 4 4 100.00
|
27 |
+
24 5 0 100.00 100.00 4 4 4 0 4 4 100.00
|
28 |
+
============================================================================
|
29 |
+
87.76 90.53 86 98 95 16 108 106 98.15
|
30 |
+
=== Summary ===
|
31 |
+
|
32 |
+
-- All --
|
33 |
+
Number of sentence = 24
|
34 |
+
Number of Error sentence = 5
|
35 |
+
Number of Skip sentence = 2
|
36 |
+
Number of Valid sentence = 17
|
37 |
+
Bracketing Recall = 87.76
|
38 |
+
Bracketing Precision = 90.53
|
39 |
+
Complete match = 52.94
|
40 |
+
Average crossing = 0.06
|
41 |
+
No crossing = 94.12
|
42 |
+
2 or less crossing = 100.00
|
43 |
+
Tagging accuracy = 98.15
|
44 |
+
|
45 |
+
-- len<=40 --
|
46 |
+
Number of sentence = 23
|
47 |
+
Number of Error sentence = 5
|
48 |
+
Number of Skip sentence = 2
|
49 |
+
Number of Valid sentence = 16
|
50 |
+
Bracketing Recall = 81.25
|
51 |
+
Bracketing Precision = 85.25
|
52 |
+
Complete match = 50.00
|
53 |
+
Average crossing = 0.06
|
54 |
+
No crossing = 93.75
|
55 |
+
2 or less crossing = 100.00
|
56 |
+
Tagging accuracy = 96.88
|
parsing/EVALB/sample/sample.tst
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
2 |
+
(S (A (P this)) (B (Q is) (C (R a) (T test))))
|
3 |
+
(S (A (P this)) (B (Q is) (A (R a) (U test))))
|
4 |
+
(S (C (P this)) (B (Q is) (A (R a) (U test))))
|
5 |
+
(S (A (P this)) (B (Q is) (R a) (A (T test))))
|
6 |
+
(S (A (P this) (Q is)) (A (R a) (T test)))
|
7 |
+
(S (P this) (Q is) (R a) (T test))
|
8 |
+
(P this) (Q is) (R a) (T test)
|
9 |
+
(S (A (P this)) (B (Q is) (A (A (R a) (T test)))))
|
10 |
+
(S (A (P this)) (B (Q is) (A (A (A (A (A (R a) (T test))))))))
|
11 |
+
|
12 |
+
(S (A (P this)) (B (Q was) (A (A (R a) (T test)))))
|
13 |
+
(S (A (P this)) (B (Q is) (U not) (A (A (R a) (T test)))))
|
14 |
+
|
15 |
+
(TOP (S (A (P this)) (B (Q is) (A (R a) (T test)))))
|
16 |
+
(S (A (P this)) (NONE *) (B (Q is) (A (R a) (T test))))
|
17 |
+
(S (A (P this)) (S (NONE abc) (A (NONE *))) (B (Q is) (A (R a) (T test))))
|
18 |
+
(S (A (P this)) (B (Q is) (A (R a) (TT test))))
|
19 |
+
(S (A (P This)) (B (Q is) (A (R a) (T test))))
|
20 |
+
(S (A (P That)) (B (Q is) (A (R a) (T test))))
|
21 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))))
|
22 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))) (A (P this)) (B (Q is) (A (R a) (T test))))
|
23 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))) (-NONE- *))
|
24 |
+
(S (A (P this)) (B (Q is) (A (R a) (T test))) (: *))
|
parsing/EVALB/tgrep_proc.prl
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/local/bin/perl
|
2 |
+
|
3 |
+
while(<>)
|
4 |
+
{
|
5 |
+
if(m/TOP/) #skip lines which are blank
|
6 |
+
{
|
7 |
+
print;
|
8 |
+
}
|
9 |
+
}
|
parsing/EVALB_SPMRL/Makefile
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TESTFILE=dev.Arabic.gold.ptb.-feat
|
2 |
+
JAVA=java
|
3 |
+
|
4 |
+
|
5 |
+
all: evalb
|
6 |
+
|
7 |
+
|
8 |
+
clean:
|
9 |
+
rm -f evalb_spmrl
|
10 |
+
|
11 |
+
install: evalb
|
12 |
+
cp evalb_spmrl /usr/local/bin
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
evalb: evalb.c
|
17 |
+
gcc -Wall -O3 -g -o evalb_spmrl evalb.c
|
18 |
+
|
19 |
+
evalb_linux: evalb.c
|
20 |
+
gcc -Wall -fPIC -O3 -g -o evalb_spmrl evalb.c
|
21 |
+
#to compile on linux
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
# note: on the original makefile, the funsigned-char option was applied
|
26 |
+
|
27 |
+
home: install_home
|
28 |
+
|
29 |
+
install_home: all
|
30 |
+
cp evalb_spmrl ${PREFIX}/bin
|
31 |
+
|
32 |
+
up:
|
33 |
+
tar zcvf ../evalb_spmrl2013.tar.gz ../evalb_spmrl2013/
|
34 |
+
putW ../evalb_spmrl2013.tar.gz
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
#################################
|
39 |
+
# stuff to debug some treebanks #
|
40 |
+
#################################
|
41 |
+
test_full: all
|
42 |
+
./evalb dev.Arabic.gold.ptb dev.Arabic.gold.ptb
|
43 |
+
|
44 |
+
|
45 |
+
test: all
|
46 |
+
./evalb -p ./new.prm ${TESTFILE} ${TESTFILE}
|
47 |
+
|
48 |
+
debug: all
|
49 |
+
./evalb -D ${TESTFILE} ${TESTFILE}
|
50 |
+
echo "./evalb -D ${TESTFILE} ${TESTFILE}"
|
51 |
+
|
52 |
+
debug_one: all
|
53 |
+
lines 616 < ${TESTFILE} > ${TESTFILE}.616
|
54 |
+
./evalb -D ${TESTFILE}.616 ${TESTFILE}.616
|
55 |
+
echo "./evalb -D ${TESTFILE}.616 ${TESTFILE}.616"
|
56 |
+
|
57 |
+
releaf:
|
58 |
+
./evalb -D dev.Arabic.gold.ptb.-feat.616.bug dev.Arabic.gold.ptb.-feat.616.bug
|
59 |
+
echo "./evalb -D dev.Arabic.gold.ptb.-feat.616.bug dev.Arabic.gold.ptb.-feat.616.bug" > /dev/stderr
|
60 |
+
|
61 |
+
java:
|
62 |
+
${JAVA} -jar ./evalC/evalC.jar ${TESTFILE} ${TESTFILE} /dev/stdout
|
63 |
+
|
64 |
+
|
65 |
+
|
parsing/EVALB_SPMRL/README
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
// Djam�: version record added for history's sake.
|
3 |
+
// note to future updater: please add your changelog below
|
4 |
+
|
5 |
+
(copied from http://nlp.cs.nyu.edu/evalb/ )
|
6 |
+
EVALB20080701.tgz (July 1, 2008 version) modified by Don Blaheta (Knox College)
|
7 |
+
EVALB20060307.tgz (March 3, 2006 version; debuged of Jan. 17, 2006 version) modified by David Ellis (Brown University)
|
8 |
+
EVALB20060117.tgz (Jan. 17, 2006 version) modified by David Ellis (Brown University)
|
9 |
+
EVALB20050908.tgz (Sept. 8, 2005 version) modified by David Brroks (Birmingham)
|
10 |
+
EVALB.tgz (original version).
|
11 |
+
Authors
|
12 |
+
|
13 |
+
Satoshi Sekine (New York University) : e-mail: his last name (at) cs.nyu.edu
|
14 |
+
Michael John Collins (University of Pennsylvania)
|
15 |
+
Note: the authors are not responsible for the newer versions. We put these versions even without checking the program. Please be responsible for yourself.
|
16 |
+
|
17 |
+
*************************************************************************
|
18 |
+
|
19 |
+
Modification
|
20 |
+
|
21 |
+
David Brroks (Birmingham): fixed the code so that the program can be compiled by the latest gcc (September 2005). Helps are given by Peet Morris and Ramon Ziai through the Corpora Mailing list.
|
22 |
+
David Ellis (Brown University) : fixes a bug in which sentences were incorrectly categorized as "length mismatch" when the the parse output had certain mislabeled parts-of-speech.
|
23 |
+
Don Blaheta (KNOX) : fixes a bug on the output of last number of the total information was not TOTAL_crossing, but it was TOTAL_non_crossing.
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
April 2012
|
28 |
+
// Modified by Slav Petrov and Ryanc Mc Donald (Google inc., for the sancl 2012 shared task)
|
29 |
+
// ===> making it less sensitive to punct POS errors leading to
|
30 |
+
// mismatch of length
|
31 |
+
|
32 |
+
|
33 |
+
August 2013, 10
|
34 |
+
// Modified by Djam� Seddah (Univ. Paris Sorbonne, for the spmrl 2013 shared task)
|
35 |
+
// ===> making it able to cope with Arabic very long lines (byte wise)
|
36 |
+
// ===> now limit is 50000 bytes, was 5000 (tricky bug, if you ask me)
|
37 |
+
// please check the constant macro section if you encounter weird bugs not present in other
|
38 |
+
implementations (check evalC by Federico Sangatti for example, http://homepages.inf.ed.ac.uk/fsangati/evalC_25_5_10.zip )
|
39 |
+
|
40 |
+
|
41 |
+
August 2013, 23
|
42 |
+
// Modif from Thomas M�ller (IMS Stuttgart)
|
43 |
+
// ===> adding of # in the stop word modify_label function (so that the
|
44 |
+
// lexer will read NPP instead of NPP##feat:...### as in hte SPMRL Data set
|
45 |
+
// Modif from Djam� Seddah
|
46 |
+
// ===> Application of modify_label to all labels (including the POS label
|
47 |
+
// wich were left untouched for some reasons)
|
48 |
+
// That should btw be an option. (wether to evaluate full labels or not,
|
49 |
+
// only stripping of Non Terminal, POS tag and so on)
|
50 |
+
|
51 |
+
|
52 |
+
August 2013, 27
|
53 |
+
// Modif from Djam�
|
54 |
+
// --> adding of an option to include the non parsed sentences in the
|
55 |
+
// --> evaluation (-X option)
|
56 |
+
// --> adding an option to evaluate only the first N sentences (-K n)
|
57 |
+
// --> adding an option to provide a compact results view (-L) so one can do
|
58 |
+
// --> find ./ -name "*parsed.run?" -exec evalb_spmrl -L GOLD {} \; -print |
|
59 |
+
// --> grep -v '=====' | grep '='
|
60 |
+
|
61 |
+
September 2013, 6
|
62 |
+
// Modif from DJame
|
63 |
+
// fixing the infinite slowness bug (shame on me)
|
64 |
+
// now speed is similar to what it was before
|
65 |
+
|
66 |
+
|
67 |
+
October 2013, 13
|
68 |
+
// Addition from Djame
|
69 |
+
// Adding the spmrl_hebrew.prm if one wants to evaluate hebrew parsing within the
|
70 |
+
// same conditions as the state-of-the-art
|
71 |
+
// namely without counting the additional SYNpos layer which inflates evalb
|
72 |
+
// scores by almost 2 points.
|
73 |
+
// Note: for the spmrl shared task, we used the spmrl.prm file (so with
|
74 |
+
// these labels. It was too late to modify the rules once again when we
|
75 |
+
// realized this)
|
76 |
+
|
parsing/EVALB_SPMRL/README.orig
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#################################################################
|
2 |
+
# #
|
3 |
+
# README file for evalb #
|
4 |
+
# #
|
5 |
+
# Satoshi Sekine (NYU) #
|
6 |
+
# Mike Collins (UPenn) #
|
7 |
+
# #
|
8 |
+
# October.1997 #
|
9 |
+
#################################################################
|
10 |
+
|
11 |
+
Contents of this README:
|
12 |
+
|
13 |
+
[0] COPYRIGHT
|
14 |
+
[1] INTRODUCTION
|
15 |
+
[2] INSTALLATION AND RUN
|
16 |
+
[3] OPTIONS
|
17 |
+
[4] OUTPUT FORMAT FROM THE SCORER
|
18 |
+
[5] HOW TO CREATE A GOLDFILE FROM THE TREEBANK
|
19 |
+
[6] THE PARAMETER FILE
|
20 |
+
[7] MORE DETAILS ABOUT THE SCORING ALGORITHM
|
21 |
+
|
22 |
+
|
23 |
+
[0] COPYRIGHT
|
24 |
+
|
25 |
+
The authors abandon the copyright of this program. Everyone is
|
26 |
+
permitted to copy and distribute the program or a portion of the program
|
27 |
+
with no charge and no restrictions unless it is harmful to someone.
|
28 |
+
|
29 |
+
However, the authors are delightful for the user's kindness of proper
|
30 |
+
usage and letting the authors know bugs or problems.
|
31 |
+
|
32 |
+
This software is provided "AS IS", and the authors make no warranties,
|
33 |
+
express or implied.
|
34 |
+
|
35 |
+
|
36 |
+
[1] INTRODUCTION
|
37 |
+
|
38 |
+
Evaluation of bracketing looks simple, but in fact, there are minor
|
39 |
+
differences from system to system. This is a program to parametarize
|
40 |
+
such minor differences and to give an informative result.
|
41 |
+
|
42 |
+
"evalb" evaluates bracketing accuracy in a test-file against a gold-file.
|
43 |
+
It returns recall, precision, tagging accuracy. It uses an identical
|
44 |
+
algorithm to that used in (Collins ACL97).
|
45 |
+
|
46 |
+
|
47 |
+
[2] Installation and Run
|
48 |
+
|
49 |
+
To compile the scorer, type
|
50 |
+
|
51 |
+
> make
|
52 |
+
|
53 |
+
|
54 |
+
To run the scorer:
|
55 |
+
|
56 |
+
> evalb -p Parameter_file Gold_file Test_file
|
57 |
+
|
58 |
+
|
59 |
+
For example to use the sample files:
|
60 |
+
|
61 |
+
> evalb -p sample.prm sample.gld sample.tst
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
[3] OPTIONS
|
66 |
+
|
67 |
+
You can specify system parameters in the command line options.
|
68 |
+
Other options concerning to evaluation metrix should be specified
|
69 |
+
in parameter file, described later.
|
70 |
+
|
71 |
+
-p param_file parameter file
|
72 |
+
-d debug mode
|
73 |
+
-e n number of error to kill (default=10)
|
74 |
+
-h help
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
[4] OUTPUT FORMAT FROM THE SCORER
|
79 |
+
|
80 |
+
The scorer gives individual scores for each sentence, for
|
81 |
+
example:
|
82 |
+
|
83 |
+
Sent. Matched Bracket Cross Correct Tag
|
84 |
+
ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy
|
85 |
+
============================================================================
|
86 |
+
1 8 0 100.00 100.00 5 5 5 0 6 5 83.33
|
87 |
+
|
88 |
+
At the end of the output the === Summary === section gives statistics
|
89 |
+
for all sentences, and for sentences <=40 words in length. The summary
|
90 |
+
contains the following information:
|
91 |
+
|
92 |
+
i) Number of sentences -- total number of sentences.
|
93 |
+
|
94 |
+
ii) Number of Error/Skip sentences -- should both be 0 if there is no
|
95 |
+
problem with the parsed/gold files.
|
96 |
+
|
97 |
+
iii) Number of valid sentences = Number of sentences - Number of Error/Skip
|
98 |
+
sentences
|
99 |
+
|
100 |
+
iv) Bracketing recall = (number of correct constituents)
|
101 |
+
----------------------------------------
|
102 |
+
(number of constituents in the goldfile)
|
103 |
+
|
104 |
+
v) Bracketing precision = (number of correct constituents)
|
105 |
+
----------------------------------------
|
106 |
+
(number of constituents in the parsed file)
|
107 |
+
|
108 |
+
vi) Complete match = percentaage of sentences where recall and precision are
|
109 |
+
both 100%.
|
110 |
+
|
111 |
+
vii) Average crossing = (number of constituents crossing a goldfile constituen
|
112 |
+
----------------------------------------------------
|
113 |
+
(number of sentences)
|
114 |
+
|
115 |
+
viii) No crossing = percentage of sentences which have 0 crossing brackets.
|
116 |
+
|
117 |
+
ix) 2 or less crossing = percentage of sentences which have <=2 crossing brackets.
|
118 |
+
|
119 |
+
x) Tagging accuracy = percentage of correct POS tags (but see [5].3 for exact
|
120 |
+
details of what is counted).
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
[5] HOW TO CREATE A GOLDFILE FROM THE PENN TREEBANK
|
125 |
+
|
126 |
+
|
127 |
+
The gold and parsed files are in a format similar to this:
|
128 |
+
|
129 |
+
(TOP (S (INTJ (RB No)) (, ,) (NP (PRP it)) (VP (VBD was) (RB n't) (NP (NNP Black) (NNP Monday))) (. .)))
|
130 |
+
|
131 |
+
To create a gold file from the treebank:
|
132 |
+
|
133 |
+
tgrep -wn '/.*/' | tgrep_proc.prl
|
134 |
+
|
135 |
+
will produce a goldfile in the required format. ("tgrep -wn '/.*/'" prints
|
136 |
+
parse trees, "tgrep_process.prl" just skips blank lines).
|
137 |
+
|
138 |
+
For example, to produce a goldfile for section 23 of the treebank:
|
139 |
+
|
140 |
+
tgrep -wn '/.*/' | tail +90895 | tgrep_process.prl | sed 2416q > sec23.gold
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
[6] THE PARAMETER (.prm) FILE
|
145 |
+
|
146 |
+
|
147 |
+
The .prm file sets options regarding the scoring method. COLLINS.prm gives
|
148 |
+
the same scoring behaviour as the scorer used in (Collins 97). The options
|
149 |
+
chosen were:
|
150 |
+
|
151 |
+
1) LABELED 1
|
152 |
+
|
153 |
+
to give labelled precision/recall figures, i.e. a constituent must have the
|
154 |
+
same span *and* label as a constituent in the goldfile.
|
155 |
+
|
156 |
+
2) DELETE_LABEL TOP
|
157 |
+
|
158 |
+
Don't count the "TOP" label (which is always given in the output of tgrep)
|
159 |
+
when scoring.
|
160 |
+
|
161 |
+
3) DELETE_LABEL -NONE-
|
162 |
+
|
163 |
+
Remove traces (and all constituents which dominate nothing but traces) when
|
164 |
+
scoring. For example
|
165 |
+
|
166 |
+
.... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)))
|
167 |
+
|
168 |
+
would be processed to give
|
169 |
+
|
170 |
+
.... (VP (VBD reported)) (. .)))
|
171 |
+
|
172 |
+
|
173 |
+
4)
|
174 |
+
DELETE_LABEL , -- for the purposes of scoring remove punctuation
|
175 |
+
DELETE_LABEL :
|
176 |
+
DELETE_LABEL ``
|
177 |
+
DELETE_LABEL ''
|
178 |
+
DELETE_LABEL .
|
179 |
+
|
180 |
+
5) DELETE_LABEL_FOR_LENGTH -NONE- -- don't include traces when calculating
|
181 |
+
the length of a sentence (important
|
182 |
+
when classifying a sentence as <=40
|
183 |
+
words or >40 words)
|
184 |
+
|
185 |
+
6) EQ_LABEL ADVP PRT
|
186 |
+
|
187 |
+
Count ADVP and PRT as being the same label when scoring.
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
|
192 |
+
[7] MORE DETAILS ABOUT THE SCORING ALGORITHM
|
193 |
+
|
194 |
+
|
195 |
+
1) The scorer initially processes the files to remove all nodes specified
|
196 |
+
by DELETE_LABEL in the .prm file. It also recursively removes nodes which
|
197 |
+
dominate nothing due to all their children being removed. For example, if
|
198 |
+
-NONE- is specified as a label to be deleted,
|
199 |
+
|
200 |
+
.... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)))
|
201 |
+
|
202 |
+
would be processed to give
|
203 |
+
|
204 |
+
.... (VP (VBD reported)) (. .)))
|
205 |
+
|
206 |
+
2) The scorer also removes all functional tags attached to non-terminals
|
207 |
+
(functional tags are prefixed with "-" or "=" in the treebank). For example
|
208 |
+
"NP-SBJ" is processed to give "NP", "NP=2" is changed to "NP".
|
209 |
+
|
210 |
+
|
211 |
+
3) Tagging accuracy counts tags for all words *except* any tags which are
|
212 |
+
deleted by a DELETE_LABEL specification in the .prm file. (For example, for
|
213 |
+
COLLINS.prm, punctuation tagged as "," ":" etc. would not be included).
|
214 |
+
|
215 |
+
4) When calculating the length of a sentence, all words with POS tags not
|
216 |
+
included in the "DELETE_LABEL_FOR_LENGTH" list in the .prm file are
|
217 |
+
counted. (For COLLINS.prm, only "-NONE-" is specified in this list, so
|
218 |
+
traces are removed before calculating the length of the sentence).
|
219 |
+
|
220 |
+
5) There are some subtleties in scoring when either the goldfile or parsed
|
221 |
+
file contains multiple constituents for the same span which have the same
|
222 |
+
non-terminal label. e.g. (NP (NP the man)) If the goldfile contains n
|
223 |
+
constituents for the same span, and the parsed file contains m constituents
|
224 |
+
with that nonterminal, the scorer works as follows:
|
225 |
+
|
226 |
+
i) If m>n, then the precision is n/m, recall is 100%
|
227 |
+
|
228 |
+
ii) If n>m, then the precision is 100%, recall is m/n.
|
229 |
+
|
230 |
+
iii) If n==m, recall and precision are both 100%.
|
parsing/EVALB_SPMRL/evalb.c
ADDED
@@ -0,0 +1,1724 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/*****************************************************************/
|
2 |
+
/* evalb [-p param_file] [-dh] [-e n] gold-file test-file */
|
3 |
+
/* */
|
4 |
+
/* Evaluate bracketing in test-file against gold-file. */
|
5 |
+
/* Return recall, precision, tagging accuracy. */
|
6 |
+
/* */
|
7 |
+
/* <option> */
|
8 |
+
/* -p param_file parameter file */
|
9 |
+
/* -d debug mode */
|
10 |
+
/* -e n number of error to kill (default=10) */
|
11 |
+
/* -h help */
|
12 |
+
/* */
|
13 |
+
/* Satoshi Sekine (NYU) */
|
14 |
+
/* Mike Collins (UPenn) */
|
15 |
+
/* */
|
16 |
+
/* October.1997 */
|
17 |
+
/* */
|
18 |
+
/* Please refer README for the update information */
|
19 |
+
/*****************************************************************/
|
20 |
+
|
21 |
+
// Djamé: version record added for history's sake.
|
22 |
+
// note to future updater: please add your changelog below
|
23 |
+
|
24 |
+
// Modified by Slav Petrov and Ryanc Mc Donald (for the sancl 2012 shared task)
|
25 |
+
// ===> making it less sensitive to punct POS errors leading to
|
26 |
+
// ===> mismatch of length
|
27 |
+
|
28 |
+
// Modified by Djamé Seddah (for the spmrl shared 2013 task)
|
29 |
+
// ===> making it able to cope with Arabic very long lines (byte wise)
|
30 |
+
// ===> now limit is 50000 bytes, was 5000 (damn bug, if you ask me)
|
31 |
+
// ===> modified to cope with spmrl 2013 morpg features (suggested by thomas Muller from IMS)
|
32 |
+
// please check the constant macro section
|
33 |
+
// Correction of bug causing hard slowdown (due to max_word_in_sent set too high)
|
34 |
+
// former version was 78x slower than regular evalb.
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
#include <stdio.h>
|
42 |
+
#include <stdlib.h> //### added for exit, atoi decls
|
43 |
+
#include <ctype.h>
|
44 |
+
#include <string.h>
|
45 |
+
#ifndef __APPLE__ # dj: added to compile on mac os x
|
46 |
+
#include <malloc.h>
|
47 |
+
#endif
|
48 |
+
|
49 |
+
/* Internal Data format -------------------------------------------*/
|
50 |
+
/* */
|
51 |
+
/* (S (NP (NNX this)) (VP (VBX is) (NP (DT a) (NNX pen))) (SYM .)) */
|
52 |
+
/* */
|
53 |
+
/* wn=5 */
|
54 |
+
/* word label */
|
55 |
+
/* terminal[0] = this NNX */
|
56 |
+
/* terminal[1] = is VBX */
|
57 |
+
/* terminal[2] = a DT */
|
58 |
+
/* terminal[3] = pen NNX */
|
59 |
+
/* terminal[4] = . SYM */
|
60 |
+
/* */
|
61 |
+
/* bn=4 */
|
62 |
+
/* start end label */
|
63 |
+
/* bracket[0] = 0 5 S */
|
64 |
+
/* bracket[1] = 0 0 NP */
|
65 |
+
/* bracket[2] = 1 4 VP */
|
66 |
+
/* bracket[3] = 2 4 NP */
|
67 |
+
/* */
|
68 |
+
/* matched bracketing */
|
69 |
+
/* Recall = --------------------------- */
|
70 |
+
/* # of bracket in ref-data */
|
71 |
+
/* */
|
72 |
+
/* matched bracketing */
|
73 |
+
/* Recall = --------------------------- */
|
74 |
+
/* # of bracket in test-data */
|
75 |
+
/* */
|
76 |
+
/*-----------------------------------------------------------------*/
|
77 |
+
|
78 |
+
/******************/
|
79 |
+
/* constant macro */
|
80 |
+
/******************/
|
81 |
+
|
82 |
+
#define MAX_SENT_LEN 50000 //Djamé : was not used
|
83 |
+
#define MAX_WORD_IN_SENT 1000
|
84 |
+
#define MAX_BRACKET_IN_SENT 2000
|
85 |
+
#define MAX_WORD_LEN 100
|
86 |
+
#define MAX_LABEL_LEN 300
|
87 |
+
#define MAX_QUOTE_TERM 20
|
88 |
+
|
89 |
+
#define MAX_DELETE_LABEL 1000
|
90 |
+
#define MAX_EQ_LABEL 1000
|
91 |
+
#define MAX_EQ_WORD 1000
|
92 |
+
|
93 |
+
#define MAX_LINE_LEN 500
|
94 |
+
|
95 |
+
#define DEFAULT_MAX_ERROR 10
|
96 |
+
#define DEFAULT_CUT_LEN 40
|
97 |
+
|
98 |
+
/*************/
|
99 |
+
/* structure */
|
100 |
+
/*************/
|
101 |
+
|
102 |
+
typedef struct ss_terminal {
|
103 |
+
char word[MAX_WORD_LEN];
|
104 |
+
char label[MAX_LABEL_LEN];
|
105 |
+
int result; /* 0:unmatch, 1:match, 9:undef */
|
106 |
+
} s_terminal;
|
107 |
+
|
108 |
+
typedef struct ss_term_ind {
|
109 |
+
s_terminal term;
|
110 |
+
int index;
|
111 |
+
int bracket;
|
112 |
+
int endslen;
|
113 |
+
int ends[MAX_BRACKET_IN_SENT];
|
114 |
+
} s_term_ind;
|
115 |
+
|
116 |
+
typedef struct ss_bracket {
|
117 |
+
int start;
|
118 |
+
int end;
|
119 |
+
unsigned int buf_start;
|
120 |
+
unsigned int buf_end;
|
121 |
+
char label[MAX_LABEL_LEN];
|
122 |
+
int result; /* 0: unmatch, 1:match, 5:delete 9:undef */
|
123 |
+
} s_bracket;
|
124 |
+
|
125 |
+
|
126 |
+
typedef struct ss_equiv {
|
127 |
+
char *s1;
|
128 |
+
char *s2;
|
129 |
+
} s_equiv;
|
130 |
+
|
131 |
+
|
132 |
+
/****************************/
|
133 |
+
/* global variables */
|
134 |
+
/* gold-data: suffix = 1 */
|
135 |
+
/* test-data: suffix = 2 */
|
136 |
+
/****************************/
|
137 |
+
|
138 |
+
/*---------------*/
|
139 |
+
/* Sentence data */
|
140 |
+
/*---------------*/
|
141 |
+
int wn1, wn2; /* number of words in sentence */
|
142 |
+
int r_wn1; /* number of words in sentence */
|
143 |
+
/* which only ignores labels in */
|
144 |
+
/* DELETE_LABEL_FOR_LENGTH */
|
145 |
+
|
146 |
+
s_terminal terminal1[MAX_WORD_IN_SENT]; /* terminal information */
|
147 |
+
s_terminal terminal2[MAX_WORD_IN_SENT];
|
148 |
+
|
149 |
+
s_term_ind quotterm1[MAX_QUOTE_TERM]; /* special terminals ("'","POS") */
|
150 |
+
s_term_ind quotterm2[MAX_QUOTE_TERM];
|
151 |
+
|
152 |
+
int bn1, bn2; /* number of brackets */
|
153 |
+
|
154 |
+
int r_bn1, r_bn2; /* number of brackets */
|
155 |
+
/* after deletion */
|
156 |
+
|
157 |
+
s_bracket bracket1[MAX_BRACKET_IN_SENT]; /* bracket information */
|
158 |
+
s_bracket bracket2[MAX_BRACKET_IN_SENT];
|
159 |
+
|
160 |
+
|
161 |
+
/*------------*/
|
162 |
+
/* Total data */
|
163 |
+
/*------------*/
|
164 |
+
int TOTAL_bn1, TOTAL_bn2, TOTAL_match; /* total number of brackets */
|
165 |
+
int TOTAL_sent; /* No. of sentence */
|
166 |
+
int TOTAL_error_sent; /* No. of error sentence */
|
167 |
+
int TOTAL_skip_sent; /* No. of skip sentence */
|
168 |
+
int TOTAL_comp_sent; /* No. of complete match sent */
|
169 |
+
int TOTAL_word; /* total number of word */
|
170 |
+
int TOTAL_crossing; /* total crossing */
|
171 |
+
int TOTAL_no_crossing; /* no crossing sentence */
|
172 |
+
int TOTAL_2L_crossing; /* 2 or less crossing sentence */
|
173 |
+
int TOTAL_correct_tag; /* total correct tagging */
|
174 |
+
|
175 |
+
int TOT_cut_len = DEFAULT_CUT_LEN; /* Cut-off length in statistics */
|
176 |
+
|
177 |
+
/* data for sentences with len <= CUT_LEN */
|
178 |
+
/* Historically it was 40. */
|
179 |
+
int TOT40_bn1, TOT40_bn2, TOT40_match; /* total number of brackets */
|
180 |
+
int TOT40_sent; /* No. of sentence */
|
181 |
+
int TOT40_error_sent; /* No. of error sentence */
|
182 |
+
int TOT40_skip_sent; /* No. of skip sentence */
|
183 |
+
int TOT40_comp_sent; /* No. of complete match sent */
|
184 |
+
int TOT40_word; /* total number of word */
|
185 |
+
int TOT40_crossing; /* total crossing */
|
186 |
+
int TOT40_no_crossing; /* no crossing sentence */
|
187 |
+
int TOT40_2L_crossing; /* 2 or less crossing sentence */
|
188 |
+
int TOT40_correct_tag; /* total correct tagging */
|
189 |
+
|
190 |
+
/*------------*/
|
191 |
+
/* miscallous */
|
192 |
+
/*------------*/
|
193 |
+
int Line; /* line number */
|
194 |
+
int Error_count = 0; /* Error count */
|
195 |
+
int Status; /* Result status for each sent */
|
196 |
+
/* 0: OK, 1: skip, 2: error */
|
197 |
+
|
198 |
+
/*-------------------*/
|
199 |
+
/* stack manuplation */
|
200 |
+
/*-------------------*/
|
201 |
+
int stack_top;
|
202 |
+
int stack[MAX_BRACKET_IN_SENT];
|
203 |
+
|
204 |
+
/************************************************************/
|
205 |
+
/* User parameters which can be specified in parameter file */
|
206 |
+
/************************************************************/
|
207 |
+
|
208 |
+
/*------------------------------------------*/
|
209 |
+
/* Debug mode */
|
210 |
+
/* print out data for individual sentence */
|
211 |
+
/*------------------------------------------*/
|
212 |
+
int DEBUG=0;
|
213 |
+
|
214 |
+
/*------------------------------------------*/
|
215 |
+
/* MAX error */
|
216 |
+
/* Number of error to stop the process. */
|
217 |
+
/* This is useful if there could be */
|
218 |
+
/* tokanization error. */
|
219 |
+
/* The process will stop when this number*/
|
220 |
+
/* of errors are accumulated. */
|
221 |
+
/*------------------------------------------*/
|
222 |
+
int Max_error = DEFAULT_MAX_ERROR;
|
223 |
+
|
224 |
+
/*------------------------------------------*/
|
225 |
+
/* Cut-off length for statistics */
|
226 |
+
/* int TOT_cut_len = DEFAULT_CUT_LEN; */
|
227 |
+
/* (Defined above) */
|
228 |
+
/*------------------------------------------*/
|
229 |
+
|
230 |
+
|
231 |
+
/*------------------------------------------*/
|
232 |
+
/* unlabeled or labeled bracketing */
|
233 |
+
/* 0: unlabeled bracketing */
|
234 |
+
/* 1: labeled bracketing */
|
235 |
+
/*------------------------------------------*/
|
236 |
+
int F_label = 1;
|
237 |
+
|
238 |
+
/*------------------------------------------*/
|
239 |
+
/* Delete labels */
|
240 |
+
/* list of labels to be ignored. */
|
241 |
+
/* If it is a pre-terminal label, delete */
|
242 |
+
/* the word along with the brackets. */
|
243 |
+
/* If it is a non-terminal label, just */
|
244 |
+
/* delete the brackets (don't delete */
|
245 |
+
/* childrens). */
|
246 |
+
/*------------------------------------------*/
|
247 |
+
char *Delete_label[MAX_DELETE_LABEL];
|
248 |
+
int Delete_label_n = 0;
|
249 |
+
|
250 |
+
/*------------------------------------------*/
|
251 |
+
/* Delete labels for length calculation */
|
252 |
+
/* list of labels to be ignored for */
|
253 |
+
/* length calculation purpose */
|
254 |
+
/*------------------------------------------*/
|
255 |
+
char *Delete_label_for_length[MAX_DELETE_LABEL];
|
256 |
+
int Delete_label_for_length_n = 0;
|
257 |
+
|
258 |
+
/*------------------------------------------*/
|
259 |
+
/* Labels to be considered for misquote */
|
260 |
+
/* (could be possesive or quote) */
|
261 |
+
/*------------------------------------------*/
|
262 |
+
char *Quote_term[MAX_QUOTE_TERM];
|
263 |
+
int Quote_term_n = 0;
|
264 |
+
|
265 |
+
/*------------------------------------------*/
|
266 |
+
/* Equivalent labels, words */
|
267 |
+
/* the pairs are considered equivalent */
|
268 |
+
/* This is non-directional. */
|
269 |
+
/*------------------------------------------*/
|
270 |
+
s_equiv EQ_label[MAX_EQ_LABEL];
|
271 |
+
int EQ_label_n = 0;
|
272 |
+
|
273 |
+
s_equiv EQ_word[MAX_EQ_WORD];
|
274 |
+
int EQ_word_n = 0;
|
275 |
+
|
276 |
+
|
277 |
+
// added by djame
|
278 |
+
int spmrl_max_line_to_read=-1 ;
|
279 |
+
int spmrl_compact_view=0; // default : classic view
|
280 |
+
int spmrl_compact_view40=0; // if one, prints <40 sentence in compact view
|
281 |
+
int spmrl_count_bad_sent=0; // default no count
|
282 |
+
int spmrl_print_filename=0; // default not to print name
|
283 |
+
|
284 |
+
/************************/
|
285 |
+
/* Function return-type */
|
286 |
+
/************************/
|
287 |
+
int main();
|
288 |
+
void init_global();
|
289 |
+
void print_head();
|
290 |
+
void init();
|
291 |
+
void read_parameter_file();
|
292 |
+
void set_param();
|
293 |
+
int narg();
|
294 |
+
int read_line();
|
295 |
+
|
296 |
+
void pushb();
|
297 |
+
int popb();
|
298 |
+
int stackempty();
|
299 |
+
|
300 |
+
void calc_result(unsigned char *buf1,unsigned char *buf);
|
301 |
+
void fix_quote();
|
302 |
+
void reinsert_term();
|
303 |
+
void massage_data();
|
304 |
+
int massage_data_gold_only(); // djame: non destructive
|
305 |
+
void modify_label();
|
306 |
+
void individual_result();
|
307 |
+
void print_total();
|
308 |
+
void dsp_info();
|
309 |
+
int my_isspace(char c); // Djamé: added for debugging' sake
|
310 |
+
|
311 |
+
int is_terminator();
|
312 |
+
int is_deletelabel();
|
313 |
+
int is_deletelabel_for_length();
|
314 |
+
int is_quote_term();
|
315 |
+
int word_comp();
|
316 |
+
int label_comp();
|
317 |
+
|
318 |
+
void Error();
|
319 |
+
void Fatal();
|
320 |
+
void Usage();
|
321 |
+
|
322 |
+
/* ### provided by std headers
|
323 |
+
int fprintf();
|
324 |
+
int printf();
|
325 |
+
int atoi();
|
326 |
+
int fclose();
|
327 |
+
int sscanf();
|
328 |
+
*/
|
329 |
+
|
330 |
+
/***********/
|
331 |
+
/* program */
|
332 |
+
/***********/
|
333 |
+
#define ARG_CHECK(st) if(!(*++(*argv) || (--argc && *++argv))){ \
|
334 |
+
fprintf(stderr,"Missing argument: %s\n",st); \
|
335 |
+
}
|
336 |
+
|
337 |
+
|
338 |
+
char *filename1, *filename2;
|
339 |
+
int
|
340 |
+
main(argc,argv)
|
341 |
+
int argc;
|
342 |
+
char *argv[];
|
343 |
+
{
|
344 |
+
|
345 |
+
FILE *fd1, *fd2;
|
346 |
+
unsigned char buff[MAX_SENT_LEN];
|
347 |
+
unsigned char buff1[MAX_SENT_LEN];
|
348 |
+
int quiet=0; // Djame
|
349 |
+
filename1=NULL;
|
350 |
+
filename2=NULL;
|
351 |
+
|
352 |
+
|
353 |
+
for(argc--,argv++;argc>0;argc--,argv++){
|
354 |
+
if(**argv == '-'){
|
355 |
+
while(*++(*argv)){
|
356 |
+
switch(**argv){
|
357 |
+
|
358 |
+
case 'h': /* help */
|
359 |
+
Usage();
|
360 |
+
exit(1);
|
361 |
+
|
362 |
+
case 'd': /* debug mode */
|
363 |
+
DEBUG = 1;
|
364 |
+
goto nextarg;
|
365 |
+
|
366 |
+
case 'D': /* debug mode */
|
367 |
+
DEBUG = 2;
|
368 |
+
goto nextarg;
|
369 |
+
|
370 |
+
case 'c': /* cut-off length */
|
371 |
+
ARG_CHECK("cut-off length for statistices");
|
372 |
+
TOT_cut_len = atoi(*argv);
|
373 |
+
fprintf(stderr,"cutoff %d\n",TOT_cut_len);
|
374 |
+
//exit(0);
|
375 |
+
goto nextarg;
|
376 |
+
|
377 |
+
|
378 |
+
case 'e': /* max error */
|
379 |
+
ARG_CHECK("number of error to kill");
|
380 |
+
Max_error = atoi(*argv);
|
381 |
+
goto nextarg;
|
382 |
+
|
383 |
+
case 'p': /* parameter file */
|
384 |
+
ARG_CHECK("parameter file");
|
385 |
+
read_parameter_file(*argv);
|
386 |
+
goto nextarg;
|
387 |
+
case 'K':
|
388 |
+
ARG_CHECK("Max nb of sentences to read");
|
389 |
+
spmrl_max_line_to_read=atoi(*argv);
|
390 |
+
goto nextarg;
|
391 |
+
case 'L': // added by djame to maintain compatibility with spmrl 2013 shared task's results extraction rules.
|
392 |
+
spmrl_compact_view=1;
|
393 |
+
goto nextarg;
|
394 |
+
case 'l': // added by djame to maintain compatibility with spmrl 2013 shared task's results extraction rules.
|
395 |
+
spmrl_compact_view=1;
|
396 |
+
spmrl_compact_view40=1;
|
397 |
+
goto nextarg;
|
398 |
+
case 'X': // added by djame : count skipping sentences (()) as bad sentence
|
399 |
+
spmrl_count_bad_sent=1;
|
400 |
+
goto nextarg;
|
401 |
+
case 'V': // added by djame to add gold_name vs test_file in the outpu
|
402 |
+
spmrl_print_filename=1;
|
403 |
+
goto nextarg;
|
404 |
+
default:
|
405 |
+
Usage();
|
406 |
+
exit(0);
|
407 |
+
}
|
408 |
+
}
|
409 |
+
} else {
|
410 |
+
if(filename1==NULL){
|
411 |
+
filename1 = *argv;
|
412 |
+
}else if(filename2==NULL){
|
413 |
+
filename2 = *argv;
|
414 |
+
}
|
415 |
+
}
|
416 |
+
nextarg: continue;
|
417 |
+
}
|
418 |
+
|
419 |
+
init_global();
|
420 |
+
|
421 |
+
|
422 |
+
if((fd1 = fopen(filename1,"r"))==NULL){
|
423 |
+
Fatal("Can't open gold file (%s)\n",filename1);
|
424 |
+
}
|
425 |
+
if((fd2 = fopen(filename2,"r"))==NULL){
|
426 |
+
Fatal("Can't open test file (%s)\n",filename2);
|
427 |
+
}
|
428 |
+
|
429 |
+
print_head();
|
430 |
+
|
431 |
+
for(Line=1;fgets(buff,MAX_SENT_LEN,fd1)!=NULL;Line++){
|
432 |
+
|
433 |
+
init();
|
434 |
+
|
435 |
+
/* READ 1 */
|
436 |
+
r_wn1 = read_line(buff,terminal1,quotterm1,&wn1,bracket1,&bn1);
|
437 |
+
|
438 |
+
strcpy(buff1,buff);
|
439 |
+
|
440 |
+
/* READ 2 */
|
441 |
+
if(fgets(buff,MAX_SENT_LEN,fd2)==NULL){
|
442 |
+
Error("Number of lines unmatch (too many lines in gold file)\n");
|
443 |
+
break;
|
444 |
+
}
|
445 |
+
|
446 |
+
read_line(buff,terminal2,quotterm2,&wn2,bracket2,&bn2);
|
447 |
+
|
448 |
+
/* Calculate result and print it */
|
449 |
+
calc_result(buff1,buff);
|
450 |
+
|
451 |
+
if(DEBUG>=1){
|
452 |
+
dsp_info();
|
453 |
+
}
|
454 |
+
// Added by djame
|
455 |
+
if (spmrl_max_line_to_read!=-1){
|
456 |
+
if ((Line+1) > spmrl_max_line_to_read ){
|
457 |
+
quiet=1;
|
458 |
+
break; // evaluate only spmrl_max_line_to_read -1 (to keep compatibility with lines )
|
459 |
+
}
|
460 |
+
}
|
461 |
+
|
462 |
+
}
|
463 |
+
|
464 |
+
if( (quiet==0) && (fgets(buff,MAX_SENT_LEN,fd2)!=NULL)){
|
465 |
+
Error("Number of lines unmatch (too many lines in test file)\n");
|
466 |
+
}
|
467 |
+
|
468 |
+
print_total();
|
469 |
+
|
470 |
+
return (0);
|
471 |
+
}
|
472 |
+
|
473 |
+
|
474 |
+
/*-----------------------------*/
|
475 |
+
/* initialize global variables */
|
476 |
+
/*-----------------------------*/
|
477 |
+
void
|
478 |
+
init_global()
|
479 |
+
{
|
480 |
+
TOTAL_bn1 = TOTAL_bn2 = TOTAL_match = 0;
|
481 |
+
TOTAL_sent = TOTAL_error_sent = TOTAL_skip_sent = TOTAL_comp_sent = 0;
|
482 |
+
TOTAL_word = TOTAL_correct_tag = 0;
|
483 |
+
TOTAL_crossing = 0;
|
484 |
+
TOTAL_no_crossing = TOTAL_2L_crossing = 0;
|
485 |
+
|
486 |
+
TOT40_bn1 = TOT40_bn2 = TOT40_match = 0;
|
487 |
+
TOT40_sent = TOT40_error_sent = TOT40_skip_sent = TOT40_comp_sent = 0;
|
488 |
+
TOT40_word = TOT40_correct_tag = 0;
|
489 |
+
TOT40_crossing = 0;
|
490 |
+
TOT40_no_crossing = TOT40_2L_crossing = 0;
|
491 |
+
|
492 |
+
}
|
493 |
+
|
494 |
+
|
495 |
+
/*------------------*/
|
496 |
+
/* print head title */
|
497 |
+
/*------------------*/
|
498 |
+
void
|
499 |
+
print_head()
|
500 |
+
{
|
501 |
+
printf(" Sent. Matched Bracket Cross Correct Tag\n");
|
502 |
+
printf(" ID Len. Stat. Recal Prec. Bracket gold test Bracket Words Tags Accracy\n");
|
503 |
+
printf("============================================================================\n");
|
504 |
+
}
|
505 |
+
|
506 |
+
|
507 |
+
/*-----------------------------------------------*/
|
508 |
+
/* initialization at each individual computation */
|
509 |
+
/*-----------------------------------------------*/
|
510 |
+
void
|
511 |
+
init()
|
512 |
+
{
|
513 |
+
int i;
|
514 |
+
|
515 |
+
wn1 = 0;
|
516 |
+
wn2 = 0;
|
517 |
+
bn1 = 0;
|
518 |
+
bn2 = 0;
|
519 |
+
r_bn1 = 0;
|
520 |
+
r_bn2 = 0;
|
521 |
+
|
522 |
+
for(i=0;i<MAX_WORD_IN_SENT;i++){
|
523 |
+
terminal1[i].word[0] = '\0';
|
524 |
+
terminal1[i].label[0] = '\0';
|
525 |
+
terminal1[i].result = 9;
|
526 |
+
terminal2[i].word[0] = '\0';
|
527 |
+
terminal2[i].label[0] = '\0';
|
528 |
+
terminal2[i].result = 9;
|
529 |
+
}
|
530 |
+
|
531 |
+
for(i=0;i<MAX_QUOTE_TERM;i++){
|
532 |
+
quotterm1[i].term.word[0] = '\0';
|
533 |
+
quotterm1[i].term.label[0] = '\0';
|
534 |
+
quotterm1[i].term.result = 9;
|
535 |
+
quotterm1[i].index = -1;
|
536 |
+
quotterm1[i].bracket = -1;
|
537 |
+
quotterm2[i].term.word[0] = '\0';
|
538 |
+
quotterm2[i].term.label[0] = '\0';
|
539 |
+
quotterm2[i].term.result = 9;
|
540 |
+
quotterm2[i].index = -1;
|
541 |
+
quotterm2[i].bracket = -1;
|
542 |
+
}
|
543 |
+
|
544 |
+
for(i=0;i<MAX_BRACKET_IN_SENT;i++){
|
545 |
+
bracket1[i].start = -1;
|
546 |
+
bracket1[i].end = -1;
|
547 |
+
bracket1[i].label[0] = '\0';
|
548 |
+
bracket1[i].result = 9;
|
549 |
+
bracket2[i].start = -1;
|
550 |
+
bracket2[i].end = -1;
|
551 |
+
bracket2[i].label[0] = '\0';
|
552 |
+
bracket2[i].result = 9;
|
553 |
+
}
|
554 |
+
|
555 |
+
Status = 0;
|
556 |
+
}
|
557 |
+
|
558 |
+
/*----------------*/
|
559 |
+
/* parameter file */
|
560 |
+
/*----------------*/
|
561 |
+
void
|
562 |
+
read_parameter_file(filename)
|
563 |
+
char *filename;
|
564 |
+
{
|
565 |
+
char buff[MAX_LINE_LEN];
|
566 |
+
FILE *fd;
|
567 |
+
int line;
|
568 |
+
int i;
|
569 |
+
|
570 |
+
if((fd=fopen(filename,"r"))==NULL){
|
571 |
+
Fatal("Can't open parameter file (%s)\n",filename);
|
572 |
+
}
|
573 |
+
|
574 |
+
for(line=1;fgets(buff,MAX_LINE_LEN,fd)!=NULL;line++){
|
575 |
+
|
576 |
+
/* clean up the tail and find unvalid line */
|
577 |
+
/*-----------------------------------------*/
|
578 |
+
for(i=strlen(buff)-1;i>0 && (isspace(buff[i]) || buff[i]=='\n');i--){
|
579 |
+
buff[i]='\0';
|
580 |
+
}
|
581 |
+
if(buff[0]=='#' || /* comment-line */
|
582 |
+
strlen(buff)<3){ /* too short, just ignore */
|
583 |
+
continue;
|
584 |
+
}
|
585 |
+
|
586 |
+
/* place the parameter and value */
|
587 |
+
/*-------------------------------*/
|
588 |
+
for(i=0;!isspace(buff[i]);i++);
|
589 |
+
for(;isspace(buff[i]) && buff[i]!='\0';i++);
|
590 |
+
if(buff[i]=='\0'){
|
591 |
+
fprintf(stderr,"Empty value in parameter file (%d)\n",line);
|
592 |
+
}
|
593 |
+
|
594 |
+
/* set parameter and value */
|
595 |
+
/*-------------------------*/
|
596 |
+
set_param(buff,buff+i);
|
597 |
+
}
|
598 |
+
|
599 |
+
fclose(fd);
|
600 |
+
}
|
601 |
+
|
602 |
+
|
603 |
+
#define STRNCMP(s) (strncmp(param,s,strlen(s))==0 && \
|
604 |
+
(param[strlen(s)]=='\0' || isspace(param[strlen(s)])))
|
605 |
+
|
606 |
+
|
607 |
+
void
|
608 |
+
set_param(param,value)
|
609 |
+
char *param, *value;
|
610 |
+
{
|
611 |
+
char l1[MAX_LABEL_LEN], l2[MAX_LABEL_LEN];
|
612 |
+
|
613 |
+
if(STRNCMP("DEBUG")){
|
614 |
+
|
615 |
+
DEBUG = atoi(value);
|
616 |
+
|
617 |
+
}else if(STRNCMP("MAX_ERROR")){
|
618 |
+
|
619 |
+
Max_error = atoi(value);
|
620 |
+
|
621 |
+
}else if(STRNCMP("CUTOFF_LEN")){
|
622 |
+
|
623 |
+
TOT_cut_len = atoi(value);
|
624 |
+
|
625 |
+
}else if(STRNCMP("LABELED")){
|
626 |
+
|
627 |
+
F_label = atoi(value);
|
628 |
+
|
629 |
+
}else if(STRNCMP("DELETE_LABEL")){
|
630 |
+
|
631 |
+
Delete_label[Delete_label_n] = (char *)malloc(strlen(value)+1);
|
632 |
+
strcpy(Delete_label[Delete_label_n],value);
|
633 |
+
Delete_label_n++;
|
634 |
+
|
635 |
+
}else if(STRNCMP("DELETE_LABEL_FOR_LENGTH")){
|
636 |
+
|
637 |
+
Delete_label_for_length[Delete_label_for_length_n] = (char *)malloc(strlen(value)+1);
|
638 |
+
strcpy(Delete_label_for_length[Delete_label_for_length_n],value);
|
639 |
+
Delete_label_for_length_n++;
|
640 |
+
|
641 |
+
}else if(STRNCMP("QUOTE_LABEL")){
|
642 |
+
|
643 |
+
Quote_term[Quote_term_n] = (char *)malloc(strlen(value)+1);
|
644 |
+
strcpy(Quote_term[Quote_term_n],value);
|
645 |
+
Quote_term_n++;
|
646 |
+
|
647 |
+
}else if(STRNCMP("EQ_LABEL")){
|
648 |
+
|
649 |
+
if(narg(value)!=2){
|
650 |
+
fprintf(stderr,"EQ_LABEL requires two values\n");
|
651 |
+
return;
|
652 |
+
}
|
653 |
+
sscanf(value,"%s %s",l1,l2);
|
654 |
+
EQ_label[EQ_label_n].s1 = (char *)malloc(strlen(l1)+1);
|
655 |
+
strcpy(EQ_label[EQ_label_n].s1,l1);
|
656 |
+
EQ_label[EQ_label_n].s2 = (char *)malloc(strlen(l2)+1);
|
657 |
+
strcpy(EQ_label[EQ_label_n].s2,l2);
|
658 |
+
EQ_label_n++;
|
659 |
+
|
660 |
+
}else if(STRNCMP("EQ_WORD")){
|
661 |
+
|
662 |
+
if(narg(value)!=2){
|
663 |
+
fprintf(stderr,"EQ_WORD requires two values\n");
|
664 |
+
return;
|
665 |
+
}
|
666 |
+
sscanf(value,"%s %s",l1,l2);
|
667 |
+
EQ_word[EQ_word_n].s1 = (char *)malloc(strlen(l1)+1);
|
668 |
+
strcpy(EQ_word[EQ_word_n].s1,l1);
|
669 |
+
EQ_word[EQ_word_n].s2 = (char *)malloc(strlen(l2)+1);
|
670 |
+
strcpy(EQ_word[EQ_word_n].s2,l2);
|
671 |
+
EQ_word_n++;
|
672 |
+
|
673 |
+
}else{
|
674 |
+
|
675 |
+
fprintf(stderr,"Unknown keyword (%s) in parameter file\n",param);
|
676 |
+
|
677 |
+
}
|
678 |
+
}
|
679 |
+
|
680 |
+
|
681 |
+
int
|
682 |
+
narg(s)
|
683 |
+
char *s;
|
684 |
+
{
|
685 |
+
int n;
|
686 |
+
|
687 |
+
for(n=0;*s!='\0';){
|
688 |
+
for(;isspace(*s);s++);
|
689 |
+
if(*s=='\0'){
|
690 |
+
break;
|
691 |
+
}
|
692 |
+
n++;
|
693 |
+
for(;!isspace(*s);s++){
|
694 |
+
if(*s=='\0'){
|
695 |
+
break;
|
696 |
+
}
|
697 |
+
}
|
698 |
+
}
|
699 |
+
|
700 |
+
return(n);
|
701 |
+
}
|
702 |
+
|
703 |
+
/*-----------------------------*/
|
704 |
+
/* Read line and gather data. */
|
705 |
+
/* Return langth of sentence. */
|
706 |
+
/*-----------------------------*/
|
707 |
+
int
|
708 |
+
read_line(buff, terminal, quotterm, wn, bracket, bn)
|
709 |
+
char *buff;
|
710 |
+
s_terminal terminal[];
|
711 |
+
s_term_ind quotterm[];
|
712 |
+
int *wn;
|
713 |
+
s_bracket bracket[];
|
714 |
+
int *bn;
|
715 |
+
{
|
716 |
+
char *p, *q, label[MAX_LABEL_LEN], word[MAX_WORD_LEN];
|
717 |
+
int qt; /* quote term counter */
|
718 |
+
int wid, bid; /* word ID, bracket ID */
|
719 |
+
int n; /* temporary remembering the position */
|
720 |
+
int b; /* temporary remembering bid */
|
721 |
+
int i;
|
722 |
+
int len; /* length of the sentence */
|
723 |
+
|
724 |
+
len = 0;
|
725 |
+
stack_top=0;
|
726 |
+
|
727 |
+
for(p=buff,qt=0,wid=0,bid=0;*p!='\0';){
|
728 |
+
|
729 |
+
if(isspace(*p)){
|
730 |
+
p++;
|
731 |
+
continue;
|
732 |
+
|
733 |
+
/* open bracket */
|
734 |
+
/*--------------*/
|
735 |
+
}else if(*p=='('){
|
736 |
+
|
737 |
+
n=wid;
|
738 |
+
for(p++,i=0;!is_terminator(*p);p++,i++){
|
739 |
+
label[i]=*p;
|
740 |
+
}
|
741 |
+
label[i]='\0';
|
742 |
+
|
743 |
+
/* Find terminals */
|
744 |
+
q = p;
|
745 |
+
if(isspace(*q)){
|
746 |
+
for(q++;isspace(*q);q++);
|
747 |
+
for(i=0;!is_terminator(*q);q++,i++){
|
748 |
+
word[i]=*q;
|
749 |
+
}
|
750 |
+
word[i]='\0';
|
751 |
+
|
752 |
+
/* compute length */
|
753 |
+
if(*q==')' && !is_deletelabel_for_length(label)==1){
|
754 |
+
len++;
|
755 |
+
}
|
756 |
+
if (DEBUG>1)
|
757 |
+
printf("label=%s, word=%s, wid=%d\n",label,word,wid);
|
758 |
+
/* quote terminal */
|
759 |
+
if(*q==')' && is_quote_term(label,word)==1){
|
760 |
+
strcpy(quotterm[qt].term.word,word);
|
761 |
+
strcpy(quotterm[qt].term.label,label);
|
762 |
+
quotterm[qt].index = wid;
|
763 |
+
quotterm[qt].bracket = bid;
|
764 |
+
quotterm[qt].endslen = stack_top;
|
765 |
+
//quotterm[qt].ends = (int*)malloc(stack_top*sizeof(int));
|
766 |
+
memcpy(quotterm[qt].ends,stack,stack_top*sizeof(int));
|
767 |
+
qt++;
|
768 |
+
}
|
769 |
+
|
770 |
+
/* Slav: do not delete terminals */
|
771 |
+
/* delete terminal */
|
772 |
+
//if(*q==')' && is_deletelabel(label)==1){
|
773 |
+
// p = q+1;
|
774 |
+
// continue;
|
775 |
+
|
776 |
+
/* valid terminal */
|
777 |
+
//}else
|
778 |
+
if(*q==')'){
|
779 |
+
strcpy(terminal[wid].word,word);
|
780 |
+
strcpy(terminal[wid].label,label);
|
781 |
+
wid++;
|
782 |
+
p = q+1;
|
783 |
+
continue;
|
784 |
+
|
785 |
+
/* error */
|
786 |
+
}else if(*q!='('){
|
787 |
+
fprintf(stderr,"debug djam: q= %s\n",q);
|
788 |
+
Error("More than two elements in a bracket\n");
|
789 |
+
}
|
790 |
+
}
|
791 |
+
|
792 |
+
/* otherwise non-terminal label */
|
793 |
+
bracket[bid].start = wid;
|
794 |
+
bracket[bid].buf_start = p-buff;
|
795 |
+
strcpy(bracket[bid].label,label);
|
796 |
+
pushb(bid);
|
797 |
+
bid++;
|
798 |
+
|
799 |
+
/* close bracket */
|
800 |
+
/*---------------*/
|
801 |
+
}else if(*p==')'){
|
802 |
+
|
803 |
+
b = popb();
|
804 |
+
bracket[b].end = wid;
|
805 |
+
bracket[b].buf_end = p-buff;
|
806 |
+
p++;
|
807 |
+
|
808 |
+
/* error */
|
809 |
+
/*-------*/
|
810 |
+
}else{
|
811 |
+
|
812 |
+
Error("Reading sentence\n");
|
813 |
+
}
|
814 |
+
}
|
815 |
+
|
816 |
+
if(!stackempty()){
|
817 |
+
Error("Bracketing is unbalanced (too many open bracket)\n");
|
818 |
+
}
|
819 |
+
|
820 |
+
*wn = wid;
|
821 |
+
*bn = bid;
|
822 |
+
|
823 |
+
return(len);
|
824 |
+
}
|
825 |
+
|
826 |
+
|
827 |
+
/*----------------------*/
|
828 |
+
/* stack operation */
|
829 |
+
/* for bracketing pairs */
|
830 |
+
/*----------------------*/
|
831 |
+
void
|
832 |
+
pushb(item)
|
833 |
+
int item;
|
834 |
+
{
|
835 |
+
stack[stack_top++]=item;
|
836 |
+
}
|
837 |
+
|
838 |
+
int
|
839 |
+
popb()
|
840 |
+
{
|
841 |
+
int item;
|
842 |
+
|
843 |
+
item = stack[stack_top-1];
|
844 |
+
|
845 |
+
if(stack_top-- < 0){
|
846 |
+
Error("Bracketing unbalance (too many close bracket)\n");
|
847 |
+
}
|
848 |
+
return(item);
|
849 |
+
}
|
850 |
+
|
851 |
+
int
|
852 |
+
stackempty()
|
853 |
+
{
|
854 |
+
if(stack_top==0){
|
855 |
+
return(1);
|
856 |
+
}else{
|
857 |
+
return(0);
|
858 |
+
}
|
859 |
+
}
|
860 |
+
|
861 |
+
|
862 |
+
/*------------------*/
|
863 |
+
/* calculate result */
|
864 |
+
/*------------------*/
|
865 |
+
void
|
866 |
+
calc_result(unsigned char *buf1,unsigned char *buf)
|
867 |
+
{
|
868 |
+
int i, j, l;
|
869 |
+
int match, crossing, correct_tag;
|
870 |
+
|
871 |
+
int last_i = -1;
|
872 |
+
|
873 |
+
char my_buf[10000]; //djame: was 1000
|
874 |
+
int match_found = 0;
|
875 |
+
|
876 |
+
char match_j[2000]; //djame was : 200
|
877 |
+
for (j = 0; j < bn2; ++j) {
|
878 |
+
match_j[j] = 0;
|
879 |
+
}
|
880 |
+
|
881 |
+
/* ML */
|
882 |
+
if (DEBUG>1)
|
883 |
+
printf("\n");
|
884 |
+
|
885 |
+
|
886 |
+
/* Find skip and error */
|
887 |
+
/*---------------------*/
|
888 |
+
if(wn2==0){ // Djame: case of empty lines
|
889 |
+
if (spmrl_count_bad_sent==1){
|
890 |
+
Status = 3;
|
891 |
+
//individual_result(wn1,r_bn1,r_bn2,match,crossing,correct_tag);
|
892 |
+
int n_bracket_gold=massage_data_gold_only();
|
893 |
+
r_bn1=n_bracket_gold;
|
894 |
+
individual_result(wn1,n_bracket_gold,0,0,0,0); // testing the case of missing analysis was 0,0
|
895 |
+
}else {
|
896 |
+
Status=2;
|
897 |
+
individual_result(0,0,0,0,0,0);
|
898 |
+
}
|
899 |
+
|
900 |
+
return;
|
901 |
+
}
|
902 |
+
|
903 |
+
if(wn1 != wn2){
|
904 |
+
//if (DEBUG>1)
|
905 |
+
//Error("Length unmatch (%d|%d)\n",wn1,wn2);
|
906 |
+
fix_quote();
|
907 |
+
if(wn1 != wn2){
|
908 |
+
individual_result(0,0,0,0,0,0);
|
909 |
+
/* Slav: ignore 1 word sentences */
|
910 |
+
if (wn1 > 1) {
|
911 |
+
Error("Length unmatch (%d|%d)\n",wn1,wn2);
|
912 |
+
return;
|
913 |
+
}
|
914 |
+
}
|
915 |
+
}
|
916 |
+
|
917 |
+
for(i=0;i<wn1;i++){
|
918 |
+
if(word_comp(terminal1[i].word,terminal2[i].word)==0){
|
919 |
+
Error("Words unmatch (%s|%s)\n",terminal1[i].word,
|
920 |
+
terminal2[i].word);
|
921 |
+
individual_result(0,0,0,0,0,0);
|
922 |
+
return;
|
923 |
+
}
|
924 |
+
}
|
925 |
+
|
926 |
+
/* massage the data */
|
927 |
+
/*------------------*/
|
928 |
+
massage_data();
|
929 |
+
|
930 |
+
/* matching brackets */
|
931 |
+
/*-------------------*/
|
932 |
+
match = 0;
|
933 |
+
for(i=0;i<bn1;i++){
|
934 |
+
for(j=0;j<bn2;j++){
|
935 |
+
|
936 |
+
if (DEBUG>1)
|
937 |
+
printf("1.res=%d, 2.res=%d, 1.start=%d, 2.start=%d, 1.end=%d, 2.end=%d\n",bracket1[i].result,bracket2[j].result,bracket1[i].start,bracket2[j].start,bracket1[i].end,bracket2[j].end);
|
938 |
+
|
939 |
+
// does bracket match?
|
940 |
+
if(bracket1[i].result != 5 &&
|
941 |
+
bracket2[j].result == 0 &&
|
942 |
+
bracket1[i].start == bracket2[j].start && bracket1[i].end == bracket2[j].end) {
|
943 |
+
|
944 |
+
// (1) do we not care about the label or (2) does the label match?
|
945 |
+
if (F_label==0 || label_comp(bracket1[i].label,bracket2[j].label)==1) {
|
946 |
+
bracket1[i].result = bracket2[j].result = 1;
|
947 |
+
match++;
|
948 |
+
match_found = 1;
|
949 |
+
break;
|
950 |
+
} else {
|
951 |
+
if (DEBUG>1) {
|
952 |
+
printf(" LABEL[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
|
953 |
+
l = bracket1[i].buf_end-bracket1[i].buf_start;
|
954 |
+
strncpy(my_buf,buf1+bracket1[i].buf_start,l);
|
955 |
+
my_buf[l] = '\0';
|
956 |
+
printf("%s\n",my_buf);
|
957 |
+
}
|
958 |
+
match_found = 1;
|
959 |
+
match_j[j] = 1;
|
960 |
+
}
|
961 |
+
}
|
962 |
+
}
|
963 |
+
|
964 |
+
if (!match_found && bracket1[i].result != 5 && DEBUG>1) {
|
965 |
+
/* ### ML 09/28/03: gold bracket with no corresponding test bracket */
|
966 |
+
printf(" BRACKET[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
|
967 |
+
l = bracket1[i].buf_end-bracket1[i].buf_start;
|
968 |
+
strncpy(my_buf,buf1+bracket1[i].buf_start,l);
|
969 |
+
my_buf[l] = '\0';
|
970 |
+
printf("%s\n",my_buf);
|
971 |
+
}
|
972 |
+
match_found = 0;
|
973 |
+
}
|
974 |
+
|
975 |
+
for(j=0;j<bn2;j++){
|
976 |
+
if (bracket2[j].result==0 && !match_j[j] && DEBUG>1) {
|
977 |
+
/* test bracket with no corresponding gold bracket */
|
978 |
+
printf(" EXTRA[%d-%d]: ",bracket2[j].start,bracket2[j].end-1);
|
979 |
+
l = bracket2[j].buf_end-bracket2[j].buf_start;
|
980 |
+
strncpy(my_buf,buf+bracket2[j].buf_start,l);
|
981 |
+
my_buf[l] = '\0';
|
982 |
+
printf("%s\n",my_buf);
|
983 |
+
}
|
984 |
+
}
|
985 |
+
|
986 |
+
/* crossing */
|
987 |
+
/*----------*/
|
988 |
+
crossing = 0;
|
989 |
+
|
990 |
+
/* crossing is counted based on the brackets */
|
991 |
+
/* in test rather than gold file (by Mike) */
|
992 |
+
for(j=0;j<bn2;j++){
|
993 |
+
for(i=0;i<bn1;i++){
|
994 |
+
if(bracket1[i].result != 5 &&
|
995 |
+
bracket2[j].result != 5 &&
|
996 |
+
((bracket1[i].start < bracket2[j].start &&
|
997 |
+
bracket1[i].end > bracket2[j].start &&
|
998 |
+
bracket1[i].end < bracket2[j].end) ||
|
999 |
+
(bracket1[i].start > bracket2[j].start &&
|
1000 |
+
bracket1[i].start < bracket2[j].end &&
|
1001 |
+
bracket1[i].end > bracket2[j].end))){
|
1002 |
+
|
1003 |
+
/* ### ML 09/01/03: get details on cross-brackettings */
|
1004 |
+
if (i != last_i) {
|
1005 |
+
if (DEBUG>1) {
|
1006 |
+
printf(" CROSSING[%d-%d]: ",bracket1[i].start,bracket1[i].end-1);
|
1007 |
+
l = bracket1[i].buf_end-bracket1[i].buf_start;
|
1008 |
+
strncpy(my_buf,buf1+bracket1[i].buf_start,l);
|
1009 |
+
my_buf[l] = '\0';
|
1010 |
+
printf("%s\n",my_buf);
|
1011 |
+
|
1012 |
+
/* ML
|
1013 |
+
printf("\n CROSSING at bracket %d:\n",i-1);
|
1014 |
+
printf(" GOLD (tokens %d-%d): ",bracket1[i].start,bracket1[i].end-1);
|
1015 |
+
l = bracket1[i].buf_end-bracket1[i].buf_start;
|
1016 |
+
strncpy(my_buf,buf1+bracket1[i].buf_start,l);
|
1017 |
+
my_buf[l] = '\0';
|
1018 |
+
printf("%s\n",my_buf);
|
1019 |
+
*/
|
1020 |
+
}
|
1021 |
+
last_i = i;
|
1022 |
+
}
|
1023 |
+
|
1024 |
+
/* ML
|
1025 |
+
printf(" TEST (tokens %d-%d): ",bracket2[j].start,bracket2[j].end-1);
|
1026 |
+
l = bracket2[j].buf_end-bracket2[j].buf_start;
|
1027 |
+
strncpy(my_buf,buf+bracket2[j].buf_start,l);
|
1028 |
+
my_buf[l] = '\0';
|
1029 |
+
printf("%s\n",my_buf);
|
1030 |
+
*/
|
1031 |
+
|
1032 |
+
crossing++;
|
1033 |
+
break;
|
1034 |
+
}
|
1035 |
+
}
|
1036 |
+
}
|
1037 |
+
|
1038 |
+
/* Tagging accuracy */
|
1039 |
+
/*------------------*/
|
1040 |
+
correct_tag=0;
|
1041 |
+
for(i=0;i<wn1;i++){
|
1042 |
+
if(label_comp(terminal1[i].label,terminal2[i].label)==1){
|
1043 |
+
terminal1[i].result = terminal2[i].result = 1;
|
1044 |
+
correct_tag++;
|
1045 |
+
} else {
|
1046 |
+
terminal1[i].result = terminal2[i].result = 0;
|
1047 |
+
}
|
1048 |
+
}
|
1049 |
+
|
1050 |
+
individual_result(wn1,r_bn1,r_bn2,match,crossing,correct_tag);
|
1051 |
+
}
|
1052 |
+
|
1053 |
+
void
|
1054 |
+
fix_quote()
|
1055 |
+
{
|
1056 |
+
int i,j,k;
|
1057 |
+
if (DEBUG>1) {
|
1058 |
+
for(i=0;i<MAX_QUOTE_TERM;i++){
|
1059 |
+
if (quotterm1[i].index!=-1)
|
1060 |
+
printf("%d: %s - %s\n",quotterm1[i].index,
|
1061 |
+
quotterm1[i].term.label,
|
1062 |
+
quotterm1[i].term.word);
|
1063 |
+
if (quotterm2[i].index!=-1)
|
1064 |
+
printf("%d: %s - %s\n",quotterm2[i].index,
|
1065 |
+
quotterm2[i].term.label,
|
1066 |
+
quotterm2[i].term.word);
|
1067 |
+
}
|
1068 |
+
}
|
1069 |
+
for(i=0;i<MAX_QUOTE_TERM;i++) {
|
1070 |
+
int ind = quotterm2[i].index;
|
1071 |
+
if (ind!=-1) {
|
1072 |
+
for(j=0;j<MAX_QUOTE_TERM;j++){
|
1073 |
+
if (quotterm1[j].index==ind &&
|
1074 |
+
strcmp(quotterm1[j].term.label,
|
1075 |
+
quotterm2[i].term.label)!=0) {
|
1076 |
+
if (is_deletelabel(quotterm1[j].term.label) && !is_deletelabel(quotterm2[i].term.label)) {
|
1077 |
+
reinsert_term("term1[j],terminal1,bracket1,&wn1);
|
1078 |
+
for (k=j;k<MAX_QUOTE_TERM;k++)
|
1079 |
+
if (quotterm1[k].index!=-1)
|
1080 |
+
quotterm1[k].index++;
|
1081 |
+
} else if (is_deletelabel(quotterm2[i].term.label) && !is_deletelabel(quotterm1[j].term.label)) {
|
1082 |
+
reinsert_term("term2[i],terminal2,bracket2,&wn2);
|
1083 |
+
for (k=i;k<MAX_QUOTE_TERM;k++)
|
1084 |
+
if (quotterm2[k].index!=-1)
|
1085 |
+
quotterm2[k].index++;
|
1086 |
+
}
|
1087 |
+
}
|
1088 |
+
}
|
1089 |
+
} else break;
|
1090 |
+
}
|
1091 |
+
}
|
1092 |
+
|
1093 |
+
void
|
1094 |
+
reinsert_term(quot,terminal,bracket,wn)
|
1095 |
+
s_term_ind* quot;
|
1096 |
+
s_terminal terminal[];
|
1097 |
+
s_bracket bracket[];
|
1098 |
+
int* wn;
|
1099 |
+
{
|
1100 |
+
int ind = quot->index;
|
1101 |
+
int bra = quot->bracket;
|
1102 |
+
s_terminal* term = "->term;
|
1103 |
+
int k;
|
1104 |
+
memmove(&terminal[ind+1],
|
1105 |
+
&terminal[ind],
|
1106 |
+
sizeof(s_terminal)*(MAX_WORD_IN_SENT-ind-1));
|
1107 |
+
strcpy(terminal[ind].label,term->label);
|
1108 |
+
strcpy(terminal[ind].word,term->word);
|
1109 |
+
(*wn)++;
|
1110 |
+
if (DEBUG>1)
|
1111 |
+
printf("bra=%d, ind=%d\n",bra,ind);
|
1112 |
+
for(k=0;k<MAX_BRACKET_IN_SENT;k++) {
|
1113 |
+
if (bracket[k].start==-1)
|
1114 |
+
break;
|
1115 |
+
if (DEBUG>1)
|
1116 |
+
printf("bracket[%d]={%d,%d}\n",k,bracket[k].start,bracket[k].end);
|
1117 |
+
if (k>=bra) {
|
1118 |
+
bracket[k].start++;
|
1119 |
+
bracket[k].end++;
|
1120 |
+
}
|
1121 |
+
//if (bracket[k].start<=ind && bracket[k].end>=ind)
|
1122 |
+
//bracket[k].end++;
|
1123 |
+
}
|
1124 |
+
if (DEBUG>1)
|
1125 |
+
printf("endslen=%d\n",quot->endslen);
|
1126 |
+
for(k=0;k<quot->endslen;k++) {
|
1127 |
+
//printf("ends[%d]=%d",k,quot->ends[k]);
|
1128 |
+
bracket[quot->ends[k]].end++;
|
1129 |
+
}
|
1130 |
+
//free(quot->ends);
|
1131 |
+
}
|
1132 |
+
/*
|
1133 |
+
void
|
1134 |
+
adjust_end(ind,bra)
|
1135 |
+
int ind;
|
1136 |
+
int bra;
|
1137 |
+
{
|
1138 |
+
for(k=0;k<MAX_BRACKET_IN_SENT;k++) {
|
1139 |
+
if (bracket[k].start==-1)
|
1140 |
+
break;
|
1141 |
+
printf("bracket[%d]={%d,%d}\n",k,bracket[k].start,bracket[k].end);
|
1142 |
+
if (k>=bra)
|
1143 |
+
bracket[k].end++;
|
1144 |
+
}
|
1145 |
+
}
|
1146 |
+
*/
|
1147 |
+
|
1148 |
+
|
1149 |
+
|
1150 |
+
int massage_data_gold_only(){
|
1151 |
+
int i, j;
|
1152 |
+
int gold_valid_bracket=0;
|
1153 |
+
char buflabel[MAX_LABEL_LEN]; // djame
|
1154 |
+
/* for GOLD */
|
1155 |
+
/*----------*/
|
1156 |
+
for(i=0;i<bn1;i++){
|
1157 |
+
|
1158 |
+
bracket1[i].result = 0;
|
1159 |
+
|
1160 |
+
/* Zero element */
|
1161 |
+
if(bracket1[i].start == bracket1[i].end){
|
1162 |
+
//bracket1[i].result = bracket1[i].result; // was 5
|
1163 |
+
continue;
|
1164 |
+
}else {
|
1165 |
+
gold_valid_bracket++;
|
1166 |
+
}
|
1167 |
+
|
1168 |
+
|
1169 |
+
/* Modify label */
|
1170 |
+
strcpy(buflabel,bracket1[i].label); //djame
|
1171 |
+
modify_label(buflabel); // Djamé will be called twice
|
1172 |
+
|
1173 |
+
/* Delete label */
|
1174 |
+
for(j=0;j<Delete_label_n;j++){
|
1175 |
+
if(label_comp(buflabel,Delete_label[j])!=1){
|
1176 |
+
gold_valid_bracket++;
|
1177 |
+
}
|
1178 |
+
}
|
1179 |
+
}
|
1180 |
+
|
1181 |
+
return gold_valid_bracket;
|
1182 |
+
}
|
1183 |
+
|
1184 |
+
|
1185 |
+
|
1186 |
+
|
1187 |
+
|
1188 |
+
void
|
1189 |
+
massage_data()
|
1190 |
+
{
|
1191 |
+
int i, j;
|
1192 |
+
|
1193 |
+
/* for GOLD */
|
1194 |
+
/*----------*/
|
1195 |
+
for(i=0;i<bn1;i++){
|
1196 |
+
|
1197 |
+
bracket1[i].result = 0;
|
1198 |
+
|
1199 |
+
/* Zero element */
|
1200 |
+
if(bracket1[i].start == bracket1[i].end){
|
1201 |
+
bracket1[i].result = 5;
|
1202 |
+
continue;
|
1203 |
+
}
|
1204 |
+
|
1205 |
+
/* Modify label */
|
1206 |
+
modify_label(bracket1[i].label);
|
1207 |
+
|
1208 |
+
/* Delete label */
|
1209 |
+
for(j=0;j<Delete_label_n;j++){
|
1210 |
+
if(label_comp(bracket1[i].label,Delete_label[j])==1){
|
1211 |
+
bracket1[i].result = 5;
|
1212 |
+
}
|
1213 |
+
}
|
1214 |
+
}
|
1215 |
+
|
1216 |
+
/* for TEST */
|
1217 |
+
/*----------*/
|
1218 |
+
for(i=0;i<bn2;i++){
|
1219 |
+
|
1220 |
+
bracket2[i].result = 0;
|
1221 |
+
|
1222 |
+
/* Zero element */
|
1223 |
+
if(bracket2[i].start == bracket2[i].end){
|
1224 |
+
bracket2[i].result = 5;
|
1225 |
+
continue;
|
1226 |
+
}
|
1227 |
+
|
1228 |
+
/* Modify label */
|
1229 |
+
modify_label(bracket2[i].label);
|
1230 |
+
|
1231 |
+
/* Delete label */
|
1232 |
+
for(j=0;j<Delete_label_n;j++){
|
1233 |
+
if(label_comp(bracket2[i].label,Delete_label[j])==1){
|
1234 |
+
bracket2[i].result = 5;
|
1235 |
+
}
|
1236 |
+
}
|
1237 |
+
}
|
1238 |
+
|
1239 |
+
|
1240 |
+
/* count up real number of brackets (exclude deleted ones) */
|
1241 |
+
/*---------------------------------------------------------*/
|
1242 |
+
r_bn1 = r_bn2 = 0;
|
1243 |
+
|
1244 |
+
for(i=0;i<bn1;i++){
|
1245 |
+
if(bracket1[i].result != 5){
|
1246 |
+
r_bn1++;
|
1247 |
+
}
|
1248 |
+
}
|
1249 |
+
|
1250 |
+
for(i=0;i<bn2;i++){
|
1251 |
+
if(bracket2[i].result != 5){
|
1252 |
+
r_bn2++;
|
1253 |
+
}
|
1254 |
+
}
|
1255 |
+
}
|
1256 |
+
|
1257 |
+
|
1258 |
+
/*------------------------*/
|
1259 |
+
/* trim the tail of label */
|
1260 |
+
/*------------------------*/
|
1261 |
+
void
|
1262 |
+
modify_label(label)
|
1263 |
+
char *label;
|
1264 |
+
{
|
1265 |
+
char *p;
|
1266 |
+
|
1267 |
+
for(p=label;*p!='\0';p++){
|
1268 |
+
if(*p=='-' || *p=='='|| *p=='#'){ // for dealing with morph features
|
1269 |
+
*p='\0';
|
1270 |
+
break;
|
1271 |
+
}
|
1272 |
+
}
|
1273 |
+
}
|
1274 |
+
|
1275 |
+
|
1276 |
+
/*-----------------------------------------------*/
|
1277 |
+
/* add individual statistics to TOTAL statictics */
|
1278 |
+
/*-----------------------------------------------*/
|
1279 |
+
void
|
1280 |
+
individual_result(wn1,bn1,bn2,match,crossing,correct_tag)
|
1281 |
+
int wn1,bn1,bn2,match,crossing,correct_tag;
|
1282 |
+
{
|
1283 |
+
|
1284 |
+
/* Statistics for ALL */
|
1285 |
+
/*--------------------*/
|
1286 |
+
TOTAL_sent++;
|
1287 |
+
if(Status==1){
|
1288 |
+
TOTAL_error_sent++;
|
1289 |
+
}else if(Status==2){
|
1290 |
+
TOTAL_skip_sent++;
|
1291 |
+
}else{
|
1292 |
+
TOTAL_bn1 += bn1;
|
1293 |
+
TOTAL_bn2 += bn2;
|
1294 |
+
TOTAL_match += match;
|
1295 |
+
if(bn1==bn2 && bn2==match){
|
1296 |
+
TOTAL_comp_sent++;
|
1297 |
+
}
|
1298 |
+
TOTAL_word += wn1;
|
1299 |
+
TOTAL_crossing += crossing;
|
1300 |
+
if(crossing==0){
|
1301 |
+
TOTAL_no_crossing++;
|
1302 |
+
}
|
1303 |
+
if(crossing <= 2){
|
1304 |
+
TOTAL_2L_crossing++;
|
1305 |
+
}
|
1306 |
+
TOTAL_correct_tag += correct_tag;
|
1307 |
+
}
|
1308 |
+
|
1309 |
+
|
1310 |
+
/* Statistics for sent length <= TOT_cut_len */
|
1311 |
+
/*-------------------------------------------*/
|
1312 |
+
//fprintf(stderr,"cut-off %d\n",TOT_cut_len);
|
1313 |
+
//exit(0);
|
1314 |
+
if(r_wn1<=TOT_cut_len){
|
1315 |
+
TOT40_sent++;
|
1316 |
+
if(Status==1){
|
1317 |
+
TOT40_error_sent++;
|
1318 |
+
}else if(Status==2){
|
1319 |
+
TOT40_skip_sent++;
|
1320 |
+
}else{
|
1321 |
+
TOT40_bn1 += bn1;
|
1322 |
+
TOT40_bn2 += bn2;
|
1323 |
+
TOT40_match += match;
|
1324 |
+
if(bn1==bn2 && bn2==match){
|
1325 |
+
TOT40_comp_sent++;
|
1326 |
+
}
|
1327 |
+
TOT40_word += wn1;
|
1328 |
+
TOT40_crossing += crossing;
|
1329 |
+
if(crossing==0){
|
1330 |
+
TOT40_no_crossing++;
|
1331 |
+
}
|
1332 |
+
if(crossing <= 2){
|
1333 |
+
TOT40_2L_crossing++;
|
1334 |
+
}
|
1335 |
+
TOT40_correct_tag += correct_tag;
|
1336 |
+
}
|
1337 |
+
}
|
1338 |
+
|
1339 |
+
/* Print individual result */
|
1340 |
+
/*-------------------------*/
|
1341 |
+
printf("%4d %3d %d ",Line,r_wn1,Status);
|
1342 |
+
printf("%6.2f %6.2f %3d %3d %3d %3d",
|
1343 |
+
(r_bn1==0?0.0:100.0*match/r_bn1),
|
1344 |
+
(r_bn2==0?0.0:100.0*match/r_bn2),
|
1345 |
+
match, r_bn1, r_bn2, crossing);
|
1346 |
+
|
1347 |
+
printf(" %4d %4d %6.2f\n",wn1,correct_tag,
|
1348 |
+
(wn1==0?0.0:100.0*correct_tag/wn1));
|
1349 |
+
}
|
1350 |
+
|
1351 |
+
|
1352 |
+
/*------------------------*/
|
1353 |
+
/* print total statistics */
|
1354 |
+
/*------------------------*/
|
1355 |
+
void
|
1356 |
+
print_total()
|
1357 |
+
{
|
1358 |
+
int sentn;
|
1359 |
+
double r,p,f;
|
1360 |
+
FILE *file;
|
1361 |
+
|
1362 |
+
|
1363 |
+
|
1364 |
+
r = TOTAL_bn1>0 ? 100.0*TOTAL_match/TOTAL_bn1 : 0.0;
|
1365 |
+
p = TOTAL_bn2>0 ? 100.0*TOTAL_match/TOTAL_bn2 : 0.0;
|
1366 |
+
f = 2*p*r/(p+r);
|
1367 |
+
|
1368 |
+
if (spmrl_compact_view == 0){
|
1369 |
+
|
1370 |
+
printf("============================================================================\n");
|
1371 |
+
|
1372 |
+
if(TOTAL_bn1>0 && TOTAL_bn2>0){
|
1373 |
+
printf(" %6.2f %6.2f %6d %5d %5d %5d",
|
1374 |
+
(TOTAL_bn1>0?100.0*TOTAL_match/TOTAL_bn1:0.0),
|
1375 |
+
(TOTAL_bn2>0?100.0*TOTAL_match/TOTAL_bn2:0.0),
|
1376 |
+
TOTAL_match,
|
1377 |
+
TOTAL_bn1,
|
1378 |
+
TOTAL_bn2,
|
1379 |
+
TOTAL_crossing);
|
1380 |
+
}
|
1381 |
+
|
1382 |
+
printf(" %5d %5d %6.2f",
|
1383 |
+
TOTAL_word,
|
1384 |
+
TOTAL_correct_tag,
|
1385 |
+
(TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0));
|
1386 |
+
|
1387 |
+
printf("\n");
|
1388 |
+
if (spmrl_print_filename==0){
|
1389 |
+
printf("=== Summary ===\n");
|
1390 |
+
}else {
|
1391 |
+
printf("=== Summary: %s\tvs\t%s ===\n",filename1,filename2);
|
1392 |
+
}
|
1393 |
+
|
1394 |
+
|
1395 |
+
sentn = TOTAL_sent - TOTAL_error_sent - TOTAL_skip_sent;
|
1396 |
+
|
1397 |
+
printf("\n-- All --\n");
|
1398 |
+
printf("Number of sentence = %6d\n",TOTAL_sent);
|
1399 |
+
printf("Number of Error sentence = %6d\n",TOTAL_error_sent);
|
1400 |
+
printf("Number of Skip sentence = %6d\n",TOTAL_skip_sent);
|
1401 |
+
printf("Number of Valid sentence = %6d\n",sentn);
|
1402 |
+
|
1403 |
+
//r = TOTAL_bn1>0 ? 100.0*TOTAL_match/TOTAL_bn1 : 0.0;
|
1404 |
+
printf("Bracketing Recall = %6.2f\n",r);
|
1405 |
+
|
1406 |
+
// p = TOTAL_bn2>0 ? 100.0*TOTAL_match/TOTAL_bn2 : 0.0;
|
1407 |
+
printf("Bracketing Precision = %6.2f\n",p);
|
1408 |
+
|
1409 |
+
// f = 2*p*r/(p+r);
|
1410 |
+
printf("Bracketing FMeasure = %6.2f\n",f);
|
1411 |
+
|
1412 |
+
printf("Complete match = %6.2f\n",
|
1413 |
+
(sentn>0?100.0*TOTAL_comp_sent/sentn:0.0));
|
1414 |
+
printf("Average crossing = %6.2f\n",
|
1415 |
+
(sentn>0?1.0*TOTAL_crossing/sentn:0.0));
|
1416 |
+
printf("No crossing = %6.2f\n",
|
1417 |
+
(sentn>0?100.0*TOTAL_no_crossing/sentn:0.0));
|
1418 |
+
printf("2 or less crossing = %6.2f\n",
|
1419 |
+
(sentn>0?100.0*TOTAL_2L_crossing/sentn:0.0));
|
1420 |
+
printf("Tagging accuracy = %6.2f\n",
|
1421 |
+
(TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0));
|
1422 |
+
|
1423 |
+
// Write stats also to a file.
|
1424 |
+
file = fopen("status", "w");
|
1425 |
+
fprintf(file, "---\n");
|
1426 |
+
fprintf(file, "F1: %.2f\n", f);
|
1427 |
+
fprintf(file, "LP: %.2f\n", p);
|
1428 |
+
fprintf(file, "LR: %.2f\n", r);
|
1429 |
+
fprintf(file, "POS: %.2f\n",
|
1430 |
+
(TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0));
|
1431 |
+
fprintf(file, "errorRate: %.2f\n", 100-f);
|
1432 |
+
fclose(file);
|
1433 |
+
|
1434 |
+
sentn = TOT40_sent - TOT40_error_sent - TOT40_skip_sent;
|
1435 |
+
|
1436 |
+
printf("\n-- len<=%d --\n",TOT_cut_len);
|
1437 |
+
printf("Number of sentence = %6d\n",TOT40_sent);
|
1438 |
+
printf("Number of Error sentence = %6d\n",TOT40_error_sent);
|
1439 |
+
printf("Number of Skip sentence = %6d\n",TOT40_skip_sent);
|
1440 |
+
printf("Number of Valid sentence = %6d\n",sentn);
|
1441 |
+
|
1442 |
+
|
1443 |
+
r = TOT40_bn1>0 ? 100.0*TOT40_match/TOT40_bn1 : 0.0;
|
1444 |
+
printf("Bracketing Recall = %6.2f\n",r);
|
1445 |
+
|
1446 |
+
p = TOT40_bn2>0 ? 100.0*TOT40_match/TOT40_bn2 : 0.0;
|
1447 |
+
printf("Bracketing Precision = %6.2f\n",p);
|
1448 |
+
|
1449 |
+
f = 2*p*r/(p+r);
|
1450 |
+
printf("Bracketing FMeasure = %6.2f\n",f);
|
1451 |
+
|
1452 |
+
printf("Complete match = %6.2f\n",
|
1453 |
+
(sentn>0?100.0*TOT40_comp_sent/sentn:0.0));
|
1454 |
+
printf("Average crossing = %6.2f\n",
|
1455 |
+
(sentn>0?1.0*TOT40_crossing/sentn:0.0));
|
1456 |
+
printf("No crossing = %6.2f\n",
|
1457 |
+
(sentn>0?100.0*TOT40_no_crossing/sentn:0.0));
|
1458 |
+
printf("2 or less crossing = %6.2f\n",
|
1459 |
+
(sentn>0?100.0*TOT40_2L_crossing/sentn:0.0));
|
1460 |
+
printf("Tagging accuracy = %6.2f\n",
|
1461 |
+
(TOT40_word>0?100.0*TOT40_correct_tag/TOT40_word:0.0));
|
1462 |
+
}else { // else spmrl_compact_view
|
1463 |
+
if (spmrl_compact_view40 ==0){
|
1464 |
+
double pos=(TOTAL_word>0?100.0*TOTAL_correct_tag/TOTAL_word:0.0);
|
1465 |
+
sentn = TOTAL_sent - TOTAL_error_sent - TOTAL_skip_sent;
|
1466 |
+
|
1467 |
+
double EX=(sentn>0?100.0*TOTAL_comp_sent/sentn:0.0);
|
1468 |
+
|
1469 |
+
printf("F1: %6.2f %%\tPrec: %6.2f %%\tRec: %6.2f %%\t",f,r,p);
|
1470 |
+
printf("POS: %6.2f %%\tEX: %6.2f %%\tUnparsed: %6d\tSent: %6d\tfile: %s\n",pos,EX,TOTAL_skip_sent+TOTAL_error_sent,TOTAL_sent,filename2);// ICI
|
1471 |
+
}else {
|
1472 |
+
|
1473 |
+
r = TOT40_bn1>0 ? 100.0*TOT40_match/TOT40_bn1 : 0.0;
|
1474 |
+
p = TOT40_bn2>0 ? 100.0*TOT40_match/TOT40_bn2 : 0.0;
|
1475 |
+
f = 2*p*r/(p+r);
|
1476 |
+
double pos=(TOT40_word>0?100.0*TOT40_correct_tag/TOT40_word:0.0);
|
1477 |
+
sentn = TOT40_sent - TOT40_error_sent - TOT40_skip_sent;
|
1478 |
+
double EX=(sentn>0?100.0*TOT40_comp_sent/sentn:0.0);
|
1479 |
+
|
1480 |
+
printf("F1: %6.2f %%\tPrec: %6.2f %%\tRec: %6.2f %%\t",f,r,p);
|
1481 |
+
printf("POS: %6.2f %%\tEX: %6.2f %%\tUnparsed: %6d\tSent: %6d\tfile: %s\n",pos,EX,TOT40_skip_sent+TOT40_error_sent,TOT40_sent,filename2);// ICI<#statements#>
|
1482 |
+
}
|
1483 |
+
|
1484 |
+
}
|
1485 |
+
|
1486 |
+
}
|
1487 |
+
|
1488 |
+
|
1489 |
+
/*--------------------------------*/
|
1490 |
+
/* display individual information */
|
1491 |
+
/*--------------------------------*/
|
1492 |
+
void
|
1493 |
+
dsp_info()
|
1494 |
+
{
|
1495 |
+
int i, n;
|
1496 |
+
|
1497 |
+
printf("-<1>---(wn1=%3d, bn1=%3d)- ",wn1,bn1);
|
1498 |
+
printf("-<2>---(wn2=%3d, bn2=%3d)-\n",wn2,bn2);
|
1499 |
+
|
1500 |
+
n = (wn1>wn2?wn1:wn2);
|
1501 |
+
|
1502 |
+
for(i=0;i<n;i++){
|
1503 |
+
if(terminal1[i].word[0]!='\0'){
|
1504 |
+
printf("%3d : %d : %-6s %-16s ",i,terminal1[i].result,
|
1505 |
+
terminal1[i].label,terminal1[i].word);
|
1506 |
+
}else{
|
1507 |
+
printf(" ");
|
1508 |
+
}
|
1509 |
+
|
1510 |
+
if(terminal2[i].word[0]!='\0'){
|
1511 |
+
printf("%3d : %d : %-6s %-16s\n",i,terminal2[i].result,
|
1512 |
+
terminal2[i].label,terminal2[i].word);
|
1513 |
+
}else{
|
1514 |
+
printf("\n");
|
1515 |
+
}
|
1516 |
+
}
|
1517 |
+
printf("\n");
|
1518 |
+
|
1519 |
+
n = (bn1>bn2?bn1:bn2);
|
1520 |
+
|
1521 |
+
for(i=0;i<n;i++){
|
1522 |
+
if(bracket1[i].start != -1){
|
1523 |
+
printf("%3d : %d : %3d %3d %-6s ",i,bracket1[i].result,
|
1524 |
+
bracket1[i].start,bracket1[i].end,
|
1525 |
+
bracket1[i].label);
|
1526 |
+
} else {
|
1527 |
+
printf(" ");
|
1528 |
+
}
|
1529 |
+
|
1530 |
+
if(bracket2[i].start != -1){
|
1531 |
+
printf("%3d : %d : %3d %3d %-6s\n",i,bracket2[i].result,
|
1532 |
+
bracket2[i].start,bracket2[i].end,
|
1533 |
+
bracket2[i].label);
|
1534 |
+
} else {
|
1535 |
+
printf("\n");
|
1536 |
+
}
|
1537 |
+
}
|
1538 |
+
printf("\n");
|
1539 |
+
|
1540 |
+
printf("========\n");
|
1541 |
+
|
1542 |
+
}
|
1543 |
+
|
1544 |
+
|
1545 |
+
/*-----------------*/
|
1546 |
+
/* some predicates */
|
1547 |
+
/*-----------------*/
|
1548 |
+
|
1549 |
+
|
1550 |
+
// Djamé: reimplementing isspace (while digging bug in spmrl 2013 arabic gold dev line 616)
|
1551 |
+
int my_isspace(char c){
|
1552 |
+
// those are Posix's sapce : "\t\n\v\f\r"
|
1553 |
+
//return (c==' ' || c=='\n');
|
1554 |
+
return (c==' ' || c=='\t' || c=='\r' || c=='\n' || c=='\v' || c=='\f');
|
1555 |
+
}
|
1556 |
+
|
1557 |
+
|
1558 |
+
|
1559 |
+
|
1560 |
+
int
|
1561 |
+
is_terminator(c)
|
1562 |
+
char c;
|
1563 |
+
{
|
1564 |
+
if(isspace(c) || c=='(' || c==')'){
|
1565 |
+
return(1);
|
1566 |
+
}else{
|
1567 |
+
return(0);
|
1568 |
+
}
|
1569 |
+
}
|
1570 |
+
|
1571 |
+
int
|
1572 |
+
is_deletelabel(s)
|
1573 |
+
char *s;
|
1574 |
+
{
|
1575 |
+
int i;
|
1576 |
+
|
1577 |
+
for(i=0;i<Delete_label_n;i++){
|
1578 |
+
if(strcmp(s,Delete_label[i])==0){
|
1579 |
+
return(1);
|
1580 |
+
}
|
1581 |
+
}
|
1582 |
+
|
1583 |
+
return(0);
|
1584 |
+
}
|
1585 |
+
|
1586 |
+
int
|
1587 |
+
is_deletelabel_for_length(s)
|
1588 |
+
char *s;
|
1589 |
+
{
|
1590 |
+
int i;
|
1591 |
+
|
1592 |
+
for(i=0;i<Delete_label_for_length_n;i++){
|
1593 |
+
if(strcmp(s,Delete_label_for_length[i])==0){
|
1594 |
+
return(1);
|
1595 |
+
}
|
1596 |
+
}
|
1597 |
+
|
1598 |
+
return(0);
|
1599 |
+
}
|
1600 |
+
|
1601 |
+
int
|
1602 |
+
is_quote_term(s,w)
|
1603 |
+
char *s;
|
1604 |
+
char *w;
|
1605 |
+
{
|
1606 |
+
int i;
|
1607 |
+
|
1608 |
+
for(i=0;i<Quote_term_n;i++){
|
1609 |
+
if(strcmp(s,Quote_term[i])==0){
|
1610 |
+
// Djame : Arabic word contain quote
|
1611 |
+
if (strcmp(w,"'")==0 || strcmp(w,"\"")==0 || strcmp(w,"/")==0)
|
1612 |
+
//if (strcmp(w,"\"")==0 || strcmp(w,"/")==0)
|
1613 |
+
return(1);
|
1614 |
+
}
|
1615 |
+
}
|
1616 |
+
|
1617 |
+
return(0);
|
1618 |
+
}
|
1619 |
+
|
1620 |
+
|
1621 |
+
/*---------------*/
|
1622 |
+
/* compare words */
|
1623 |
+
/*---------------*/
|
1624 |
+
int
|
1625 |
+
word_comp(s1,s2)
|
1626 |
+
char *s1,*s2;
|
1627 |
+
{
|
1628 |
+
int i;
|
1629 |
+
|
1630 |
+
if(strcmp(s1,s2)==0){
|
1631 |
+
return(1);
|
1632 |
+
}
|
1633 |
+
|
1634 |
+
for(i=0;i<EQ_word_n;i++){
|
1635 |
+
if((strcmp(s1,EQ_word[i].s1)==0 &&
|
1636 |
+
strcmp(s2,EQ_word[i].s2)==0) ||
|
1637 |
+
(strcmp(s1,EQ_word[i].s2)==0 &&
|
1638 |
+
strcmp(s2,EQ_word[i].s1)==0)){
|
1639 |
+
return(1);
|
1640 |
+
}
|
1641 |
+
}
|
1642 |
+
|
1643 |
+
return(0);
|
1644 |
+
}
|
1645 |
+
|
1646 |
+
/*----------------*/
|
1647 |
+
/* compare labels */
|
1648 |
+
/*----------------*/
|
1649 |
+
int
|
1650 |
+
label_comp(s1,s2)
|
1651 |
+
char *s1,*s2;
|
1652 |
+
{
|
1653 |
+
int i;
|
1654 |
+
// Added by djame for spmrl 2013 so pos tag got filtered too
|
1655 |
+
|
1656 |
+
modify_label(s1); // djame
|
1657 |
+
modify_label(s2); // djame
|
1658 |
+
if(strcmp(s1,s2)==0){
|
1659 |
+
return(1);
|
1660 |
+
}
|
1661 |
+
|
1662 |
+
for(i=0;i<EQ_label_n;i++){
|
1663 |
+
if((strcmp(s1,EQ_label[i].s1)==0 &&
|
1664 |
+
strcmp(s2,EQ_label[i].s2)==0) ||
|
1665 |
+
(strcmp(s1,EQ_label[i].s2)==0 &&
|
1666 |
+
strcmp(s2,EQ_label[i].s1)==0)){
|
1667 |
+
return(1);
|
1668 |
+
}
|
1669 |
+
}
|
1670 |
+
|
1671 |
+
return(0);
|
1672 |
+
}
|
1673 |
+
|
1674 |
+
|
1675 |
+
/*--------*/
|
1676 |
+
/* errors */
|
1677 |
+
/*--------*/
|
1678 |
+
void
|
1679 |
+
Error(s,arg1,arg2,arg3)
|
1680 |
+
char *s, *arg1, *arg2, *arg3;
|
1681 |
+
{
|
1682 |
+
Status = 1;
|
1683 |
+
fprintf(stderr,"%d : ",Line);
|
1684 |
+
fprintf(stderr,s,arg1,arg2,arg3);
|
1685 |
+
if(Error_count++>Max_error){
|
1686 |
+
exit(1);
|
1687 |
+
}
|
1688 |
+
}
|
1689 |
+
|
1690 |
+
|
1691 |
+
/*---------------------*/
|
1692 |
+
/* fatal error to exit */
|
1693 |
+
/*---------------------*/
|
1694 |
+
void
|
1695 |
+
Fatal(s,arg1,arg2,arg3)
|
1696 |
+
char *s, *arg1, *arg2, *arg3;
|
1697 |
+
{
|
1698 |
+
fprintf(stderr,s,arg1,arg2,arg3);
|
1699 |
+
exit(1);
|
1700 |
+
}
|
1701 |
+
|
1702 |
+
|
1703 |
+
/*-------*/
|
1704 |
+
/* Usage */
|
1705 |
+
/*-------*/
|
1706 |
+
void
|
1707 |
+
Usage()
|
1708 |
+
{
|
1709 |
+
fprintf(stderr," evalb [-dDh][-c n][-e n][-p param_file] gold-file test-file \n");
|
1710 |
+
fprintf(stderr," \n");
|
1711 |
+
fprintf(stderr," Evaluate bracketing in test-file against gold-file. \n");
|
1712 |
+
fprintf(stderr," Return recall, precision, F-Measure, tag accuracy. \n");
|
1713 |
+
fprintf(stderr," \n");
|
1714 |
+
fprintf(stderr," <option> \n");
|
1715 |
+
fprintf(stderr," -d debug mode \n");
|
1716 |
+
fprintf(stderr," -D debug mode plus bracketing info \n");
|
1717 |
+
fprintf(stderr," -c n cut-off length forstatistics (def.=40)\n");
|
1718 |
+
fprintf(stderr," -e n number of error to kill (default=10) \n");
|
1719 |
+
fprintf(stderr," -p param_file parameter file \n");
|
1720 |
+
fprintf(stderr," -K n Evaluate up to n sentences \n");
|
1721 |
+
fprintf(stderr," -X Count skipped sentences brackets as not parsed \n");
|
1722 |
+
fprintf(stderr," -L Compact view (for use in batch mode, all sentences \n");
|
1723 |
+
fprintf(stderr," -h help \n");
|
1724 |
+
}
|
parsing/EVALB_SPMRL/spmrl.prm
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##------------------------------------------##
|
2 |
+
## Debug mode ##
|
3 |
+
## 0: No debugging ##
|
4 |
+
## 1: print data for individual sentence ##
|
5 |
+
## 2: print detailed bracketing info ##
|
6 |
+
##------------------------------------------##
|
7 |
+
DEBUG 0
|
8 |
+
|
9 |
+
##------------------------------------------##
|
10 |
+
## MAX error ##
|
11 |
+
## Number of error to stop the process. ##
|
12 |
+
## This is useful if there could be ##
|
13 |
+
## tokanization error. ##
|
14 |
+
## The process will stop when this number##
|
15 |
+
## of errors are accumulated. ##
|
16 |
+
##------------------------------------------##
|
17 |
+
MAX_ERROR 10000
|
18 |
+
|
19 |
+
##------------------------------------------##
|
20 |
+
## Cut-off length for statistics ##
|
21 |
+
## At the end of evaluation, the ##
|
22 |
+
## statistics for the senetnces of length##
|
23 |
+
## less than or equal to this number will##
|
24 |
+
## be shown, on top of the statistics ##
|
25 |
+
## for all the sentences ##
|
26 |
+
##------------------------------------------##
|
27 |
+
CUTOFF_LEN 70
|
28 |
+
|
29 |
+
##------------------------------------------##
|
30 |
+
## unlabeled or labeled bracketing ##
|
31 |
+
## 0: unlabeled bracketing ##
|
32 |
+
## 1: labeled bracketing ##
|
33 |
+
##------------------------------------------##
|
34 |
+
LABELED 1
|
35 |
+
|
36 |
+
##------------------------------------------##
|
37 |
+
## Delete labels ##
|
38 |
+
## list of labels to be ignored. ##
|
39 |
+
## If it is a pre-terminal label, delete ##
|
40 |
+
## the word along with the brackets. ##
|
41 |
+
## If it is a non-terminal label, just ##
|
42 |
+
## delete the brackets (don't delete ##
|
43 |
+
## deildrens). ##
|
44 |
+
##------------------------------------------##
|
45 |
+
DELETE_LABEL TOP
|
46 |
+
DELETE_LABEL ROOT
|
47 |
+
DELETE_LABEL S1
|
48 |
+
DELETE_LABEL -NONE-
|
49 |
+
DELETE_LABEL VROOT
|
50 |
+
|
51 |
+
#DELETE_LABEL ,
|
52 |
+
#DELETE_LABEL :
|
53 |
+
#DELETE_LABEL ``
|
54 |
+
#DELETE_LABEL ''
|
55 |
+
#DELETE_LABEL .
|
56 |
+
#DELETE_LABEL ?
|
57 |
+
#DELETE_LABEL !
|
58 |
+
#DELETE_LABEL PONCT
|
59 |
+
|
60 |
+
##------------------------------------------##
|
61 |
+
## Delete labels for length calculation ##
|
62 |
+
## list of labels to be ignored for ##
|
63 |
+
## length calculation purpose ##
|
64 |
+
##------------------------------------------##
|
65 |
+
DELETE_LABEL_FOR_LENGTH -NONE-
|
66 |
+
|
67 |
+
##------------------------------------------##
|
68 |
+
## Labels to be considered for misquote ##
|
69 |
+
## (could be possesive or quote) ##
|
70 |
+
##------------------------------------------##
|
71 |
+
#QUOTE_LABEL ``
|
72 |
+
#QUOTE_LABEL ''
|
73 |
+
#QUOTE_LABEL POS
|
74 |
+
|
75 |
+
##------------------------------------------##
|
76 |
+
## These ones are less common, but ##
|
77 |
+
## are on occasion output by parsers: ##
|
78 |
+
##------------------------------------------##
|
79 |
+
#QUOTE_LABEL NN
|
80 |
+
#QUOTE_LABEL CD
|
81 |
+
#QUOTE_LABEL VBZ
|
82 |
+
#QUOTE_LABEL :
|
83 |
+
|
84 |
+
##------------------------------------------##
|
85 |
+
## Equivalent labels, words ##
|
86 |
+
## the pairs are considered equivalent ##
|
87 |
+
## This is non-directional. ##
|
88 |
+
##------------------------------------------##
|
89 |
+
#EQ_LABEL ADVP PRT
|
90 |
+
|
91 |
+
# EQ_WORD Example example
|
parsing/EVALB_SPMRL/spmrl_hebrew.prm
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
##------------------------------------------##
|
2 |
+
## Debug mode ##
|
3 |
+
## 0: No debugging ##
|
4 |
+
## 1: print data for individual sentence ##
|
5 |
+
## 2: print detailed bracketing info ##
|
6 |
+
##------------------------------------------##
|
7 |
+
DEBUG 0
|
8 |
+
|
9 |
+
##------------------------------------------##
|
10 |
+
## MAX error ##
|
11 |
+
## Number of error to stop the process. ##
|
12 |
+
## This is useful if there could be ##
|
13 |
+
## tokanization error. ##
|
14 |
+
## The process will stop when this number##
|
15 |
+
## of errors are accumulated. ##
|
16 |
+
##------------------------------------------##
|
17 |
+
MAX_ERROR 10000
|
18 |
+
|
19 |
+
##------------------------------------------##
|
20 |
+
## Cut-off length for statistics ##
|
21 |
+
## At the end of evaluation, the ##
|
22 |
+
## statistics for the senetnces of length##
|
23 |
+
## less than or equal to this number will##
|
24 |
+
## be shown, on top of the statistics ##
|
25 |
+
## for all the sentences ##
|
26 |
+
##------------------------------------------##
|
27 |
+
CUTOFF_LEN 40
|
28 |
+
|
29 |
+
##------------------------------------------##
|
30 |
+
## unlabeled or labeled bracketing ##
|
31 |
+
## 0: unlabeled bracketing ##
|
32 |
+
## 1: labeled bracketing ##
|
33 |
+
##------------------------------------------##
|
34 |
+
LABELED 1
|
35 |
+
|
36 |
+
##------------------------------------------##
|
37 |
+
## Delete labels ##
|
38 |
+
## list of labels to be ignored. ##
|
39 |
+
## If it is a pre-terminal label, delete ##
|
40 |
+
## the word along with the brackets. ##
|
41 |
+
## If it is a non-terminal label, just ##
|
42 |
+
## delete the brackets (don't delete ##
|
43 |
+
## deildrens). ##
|
44 |
+
##------------------------------------------##
|
45 |
+
DELETE_LABEL TOP
|
46 |
+
DELETE_LABEL ROOT
|
47 |
+
DELETE_LABEL S1
|
48 |
+
DELETE_LABEL -NONE-
|
49 |
+
DELETE_LABEL VROOT
|
50 |
+
#DELETE_LABEL SENT
|
51 |
+
|
52 |
+
#DELETE_LABEL ,
|
53 |
+
#DELETE_LABEL :
|
54 |
+
#DELETE_LABEL ``
|
55 |
+
#DELETE_LABEL ''
|
56 |
+
#DELETE_LABEL .
|
57 |
+
#DELETE_LABEL ?
|
58 |
+
#DELETE_LABEL !
|
59 |
+
#DELETE_LABEL PONCT
|
60 |
+
|
61 |
+
##------------------------------------------##
|
62 |
+
## Delete labels for length calculation ##
|
63 |
+
## list of labels to be ignored for ##
|
64 |
+
## length calculation purpose ##
|
65 |
+
##------------------------------------------##
|
66 |
+
DELETE_LABEL_FOR_LENGTH -NONE-
|
67 |
+
|
68 |
+
##------------------------------------------##
|
69 |
+
## Labels to be considered for misquote ##
|
70 |
+
## (could be possesive or quote) ##
|
71 |
+
##------------------------------------------##
|
72 |
+
#QUOTE_LABEL ``
|
73 |
+
#QUOTE_LABEL ''
|
74 |
+
#QUOTE_LABEL POS
|
75 |
+
|
76 |
+
##------------------------------------------##
|
77 |
+
## These ones are less common, but ##
|
78 |
+
## are on occasion output by parsers: ##
|
79 |
+
##------------------------------------------##
|
80 |
+
#QUOTE_LABEL NN
|
81 |
+
#QUOTE_LABEL CD
|
82 |
+
#QUOTE_LABEL VBZ
|
83 |
+
#QUOTE_LABEL :
|
84 |
+
|
85 |
+
##------------------------------------------##
|
86 |
+
## Equivalent labels, words ##
|
87 |
+
## the pairs are considered equivalent ##
|
88 |
+
## This is non-directional. ##
|
89 |
+
##------------------------------------------##
|
90 |
+
#EQ_LABEL ADVP PRT
|
91 |
+
|
92 |
+
# EQ_WORD Example example
|
93 |
+
DELETE_LABEL SYN_NN
|
94 |
+
DELETE_LABEL SYN_NNP
|
95 |
+
DELETE_LABEL SYN_NNT
|
96 |
+
DELETE_LABEL SYN_PRP
|
97 |
+
DELETE_LABEL SYN_JJ
|
98 |
+
DELETE_LABEL SYN_JJT
|
99 |
+
DELETE_LABEL SYN_RB
|
100 |
+
DELETE_LABEL SYN_RBR
|
101 |
+
DELETE_LABEL SYN_MOD
|
102 |
+
DELETE_LABEL SYN_VB
|
103 |
+
DELETE_LABEL SYN_AUX
|
104 |
+
DELETE_LABEL SYN_AGR
|
105 |
+
DELETE_LABEL SYN_IN
|
106 |
+
DELETE_LABEL SYN_COM
|
107 |
+
DELETE_LABEL SYN_REL
|
108 |
+
DELETE_LABEL SYN_CC
|
109 |
+
DELETE_LABEL SYN_QW
|
110 |
+
DELETE_LABEL SYN_HAM
|
111 |
+
DELETE_LABEL SYN_WDT
|
112 |
+
DELETE_LABEL SYN_DT
|
113 |
+
DELETE_LABEL SYN_CD
|
114 |
+
DELETE_LABEL SYN_CDT
|
115 |
+
DELETE_LABEL SYN_AT
|
116 |
+
DELETE_LABEL SYN_H
|
117 |
+
DELETE_LABEL SYN_FL
|
118 |
+
DELETE_LABEL SYN_ZVL
|