sociolome / tools /framenet /concrete_fn.py
Gosse Minnema
Initial commit
05922fb
raw
history blame
3.69 kB
import argparse
import os
from collections import defaultdict
from typing import Dict, Any
from concrete.util import CommunicationWriterTGZ
from nltk.corpus import framenet, framenet15
from tqdm import tqdm
from sftp.data_reader.concrete_srl import concrete_doc
from tools.framenet.fn_util import framenet_split, Sentence as TokSentence
def process_sentence(sent) -> Dict[str, Any]:
ret = {'sentence': sent.text, 'tokenization': list(), 'annotations': list()}
tok_sent = TokSentence(sent.text)
for token in tok_sent.tokens:
ret['tokenization'].append((token.idx, token.idx_end-1))
def process_one_ann_set(ann_set):
ret['annotations'].append(event := {'label': ann_set.frame.name, 'children': (arg_list := list())})
target_list = list()
for tar_start, tar_end in ann_set.Target:
target_list.extend(
list(range(tok_sent.span(tar_start, tar_end)[0], tok_sent.span(tar_start, tar_end)[1]+1))
)
target_list.sort()
event['span'] = (target_list[0], target_list[-1])
for fe_start, fe_end, fe_name in ann_set.FE[0]:
fe_start, fe_end = tok_sent.span(fe_start, fe_end)
arg_list.append({
'span': (fe_start, fe_end),
'label': fe_name
})
if 'annotationSet' in sent:
for ann_item in sent.annotationSet:
if 'Target' not in ann_item:
continue
process_one_ann_set(ann_item)
if 'Target' in sent:
process_one_ann_set(sent)
return ret
def process_doc(docs, dst_path: str):
writer = CommunicationWriterTGZ(dst_path)
for doc in tqdm(docs):
sentences = list()
for sent in doc.sentence:
sentences.append(process_sentence(sent))
comm = concrete_doc(sentences, doc.filename)
writer.write(comm, comm.id + '.concrete')
writer.close()
def process_exemplar(dst_path, fn):
bar = tqdm()
raw_annotations = list()
print('Loading exemplars...')
try:
for ann_sent in fn.annotations(full_text=False):
if 'Target' not in ann_sent:
continue
bar.update()
raw_annotations.append(ann_sent)
except RuntimeError:
pass
finally:
bar.close()
char_idx_offset = 0
sentences = list()
for sent in raw_annotations:
sentences.append(process_sentence(sent))
char_idx_offset += len(sent.text)+1
comm = concrete_doc(sentences, 'exemplar')
CommunicationWriterTGZ(dst_path).write(comm, 'exemplar.concrete')
def run():
parser = argparse.ArgumentParser()
parser.add_argument(
'dst', metavar='DESTINATION', type=str,
help='Destination folder path.'
)
parser.add_argument(
'-v', metavar='VERSION', default='1.7', type=str, choices=['1.5', '1.7'],
help='Version of FrameNet. Either 1.5 or 1.7.'
)
args = parser.parse_args()
fn = framenet if args.v == '1.7' else framenet15
os.makedirs(args.dst, exist_ok=True)
doc_group = defaultdict(list)
for doc in fn.docs():
if doc.filename in framenet_split['dev']:
doc_group['dev'].append(doc)
elif doc.filename in framenet_split['test']:
doc_group['test'].append(doc)
else:
doc_group['train'].append(doc)
for sp in framenet_split:
print(f'Loaded {len(doc_group[sp])} docs for {sp}.')
for sp in framenet_split:
process_doc(doc_group[sp], dst_path=os.path.join(args.dst, f'{sp}.tar.gz'))
process_exemplar(os.path.join(args.dst, 'exemplar.tar.gz'), fn)
if __name__ == '__main__':
run()