from collections import defaultdict from itertools import product from typing import * import nltk from nltk.corpus import framenet, framenet15 from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() manual = { '\'s': 'be', '\'re': 'be', '\'ve': 'have', 've': 'have', 'men': 'man', 'saw': 'see', 'could': 'can', 'neighbour': 'neighbor', 'felt': 'feel', 'fell': 'fall', 'little': 'a little', 'have': 'have to', 'raping': 'rape', 'flavor': 'flavour', 'ca': 'can', 'bit': 'a bit', } def load_framenet_corpus(version): if '1.5' in version: nltk.download('framenet_v15') return framenet15 elif '1.7' in version: nltk.download('framenet_v17') return framenet else: raise NotImplementedError def is_word(s: str): return all([c.isalpha() or c in ' -\'' for c in s]) def lu_to_frame(version: str): fn = load_framenet_corpus(version) fn._bad_statuses = [] map_no_pos = defaultdict(set) lexicon_set = set() for frame in fn.frames(): for lu in frame.lexUnit: assert lu.count('.') == 1 lexicon, pos = lu.split('.') lexicon = lexicon.lower() lexicon = ' '.join(filter(lambda x: is_word(x), lexicon.split())) if lexicon == '': continue map_no_pos[lexicon].add(frame.name) lexicon_set.add(lexicon) fn._bad_statuses = [] return map_no_pos class FrameIdentifier: def __init__(self): lf_map = lu_to_frame('1.7') lf_map['there have'].add('Existence') lf_map['there must'].add('Existence') lf_map['be there'].add('Existence') self.lf_map = dict(lf_map) def __call__(self, tokens: List[str]): if len(tokens) == 1 and tokens[0].isnumeric(): return ['Cardinal_numbers'] if len(tokens) == 1 and tokens[0].endswith('th') and tokens[0][:-2].isnumeric(): return ['Ordinal_numbers'] tokens = [t.lower() for t in tokens] frames = list() if not all([is_word(t) for t in tokens]): return [] for i, token in enumerate(tokens): t2s = [token] for _pos in 'asrnv': t2s.append(lemmatizer.lemmatize(token, _pos)) for t_ in t2s: if t_ in manual: t2s.append(manual[t_]) t2s = list(set(t2s)) tokens[i] = t2s for t2s in tokens: for t in t2s: key = t if key in self.lf_map: for f in self.lf_map[key]: frames.append(f) for t1, t2 in zip(tokens, tokens[1:]): for ts in product(t1, t2): t = ' '.join(ts) if t in self.lf_map: for f in self.lf_map[t]: frames.append(f) return list(set(frames))