from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer framenet_split = { "train": [ "LUCorpus-v0.3__CNN_AARONBROWN_ENG_20051101_215800.partial-NEW.xml", "NTI__Iran_Chemical.xml", "NTI__Taiwan_Introduction.xml", "LUCorpus-v0.3__20000416_xin_eng-NEW.xml", "NTI__NorthKorea_ChemicalOverview.xml", "NTI__workAdvances.xml", "C-4__C-4Text.xml", "ANC__IntroOfDublin.xml", "LUCorpus-v0.3__20000420_xin_eng-NEW.xml", "NTI__BWTutorial_chapter1.xml", "ANC__110CYL068.xml", "LUCorpus-v0.3__artb_004_A1_E1_NEW.xml", "NTI__Iran_Missile.xml", "LUCorpus-v0.3__20000424_nyt-NEW.xml", "LUCorpus-v0.3__wsj_1640.mrg-NEW.xml", "ANC__110CYL070.xml", "NTI__Iran_Introduction.xml", "KBEval__lcch.xml", "ANC__HistoryOfLasVegas.xml", "LUCorpus-v0.3__wsj_2465.xml", "KBEval__LCC-M.xml", "LUCorpus-v0.3__artb_004_A1_E2_NEW.xml", "LUCorpus-v0.3__AFGP-2002-600002-Trans.xml", "LUCorpus-v0.3__602CZL285-1.xml", "PropBank__LomaPrieta.xml", "NTI__Iran_Biological.xml", "NTI__Kazakhstan.xml", "LUCorpus-v0.3__AFGP-2002-600045-Trans.xml", "NTI__Iran_Nuclear.xml", "ANC__EntrepreneurAsMadonna.xml", "SemAnno__Text1.xml", "ANC__HistoryOfJerusalem.xml", "NTI__ChinaOverview.xml", "PropBank__ElectionVictory.xml", "NTI__Russia_Introduction.xml", "NTI__SouthAfrica_Introduction.xml", "LUCorpus-v0.3__20000419_apw_eng-NEW.xml", "NTI__LibyaCountry1.xml", "ANC__IntroJamaica.xml", "QA__IranRelatedQuestions.xml", "ANC__HistoryOfGreece.xml", "NTI__NorthKorea_NuclearCapabilities.xml", "PropBank__BellRinging.xml", "PropBank__PolemicProgressiveEducation.xml", "NTI__WMDNews_042106.xml", "ANC__110CYL200.xml", "LUCorpus-v0.3__CNN_ENG_20030614_173123.4-NEW-1.xml" ], "dev": [ "NTI__WMDNews_062606.xml", "LUCorpus-v0.3__ENRON-pearson-email-25jul02.xml", "KBEval__MIT.xml", "ANC__110CYL072.xml", "LUCorpus-v0.3__20000415_apw_eng-NEW.xml", "Miscellaneous__Hijack.xml", "PropBank__TicketSplitting.xml", "NTI__NorthKorea_NuclearOverview.xml" ], "test": [ "NTI__NorthKorea_Introduction.xml", "LUCorpus-v0.3__enron-thread-159550.xml", "ANC__WhereToHongKong.xml", "KBEval__atm.xml", "ANC__112C-L013.xml", "LUCorpus-v0.3__IZ-060316-01-Trans-1.xml", "LUCorpus-v0.3__AFGP-2002-602187-Trans.xml", "ANC__StephanopoulosCrimes.xml", "ANC__110CYL069.xml", "ANC__110CYL067.xml", "ANC__IntroHongKong.xml", "LUCorpus-v0.3__20000410_nyt-NEW.xml", "KBEval__Brandeis.xml", "KBEval__Stanford.xml", "LUCorpus-v0.3__SNO-525.xml", "PropBank__AetnaLifeAndCasualty.xml", "Miscellaneous__Hound-Ch14.xml", "NTI__Syria_NuclearOverview.xml", "KBEval__cycorp.xml", "KBEval__utd-icsi.xml", "LUCorpus-v0.3__sw2025-ms98-a-trans.ascii-1-NEW.xml", "Miscellaneous__SadatAssassination.xml", "KBEval__parc.xml" ] } _spacy_tokenizer = SpacyTokenizer(language='en_core_web_sm', pos_tags=True) class Sentence: def __init__(self, text): """ Re-tokenize sentence. Map character indices to token indices. We assume the char and token span indices are left inclusive and right inclusive. """ self.tokens = _spacy_tokenizer.tokenize(text) @property def pos(self): return [t.pos_ for t in self.tokens] @property def tag(self): return [t.tag_ for t in self.tokens] @property def starts(self): return [t.idx for t in self.tokens] @property def ends(self): return [t.idx_end for t in self.tokens] def char2token(self, char_idx): """ If char_idx falls into the a token, return the index of this token. Elif char_idx falls into the gap between 2 tokens, return the index of the previous token. Elif char_idx is lower than the first token, return 0. Elif return the index of the last token. """ if char_idx < self.starts[0]: return 0 if char_idx >= self.starts[-1]: return len(self.tokens)-1 for i_tok, start_idx in enumerate(self.starts): if start_idx == char_idx: return i_tok if start_idx > char_idx: return i_tok-1 def span(self, start, end): # Left inclusive, right inclusive assert end > start start, end = self.char2token(start), self.char2token(end-1) assert end >= start return start, end def __repr__(self): return self.tokens.__repr__()