Spaces:
Sleeping
Sleeping
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer | |
framenet_split = { | |
"train": [ | |
"LUCorpus-v0.3__CNN_AARONBROWN_ENG_20051101_215800.partial-NEW.xml", | |
"NTI__Iran_Chemical.xml", | |
"NTI__Taiwan_Introduction.xml", | |
"LUCorpus-v0.3__20000416_xin_eng-NEW.xml", | |
"NTI__NorthKorea_ChemicalOverview.xml", | |
"NTI__workAdvances.xml", | |
"C-4__C-4Text.xml", | |
"ANC__IntroOfDublin.xml", | |
"LUCorpus-v0.3__20000420_xin_eng-NEW.xml", | |
"NTI__BWTutorial_chapter1.xml", | |
"ANC__110CYL068.xml", | |
"LUCorpus-v0.3__artb_004_A1_E1_NEW.xml", | |
"NTI__Iran_Missile.xml", | |
"LUCorpus-v0.3__20000424_nyt-NEW.xml", | |
"LUCorpus-v0.3__wsj_1640.mrg-NEW.xml", | |
"ANC__110CYL070.xml", | |
"NTI__Iran_Introduction.xml", | |
"KBEval__lcch.xml", | |
"ANC__HistoryOfLasVegas.xml", | |
"LUCorpus-v0.3__wsj_2465.xml", | |
"KBEval__LCC-M.xml", | |
"LUCorpus-v0.3__artb_004_A1_E2_NEW.xml", | |
"LUCorpus-v0.3__AFGP-2002-600002-Trans.xml", | |
"LUCorpus-v0.3__602CZL285-1.xml", | |
"PropBank__LomaPrieta.xml", | |
"NTI__Iran_Biological.xml", | |
"NTI__Kazakhstan.xml", | |
"LUCorpus-v0.3__AFGP-2002-600045-Trans.xml", | |
"NTI__Iran_Nuclear.xml", | |
"ANC__EntrepreneurAsMadonna.xml", | |
"SemAnno__Text1.xml", | |
"ANC__HistoryOfJerusalem.xml", | |
"NTI__ChinaOverview.xml", | |
"PropBank__ElectionVictory.xml", | |
"NTI__Russia_Introduction.xml", | |
"NTI__SouthAfrica_Introduction.xml", | |
"LUCorpus-v0.3__20000419_apw_eng-NEW.xml", | |
"NTI__LibyaCountry1.xml", | |
"ANC__IntroJamaica.xml", | |
"QA__IranRelatedQuestions.xml", | |
"ANC__HistoryOfGreece.xml", | |
"NTI__NorthKorea_NuclearCapabilities.xml", | |
"PropBank__BellRinging.xml", | |
"PropBank__PolemicProgressiveEducation.xml", | |
"NTI__WMDNews_042106.xml", | |
"ANC__110CYL200.xml", | |
"LUCorpus-v0.3__CNN_ENG_20030614_173123.4-NEW-1.xml" | |
], | |
"dev": [ | |
"NTI__WMDNews_062606.xml", | |
"LUCorpus-v0.3__ENRON-pearson-email-25jul02.xml", | |
"KBEval__MIT.xml", | |
"ANC__110CYL072.xml", | |
"LUCorpus-v0.3__20000415_apw_eng-NEW.xml", | |
"Miscellaneous__Hijack.xml", | |
"PropBank__TicketSplitting.xml", | |
"NTI__NorthKorea_NuclearOverview.xml" | |
], | |
"test": [ | |
"NTI__NorthKorea_Introduction.xml", | |
"LUCorpus-v0.3__enron-thread-159550.xml", | |
"ANC__WhereToHongKong.xml", | |
"KBEval__atm.xml", | |
"ANC__112C-L013.xml", | |
"LUCorpus-v0.3__IZ-060316-01-Trans-1.xml", | |
"LUCorpus-v0.3__AFGP-2002-602187-Trans.xml", | |
"ANC__StephanopoulosCrimes.xml", | |
"ANC__110CYL069.xml", | |
"ANC__110CYL067.xml", | |
"ANC__IntroHongKong.xml", | |
"LUCorpus-v0.3__20000410_nyt-NEW.xml", | |
"KBEval__Brandeis.xml", | |
"KBEval__Stanford.xml", | |
"LUCorpus-v0.3__SNO-525.xml", | |
"PropBank__AetnaLifeAndCasualty.xml", | |
"Miscellaneous__Hound-Ch14.xml", | |
"NTI__Syria_NuclearOverview.xml", | |
"KBEval__cycorp.xml", | |
"KBEval__utd-icsi.xml", | |
"LUCorpus-v0.3__sw2025-ms98-a-trans.ascii-1-NEW.xml", | |
"Miscellaneous__SadatAssassination.xml", | |
"KBEval__parc.xml" | |
] | |
} | |
_spacy_tokenizer = SpacyTokenizer(language='en_core_web_sm', pos_tags=True) | |
class Sentence: | |
def __init__(self, text): | |
""" | |
Re-tokenize sentence. Map character indices to token indices. | |
We assume the char and token span indices are left inclusive and right inclusive. | |
""" | |
self.tokens = _spacy_tokenizer.tokenize(text) | |
def pos(self): | |
return [t.pos_ for t in self.tokens] | |
def tag(self): | |
return [t.tag_ for t in self.tokens] | |
def starts(self): | |
return [t.idx for t in self.tokens] | |
def ends(self): | |
return [t.idx_end for t in self.tokens] | |
def char2token(self, char_idx): | |
""" | |
If char_idx falls into the a token, return the index of this token. | |
Elif char_idx falls into the gap between 2 tokens, return the index of the previous token. | |
Elif char_idx is lower than the first token, return 0. | |
Elif return the index of the last token. | |
""" | |
if char_idx < self.starts[0]: | |
return 0 | |
if char_idx >= self.starts[-1]: | |
return len(self.tokens)-1 | |
for i_tok, start_idx in enumerate(self.starts): | |
if start_idx == char_idx: | |
return i_tok | |
if start_idx > char_idx: | |
return i_tok-1 | |
def span(self, start, end): | |
# Left inclusive, right inclusive | |
assert end > start | |
start, end = self.char2token(start), self.char2token(end-1) | |
assert end >= start | |
return start, end | |
def __repr__(self): | |
return self.tokens.__repr__() | |