sociolome / tools /framenet /fn_util.py
Gosse Minnema
Initial commit
05922fb
raw
history blame
4.94 kB
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer
framenet_split = {
"train": [
"LUCorpus-v0.3__CNN_AARONBROWN_ENG_20051101_215800.partial-NEW.xml",
"NTI__Iran_Chemical.xml",
"NTI__Taiwan_Introduction.xml",
"LUCorpus-v0.3__20000416_xin_eng-NEW.xml",
"NTI__NorthKorea_ChemicalOverview.xml",
"NTI__workAdvances.xml",
"C-4__C-4Text.xml",
"ANC__IntroOfDublin.xml",
"LUCorpus-v0.3__20000420_xin_eng-NEW.xml",
"NTI__BWTutorial_chapter1.xml",
"ANC__110CYL068.xml",
"LUCorpus-v0.3__artb_004_A1_E1_NEW.xml",
"NTI__Iran_Missile.xml",
"LUCorpus-v0.3__20000424_nyt-NEW.xml",
"LUCorpus-v0.3__wsj_1640.mrg-NEW.xml",
"ANC__110CYL070.xml",
"NTI__Iran_Introduction.xml",
"KBEval__lcch.xml",
"ANC__HistoryOfLasVegas.xml",
"LUCorpus-v0.3__wsj_2465.xml",
"KBEval__LCC-M.xml",
"LUCorpus-v0.3__artb_004_A1_E2_NEW.xml",
"LUCorpus-v0.3__AFGP-2002-600002-Trans.xml",
"LUCorpus-v0.3__602CZL285-1.xml",
"PropBank__LomaPrieta.xml",
"NTI__Iran_Biological.xml",
"NTI__Kazakhstan.xml",
"LUCorpus-v0.3__AFGP-2002-600045-Trans.xml",
"NTI__Iran_Nuclear.xml",
"ANC__EntrepreneurAsMadonna.xml",
"SemAnno__Text1.xml",
"ANC__HistoryOfJerusalem.xml",
"NTI__ChinaOverview.xml",
"PropBank__ElectionVictory.xml",
"NTI__Russia_Introduction.xml",
"NTI__SouthAfrica_Introduction.xml",
"LUCorpus-v0.3__20000419_apw_eng-NEW.xml",
"NTI__LibyaCountry1.xml",
"ANC__IntroJamaica.xml",
"QA__IranRelatedQuestions.xml",
"ANC__HistoryOfGreece.xml",
"NTI__NorthKorea_NuclearCapabilities.xml",
"PropBank__BellRinging.xml",
"PropBank__PolemicProgressiveEducation.xml",
"NTI__WMDNews_042106.xml",
"ANC__110CYL200.xml",
"LUCorpus-v0.3__CNN_ENG_20030614_173123.4-NEW-1.xml"
],
"dev": [
"NTI__WMDNews_062606.xml",
"LUCorpus-v0.3__ENRON-pearson-email-25jul02.xml",
"KBEval__MIT.xml",
"ANC__110CYL072.xml",
"LUCorpus-v0.3__20000415_apw_eng-NEW.xml",
"Miscellaneous__Hijack.xml",
"PropBank__TicketSplitting.xml",
"NTI__NorthKorea_NuclearOverview.xml"
],
"test": [
"NTI__NorthKorea_Introduction.xml",
"LUCorpus-v0.3__enron-thread-159550.xml",
"ANC__WhereToHongKong.xml",
"KBEval__atm.xml",
"ANC__112C-L013.xml",
"LUCorpus-v0.3__IZ-060316-01-Trans-1.xml",
"LUCorpus-v0.3__AFGP-2002-602187-Trans.xml",
"ANC__StephanopoulosCrimes.xml",
"ANC__110CYL069.xml",
"ANC__110CYL067.xml",
"ANC__IntroHongKong.xml",
"LUCorpus-v0.3__20000410_nyt-NEW.xml",
"KBEval__Brandeis.xml",
"KBEval__Stanford.xml",
"LUCorpus-v0.3__SNO-525.xml",
"PropBank__AetnaLifeAndCasualty.xml",
"Miscellaneous__Hound-Ch14.xml",
"NTI__Syria_NuclearOverview.xml",
"KBEval__cycorp.xml",
"KBEval__utd-icsi.xml",
"LUCorpus-v0.3__sw2025-ms98-a-trans.ascii-1-NEW.xml",
"Miscellaneous__SadatAssassination.xml",
"KBEval__parc.xml"
]
}
_spacy_tokenizer = SpacyTokenizer(language='en_core_web_sm', pos_tags=True)
class Sentence:
def __init__(self, text):
"""
Re-tokenize sentence. Map character indices to token indices.
We assume the char and token span indices are left inclusive and right inclusive.
"""
self.tokens = _spacy_tokenizer.tokenize(text)
@property
def pos(self):
return [t.pos_ for t in self.tokens]
@property
def tag(self):
return [t.tag_ for t in self.tokens]
@property
def starts(self):
return [t.idx for t in self.tokens]
@property
def ends(self):
return [t.idx_end for t in self.tokens]
def char2token(self, char_idx):
"""
If char_idx falls into the a token, return the index of this token.
Elif char_idx falls into the gap between 2 tokens, return the index of the previous token.
Elif char_idx is lower than the first token, return 0.
Elif return the index of the last token.
"""
if char_idx < self.starts[0]:
return 0
if char_idx >= self.starts[-1]:
return len(self.tokens)-1
for i_tok, start_idx in enumerate(self.starts):
if start_idx == char_idx:
return i_tok
if start_idx > char_idx:
return i_tok-1
def span(self, start, end):
# Left inclusive, right inclusive
assert end > start
start, end = self.char2token(start), self.char2token(end-1)
assert end >= start
return start, end
def __repr__(self):
return self.tokens.__repr__()