Spaces:
Sleeping
Sleeping
File size: 4,941 Bytes
05922fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer
framenet_split = {
"train": [
"LUCorpus-v0.3__CNN_AARONBROWN_ENG_20051101_215800.partial-NEW.xml",
"NTI__Iran_Chemical.xml",
"NTI__Taiwan_Introduction.xml",
"LUCorpus-v0.3__20000416_xin_eng-NEW.xml",
"NTI__NorthKorea_ChemicalOverview.xml",
"NTI__workAdvances.xml",
"C-4__C-4Text.xml",
"ANC__IntroOfDublin.xml",
"LUCorpus-v0.3__20000420_xin_eng-NEW.xml",
"NTI__BWTutorial_chapter1.xml",
"ANC__110CYL068.xml",
"LUCorpus-v0.3__artb_004_A1_E1_NEW.xml",
"NTI__Iran_Missile.xml",
"LUCorpus-v0.3__20000424_nyt-NEW.xml",
"LUCorpus-v0.3__wsj_1640.mrg-NEW.xml",
"ANC__110CYL070.xml",
"NTI__Iran_Introduction.xml",
"KBEval__lcch.xml",
"ANC__HistoryOfLasVegas.xml",
"LUCorpus-v0.3__wsj_2465.xml",
"KBEval__LCC-M.xml",
"LUCorpus-v0.3__artb_004_A1_E2_NEW.xml",
"LUCorpus-v0.3__AFGP-2002-600002-Trans.xml",
"LUCorpus-v0.3__602CZL285-1.xml",
"PropBank__LomaPrieta.xml",
"NTI__Iran_Biological.xml",
"NTI__Kazakhstan.xml",
"LUCorpus-v0.3__AFGP-2002-600045-Trans.xml",
"NTI__Iran_Nuclear.xml",
"ANC__EntrepreneurAsMadonna.xml",
"SemAnno__Text1.xml",
"ANC__HistoryOfJerusalem.xml",
"NTI__ChinaOverview.xml",
"PropBank__ElectionVictory.xml",
"NTI__Russia_Introduction.xml",
"NTI__SouthAfrica_Introduction.xml",
"LUCorpus-v0.3__20000419_apw_eng-NEW.xml",
"NTI__LibyaCountry1.xml",
"ANC__IntroJamaica.xml",
"QA__IranRelatedQuestions.xml",
"ANC__HistoryOfGreece.xml",
"NTI__NorthKorea_NuclearCapabilities.xml",
"PropBank__BellRinging.xml",
"PropBank__PolemicProgressiveEducation.xml",
"NTI__WMDNews_042106.xml",
"ANC__110CYL200.xml",
"LUCorpus-v0.3__CNN_ENG_20030614_173123.4-NEW-1.xml"
],
"dev": [
"NTI__WMDNews_062606.xml",
"LUCorpus-v0.3__ENRON-pearson-email-25jul02.xml",
"KBEval__MIT.xml",
"ANC__110CYL072.xml",
"LUCorpus-v0.3__20000415_apw_eng-NEW.xml",
"Miscellaneous__Hijack.xml",
"PropBank__TicketSplitting.xml",
"NTI__NorthKorea_NuclearOverview.xml"
],
"test": [
"NTI__NorthKorea_Introduction.xml",
"LUCorpus-v0.3__enron-thread-159550.xml",
"ANC__WhereToHongKong.xml",
"KBEval__atm.xml",
"ANC__112C-L013.xml",
"LUCorpus-v0.3__IZ-060316-01-Trans-1.xml",
"LUCorpus-v0.3__AFGP-2002-602187-Trans.xml",
"ANC__StephanopoulosCrimes.xml",
"ANC__110CYL069.xml",
"ANC__110CYL067.xml",
"ANC__IntroHongKong.xml",
"LUCorpus-v0.3__20000410_nyt-NEW.xml",
"KBEval__Brandeis.xml",
"KBEval__Stanford.xml",
"LUCorpus-v0.3__SNO-525.xml",
"PropBank__AetnaLifeAndCasualty.xml",
"Miscellaneous__Hound-Ch14.xml",
"NTI__Syria_NuclearOverview.xml",
"KBEval__cycorp.xml",
"KBEval__utd-icsi.xml",
"LUCorpus-v0.3__sw2025-ms98-a-trans.ascii-1-NEW.xml",
"Miscellaneous__SadatAssassination.xml",
"KBEval__parc.xml"
]
}
_spacy_tokenizer = SpacyTokenizer(language='en_core_web_sm', pos_tags=True)
class Sentence:
def __init__(self, text):
"""
Re-tokenize sentence. Map character indices to token indices.
We assume the char and token span indices are left inclusive and right inclusive.
"""
self.tokens = _spacy_tokenizer.tokenize(text)
@property
def pos(self):
return [t.pos_ for t in self.tokens]
@property
def tag(self):
return [t.tag_ for t in self.tokens]
@property
def starts(self):
return [t.idx for t in self.tokens]
@property
def ends(self):
return [t.idx_end for t in self.tokens]
def char2token(self, char_idx):
"""
If char_idx falls into the a token, return the index of this token.
Elif char_idx falls into the gap between 2 tokens, return the index of the previous token.
Elif char_idx is lower than the first token, return 0.
Elif return the index of the last token.
"""
if char_idx < self.starts[0]:
return 0
if char_idx >= self.starts[-1]:
return len(self.tokens)-1
for i_tok, start_idx in enumerate(self.starts):
if start_idx == char_idx:
return i_tok
if start_idx > char_idx:
return i_tok-1
def span(self, start, end):
# Left inclusive, right inclusive
assert end > start
start, end = self.char2token(start), self.char2token(end-1)
assert end >= start
return start, end
def __repr__(self):
return self.tokens.__repr__()
|