File size: 4,941 Bytes
05922fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer

framenet_split = {
    "train": [
        "LUCorpus-v0.3__CNN_AARONBROWN_ENG_20051101_215800.partial-NEW.xml",
        "NTI__Iran_Chemical.xml",
        "NTI__Taiwan_Introduction.xml",
        "LUCorpus-v0.3__20000416_xin_eng-NEW.xml",
        "NTI__NorthKorea_ChemicalOverview.xml",
        "NTI__workAdvances.xml",
        "C-4__C-4Text.xml",
        "ANC__IntroOfDublin.xml",
        "LUCorpus-v0.3__20000420_xin_eng-NEW.xml",
        "NTI__BWTutorial_chapter1.xml",
        "ANC__110CYL068.xml",
        "LUCorpus-v0.3__artb_004_A1_E1_NEW.xml",
        "NTI__Iran_Missile.xml",
        "LUCorpus-v0.3__20000424_nyt-NEW.xml",
        "LUCorpus-v0.3__wsj_1640.mrg-NEW.xml",
        "ANC__110CYL070.xml",
        "NTI__Iran_Introduction.xml",
        "KBEval__lcch.xml",
        "ANC__HistoryOfLasVegas.xml",
        "LUCorpus-v0.3__wsj_2465.xml",
        "KBEval__LCC-M.xml",
        "LUCorpus-v0.3__artb_004_A1_E2_NEW.xml",
        "LUCorpus-v0.3__AFGP-2002-600002-Trans.xml",
        "LUCorpus-v0.3__602CZL285-1.xml",
        "PropBank__LomaPrieta.xml",
        "NTI__Iran_Biological.xml",
        "NTI__Kazakhstan.xml",
        "LUCorpus-v0.3__AFGP-2002-600045-Trans.xml",
        "NTI__Iran_Nuclear.xml",
        "ANC__EntrepreneurAsMadonna.xml",
        "SemAnno__Text1.xml",
        "ANC__HistoryOfJerusalem.xml",
        "NTI__ChinaOverview.xml",
        "PropBank__ElectionVictory.xml",
        "NTI__Russia_Introduction.xml",
        "NTI__SouthAfrica_Introduction.xml",
        "LUCorpus-v0.3__20000419_apw_eng-NEW.xml",
        "NTI__LibyaCountry1.xml",
        "ANC__IntroJamaica.xml",
        "QA__IranRelatedQuestions.xml",
        "ANC__HistoryOfGreece.xml",
        "NTI__NorthKorea_NuclearCapabilities.xml",
        "PropBank__BellRinging.xml",
        "PropBank__PolemicProgressiveEducation.xml",
        "NTI__WMDNews_042106.xml",
        "ANC__110CYL200.xml",
        "LUCorpus-v0.3__CNN_ENG_20030614_173123.4-NEW-1.xml"
    ],

    "dev": [
        "NTI__WMDNews_062606.xml",
        "LUCorpus-v0.3__ENRON-pearson-email-25jul02.xml",
        "KBEval__MIT.xml",
        "ANC__110CYL072.xml",
        "LUCorpus-v0.3__20000415_apw_eng-NEW.xml",
        "Miscellaneous__Hijack.xml",
        "PropBank__TicketSplitting.xml",
        "NTI__NorthKorea_NuclearOverview.xml"
    ],

    "test": [
        "NTI__NorthKorea_Introduction.xml",
        "LUCorpus-v0.3__enron-thread-159550.xml",
        "ANC__WhereToHongKong.xml",
        "KBEval__atm.xml",
        "ANC__112C-L013.xml",
        "LUCorpus-v0.3__IZ-060316-01-Trans-1.xml",
        "LUCorpus-v0.3__AFGP-2002-602187-Trans.xml",
        "ANC__StephanopoulosCrimes.xml",
        "ANC__110CYL069.xml",
        "ANC__110CYL067.xml",
        "ANC__IntroHongKong.xml",
        "LUCorpus-v0.3__20000410_nyt-NEW.xml",
        "KBEval__Brandeis.xml",
        "KBEval__Stanford.xml",
        "LUCorpus-v0.3__SNO-525.xml",
        "PropBank__AetnaLifeAndCasualty.xml",
        "Miscellaneous__Hound-Ch14.xml",
        "NTI__Syria_NuclearOverview.xml",
        "KBEval__cycorp.xml",
        "KBEval__utd-icsi.xml",
        "LUCorpus-v0.3__sw2025-ms98-a-trans.ascii-1-NEW.xml",
        "Miscellaneous__SadatAssassination.xml",
        "KBEval__parc.xml"
    ]
}

_spacy_tokenizer = SpacyTokenizer(language='en_core_web_sm', pos_tags=True)


class Sentence:
    def __init__(self, text):
        """
        Re-tokenize sentence. Map character indices to token indices.
        We assume the char and token span indices are left inclusive and right inclusive.
        """
        self.tokens = _spacy_tokenizer.tokenize(text)

    @property
    def pos(self):
        return [t.pos_ for t in self.tokens]

    @property
    def tag(self):
        return [t.tag_ for t in self.tokens]

    @property
    def starts(self):
        return [t.idx for t in self.tokens]

    @property
    def ends(self):
        return [t.idx_end for t in self.tokens]

    def char2token(self, char_idx):
        """
        If char_idx falls into the a token, return the index of this token.
        Elif char_idx falls into the gap between 2 tokens, return the index of the previous token.
        Elif char_idx is lower than the first token, return 0.
        Elif return the index of the last token.
        """
        if char_idx < self.starts[0]:
            return 0
        if char_idx >= self.starts[-1]:
            return len(self.tokens)-1
        for i_tok, start_idx in enumerate(self.starts):
            if start_idx == char_idx:
                return i_tok
            if start_idx > char_idx:
                return i_tok-1

    def span(self, start, end):
        # Left inclusive, right inclusive
        assert end > start
        start, end = self.char2token(start), self.char2token(end-1)
        assert end >= start
        return start, end

    def __repr__(self):
        return self.tokens.__repr__()