Spaces:

responsibility-framing
/

sociolome

Sleeping

File size: 4,941 Bytes

05922fb

from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer

framenet_split = {
    "train": [
        "LUCorpus-v0.3__CNN_AARONBROWN_ENG_20051101_215800.partial-NEW.xml",
        "NTI__Iran_Chemical.xml",
        "NTI__Taiwan_Introduction.xml",
        "LUCorpus-v0.3__20000416_xin_eng-NEW.xml",
        "NTI__NorthKorea_ChemicalOverview.xml",
        "NTI__workAdvances.xml",
        "C-4__C-4Text.xml",
        "ANC__IntroOfDublin.xml",
        "LUCorpus-v0.3__20000420_xin_eng-NEW.xml",
        "NTI__BWTutorial_chapter1.xml",
        "ANC__110CYL068.xml",
        "LUCorpus-v0.3__artb_004_A1_E1_NEW.xml",
        "NTI__Iran_Missile.xml",
        "LUCorpus-v0.3__20000424_nyt-NEW.xml",
        "LUCorpus-v0.3__wsj_1640.mrg-NEW.xml",
        "ANC__110CYL070.xml",
        "NTI__Iran_Introduction.xml",
        "KBEval__lcch.xml",
        "ANC__HistoryOfLasVegas.xml",
        "LUCorpus-v0.3__wsj_2465.xml",
        "KBEval__LCC-M.xml",
        "LUCorpus-v0.3__artb_004_A1_E2_NEW.xml",
        "LUCorpus-v0.3__AFGP-2002-600002-Trans.xml",
        "LUCorpus-v0.3__602CZL285-1.xml",
        "PropBank__LomaPrieta.xml",
        "NTI__Iran_Biological.xml",
        "NTI__Kazakhstan.xml",
        "LUCorpus-v0.3__AFGP-2002-600045-Trans.xml",
        "NTI__Iran_Nuclear.xml",
        "ANC__EntrepreneurAsMadonna.xml",
        "SemAnno__Text1.xml",
        "ANC__HistoryOfJerusalem.xml",
        "NTI__ChinaOverview.xml",
        "PropBank__ElectionVictory.xml",
        "NTI__Russia_Introduction.xml",
        "NTI__SouthAfrica_Introduction.xml",
        "LUCorpus-v0.3__20000419_apw_eng-NEW.xml",
        "NTI__LibyaCountry1.xml",
        "ANC__IntroJamaica.xml",
        "QA__IranRelatedQuestions.xml",
        "ANC__HistoryOfGreece.xml",
        "NTI__NorthKorea_NuclearCapabilities.xml",
        "PropBank__BellRinging.xml",
        "PropBank__PolemicProgressiveEducation.xml",
        "NTI__WMDNews_042106.xml",
        "ANC__110CYL200.xml",
        "LUCorpus-v0.3__CNN_ENG_20030614_173123.4-NEW-1.xml"
    ],

    "dev": [
        "NTI__WMDNews_062606.xml",
        "LUCorpus-v0.3__ENRON-pearson-email-25jul02.xml",
        "KBEval__MIT.xml",
        "ANC__110CYL072.xml",
        "LUCorpus-v0.3__20000415_apw_eng-NEW.xml",
        "Miscellaneous__Hijack.xml",
        "PropBank__TicketSplitting.xml",
        "NTI__NorthKorea_NuclearOverview.xml"
    ],

    "test": [
        "NTI__NorthKorea_Introduction.xml",
        "LUCorpus-v0.3__enron-thread-159550.xml",
        "ANC__WhereToHongKong.xml",
        "KBEval__atm.xml",
        "ANC__112C-L013.xml",
        "LUCorpus-v0.3__IZ-060316-01-Trans-1.xml",
        "LUCorpus-v0.3__AFGP-2002-602187-Trans.xml",
        "ANC__StephanopoulosCrimes.xml",
        "ANC__110CYL069.xml",
        "ANC__110CYL067.xml",
        "ANC__IntroHongKong.xml",
        "LUCorpus-v0.3__20000410_nyt-NEW.xml",
        "KBEval__Brandeis.xml",
        "KBEval__Stanford.xml",
        "LUCorpus-v0.3__SNO-525.xml",
        "PropBank__AetnaLifeAndCasualty.xml",
        "Miscellaneous__Hound-Ch14.xml",
        "NTI__Syria_NuclearOverview.xml",
        "KBEval__cycorp.xml",
        "KBEval__utd-icsi.xml",
        "LUCorpus-v0.3__sw2025-ms98-a-trans.ascii-1-NEW.xml",
        "Miscellaneous__SadatAssassination.xml",
        "KBEval__parc.xml"
    ]
}

_spacy_tokenizer = SpacyTokenizer(language='en_core_web_sm', pos_tags=True)


class Sentence:
    def __init__(self, text):
        """
        Re-tokenize sentence. Map character indices to token indices.
        We assume the char and token span indices are left inclusive and right inclusive.
        """
        self.tokens = _spacy_tokenizer.tokenize(text)

    @property
    def pos(self):
        return [t.pos_ for t in self.tokens]

    @property
    def tag(self):
        return [t.tag_ for t in self.tokens]

    @property
    def starts(self):
        return [t.idx for t in self.tokens]

    @property
    def ends(self):
        return [t.idx_end for t in self.tokens]

    def char2token(self, char_idx):
        """
        If char_idx falls into the a token, return the index of this token.
        Elif char_idx falls into the gap between 2 tokens, return the index of the previous token.
        Elif char_idx is lower than the first token, return 0.
        Elif return the index of the last token.
        """
        if char_idx < self.starts[0]:
            return 0
        if char_idx >= self.starts[-1]:
            return len(self.tokens)-1
        for i_tok, start_idx in enumerate(self.starts):
            if start_idx == char_idx:
                return i_tok
            if start_idx > char_idx:
                return i_tok-1

    def span(self, start, end):
        # Left inclusive, right inclusive
        assert end > start
        start, end = self.char2token(start), self.char2token(end-1)
        assert end >= start
        return start, end

    def __repr__(self):
        return self.tokens.__repr__()