Spaces:

responsibility-framing
/

sociolome

Sleeping

Gosse Minnema

Initial commit

05922fb 8 months ago

4.94 kB

	from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer

	framenet_split = {
	"train": [
	"LUCorpus-v0.3__CNN_AARONBROWN_ENG_20051101_215800.partial-NEW.xml",
	"NTI__Iran_Chemical.xml",
	"NTI__Taiwan_Introduction.xml",
	"LUCorpus-v0.3__20000416_xin_eng-NEW.xml",
	"NTI__NorthKorea_ChemicalOverview.xml",
	"NTI__workAdvances.xml",
	"C-4__C-4Text.xml",
	"ANC__IntroOfDublin.xml",
	"LUCorpus-v0.3__20000420_xin_eng-NEW.xml",
	"NTI__BWTutorial_chapter1.xml",
	"ANC__110CYL068.xml",
	"LUCorpus-v0.3__artb_004_A1_E1_NEW.xml",
	"NTI__Iran_Missile.xml",
	"LUCorpus-v0.3__20000424_nyt-NEW.xml",
	"LUCorpus-v0.3__wsj_1640.mrg-NEW.xml",
	"ANC__110CYL070.xml",
	"NTI__Iran_Introduction.xml",
	"KBEval__lcch.xml",
	"ANC__HistoryOfLasVegas.xml",
	"LUCorpus-v0.3__wsj_2465.xml",
	"KBEval__LCC-M.xml",
	"LUCorpus-v0.3__artb_004_A1_E2_NEW.xml",
	"LUCorpus-v0.3__AFGP-2002-600002-Trans.xml",
	"LUCorpus-v0.3__602CZL285-1.xml",
	"PropBank__LomaPrieta.xml",
	"NTI__Iran_Biological.xml",
	"NTI__Kazakhstan.xml",
	"LUCorpus-v0.3__AFGP-2002-600045-Trans.xml",
	"NTI__Iran_Nuclear.xml",
	"ANC__EntrepreneurAsMadonna.xml",
	"SemAnno__Text1.xml",
	"ANC__HistoryOfJerusalem.xml",
	"NTI__ChinaOverview.xml",
	"PropBank__ElectionVictory.xml",
	"NTI__Russia_Introduction.xml",
	"NTI__SouthAfrica_Introduction.xml",
	"LUCorpus-v0.3__20000419_apw_eng-NEW.xml",
	"NTI__LibyaCountry1.xml",
	"ANC__IntroJamaica.xml",
	"QA__IranRelatedQuestions.xml",
	"ANC__HistoryOfGreece.xml",
	"NTI__NorthKorea_NuclearCapabilities.xml",
	"PropBank__BellRinging.xml",
	"PropBank__PolemicProgressiveEducation.xml",
	"NTI__WMDNews_042106.xml",
	"ANC__110CYL200.xml",
	"LUCorpus-v0.3__CNN_ENG_20030614_173123.4-NEW-1.xml"
	],

	"dev": [
	"NTI__WMDNews_062606.xml",
	"LUCorpus-v0.3__ENRON-pearson-email-25jul02.xml",
	"KBEval__MIT.xml",
	"ANC__110CYL072.xml",
	"LUCorpus-v0.3__20000415_apw_eng-NEW.xml",
	"Miscellaneous__Hijack.xml",
	"PropBank__TicketSplitting.xml",
	"NTI__NorthKorea_NuclearOverview.xml"
	],

	"test": [
	"NTI__NorthKorea_Introduction.xml",
	"LUCorpus-v0.3__enron-thread-159550.xml",
	"ANC__WhereToHongKong.xml",
	"KBEval__atm.xml",
	"ANC__112C-L013.xml",
	"LUCorpus-v0.3__IZ-060316-01-Trans-1.xml",
	"LUCorpus-v0.3__AFGP-2002-602187-Trans.xml",
	"ANC__StephanopoulosCrimes.xml",
	"ANC__110CYL069.xml",
	"ANC__110CYL067.xml",
	"ANC__IntroHongKong.xml",
	"LUCorpus-v0.3__20000410_nyt-NEW.xml",
	"KBEval__Brandeis.xml",
	"KBEval__Stanford.xml",
	"LUCorpus-v0.3__SNO-525.xml",
	"PropBank__AetnaLifeAndCasualty.xml",
	"Miscellaneous__Hound-Ch14.xml",
	"NTI__Syria_NuclearOverview.xml",
	"KBEval__cycorp.xml",
	"KBEval__utd-icsi.xml",
	"LUCorpus-v0.3__sw2025-ms98-a-trans.ascii-1-NEW.xml",
	"Miscellaneous__SadatAssassination.xml",
	"KBEval__parc.xml"
	]
	}

	_spacy_tokenizer = SpacyTokenizer(language='en_core_web_sm', pos_tags=True)


	class Sentence:
	def __init__(self, text):
	"""
	Re-tokenize sentence. Map character indices to token indices.
	We assume the char and token span indices are left inclusive and right inclusive.
	"""
	self.tokens = _spacy_tokenizer.tokenize(text)

	@property
	def pos(self):
	return [t.pos_ for t in self.tokens]

	@property
	def tag(self):
	return [t.tag_ for t in self.tokens]

	@property
	def starts(self):
	return [t.idx for t in self.tokens]

	@property
	def ends(self):
	return [t.idx_end for t in self.tokens]

	def char2token(self, char_idx):
	"""
	If char_idx falls into the a token, return the index of this token.
	Elif char_idx falls into the gap between 2 tokens, return the index of the previous token.
	Elif char_idx is lower than the first token, return 0.
	Elif return the index of the last token.
	"""
	if char_idx < self.starts[0]:
	return 0
	if char_idx >= self.starts[-1]:
	return len(self.tokens)-1
	for i_tok, start_idx in enumerate(self.starts):
	if start_idx == char_idx:
	return i_tok
	if start_idx > char_idx:
	return i_tok-1

	def span(self, start, end):
	# Left inclusive, right inclusive
	assert end > start
	start, end = self.char2token(start), self.char2token(end-1)
	assert end >= start
	return start, end

	def __repr__(self):
	return self.tokens.__repr__()