nobu-g nesv042 commited on
Commit
89991d1
1 Parent(s): 5fbcdbd

Fix jumanpp.apply_to_sentence return empty list for sentence larger than ~1700 characters (#2)

Browse files

- Fix jumanpp.apply_to_sentence return empty list for sentence larger than ~1700 characters (71e8994479e42c5e0afab380bc4b41941ce1dac0)
- Fix jumanpp.apply_to_sentence return empty list (072cc83b373db1749b80d92a4d4fd5c7fbb0ba63)
- Adding rhoknp package reference (3d4a29abd2f3ba3abd94da87127d5115655fbe00)
- Adding rhoknp package reference (911013a3b7b60de980d55fa69287d9da7300c61f)


Co-authored-by: Loc Nguyen <nesv042@users.noreply.huggingface.co>

tokenization_deberta_v2_jumanpp.py CHANGED
@@ -24,7 +24,13 @@ class JumanppTokenizer:
24
  "You need to install rhoknp to use JumanppPreTokenizer. "
25
  "See https://github.com/ku-nlp/rhoknp for installation."
26
  )
 
27
  self.jumanpp = rhoknp.Jumanpp()
28
 
29
  def tokenize(self, text: str) -> str:
30
- return " ".join([morpheme.surf for morpheme in self.jumanpp.apply_to_sentence(text).morphemes])
 
 
 
 
 
 
24
  "You need to install rhoknp to use JumanppPreTokenizer. "
25
  "See https://github.com/ku-nlp/rhoknp for installation."
26
  )
27
+ self.rhoknp = rhoknp
28
  self.jumanpp = rhoknp.Jumanpp()
29
 
30
  def tokenize(self, text: str) -> str:
31
+ morphemes = self.jumanpp.apply_to_sentence(text).morphemes
32
+ if not morphemes:
33
+ doc = self.rhoknp.Document.from_raw_text(text)
34
+ morphemes = self.jumanpp.apply_to_document(doc).morphemes
35
+ return " ".join([morpheme.surf for morpheme in morphemes])
36
+
tokenization_deberta_v2_jumanpp_fast.py CHANGED
@@ -55,6 +55,7 @@ class JumanppPreTokenizer:
55
  "You need to install rhoknp to use JumanppPreTokenizer. "
56
  "See https://github.com/ku-nlp/rhoknp for installation."
57
  )
 
58
  self.jumanpp = rhoknp.Jumanpp()
59
 
60
  def pre_tokenize(self, pretok: PreTokenizedString):
@@ -62,4 +63,7 @@ class JumanppPreTokenizer:
62
 
63
  def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
64
  offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
 
 
 
65
  return [normalized_string[offset[0]:offset[1]] for offset in offsets]
 
55
  "You need to install rhoknp to use JumanppPreTokenizer. "
56
  "See https://github.com/ku-nlp/rhoknp for installation."
57
  )
58
+ self.rhoknp = rhoknp
59
  self.jumanpp = rhoknp.Jumanpp()
60
 
61
  def pre_tokenize(self, pretok: PreTokenizedString):
 
63
 
64
  def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
65
  offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
66
+ if not offsets:
67
+ doc = self.rhoknp.Document.from_raw_text(str(normalized_string))
68
+ offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
69
  return [normalized_string[offset[0]:offset[1]] for offset in offsets]