Fix jumanpp.apply_to_sentence return empty list for sentence larger than ~1700 characters (#2)
Browse files- Fix jumanpp.apply_to_sentence return empty list for sentence larger than ~1700 characters (71e8994479e42c5e0afab380bc4b41941ce1dac0)
- Fix jumanpp.apply_to_sentence return empty list (072cc83b373db1749b80d92a4d4fd5c7fbb0ba63)
- Adding rhoknp package reference (3d4a29abd2f3ba3abd94da87127d5115655fbe00)
- Adding rhoknp package reference (911013a3b7b60de980d55fa69287d9da7300c61f)
Co-authored-by: Loc Nguyen <nesv042@users.noreply.huggingface.co>
tokenization_deberta_v2_jumanpp.py
CHANGED
@@ -24,7 +24,13 @@ class JumanppTokenizer:
|
|
24 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
25 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
26 |
)
|
|
|
27 |
self.jumanpp = rhoknp.Jumanpp()
|
28 |
|
29 |
def tokenize(self, text: str) -> str:
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
24 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
25 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
26 |
)
|
27 |
+
self.rhoknp = rhoknp
|
28 |
self.jumanpp = rhoknp.Jumanpp()
|
29 |
|
30 |
def tokenize(self, text: str) -> str:
|
31 |
+
morphemes = self.jumanpp.apply_to_sentence(text).morphemes
|
32 |
+
if not morphemes:
|
33 |
+
doc = self.rhoknp.Document.from_raw_text(text)
|
34 |
+
morphemes = self.jumanpp.apply_to_document(doc).morphemes
|
35 |
+
return " ".join([morpheme.surf for morpheme in morphemes])
|
36 |
+
|
tokenization_deberta_v2_jumanpp_fast.py
CHANGED
@@ -55,6 +55,7 @@ class JumanppPreTokenizer:
|
|
55 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
56 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
57 |
)
|
|
|
58 |
self.jumanpp = rhoknp.Jumanpp()
|
59 |
|
60 |
def pre_tokenize(self, pretok: PreTokenizedString):
|
@@ -62,4 +63,7 @@ class JumanppPreTokenizer:
|
|
62 |
|
63 |
def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
64 |
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
|
|
|
|
|
|
|
65 |
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|
|
|
55 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
56 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
57 |
)
|
58 |
+
self.rhoknp = rhoknp
|
59 |
self.jumanpp = rhoknp.Jumanpp()
|
60 |
|
61 |
def pre_tokenize(self, pretok: PreTokenizedString):
|
|
|
63 |
|
64 |
def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
65 |
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
|
66 |
+
if not offsets:
|
67 |
+
doc = self.rhoknp.Document.from_raw_text(str(normalized_string))
|
68 |
+
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
|
69 |
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|