|
from transformers import DebertaV2Tokenizer |
|
|
|
|
|
class DebertaV2JumanppTokenizer(DebertaV2Tokenizer): |
|
def __init__(self, *args, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
self.juman_tokenizer = JumanppTokenizer() |
|
|
|
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]: |
|
text = self.juman_tokenizer.tokenize(text) |
|
|
|
add_prefix_space = kwargs.pop("add_prefix_space", False) |
|
if is_split_into_words or add_prefix_space: |
|
text = " " + text |
|
return (text, kwargs) |
|
|
|
|
|
class JumanppTokenizer: |
|
def __init__(self): |
|
try: |
|
import rhoknp |
|
except ImportError: |
|
raise ImportError( |
|
"You need to install rhoknp to use JumanppPreTokenizer. " |
|
"See https://github.com/ku-nlp/rhoknp for installation." |
|
) |
|
self.rhoknp = rhoknp |
|
self.jumanpp = rhoknp.Jumanpp() |
|
|
|
def tokenize(self, text: str) -> str: |
|
try: |
|
morphemes = self.jumanpp.apply_to_sentence(text).morphemes |
|
except RuntimeError: |
|
doc = self.rhoknp.Document.from_raw_text(text) |
|
morphemes = self.jumanpp.apply_to_document(doc).morphemes |
|
return " ".join([morpheme.surf for morpheme in morphemes]) |
|
|