Update README.md
#1
by
helonky
- opened
README.md
CHANGED
@@ -16,11 +16,11 @@ class JiebaTokenizer(BertTokenizer):
|
|
16 |
self.pre_tokenizer = pre_tokenizer
|
17 |
def _tokenize(self, text, *arg, **kwargs):
|
18 |
split_tokens = []
|
19 |
-
for
|
20 |
-
if
|
21 |
-
split_tokens.append(
|
22 |
else:
|
23 |
-
split_tokens.extend(super()._tokenize(
|
24 |
return split_tokens
|
25 |
model = BigBirdModel.from_pretrained('Lowin/chinese-bigbird-base-4096')
|
26 |
tokenizer = JiebaTokenizer.from_pretrained('Lowin/chinese-bigbird-base-4096')
|
|
|
16 |
self.pre_tokenizer = pre_tokenizer
|
17 |
def _tokenize(self, text, *arg, **kwargs):
|
18 |
split_tokens = []
|
19 |
+
for word in self.pre_tokenizer(text):
|
20 |
+
if word in self.vocab:
|
21 |
+
split_tokens.append(word)
|
22 |
else:
|
23 |
+
split_tokens.extend(super()._tokenize(word))
|
24 |
return split_tokens
|
25 |
model = BigBirdModel.from_pretrained('Lowin/chinese-bigbird-base-4096')
|
26 |
tokenizer = JiebaTokenizer.from_pretrained('Lowin/chinese-bigbird-base-4096')
|