if001 commited on
Commit
de00515
1 Parent(s): e254bed
Files changed (2) hide show
  1. README.md +7 -0
  2. sentencepiece_ja.py +1 -7
README.md CHANGED
@@ -31,3 +31,10 @@ https://huggingface.co/datasets/izumi-lab/wikinews-ja-20230728
31
  https://huggingface.co/datasets/izumi-lab/wikinews-en-20230728
32
  https://huggingface.co/datasets/if001/aozorabunko-clean-sin
33
 
 
 
 
 
 
 
 
 
31
  https://huggingface.co/datasets/izumi-lab/wikinews-en-20230728
32
  https://huggingface.co/datasets/if001/aozorabunko-clean-sin
33
 
34
+
35
+
36
+ ## settings
37
+ ```
38
+ all_special_ids: [1, 2, 3, 0, 4]
39
+ all_special_tokens: ['<BOS>', '<EOS>', '<UNK>', '<PAD>', '<MASK>']
40
+ ```
sentencepiece_ja.py CHANGED
@@ -14,13 +14,7 @@ class SentencePieceJA(PreTrainedTokenizer):
14
  **kwargs):
15
  from tokenizers import Tokenizer
16
  self._tokenizer = Tokenizer.from_file(model_path)
17
- super().__init__(
18
- pad_token=pad,
19
- bos_token=bos,
20
- eos_token=eos,
21
- unk_token=unk,
22
- mask_token=mask,
23
- **kwargs)
24
  self.add_special_tokens({
25
  'pad_token': pad,
26
  'bos_token': bos,
 
14
  **kwargs):
15
  from tokenizers import Tokenizer
16
  self._tokenizer = Tokenizer.from_file(model_path)
17
+ super().__init__(**kwargs)
 
 
 
 
 
 
18
  self.add_special_tokens({
19
  'pad_token': pad,
20
  'bos_token': bos,