xu-song's picture
add more tokenizer
c75633b
raw
history blame
1.82 kB
import json
import os
from transformers import LlamaTokenizer
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")
tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR)
tokens = [ 1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492,
526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889,
29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641,
9109, 29889, 29871, 3575, 6089, 881, 451, 3160, 738, 10311,
1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916,
391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793,
29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443,
5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644,
263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338,
451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012,
310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915,
29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016,
29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816,
29903, 6778, 13, 13, 15970, 526, 366, 518, 29914, 25580,
29962]
text = tokenizer.decode(tokens)
print(text)
for token_id in tokens:
print(json.dumps({"token_id": token_id, "decode_str": tokenizer.decode([token_id]), "token": tokenizer.convert_ids_to_tokens([token_id][0])}, ensure_ascii=False))
def byte_token():
"""
为什么 \n 是 "<0x0A>"
8 11 145
:return:
"""
for token_id in [8, 11, 145]:
token_str = tokenizer.decode([token_id])
print(token_str)
byte_token()