Spaces:
Running
Running
import json | |
import os | |
from transformers import LlamaTokenizer | |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer") | |
tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR) | |
tokens = [ 1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, | |
526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, | |
29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, | |
9109, 29889, 29871, 3575, 6089, 881, 451, 3160, 738, 10311, | |
1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, | |
391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, | |
29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, | |
5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, | |
263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, | |
451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, | |
310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, | |
29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, | |
29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, | |
29903, 6778, 13, 13, 15970, 526, 366, 518, 29914, 25580, | |
29962] | |
text = tokenizer.decode(tokens) | |
print(text) | |
for token_id in tokens: | |
print(json.dumps({"token_id": token_id, "decode_str": tokenizer.decode([token_id]), "token": tokenizer.convert_ids_to_tokens([token_id][0])}, ensure_ascii=False)) | |
def byte_token(): | |
""" | |
为什么 \n 是 "<0x0A>" | |
8 11 145 | |
:return: | |
""" | |
for token_id in [8, 11, 145]: | |
token_str = tokenizer.decode([token_id]) | |
print(token_str) | |
byte_token() |