TokenBender
commited on
Commit
•
96b9b67
1
Parent(s):
eb180e7
Upload 4 files
Browse files- added_tokens.json +4 -0
- special_tokens_map.json +17 -52
- tokenizer.json +18 -0
- tokenizer_config.json +22 -40
added_tokens.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<|im_end|>": 49153,
|
3 |
+
"<|im_start|>": 49152
|
4 |
+
}
|
special_tokens_map.json
CHANGED
@@ -1,58 +1,23 @@
|
|
1 |
{
|
2 |
"additional_special_tokens": [
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
"<jupyter_script>",
|
18 |
-
"<empty_output>",
|
19 |
-
"<code_to_intermediate>",
|
20 |
-
"<intermediate_to_code>",
|
21 |
-
"<pr>",
|
22 |
-
"<pr_status>",
|
23 |
-
"<pr_is_merged>",
|
24 |
-
"<pr_base>",
|
25 |
-
"<pr_file>",
|
26 |
-
"<pr_base_code>",
|
27 |
-
"<pr_diff>",
|
28 |
-
"<pr_diff_hunk>",
|
29 |
-
"<pr_comment>",
|
30 |
-
"<pr_event_id>",
|
31 |
-
"<pr_review>",
|
32 |
-
"<pr_review_state>",
|
33 |
-
"<pr_review_comment>",
|
34 |
-
"<pr_in_reply_to_review_id>",
|
35 |
-
"<pr_in_reply_to_comment_id>",
|
36 |
-
"<pr_diff_hunk_comment_line>",
|
37 |
-
"<NAME>",
|
38 |
-
"<EMAIL>",
|
39 |
-
"<KEY>",
|
40 |
-
"<PASSWORD>"
|
41 |
],
|
42 |
-
"bos_token":
|
43 |
-
|
44 |
-
|
45 |
-
"normalized": false,
|
46 |
-
"rstrip": false,
|
47 |
-
"single_word": false
|
48 |
-
},
|
49 |
-
"eos_token": {
|
50 |
-
"content": "<|endoftext|>",
|
51 |
-
"lstrip": false,
|
52 |
-
"normalized": false,
|
53 |
-
"rstrip": false,
|
54 |
-
"single_word": false
|
55 |
-
},
|
56 |
"unk_token": {
|
57 |
"content": "<|endoftext|>",
|
58 |
"lstrip": false,
|
|
|
1 |
{
|
2 |
"additional_special_tokens": [
|
3 |
+
{
|
4 |
+
"content": "<|im_start|>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"content": "<|im_end|>",
|
12 |
+
"lstrip": false,
|
13 |
+
"normalized": false,
|
14 |
+
"rstrip": false,
|
15 |
+
"single_word": false
|
16 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
],
|
18 |
+
"bos_token": "<|im_start|>",
|
19 |
+
"eos_token": "<|im_end|>",
|
20 |
+
"pad_token": "<|im_end|>",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
"unk_token": {
|
22 |
"content": "<|endoftext|>",
|
23 |
"lstrip": false,
|
tokenizer.json
CHANGED
@@ -344,6 +344,24 @@
|
|
344 |
"rstrip": false,
|
345 |
"normalized": false,
|
346 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
}
|
348 |
],
|
349 |
"normalizer": null,
|
|
|
344 |
"rstrip": false,
|
345 |
"normalized": false,
|
346 |
"special": true
|
347 |
+
},
|
348 |
+
{
|
349 |
+
"id": 49152,
|
350 |
+
"content": "<|im_start|>",
|
351 |
+
"single_word": false,
|
352 |
+
"lstrip": false,
|
353 |
+
"rstrip": false,
|
354 |
+
"normalized": false,
|
355 |
+
"special": true
|
356 |
+
},
|
357 |
+
{
|
358 |
+
"id": 49153,
|
359 |
+
"content": "<|im_end|>",
|
360 |
+
"single_word": false,
|
361 |
+
"lstrip": false,
|
362 |
+
"rstrip": false,
|
363 |
+
"normalized": false,
|
364 |
+
"special": true
|
365 |
}
|
366 |
],
|
367 |
"normalizer": null,
|
tokenizer_config.json
CHANGED
@@ -304,52 +304,34 @@
|
|
304 |
"rstrip": false,
|
305 |
"single_word": false,
|
306 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
}
|
308 |
},
|
309 |
"additional_special_tokens": [
|
310 |
-
"<|
|
311 |
-
"
|
312 |
-
"<fim_middle>",
|
313 |
-
"<fim_suffix>",
|
314 |
-
"<fim_pad>",
|
315 |
-
"<repo_name>",
|
316 |
-
"<file_sep>",
|
317 |
-
"<issue_start>",
|
318 |
-
"<issue_comment>",
|
319 |
-
"<issue_closed>",
|
320 |
-
"<jupyter_start>",
|
321 |
-
"<jupyter_text>",
|
322 |
-
"<jupyter_code>",
|
323 |
-
"<jupyter_output>",
|
324 |
-
"<jupyter_script>",
|
325 |
-
"<empty_output>",
|
326 |
-
"<code_to_intermediate>",
|
327 |
-
"<intermediate_to_code>",
|
328 |
-
"<pr>",
|
329 |
-
"<pr_status>",
|
330 |
-
"<pr_is_merged>",
|
331 |
-
"<pr_base>",
|
332 |
-
"<pr_file>",
|
333 |
-
"<pr_base_code>",
|
334 |
-
"<pr_diff>",
|
335 |
-
"<pr_diff_hunk>",
|
336 |
-
"<pr_comment>",
|
337 |
-
"<pr_event_id>",
|
338 |
-
"<pr_review>",
|
339 |
-
"<pr_review_state>",
|
340 |
-
"<pr_review_comment>",
|
341 |
-
"<pr_in_reply_to_review_id>",
|
342 |
-
"<pr_in_reply_to_comment_id>",
|
343 |
-
"<pr_diff_hunk_comment_line>",
|
344 |
-
"<NAME>",
|
345 |
-
"<EMAIL>",
|
346 |
-
"<KEY>",
|
347 |
-
"<PASSWORD>"
|
348 |
],
|
349 |
-
"bos_token": "<|
|
|
|
350 |
"clean_up_tokenization_spaces": true,
|
351 |
-
"eos_token": "<|
|
352 |
"model_max_length": 1000000000000000019884624838656,
|
|
|
353 |
"tokenizer_class": "GPT2Tokenizer",
|
354 |
"unk_token": "<|endoftext|>",
|
355 |
"vocab_size": 49152
|
|
|
304 |
"rstrip": false,
|
305 |
"single_word": false,
|
306 |
"special": true
|
307 |
+
},
|
308 |
+
"49152": {
|
309 |
+
"content": "<|im_start|>",
|
310 |
+
"lstrip": false,
|
311 |
+
"normalized": false,
|
312 |
+
"rstrip": false,
|
313 |
+
"single_word": false,
|
314 |
+
"special": true
|
315 |
+
},
|
316 |
+
"49153": {
|
317 |
+
"content": "<|im_end|>",
|
318 |
+
"lstrip": false,
|
319 |
+
"normalized": false,
|
320 |
+
"rstrip": false,
|
321 |
+
"single_word": false,
|
322 |
+
"special": true
|
323 |
}
|
324 |
},
|
325 |
"additional_special_tokens": [
|
326 |
+
"<|im_start|>",
|
327 |
+
"<|im_end|>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
],
|
329 |
+
"bos_token": "<|im_start|>",
|
330 |
+
"chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
|
331 |
"clean_up_tokenization_spaces": true,
|
332 |
+
"eos_token": "<|im_end|>",
|
333 |
"model_max_length": 1000000000000000019884624838656,
|
334 |
+
"pad_token": "<|im_end|>",
|
335 |
"tokenizer_class": "GPT2Tokenizer",
|
336 |
"unk_token": "<|endoftext|>",
|
337 |
"vocab_size": 49152
|