hadiqa123 commited on
Commit
daa41e6
1 Parent(s): 2ae91f5

add tokenizer

Browse files
Files changed (4) hide show
  1. added_tokens.json +2 -106
  2. special_tokens_map.json +18 -129
  3. tokenizer_config.json +9 -32
  4. vocab.json +0 -0
added_tokens.json CHANGED
@@ -1,108 +1,4 @@
1
  {
2
- "<|af|>": 50327,
3
- "<|am|>": 50334,
4
- "<|ar|>": 50272,
5
- "<|as|>": 50350,
6
- "<|az|>": 50304,
7
- "<|ba|>": 50355,
8
- "<|be|>": 50330,
9
- "<|bg|>": 50292,
10
- "<|bn|>": 50302,
11
- "<|bo|>": 50347,
12
- "<|br|>": 50309,
13
- "<|bs|>": 50315,
14
- "<|ca|>": 50270,
15
- "<|cs|>": 50283,
16
- "<|cy|>": 50297,
17
- "<|da|>": 50285,
18
- "<|de|>": 50261,
19
- "<|el|>": 50281,
20
- "<|en|>": 50259,
21
- "<|es|>": 50262,
22
- "<|et|>": 50307,
23
- "<|eu|>": 50310,
24
- "<|fa|>": 50300,
25
- "<|fi|>": 50277,
26
- "<|fo|>": 50338,
27
- "<|fr|>": 50265,
28
- "<|gl|>": 50319,
29
- "<|gu|>": 50333,
30
- "<|haw|>": 50352,
31
- "<|ha|>": 50354,
32
- "<|he|>": 50279,
33
- "<|hi|>": 50276,
34
- "<|hr|>": 50291,
35
- "<|ht|>": 50339,
36
- "<|hu|>": 50286,
37
- "<|hy|>": 50312,
38
- "<|id|>": 50275,
39
- "<|is|>": 50311,
40
- "<|it|>": 50274,
41
- "<|ja|>": 50266,
42
- "<|jw|>": 50356,
43
- "<|ka|>": 50329,
44
- "<|kk|>": 50316,
45
- "<|km|>": 50323,
46
- "<|kn|>": 50306,
47
- "<|ko|>": 50264,
48
- "<|la|>": 50294,
49
- "<|lb|>": 50345,
50
- "<|ln|>": 50353,
51
- "<|lo|>": 50336,
52
- "<|lt|>": 50293,
53
- "<|lv|>": 50301,
54
- "<|mg|>": 50349,
55
- "<|mi|>": 50295,
56
- "<|mk|>": 50308,
57
- "<|ml|>": 50296,
58
- "<|mn|>": 50314,
59
- "<|mr|>": 50320,
60
- "<|ms|>": 50282,
61
- "<|mt|>": 50343,
62
- "<|my|>": 50346,
63
- "<|ne|>": 50313,
64
- "<|nl|>": 50271,
65
- "<|nn|>": 50342,
66
- "<|nocaptions|>": 50362,
67
- "<|notimestamps|>": 50363,
68
- "<|no|>": 50288,
69
- "<|oc|>": 50328,
70
- "<|pa|>": 50321,
71
- "<|pl|>": 50269,
72
- "<|ps|>": 50340,
73
- "<|pt|>": 50267,
74
- "<|ro|>": 50284,
75
- "<|ru|>": 50263,
76
- "<|sa|>": 50344,
77
- "<|sd|>": 50332,
78
- "<|si|>": 50322,
79
- "<|sk|>": 50298,
80
- "<|sl|>": 50305,
81
- "<|sn|>": 50324,
82
- "<|so|>": 50326,
83
- "<|sq|>": 50317,
84
- "<|sr|>": 50303,
85
- "<|startoflm|>": 50360,
86
- "<|startofprev|>": 50361,
87
- "<|startoftranscript|>": 50258,
88
- "<|su|>": 50357,
89
- "<|sv|>": 50273,
90
- "<|sw|>": 50318,
91
- "<|ta|>": 50287,
92
- "<|te|>": 50299,
93
- "<|tg|>": 50331,
94
- "<|th|>": 50289,
95
- "<|tk|>": 50341,
96
- "<|tl|>": 50348,
97
- "<|transcribe|>": 50359,
98
- "<|translate|>": 50358,
99
- "<|tr|>": 50268,
100
- "<|tt|>": 50351,
101
- "<|uk|>": 50280,
102
- "<|ur|>": 50290,
103
- "<|uz|>": 50337,
104
- "<|vi|>": 50278,
105
- "<|yi|>": 50335,
106
- "<|yo|>": 50325,
107
- "<|zh|>": 50260
108
  }
 
1
  {
2
+ "</s>": 47,
3
+ "<s>": 46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  }
special_tokens_map.json CHANGED
@@ -1,133 +1,22 @@
1
  {
2
  "additional_special_tokens": [
3
- "<|endoftext|>",
4
- "<|startoftranscript|>",
5
- "<|en|>",
6
- "<|zh|>",
7
- "<|de|>",
8
- "<|es|>",
9
- "<|ru|>",
10
- "<|ko|>",
11
- "<|fr|>",
12
- "<|ja|>",
13
- "<|pt|>",
14
- "<|tr|>",
15
- "<|pl|>",
16
- "<|ca|>",
17
- "<|nl|>",
18
- "<|ar|>",
19
- "<|sv|>",
20
- "<|it|>",
21
- "<|id|>",
22
- "<|hi|>",
23
- "<|fi|>",
24
- "<|vi|>",
25
- "<|he|>",
26
- "<|uk|>",
27
- "<|el|>",
28
- "<|ms|>",
29
- "<|cs|>",
30
- "<|ro|>",
31
- "<|da|>",
32
- "<|hu|>",
33
- "<|ta|>",
34
- "<|no|>",
35
- "<|th|>",
36
- "<|ur|>",
37
- "<|hr|>",
38
- "<|bg|>",
39
- "<|lt|>",
40
- "<|la|>",
41
- "<|mi|>",
42
- "<|ml|>",
43
- "<|cy|>",
44
- "<|sk|>",
45
- "<|te|>",
46
- "<|fa|>",
47
- "<|lv|>",
48
- "<|bn|>",
49
- "<|sr|>",
50
- "<|az|>",
51
- "<|sl|>",
52
- "<|kn|>",
53
- "<|et|>",
54
- "<|mk|>",
55
- "<|br|>",
56
- "<|eu|>",
57
- "<|is|>",
58
- "<|hy|>",
59
- "<|ne|>",
60
- "<|mn|>",
61
- "<|bs|>",
62
- "<|kk|>",
63
- "<|sq|>",
64
- "<|sw|>",
65
- "<|gl|>",
66
- "<|mr|>",
67
- "<|pa|>",
68
- "<|si|>",
69
- "<|km|>",
70
- "<|sn|>",
71
- "<|yo|>",
72
- "<|so|>",
73
- "<|af|>",
74
- "<|oc|>",
75
- "<|ka|>",
76
- "<|be|>",
77
- "<|tg|>",
78
- "<|sd|>",
79
- "<|gu|>",
80
- "<|am|>",
81
- "<|yi|>",
82
- "<|lo|>",
83
- "<|uz|>",
84
- "<|fo|>",
85
- "<|ht|>",
86
- "<|ps|>",
87
- "<|tk|>",
88
- "<|nn|>",
89
- "<|mt|>",
90
- "<|sa|>",
91
- "<|lb|>",
92
- "<|my|>",
93
- "<|bo|>",
94
- "<|tl|>",
95
- "<|mg|>",
96
- "<|as|>",
97
- "<|tt|>",
98
- "<|haw|>",
99
- "<|ln|>",
100
- "<|ha|>",
101
- "<|ba|>",
102
- "<|jw|>",
103
- "<|su|>",
104
- "<|translate|>",
105
- "<|transcribe|>",
106
- "<|startoflm|>",
107
- "<|startofprev|>",
108
- "<|nocaptions|>",
109
- "<|notimestamps|>"
110
  ],
111
- "bos_token": {
112
- "content": "<|endoftext|>",
113
- "lstrip": false,
114
- "normalized": true,
115
- "rstrip": false,
116
- "single_word": false
117
- },
118
- "eos_token": {
119
- "content": "<|endoftext|>",
120
- "lstrip": false,
121
- "normalized": true,
122
- "rstrip": false,
123
- "single_word": false
124
- },
125
- "pad_token": "<|endoftext|>",
126
- "unk_token": {
127
- "content": "<|endoftext|>",
128
- "lstrip": false,
129
- "normalized": true,
130
- "rstrip": false,
131
- "single_word": false
132
- }
133
  }
 
1
  {
2
  "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ],
18
+ "bos_token": "<s>",
19
+ "eos_token": "</s>",
20
+ "pad_token": "[PAD]",
21
+ "unk_token": "[UNK]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  }
tokenizer_config.json CHANGED
@@ -1,35 +1,12 @@
1
  {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "eos_token": {
13
- "__type": "AddedToken",
14
- "content": "<|endoftext|>",
15
- "lstrip": false,
16
- "normalized": true,
17
- "rstrip": false,
18
- "single_word": false
19
- },
20
- "errors": "replace",
21
- "model_max_length": 1024,
22
- "pad_token": null,
23
- "processor_class": "WhisperProcessor",
24
- "return_attention_mask": false,
25
  "special_tokens_map_file": null,
26
- "tokenizer_class": "WhisperTokenizer",
27
- "unk_token": {
28
- "__type": "AddedToken",
29
- "content": "<|endoftext|>",
30
- "lstrip": false,
31
- "normalized": true,
32
- "rstrip": false,
33
- "single_word": false
34
- }
35
  }
 
1
  {
2
+ "bos_token": "<s>",
3
+ "do_lower_case": false,
4
+ "eos_token": "</s>",
5
+ "name_or_path": "./",
6
+ "pad_token": "[PAD]",
7
+ "replace_word_delimiter_char": " ",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "special_tokens_map_file": null,
9
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
10
+ "unk_token": "[UNK]",
11
+ "word_delimiter_token": "|"
 
 
 
 
 
 
12
  }
vocab.json CHANGED
The diff for this file is too large to render. See raw diff