ndeclarke commited on
Commit
e41a29f
1 Parent(s): b196010

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer_config.json +1 -2
  2. vocab.json +0 -113
tokenizer_config.json CHANGED
@@ -39,9 +39,8 @@
39
  "eos_token": "</s>",
40
  "model_max_length": 1000000000000000019884624838656,
41
  "pad_token": "[PAD]",
42
- "processor_class": "Wav2Vec2Processor",
43
  "replace_word_delimiter_char": " ",
44
- "target_lang": "tam-512",
45
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
46
  "unk_token": "[UNK]",
47
  "word_delimiter_token": "|"
 
39
  "eos_token": "</s>",
40
  "model_max_length": 1000000000000000019884624838656,
41
  "pad_token": "[PAD]",
 
42
  "replace_word_delimiter_char": " ",
43
+ "target_lang": "tam-32",
44
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
45
  "unk_token": "[UNK]",
46
  "word_delimiter_token": "|"
vocab.json CHANGED
@@ -1,118 +1,5 @@
1
  {
2
- "tam-128": {
3
- "&": 1,
4
- "[PAD]": 53,
5
- "[UNK]": 52,
6
- "_": 2,
7
- "|": 0,
8
- "ஃ": 3,
9
- "அ": 4,
10
- "ஆ": 5,
11
- "இ": 6,
12
- "ஈ": 7,
13
- "உ": 8,
14
- "ஊ": 9,
15
- "எ": 10,
16
- "ஏ": 11,
17
- "ஐ": 12,
18
- "ஒ": 13,
19
- "ஓ": 14,
20
- "ஔ": 15,
21
- "க": 16,
22
- "ங": 17,
23
- "ச": 18,
24
- "ஜ": 19,
25
- "ஞ": 20,
26
- "ட": 21,
27
- "ண": 22,
28
- "த": 23,
29
- "ந": 24,
30
- "ன": 25,
31
- "ப": 26,
32
- "ம": 27,
33
- "ய": 28,
34
- "ர": 29,
35
- "ற": 30,
36
- "ல": 31,
37
- "ள": 32,
38
- "ழ": 33,
39
- "வ": 34,
40
- "ஷ": 35,
41
- "ஸ": 36,
42
- "ஹ": 37,
43
- "ா": 38,
44
- "ி": 39,
45
- "ீ": 40,
46
- "ு": 41,
47
- "ூ": 42,
48
- "ெ": 43,
49
- "ே": 44,
50
- "ை": 45,
51
- "ொ": 46,
52
- "ோ": 47,
53
- "ௌ": 48,
54
- "்": 49,
55
- "ௗ": 50,
56
- "ഥ": 51
57
- },
58
  "tam-32": {
59
- "&": 1,
60
- "[PAD]": 54,
61
- "[UNK]": 53,
62
- "_": 2,
63
- "|": 0,
64
- "¾": 3,
65
- "ஃ": 4,
66
- "அ": 5,
67
- "ஆ": 6,
68
- "இ": 7,
69
- "ஈ": 8,
70
- "உ": 9,
71
- "ஊ": 10,
72
- "எ": 11,
73
- "ஏ": 12,
74
- "ஐ": 13,
75
- "ஒ": 14,
76
- "ஓ": 15,
77
- "ஔ": 16,
78
- "க": 17,
79
- "ங": 18,
80
- "ச": 19,
81
- "ஜ": 20,
82
- "ஞ": 21,
83
- "ட": 22,
84
- "ண": 23,
85
- "த": 24,
86
- "ந": 25,
87
- "ன": 26,
88
- "ப": 27,
89
- "ம": 28,
90
- "ய": 29,
91
- "ர": 30,
92
- "ற": 31,
93
- "ல": 32,
94
- "ள": 33,
95
- "ழ": 34,
96
- "வ": 35,
97
- "ஷ": 36,
98
- "ஸ": 37,
99
- "ஹ": 38,
100
- "ா": 39,
101
- "ி": 40,
102
- "ீ": 41,
103
- "ு": 42,
104
- "ூ": 43,
105
- "ெ": 44,
106
- "ே": 45,
107
- "ை": 46,
108
- "ொ": 47,
109
- "ோ": 48,
110
- "ௌ": 49,
111
- "்": 50,
112
- "ௗ": 51,
113
- "ഥ": 52
114
- },
115
- "tam-512": {
116
  "&": 1,
117
  "[PAD]": 53,
118
  "[UNK]": 52,
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "tam-32": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "&": 1,
4
  "[PAD]": 53,
5
  "[UNK]": 52,