Update from $USER
Browse files- README.md +125 -0
- config.json +76 -0
- preprocessor_config.json +8 -0
- pytorch_model.bin +3 -0
- scheduler.pt +3 -0
- special_tokens_map.json +1 -0
- template.README.md +54 -0
- tokenizer_config.json +1 -0
- trainer_state.json +128 -0
- training_args.bin +3 -0
- vocab.json +1 -0
README.md
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: ta
|
3 |
+
datasets:
|
4 |
+
- common_voice
|
5 |
+
tags:
|
6 |
+
- audio
|
7 |
+
- automatic-speech-recognition
|
8 |
+
- speech
|
9 |
+
- xlsr-fine-tuning-week
|
10 |
+
license: apache-2.0
|
11 |
+
model-index:
|
12 |
+
- name: XLSR Wav2Vec2 Tamil by Amrrs
|
13 |
+
results:
|
14 |
+
- task:
|
15 |
+
name: Speech Recognition
|
16 |
+
type: automatic-speech-recognition
|
17 |
+
dataset:
|
18 |
+
name: Common Voice ta
|
19 |
+
type: common_voice
|
20 |
+
args: ta
|
21 |
+
metrics:
|
22 |
+
- name: Test WER
|
23 |
+
type: wer
|
24 |
+
value: 82.94
|
25 |
+
---
|
26 |
+
|
27 |
+
# Wav2Vec2-Large-XLSR-53-Tamil
|
28 |
+
|
29 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Tamil using the [Common Voice](https://huggingface.co/datasets/common_voice)
|
30 |
+
When using this model, make sure that your speech input is sampled at 16kHz.
|
31 |
+
|
32 |
+
## Usage
|
33 |
+
|
34 |
+
The model can be used directly (without a language model) as follows:
|
35 |
+
|
36 |
+
```python
|
37 |
+
import torch
|
38 |
+
import torchaudio
|
39 |
+
from datasets import load_dataset
|
40 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
41 |
+
|
42 |
+
test_dataset = load_dataset("common_voice", "ta", split="test[:2%]").
|
43 |
+
|
44 |
+
processor = Wav2Vec2Processor.from_pretrained("Amrrs/wav2vec2-large-xlsr-53-tamil")
|
45 |
+
model = Wav2Vec2ForCTC.from_pretrained("Amrrs/wav2vec2-large-xlsr-53-tamil")
|
46 |
+
|
47 |
+
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
48 |
+
|
49 |
+
# Preprocessing the datasets.
|
50 |
+
# We need to read the aduio files as arrays
|
51 |
+
def speech_file_to_array_fn(batch):
|
52 |
+
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
53 |
+
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
54 |
+
return batch
|
55 |
+
|
56 |
+
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
57 |
+
inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
|
58 |
+
|
59 |
+
with torch.no_grad():
|
60 |
+
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
61 |
+
|
62 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
63 |
+
|
64 |
+
print("Prediction:", processor.batch_decode(predicted_ids))
|
65 |
+
print("Reference:", test_dataset["sentence"][:2])
|
66 |
+
```
|
67 |
+
|
68 |
+
|
69 |
+
## Evaluation
|
70 |
+
|
71 |
+
The model can be evaluated as follows on the {language} test data of Common Voice.
|
72 |
+
|
73 |
+
|
74 |
+
```python
|
75 |
+
import torch
|
76 |
+
import torchaudio
|
77 |
+
from datasets import load_dataset, load_metric
|
78 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
79 |
+
import re
|
80 |
+
|
81 |
+
test_dataset = load_dataset("common_voice", "ta", split="test")
|
82 |
+
wer = load_metric("wer")
|
83 |
+
|
84 |
+
processor = Wav2Vec2Processor.from_pretrained("Amrrs/wav2vec2-large-xlsr-53-tamil")
|
85 |
+
model = Wav2Vec2ForCTC.from_pretrained("Amrrs/wav2vec2-large-xlsr-53-tamil")
|
86 |
+
model.to("cuda")
|
87 |
+
|
88 |
+
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
|
89 |
+
resampler = torchaudio.transforms.Resample(48_000, 16_000)
|
90 |
+
|
91 |
+
# Preprocessing the datasets.
|
92 |
+
# We need to read the aduio files as arrays
|
93 |
+
def speech_file_to_array_fn(batch):
|
94 |
+
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
|
95 |
+
speech_array, sampling_rate = torchaudio.load(batch["path"])
|
96 |
+
batch["speech"] = resampler(speech_array).squeeze().numpy()
|
97 |
+
return batch
|
98 |
+
|
99 |
+
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
100 |
+
|
101 |
+
# Preprocessing the datasets.
|
102 |
+
# We need to read the aduio files as arrays
|
103 |
+
def evaluate(batch):
|
104 |
+
inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
|
105 |
+
|
106 |
+
with torch.no_grad():
|
107 |
+
logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
|
108 |
+
|
109 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
110 |
+
batch["pred_strings"] = processor.batch_decode(pred_ids)
|
111 |
+
return batch
|
112 |
+
|
113 |
+
result = test_dataset.map(evaluate, batched=True, batch_size=8)
|
114 |
+
|
115 |
+
print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
|
116 |
+
```
|
117 |
+
|
118 |
+
**Test Result**: 82.94 %
|
119 |
+
|
120 |
+
|
121 |
+
## Training
|
122 |
+
|
123 |
+
The Common Voice `train`, `validation` datasets were used for training.
|
124 |
+
|
125 |
+
The script used for training can be found [here](https://colab.research.google.com/drive/1-Klkgr4f-C9SanHfVC5RhP0ELUH6TYlN?usp=sharing)
|
config.json
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "facebook/wav2vec2-large-xlsr-53",
|
3 |
+
"activation_dropout": 0.0,
|
4 |
+
"apply_spec_augment": true,
|
5 |
+
"architectures": [
|
6 |
+
"Wav2Vec2ForCTC"
|
7 |
+
],
|
8 |
+
"attention_dropout": 0.1,
|
9 |
+
"bos_token_id": 1,
|
10 |
+
"conv_bias": true,
|
11 |
+
"conv_dim": [
|
12 |
+
512,
|
13 |
+
512,
|
14 |
+
512,
|
15 |
+
512,
|
16 |
+
512,
|
17 |
+
512,
|
18 |
+
512
|
19 |
+
],
|
20 |
+
"conv_kernel": [
|
21 |
+
10,
|
22 |
+
3,
|
23 |
+
3,
|
24 |
+
3,
|
25 |
+
3,
|
26 |
+
2,
|
27 |
+
2
|
28 |
+
],
|
29 |
+
"conv_stride": [
|
30 |
+
5,
|
31 |
+
2,
|
32 |
+
2,
|
33 |
+
2,
|
34 |
+
2,
|
35 |
+
2,
|
36 |
+
2
|
37 |
+
],
|
38 |
+
"ctc_loss_reduction": "mean",
|
39 |
+
"ctc_zero_infinity": false,
|
40 |
+
"do_stable_layer_norm": true,
|
41 |
+
"eos_token_id": 2,
|
42 |
+
"feat_extract_activation": "gelu",
|
43 |
+
"feat_extract_dropout": 0.0,
|
44 |
+
"feat_extract_norm": "layer",
|
45 |
+
"feat_proj_dropout": 0.0,
|
46 |
+
"final_dropout": 0.0,
|
47 |
+
"gradient_checkpointing": true,
|
48 |
+
"hidden_act": "gelu",
|
49 |
+
"hidden_dropout": 0.1,
|
50 |
+
"hidden_size": 1024,
|
51 |
+
"initializer_range": 0.02,
|
52 |
+
"intermediate_size": 4096,
|
53 |
+
"layer_norm_eps": 1e-05,
|
54 |
+
"layerdrop": 0.1,
|
55 |
+
"mask_channel_length": 10,
|
56 |
+
"mask_channel_min_space": 1,
|
57 |
+
"mask_channel_other": 0.0,
|
58 |
+
"mask_channel_prob": 0.0,
|
59 |
+
"mask_channel_selection": "static",
|
60 |
+
"mask_feature_length": 10,
|
61 |
+
"mask_feature_prob": 0.0,
|
62 |
+
"mask_time_length": 10,
|
63 |
+
"mask_time_min_space": 1,
|
64 |
+
"mask_time_other": 0.0,
|
65 |
+
"mask_time_prob": 0.05,
|
66 |
+
"mask_time_selection": "static",
|
67 |
+
"model_type": "wav2vec2",
|
68 |
+
"num_attention_heads": 16,
|
69 |
+
"num_conv_pos_embedding_groups": 16,
|
70 |
+
"num_conv_pos_embeddings": 128,
|
71 |
+
"num_feat_extract_layers": 7,
|
72 |
+
"num_hidden_layers": 24,
|
73 |
+
"pad_token_id": 86,
|
74 |
+
"transformers_version": "4.5.0.dev0",
|
75 |
+
"vocab_size": 87
|
76 |
+
}
|
preprocessor_config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_size": 1,
|
4 |
+
"padding_side": "right",
|
5 |
+
"padding_value": 0.0,
|
6 |
+
"return_attention_mask": false,
|
7 |
+
"sampling_rate": 16000
|
8 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7feadc845b185cd454fa4e1b4f02a3c377274c74f2548e8a36f7dcf96e30bd9e
|
3 |
+
size 1262290519
|
scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c3c46c269182a38cbc663d3f3625cc130c984c5be30970f17f9b4047c1fff9d4
|
3 |
+
size 623
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]"}
|
template.README.md
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
-
|
4 |
+
-
|
5 |
+
thumbnail:
|
6 |
+
tags:
|
7 |
+
-
|
8 |
+
-
|
9 |
+
-
|
10 |
+
license:
|
11 |
+
datasets:
|
12 |
+
-
|
13 |
+
-
|
14 |
+
metrics:
|
15 |
+
-
|
16 |
+
-
|
17 |
+
---
|
18 |
+
|
19 |
+
# MyModelName
|
20 |
+
|
21 |
+
## Model description
|
22 |
+
|
23 |
+
You can embed local or remote images using `![](...)`
|
24 |
+
|
25 |
+
## Intended uses & limitations
|
26 |
+
|
27 |
+
#### How to use
|
28 |
+
|
29 |
+
```python
|
30 |
+
# You can include sample code which will be formatted
|
31 |
+
```
|
32 |
+
|
33 |
+
#### Limitations and bias
|
34 |
+
|
35 |
+
Provide examples of latent issues and potential remediations.
|
36 |
+
|
37 |
+
## Training data
|
38 |
+
|
39 |
+
Describe the data you used to train the model.
|
40 |
+
If you initialized it with pre-trained weights, add a link to the pre-trained model card or repository with description of the pre-training data.
|
41 |
+
|
42 |
+
## Training procedure
|
43 |
+
|
44 |
+
Preprocessing, hardware used, hyperparameters...
|
45 |
+
|
46 |
+
## Eval results
|
47 |
+
|
48 |
+
### BibTeX entry and citation info
|
49 |
+
|
50 |
+
```bibtex
|
51 |
+
@inproceedings{...,
|
52 |
+
year={2020}
|
53 |
+
}
|
54 |
+
```
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"}
|
trainer_state.json
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 29.906542056074766,
|
5 |
+
"global_step": 3200,
|
6 |
+
"is_hyper_param_search": false,
|
7 |
+
"is_local_process_zero": true,
|
8 |
+
"is_world_process_zero": true,
|
9 |
+
"log_history": [
|
10 |
+
{
|
11 |
+
"epoch": 3.74,
|
12 |
+
"learning_rate": 0.000285,
|
13 |
+
"loss": 3.7926,
|
14 |
+
"step": 400
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"epoch": 3.74,
|
18 |
+
"eval_loss": 2.7348811626434326,
|
19 |
+
"eval_runtime": 77.58,
|
20 |
+
"eval_samples_per_second": 5.955,
|
21 |
+
"eval_wer": 1.0,
|
22 |
+
"step": 400
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"epoch": 7.48,
|
26 |
+
"learning_rate": 0.0002584870848708487,
|
27 |
+
"loss": 0.6512,
|
28 |
+
"step": 800
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"epoch": 7.48,
|
32 |
+
"eval_loss": 0.3463097810745239,
|
33 |
+
"eval_runtime": 78.064,
|
34 |
+
"eval_samples_per_second": 5.918,
|
35 |
+
"eval_wer": 0.44352893890675243,
|
36 |
+
"step": 800
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"epoch": 11.21,
|
40 |
+
"learning_rate": 0.00021420664206642064,
|
41 |
+
"loss": 0.2406,
|
42 |
+
"step": 1200
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"epoch": 11.21,
|
46 |
+
"eval_loss": 0.2929766774177551,
|
47 |
+
"eval_runtime": 77.4439,
|
48 |
+
"eval_samples_per_second": 5.966,
|
49 |
+
"eval_wer": 0.38183279742765275,
|
50 |
+
"step": 1200
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"epoch": 14.95,
|
54 |
+
"learning_rate": 0.0001699261992619926,
|
55 |
+
"loss": 0.153,
|
56 |
+
"step": 1600
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"epoch": 14.95,
|
60 |
+
"eval_loss": 0.29108402132987976,
|
61 |
+
"eval_runtime": 77.5593,
|
62 |
+
"eval_samples_per_second": 5.957,
|
63 |
+
"eval_wer": 0.3659565916398714,
|
64 |
+
"step": 1600
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"epoch": 18.69,
|
68 |
+
"learning_rate": 0.00012564575645756455,
|
69 |
+
"loss": 0.1189,
|
70 |
+
"step": 2000
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"epoch": 18.69,
|
74 |
+
"eval_loss": 0.3000461161136627,
|
75 |
+
"eval_runtime": 77.8803,
|
76 |
+
"eval_samples_per_second": 5.932,
|
77 |
+
"eval_wer": 0.3516881028938907,
|
78 |
+
"step": 2000
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"epoch": 22.43,
|
82 |
+
"learning_rate": 8.136531365313652e-05,
|
83 |
+
"loss": 0.0902,
|
84 |
+
"step": 2400
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"epoch": 22.43,
|
88 |
+
"eval_loss": 0.31765106320381165,
|
89 |
+
"eval_runtime": 77.559,
|
90 |
+
"eval_samples_per_second": 5.957,
|
91 |
+
"eval_wer": 0.3432475884244373,
|
92 |
+
"step": 2400
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"epoch": 26.17,
|
96 |
+
"learning_rate": 3.7084870848708486e-05,
|
97 |
+
"loss": 0.0748,
|
98 |
+
"step": 2800
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"epoch": 26.17,
|
102 |
+
"eval_loss": 0.32380491495132446,
|
103 |
+
"eval_runtime": 77.8712,
|
104 |
+
"eval_samples_per_second": 5.933,
|
105 |
+
"eval_wer": 0.33641479099678456,
|
106 |
+
"step": 2800
|
107 |
+
},
|
108 |
+
{
|
109 |
+
"epoch": 29.91,
|
110 |
+
"learning_rate": 0.0,
|
111 |
+
"loss": 0.0659,
|
112 |
+
"step": 3200
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"epoch": 29.91,
|
116 |
+
"eval_loss": 0.3231419026851654,
|
117 |
+
"eval_runtime": 77.4758,
|
118 |
+
"eval_samples_per_second": 5.963,
|
119 |
+
"eval_wer": 0.3307877813504823,
|
120 |
+
"step": 3200
|
121 |
+
}
|
122 |
+
],
|
123 |
+
"max_steps": 3210,
|
124 |
+
"num_train_epochs": 30,
|
125 |
+
"total_flos": 2.032238891438037e+19,
|
126 |
+
"trial_name": null,
|
127 |
+
"trial_params": null
|
128 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1337e36acc2013a93019f879828c2ee996e2e62e14e384176dc91b9aecb2c6e
|
3 |
+
size 2287
|
vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"\u0ac5": 0, "\u0aa8": 1, "\u0a85": 2, "\u0ac1": 3, "\u200c": 4, "\u0a90": 5, "\u0ae6": 6, "\u0a8f": 7, "\u0a86": 8, "\u0a87": 9, "\u0ab8": 10, "\u0aa7": 11, "\u0ab6": 12, "\u0ae8": 13, "\u0a94": 14, "\u0abc": 15, "\u0aad": 16, "\u0aaf": 17, "\u0aa0": 18, "\u0aa1": 19, "2": 20, "\u0a95": 21, "u": 22, "\u0aa6": 23, "\u0a89": 24, "\u0ac2": 25, "\u0a9c": 26, "\u0a88": 27, "\u0a9b": 28, "\u0aa3": 29, "0": 30, "\u0ab3": 31, "\u0ac9": 32, "\u0ab0": 33, "\u0a82": 34, "\u0ab2": 35, "\u0aae": 36, "\u0acc": 37, "\u0aac": 38, "\u0aee": 39, "\u0a91": 40, "\u0ae9": 41, "\u0aec": 42, "g": 43, "\u0ac0": 44, "\u0a96": 45, "\u0a9a": 46, "\u0a8a": 47, "e": 48, "\u0a97": 49, "\u0a98": 50, "\u0ac8": 51, "\u0ae0": 52, "\u0a8b": 54, "\u0a83": 55, "\u0aa4": 56, "t": 57, "\u200d": 58, "\u0aab": 59, "\u0ae7": 60, "\u0aef": 61, "\u0acb": 62, "_": 63, "\u0abe": 64, "r": 65, "\u0acd": 66, "\u0aa5": 67, "\u0ab5": 68, "\u0ab9": 69, "\u0ab7": 70, "\u0a9d": 71, "\u0aa2": 72, "\u0aed": 73, "\u0aaa": 74, "\u0a9e": 75, "\u0a93": 76, "\u0ac7": 77, "\u0ac3": 78, "\u0abf": 79, "\u0aeb": 80, "\u0a9f": 81, "\u0ae2": 82, "\u0a81": 83, "l": 84, "|": 53, "[UNK]": 85, "[PAD]": 86}
|