vitouphy commited on
Commit
60b1816
1 Parent(s): bb8c2ed

add lm info

Browse files
build_lm_processor.ipynb CHANGED
@@ -2,8 +2,8 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "5393aa33",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -24,8 +24,8 @@
24
  },
25
  {
26
  "cell_type": "code",
27
- "execution_count": 2,
28
- "id": "2d34d3b8",
29
  "metadata": {},
30
  "outputs": [],
31
  "source": [
@@ -35,30 +35,18 @@
35
  },
36
  {
37
  "cell_type": "code",
38
- "execution_count": 3,
39
- "id": "f0354cb2",
40
  "metadata": {},
41
- "outputs": [
42
- {
43
- "name": "stderr",
44
- "output_type": "stream",
45
- "text": [
46
- "Loading the LM will be faster if you build a binary file.\n",
47
- "Reading /workspace/xls-r-300m-km/vitouphy/xls-r-300m-km/language_model/km_text.arpa\n",
48
- "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
49
- "Only 81 unigrams passed as vocabulary. Is this small or artificial data?\n",
50
- "****************************************************************************************************\n"
51
- ]
52
- }
53
- ],
54
  "source": [
55
- "processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
56
  ]
57
  },
58
  {
59
  "cell_type": "code",
60
- "execution_count": 4,
61
- "id": "109f28e9",
62
  "metadata": {},
63
  "outputs": [
64
  {
@@ -77,8 +65,8 @@
77
  },
78
  {
79
  "cell_type": "code",
80
- "execution_count": 5,
81
- "id": "300cec39",
82
  "metadata": {},
83
  "outputs": [
84
  {
@@ -88,8 +76,8 @@
88
  "Loading the LM will be faster if you build a binary file.\n",
89
  "Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
90
  "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
91
- "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
92
- "****************************************************************************************************\n"
93
  ]
94
  }
95
  ],
@@ -102,8 +90,8 @@
102
  },
103
  {
104
  "cell_type": "code",
105
- "execution_count": 8,
106
- "id": "27dd8427",
107
  "metadata": {},
108
  "outputs": [],
109
  "source": [
@@ -116,8 +104,8 @@
116
  },
117
  {
118
  "cell_type": "code",
119
- "execution_count": 9,
120
- "id": "94eb248e",
121
  "metadata": {},
122
  "outputs": [],
123
  "source": [
@@ -126,7 +114,7 @@
126
  },
127
  {
128
  "cell_type": "markdown",
129
- "id": "8f9b3dcc",
130
  "metadata": {},
131
  "source": [
132
  "## Save Model"
@@ -135,7 +123,7 @@
135
  {
136
  "cell_type": "code",
137
  "execution_count": 9,
138
- "id": "8b584690",
139
  "metadata": {},
140
  "outputs": [
141
  {
@@ -160,7 +148,7 @@
160
  {
161
  "cell_type": "code",
162
  "execution_count": 12,
163
- "id": "3712c030",
164
  "metadata": {},
165
  "outputs": [],
166
  "source": [
@@ -170,7 +158,7 @@
170
  {
171
  "cell_type": "code",
172
  "execution_count": null,
173
- "id": "b5d8de20",
174
  "metadata": {},
175
  "outputs": [],
176
  "source": []
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 8,
6
+ "id": "4ceb07da",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
24
  },
25
  {
26
  "cell_type": "code",
27
+ "execution_count": 9,
28
+ "id": "adaa2f36",
29
  "metadata": {},
30
  "outputs": [],
31
  "source": [
 
35
  },
36
  {
37
  "cell_type": "code",
38
+ "execution_count": 10,
39
+ "id": "4f07fc9d",
40
  "metadata": {},
41
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
42
  "source": [
43
+ "processor = AutoProcessor.from_pretrained(\"vitouphy/wav2vec2-xls-r-1b-km\")"
44
  ]
45
  },
46
  {
47
  "cell_type": "code",
48
+ "execution_count": 11,
49
+ "id": "17473aee",
50
  "metadata": {},
51
  "outputs": [
52
  {
 
65
  },
66
  {
67
  "cell_type": "code",
68
+ "execution_count": 12,
69
+ "id": "33fa6838",
70
  "metadata": {},
71
  "outputs": [
72
  {
 
76
  "Loading the LM will be faster if you build a binary file.\n",
77
  "Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
78
  "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
79
+ "****************************************************************************************************\n",
80
+ "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n"
81
  ]
82
  }
83
  ],
 
90
  },
91
  {
92
  "cell_type": "code",
93
+ "execution_count": 15,
94
+ "id": "ae0d32e9",
95
  "metadata": {},
96
  "outputs": [],
97
  "source": [
 
104
  },
105
  {
106
  "cell_type": "code",
107
+ "execution_count": 16,
108
+ "id": "d1acffc0",
109
  "metadata": {},
110
  "outputs": [],
111
  "source": [
 
114
  },
115
  {
116
  "cell_type": "markdown",
117
+ "id": "499eb495",
118
  "metadata": {},
119
  "source": [
120
  "## Save Model"
 
123
  {
124
  "cell_type": "code",
125
  "execution_count": 9,
126
+ "id": "bdd7821c",
127
  "metadata": {},
128
  "outputs": [
129
  {
 
148
  {
149
  "cell_type": "code",
150
  "execution_count": 12,
151
+ "id": "3c78a0bf",
152
  "metadata": {},
153
  "outputs": [],
154
  "source": [
 
158
  {
159
  "cell_type": "code",
160
  "execution_count": null,
161
+ "id": "202fbb76",
162
  "metadata": {},
163
  "outputs": [],
164
  "source": []
eval.sh CHANGED
@@ -1,5 +1,5 @@
1
  ./eval.py \
2
- --model_id ./ \
3
  --dataset openslr \
4
  --config km \
5
  --split test \
 
1
  ./eval.py \
2
+ --model_id vitouphy/wav2vec2-xls-r-1b-km \
3
  --dataset openslr \
4
  --config km \
5
  --split test \
preprocessor_config.json CHANGED
@@ -4,6 +4,7 @@
4
  "feature_size": 1,
5
  "padding_side": "right",
6
  "padding_value": 0.0,
 
7
  "return_attention_mask": true,
8
  "sampling_rate": 16000
9
  }
 
4
  "feature_size": 1,
5
  "padding_side": "right",
6
  "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2ProcessorWithLM",
8
  "return_attention_mask": true,
9
  "sampling_rate": 16000
10
  }
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "vitouphy/wav2vec2-xls-r-1b-km", "tokenizer_class": "Wav2Vec2CTCTokenizer", "processor_class": "Wav2Vec2ProcessorWithLM"}