model added
Browse files- .gitattributes +2 -0
- classifier.ckpt +3 -0
- config.json +3 -0
- embedding_model.ckpt +3 -0
- hyperparams.yaml +52 -0
- label_encoder.txt +310 -0
- test.py +7 -0
.gitattributes
CHANGED
@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
classifier.ckpt filter=lfs diff=lfs merge=lfs -text
|
36 |
+
embedding_model.ckpt filter=lfs diff=lfs merge=lfs -text
|
classifier.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a59a8ff03fc9b88c20e56c05dafca58b4947da0b13109bb8f2a85f0a55f90f1
|
3 |
+
size 237355
|
config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"speechbrain_interface": "SpeakerRecognition"
|
3 |
+
}
|
embedding_model.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b137c365f7b6399196dcfde86a60de175309bf3e464aa5b0ebde9651f1695a37
|
3 |
+
size 83310835
|
hyperparams.yaml
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ############################################################################
|
2 |
+
# Model: ECAPA big for Speaker verification
|
3 |
+
# ############################################################################
|
4 |
+
|
5 |
+
# Feature parameters
|
6 |
+
n_mels: 80
|
7 |
+
|
8 |
+
# Pretrain folder (HuggingFace)
|
9 |
+
pretrained_path: gorinars/sb-ecapa-vggsound
|
10 |
+
|
11 |
+
# Output parameters
|
12 |
+
out_n_neurons: 308
|
13 |
+
|
14 |
+
# Model params
|
15 |
+
compute_features: !new:speechbrain.lobes.features.Fbank
|
16 |
+
n_mels: !ref <n_mels>
|
17 |
+
|
18 |
+
mean_var_norm: !new:speechbrain.processing.features.InputNormalization
|
19 |
+
norm_type: sentence
|
20 |
+
std_norm: False
|
21 |
+
|
22 |
+
embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
|
23 |
+
input_size: !ref <n_mels>
|
24 |
+
channels: [1024, 1024, 1024, 1024, 3072]
|
25 |
+
kernel_sizes: [5, 3, 3, 3, 1]
|
26 |
+
dilations: [1, 2, 3, 4, 1]
|
27 |
+
attention_channels: 128
|
28 |
+
lin_neurons: 192
|
29 |
+
|
30 |
+
classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
|
31 |
+
input_size: 192
|
32 |
+
out_neurons: !ref <out_n_neurons>
|
33 |
+
|
34 |
+
modules:
|
35 |
+
compute_features: !ref <compute_features>
|
36 |
+
mean_var_norm: !ref <mean_var_norm>
|
37 |
+
embedding_model: !ref <embedding_model>
|
38 |
+
classifier: !ref <classifier>
|
39 |
+
|
40 |
+
label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
|
41 |
+
|
42 |
+
|
43 |
+
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
44 |
+
loadables:
|
45 |
+
embedding_model: !ref <embedding_model>
|
46 |
+
classifier: !ref <classifier>
|
47 |
+
label_encoder: !ref <label_encoder>
|
48 |
+
paths:
|
49 |
+
embedding_model: !ref <pretrained_path>/embedding_model.ckpt
|
50 |
+
classifier: !ref <pretrained_path>/classifier.ckpt
|
51 |
+
label_encoder: !ref <pretrained_path>/label_encoder.txt
|
52 |
+
|
label_encoder.txt
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'air conditioning noise' => 0
|
2 |
+
'air horn' => 1
|
3 |
+
'airplane' => 2
|
4 |
+
'airplane flyby' => 3
|
5 |
+
'alarm clock ringing' => 4
|
6 |
+
'alligators, crocodiles hissing' => 5
|
7 |
+
'ambulance siren' => 6
|
8 |
+
'arc welding' => 7
|
9 |
+
'baby babbling' => 8
|
10 |
+
'baby crying' => 9
|
11 |
+
'baby laughter' => 10
|
12 |
+
'baltimore oriole calling' => 11
|
13 |
+
'barn swallow calling' => 12
|
14 |
+
'basketball bounce' => 13
|
15 |
+
'bathroom ventilation fan running' => 14
|
16 |
+
'beat boxing' => 15
|
17 |
+
'bee, wasp, etc. buzzing' => 16
|
18 |
+
'bird chirping, tweeting' => 17
|
19 |
+
'bird squawking' => 18
|
20 |
+
'bird wings flapping' => 19
|
21 |
+
'black capped chickadee calling' => 20
|
22 |
+
'blowtorch igniting' => 21
|
23 |
+
'bouncing on trampoline' => 22
|
24 |
+
'bowling impact' => 23
|
25 |
+
'bull bellowing' => 24
|
26 |
+
'canary calling' => 25
|
27 |
+
'cap gun shooting' => 26
|
28 |
+
'car engine idling' => 27
|
29 |
+
'car engine knocking' => 28
|
30 |
+
'car engine starting' => 29
|
31 |
+
'car passing by' => 30
|
32 |
+
'cat caterwauling' => 31
|
33 |
+
'cat growling' => 32
|
34 |
+
'cat hissing' => 33
|
35 |
+
'cat meowing' => 34
|
36 |
+
'cat purring' => 35
|
37 |
+
'cattle mooing' => 36
|
38 |
+
'cattle, bovinae cowbell' => 37
|
39 |
+
'cell phone buzzing' => 38
|
40 |
+
'chainsawing trees' => 39
|
41 |
+
'cheetah chirrup' => 40
|
42 |
+
'chicken clucking' => 41
|
43 |
+
'chicken crowing' => 42
|
44 |
+
'child singing' => 43
|
45 |
+
'child speech, kid speaking' => 44
|
46 |
+
'children shouting' => 45
|
47 |
+
'chimpanzee pant-hooting' => 46
|
48 |
+
'chinchilla barking' => 47
|
49 |
+
'chipmunk chirping' => 48
|
50 |
+
'chopping food' => 49
|
51 |
+
'chopping wood' => 50
|
52 |
+
'church bell ringing' => 51
|
53 |
+
'civil defense siren' => 52
|
54 |
+
'cow lowing' => 53
|
55 |
+
'coyote howling' => 54
|
56 |
+
'cricket chirping' => 55
|
57 |
+
'crow cawing' => 56
|
58 |
+
'cuckoo bird calling' => 57
|
59 |
+
'cutting hair with electric trimmers' => 58
|
60 |
+
'dinosaurs bellowing' => 59
|
61 |
+
'disc scratching' => 60
|
62 |
+
'dog barking' => 61
|
63 |
+
'dog baying' => 62
|
64 |
+
'dog bow-wow' => 63
|
65 |
+
'dog growling' => 64
|
66 |
+
'dog howling' => 65
|
67 |
+
'dog whimpering' => 66
|
68 |
+
'donkey, ass braying' => 67
|
69 |
+
'door slamming' => 68
|
70 |
+
'driving buses' => 69
|
71 |
+
'driving motorcycle' => 70
|
72 |
+
'driving snowmobile' => 71
|
73 |
+
'duck quacking' => 72
|
74 |
+
'eagle screaming' => 73
|
75 |
+
'eating with cutlery' => 74
|
76 |
+
'electric grinder grinding' => 75
|
77 |
+
'electric shaver, electric razor shaving' => 76
|
78 |
+
'elephant trumpeting' => 77
|
79 |
+
'eletric blender running' => 78
|
80 |
+
'elk bugling' => 79
|
81 |
+
'engine accelerating, revving, vroom' => 80
|
82 |
+
'female singing' => 81
|
83 |
+
'female speech, woman speaking' => 82
|
84 |
+
'ferret dooking' => 83
|
85 |
+
'fire crackling' => 84
|
86 |
+
'fire truck siren' => 85
|
87 |
+
'fireworks banging' => 86
|
88 |
+
'firing cannon' => 87
|
89 |
+
'firing muskets' => 88
|
90 |
+
'fly, housefly buzzing' => 89
|
91 |
+
'foghorn' => 90
|
92 |
+
'footsteps on snow' => 91
|
93 |
+
'forging swords' => 92
|
94 |
+
'fox barking' => 93
|
95 |
+
'francolin calling' => 94
|
96 |
+
'frog croaking' => 95
|
97 |
+
'gibbon howling' => 96
|
98 |
+
'goat bleating' => 97
|
99 |
+
'golf driving' => 98
|
100 |
+
'goose honking' => 99
|
101 |
+
'hail' => 100
|
102 |
+
'hair dryer drying' => 101
|
103 |
+
'hammering nails' => 102
|
104 |
+
'heart sounds, heartbeat' => 103
|
105 |
+
'hedge trimmer running' => 104
|
106 |
+
'helicopter' => 105
|
107 |
+
'horse clip-clop' => 106
|
108 |
+
'horse neighing' => 107
|
109 |
+
'ice cracking' => 108
|
110 |
+
'ice cream truck, ice cream van' => 109
|
111 |
+
'lathe spinning' => 110
|
112 |
+
'lawn mowing' => 111
|
113 |
+
'lighting firecrackers' => 112
|
114 |
+
'lions growling' => 113
|
115 |
+
'lions roaring' => 114
|
116 |
+
'lip smacking' => 115
|
117 |
+
'machine gun shooting' => 116
|
118 |
+
'magpie calling' => 117
|
119 |
+
'male singing' => 118
|
120 |
+
'male speech, man speaking' => 119
|
121 |
+
'metronome' => 120
|
122 |
+
'missile launch' => 121
|
123 |
+
'mosquito buzzing' => 122
|
124 |
+
'motorboat, speedboat acceleration' => 123
|
125 |
+
'mouse clicking' => 124
|
126 |
+
'mouse pattering' => 125
|
127 |
+
'mouse squeaking' => 126
|
128 |
+
'mynah bird singing' => 127
|
129 |
+
'ocean burbling' => 128
|
130 |
+
'opening or closing car doors' => 129
|
131 |
+
'opening or closing car electric windows' => 130
|
132 |
+
'opening or closing drawers' => 131
|
133 |
+
'orchestra' => 132
|
134 |
+
'otter growling' => 133
|
135 |
+
'owl hooting' => 134
|
136 |
+
'parrot talking' => 135
|
137 |
+
'penguins braying' => 136
|
138 |
+
'people babbling' => 137
|
139 |
+
'people battle cry' => 138
|
140 |
+
'people belly laughing' => 139
|
141 |
+
'people booing' => 140
|
142 |
+
'people burping' => 141
|
143 |
+
'people cheering' => 142
|
144 |
+
'people clapping' => 143
|
145 |
+
'people coughing' => 144
|
146 |
+
'people crowd' => 145
|
147 |
+
'people eating' => 146
|
148 |
+
'people eating apple' => 147
|
149 |
+
'people eating crisps' => 148
|
150 |
+
'people eating noodle' => 149
|
151 |
+
'people farting' => 150
|
152 |
+
'people finger snapping' => 151
|
153 |
+
'people gargling' => 152
|
154 |
+
'people giggling' => 153
|
155 |
+
'people hiccup' => 154
|
156 |
+
'people humming' => 155
|
157 |
+
'people marching' => 156
|
158 |
+
'people nose blowing' => 157
|
159 |
+
'people running' => 158
|
160 |
+
'people screaming' => 159
|
161 |
+
'people shuffling' => 160
|
162 |
+
'people slapping' => 161
|
163 |
+
'people slurping' => 162
|
164 |
+
'people sneezing' => 163
|
165 |
+
'people sniggering' => 164
|
166 |
+
'people sobbing' => 165
|
167 |
+
'people whispering' => 166
|
168 |
+
'people whistling' => 167
|
169 |
+
'pheasant crowing' => 168
|
170 |
+
'pig oinking' => 169
|
171 |
+
'pigeon, dove cooing' => 170
|
172 |
+
'planing timber' => 171
|
173 |
+
'plastic bottle crushing' => 172
|
174 |
+
'playing accordion' => 173
|
175 |
+
'playing acoustic guitar' => 174
|
176 |
+
'playing badminton' => 175
|
177 |
+
'playing bagpipes' => 176
|
178 |
+
'playing banjo' => 177
|
179 |
+
'playing bass drum' => 178
|
180 |
+
'playing bass guitar' => 179
|
181 |
+
'playing bassoon' => 180
|
182 |
+
'playing bongo' => 181
|
183 |
+
'playing bugle' => 182
|
184 |
+
'playing castanets' => 183
|
185 |
+
'playing cello' => 184
|
186 |
+
'playing clarinet' => 185
|
187 |
+
'playing congas' => 186
|
188 |
+
'playing cornet' => 187
|
189 |
+
'playing cymbal' => 188
|
190 |
+
'playing darts' => 189
|
191 |
+
'playing didgeridoo' => 190
|
192 |
+
'playing djembe' => 191
|
193 |
+
'playing double bass' => 192
|
194 |
+
'playing drum kit' => 193
|
195 |
+
'playing electric guitar' => 194
|
196 |
+
'playing electronic organ' => 195
|
197 |
+
'playing erhu' => 196
|
198 |
+
'playing flute' => 197
|
199 |
+
'playing french horn' => 198
|
200 |
+
'playing glockenspiel' => 199
|
201 |
+
'playing gong' => 200
|
202 |
+
'playing guiro' => 201
|
203 |
+
'playing hammond organ' => 202
|
204 |
+
'playing harmonica' => 203
|
205 |
+
'playing harp' => 204
|
206 |
+
'playing harpsichord' => 205
|
207 |
+
'playing hockey' => 206
|
208 |
+
'playing lacrosse' => 207
|
209 |
+
'playing mandolin' => 208
|
210 |
+
'playing marimba, xylophone' => 209
|
211 |
+
'playing oboe' => 210
|
212 |
+
'playing piano' => 211
|
213 |
+
'playing saxophone' => 212
|
214 |
+
'playing shofar' => 213
|
215 |
+
'playing sitar' => 214
|
216 |
+
'playing snare drum' => 215
|
217 |
+
'playing squash' => 216
|
218 |
+
'playing steel guitar, slide guitar' => 217
|
219 |
+
'playing steelpan' => 218
|
220 |
+
'playing synthesizer' => 219
|
221 |
+
'playing tabla' => 220
|
222 |
+
'playing table tennis' => 221
|
223 |
+
'playing tambourine' => 222
|
224 |
+
'playing tennis' => 223
|
225 |
+
'playing theremin' => 224
|
226 |
+
'playing timbales' => 225
|
227 |
+
'playing timpani' => 226
|
228 |
+
'playing trombone' => 227
|
229 |
+
'playing trumpet' => 228
|
230 |
+
'playing tuning fork' => 229
|
231 |
+
'playing tympani' => 230
|
232 |
+
'playing ukulele' => 231
|
233 |
+
'playing vibraphone' => 232
|
234 |
+
'playing violin, fiddle' => 233
|
235 |
+
'playing volleyball' => 234
|
236 |
+
'playing washboard' => 235
|
237 |
+
'playing zither' => 236
|
238 |
+
'police car (siren)' => 237
|
239 |
+
'police radio chatter' => 238
|
240 |
+
'popping popcorn' => 239
|
241 |
+
'printer printing' => 240
|
242 |
+
'pumping water' => 241
|
243 |
+
'race car, auto racing' => 242
|
244 |
+
'railroad car, train wagon' => 243
|
245 |
+
'raining' => 244
|
246 |
+
'rapping' => 245
|
247 |
+
'reversing beeps' => 246
|
248 |
+
'ripping paper' => 247
|
249 |
+
'roller coaster running' => 248
|
250 |
+
'rope skipping' => 249
|
251 |
+
'rowboat, canoe, kayak rowing' => 250
|
252 |
+
'running electric fan' => 251
|
253 |
+
'sailing' => 252
|
254 |
+
'scuba diving' => 253
|
255 |
+
'sea lion barking' => 254
|
256 |
+
'sea waves' => 255
|
257 |
+
'sharpen knife' => 256
|
258 |
+
'sheep bleating' => 257
|
259 |
+
'shot football' => 258
|
260 |
+
'singing bowl' => 259
|
261 |
+
'singing choir' => 260
|
262 |
+
'skateboarding' => 261
|
263 |
+
'skidding' => 262
|
264 |
+
'skiing' => 263
|
265 |
+
'sliding door' => 264
|
266 |
+
'sloshing water' => 265
|
267 |
+
'slot machine' => 266
|
268 |
+
'smoke detector beeping' => 267
|
269 |
+
'snake hissing' => 268
|
270 |
+
'snake rattling' => 269
|
271 |
+
'splashing water' => 270
|
272 |
+
'spraying water' => 271
|
273 |
+
'squishing water' => 272
|
274 |
+
'stream burbling' => 273
|
275 |
+
'strike lighter' => 274
|
276 |
+
'striking bowling' => 275
|
277 |
+
'striking pool' => 276
|
278 |
+
'subway, metro, underground' => 277
|
279 |
+
'swimming' => 278
|
280 |
+
'tap dancing' => 279
|
281 |
+
'tapping guitar' => 280
|
282 |
+
'telephone bell ringing' => 281
|
283 |
+
'thunder' => 282
|
284 |
+
'toilet flushing' => 283
|
285 |
+
'tornado roaring' => 284
|
286 |
+
'tractor digging' => 285
|
287 |
+
'train horning' => 286
|
288 |
+
'train wheels squealing' => 287
|
289 |
+
'train whistling' => 288
|
290 |
+
'turkey gobbling' => 289
|
291 |
+
'typing on computer keyboard' => 290
|
292 |
+
'typing on typewriter' => 291
|
293 |
+
'underwater bubbling' => 292
|
294 |
+
'using sewing machines' => 293
|
295 |
+
'vacuum cleaner cleaning floors' => 294
|
296 |
+
'vehicle horn, car horn, honking' => 295
|
297 |
+
'volcano explosion' => 296
|
298 |
+
'warbler chirping' => 297
|
299 |
+
'waterfall burbling' => 298
|
300 |
+
'whale calling' => 299
|
301 |
+
'wind chime' => 300
|
302 |
+
'wind noise' => 301
|
303 |
+
'wind rustling leaves' => 302
|
304 |
+
'wood thrush calling' => 303
|
305 |
+
'woodpecker pecking tree' => 304
|
306 |
+
'writing on blackboard with chalk' => 305
|
307 |
+
'yodelling' => 306
|
308 |
+
'zebra braying' => 307
|
309 |
+
================
|
310 |
+
'starting_index' => 0
|
test.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torchaudio
|
2 |
+
from speechbrain.pretrained import EncoderClassifier
|
3 |
+
classifier = EncoderClassifier.from_hparams(source=".")
|
4 |
+
signal, fs =torchaudio.load('example1.wav')
|
5 |
+
embeddings = classifier.encode_batch(signal)
|
6 |
+
|
7 |
+
print(embeddings)
|