Haopeng commited on
Commit
f70a1dc
1 Parent(s): 4faedb5

Change Hypothesis highlight and WER

Browse files
Files changed (3) hide show
  1. .gitignore +534 -0
  2. app.py +12 -4
  3. local/wer.py +33 -0
.gitignore ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by: https://github.com/michaelliao/gitignore-online-generator
2
+
3
+ #################### Python.gitignore ####################
4
+
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+
166
+ #################### Archives.gitignore ####################
167
+
168
+ # It's better to unpack these files and commit the raw source because
169
+ # git has its own built in compression methods.
170
+ *.7z
171
+ *.jar
172
+ *.rar
173
+ *.zip
174
+ *.gz
175
+ *.gzip
176
+ *.tgz
177
+ *.bzip
178
+ *.bzip2
179
+ *.bz2
180
+ *.xz
181
+ *.lzma
182
+ *.cab
183
+ *.xar
184
+
185
+ # Packing-only formats
186
+ *.iso
187
+ *.tar
188
+
189
+ # Package management formats
190
+ *.dmg
191
+ *.xpi
192
+ *.gem
193
+ *.egg
194
+ *.deb
195
+ *.rpm
196
+ *.msi
197
+ *.msm
198
+ *.msp
199
+ *.txz
200
+
201
+ #################### Backup.gitignore ####################
202
+
203
+ *.bak
204
+ *.gho
205
+ *.ori
206
+ *.orig
207
+ *.tmp
208
+
209
+ #################### Emacs.gitignore ####################
210
+
211
+ # -*- mode: gitignore; -*-
212
+ *~
213
+ \#*\#
214
+ /.emacs.desktop
215
+ /.emacs.desktop.lock
216
+ *.elc
217
+ auto-save-list
218
+ tramp
219
+ .\#*
220
+
221
+ # Org-mode
222
+ .org-id-locations
223
+ *_archive
224
+
225
+ # flymake-mode
226
+ *_flymake.*
227
+
228
+ # eshell files
229
+ /eshell/history
230
+ /eshell/lastdir
231
+
232
+ # elpa packages
233
+ /elpa/
234
+
235
+ # reftex files
236
+ *.rel
237
+
238
+ # AUCTeX auto folder
239
+ /auto/
240
+
241
+ # cask packages
242
+ .cask/
243
+ dist/
244
+
245
+ # Flycheck
246
+ flycheck_*.el
247
+
248
+ # server auth directory
249
+ /server/
250
+
251
+ # projectiles files
252
+ .projectile
253
+
254
+ # directory configuration
255
+ .dir-locals.el
256
+
257
+ # network security
258
+ /network-security.data
259
+
260
+
261
+ #################### JetBrains.gitignore ####################
262
+
263
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
264
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
265
+
266
+ # User-specific stuff
267
+ .idea/**/workspace.xml
268
+ .idea/**/tasks.xml
269
+ .idea/**/usage.statistics.xml
270
+ .idea/**/dictionaries
271
+ .idea/**/shelf
272
+
273
+ # AWS User-specific
274
+ .idea/**/aws.xml
275
+
276
+ # Generated files
277
+ .idea/**/contentModel.xml
278
+
279
+ # Sensitive or high-churn files
280
+ .idea/**/dataSources/
281
+ .idea/**/dataSources.ids
282
+ .idea/**/dataSources.local.xml
283
+ .idea/**/sqlDataSources.xml
284
+ .idea/**/dynamic.xml
285
+ .idea/**/uiDesigner.xml
286
+ .idea/**/dbnavigator.xml
287
+
288
+ # Gradle
289
+ .idea/**/gradle.xml
290
+ .idea/**/libraries
291
+
292
+ # Gradle and Maven with auto-import
293
+ # When using Gradle or Maven with auto-import, you should exclude module files,
294
+ # since they will be recreated, and may cause churn. Uncomment if using
295
+ # auto-import.
296
+ # .idea/artifacts
297
+ # .idea/compiler.xml
298
+ # .idea/jarRepositories.xml
299
+ # .idea/modules.xml
300
+ # .idea/*.iml
301
+ # .idea/modules
302
+ # *.iml
303
+ # *.ipr
304
+
305
+ # CMake
306
+ cmake-build-*/
307
+
308
+ # Mongo Explorer plugin
309
+ .idea/**/mongoSettings.xml
310
+
311
+ # File-based project format
312
+ *.iws
313
+
314
+ # IntelliJ
315
+ out/
316
+
317
+ # mpeltonen/sbt-idea plugin
318
+ .idea_modules/
319
+
320
+ # JIRA plugin
321
+ atlassian-ide-plugin.xml
322
+
323
+ # Cursive Clojure plugin
324
+ .idea/replstate.xml
325
+
326
+ # SonarLint plugin
327
+ .idea/sonarlint/
328
+
329
+ # Crashlytics plugin (for Android Studio and IntelliJ)
330
+ com_crashlytics_export_strings.xml
331
+ crashlytics.properties
332
+ crashlytics-build.properties
333
+ fabric.properties
334
+
335
+ # Editor-based Rest Client
336
+ .idea/httpRequests
337
+
338
+ # Android studio 3.1+ serialized cache file
339
+ .idea/caches/build_file_checksums.ser
340
+
341
+ #################### Linux.gitignore ####################
342
+
343
+ *~
344
+
345
+ # temporary files which can be created if a process still has a handle open of a deleted file
346
+ .fuse_hidden*
347
+
348
+ # KDE directory preferences
349
+ .directory
350
+
351
+ # Linux trash folder which might appear on any partition or disk
352
+ .Trash-*
353
+
354
+ # .nfs files are created when an open file is removed but is still being accessed
355
+ .nfs*
356
+
357
+ #################### NotepadPP.gitignore ####################
358
+
359
+ # Notepad++ backups #
360
+ *.bak
361
+
362
+ #################### PuTTY.gitignore ####################
363
+
364
+ # Private key
365
+ *.ppk
366
+
367
+ #################### SublimeText.gitignore ####################
368
+
369
+ # Cache files for Sublime Text
370
+ *.tmlanguage.cache
371
+ *.tmPreferences.cache
372
+ *.stTheme.cache
373
+
374
+ # Workspace files are user-specific
375
+ *.sublime-workspace
376
+
377
+ # Project files should be checked into the repository, unless a significant
378
+ # proportion of contributors will probably not be using Sublime Text
379
+ # *.sublime-project
380
+
381
+ # SFTP configuration file
382
+ sftp-config.json
383
+ sftp-config-alt*.json
384
+
385
+ # Package control specific files
386
+ Package Control.last-run
387
+ Package Control.ca-list
388
+ Package Control.ca-bundle
389
+ Package Control.system-ca-bundle
390
+ Package Control.cache/
391
+ Package Control.ca-certs/
392
+ Package Control.merged-ca-bundle
393
+ Package Control.user-ca-bundle
394
+ oscrypto-ca-bundle.crt
395
+ bh_unicode_properties.cache
396
+
397
+ # Sublime-github package stores a github token in this file
398
+ # https://packagecontrol.io/packages/sublime-github
399
+ GitHub.sublime-settings
400
+
401
+ #################### Vim.gitignore ####################
402
+
403
+ # Swap
404
+ [._]*.s[a-v][a-z]
405
+ !*.svg # comment out if you don't need vector files
406
+ [._]*.sw[a-p]
407
+ [._]s[a-rt-v][a-z]
408
+ [._]ss[a-gi-z]
409
+ [._]sw[a-p]
410
+
411
+ # Session
412
+ Session.vim
413
+ Sessionx.vim
414
+
415
+ # Temporary
416
+ .netrwhist
417
+ *~
418
+ # Auto-generated tag files
419
+ tags
420
+ # Persistent undo
421
+ [._]*.un~
422
+
423
+ #################### VirtualEnv.gitignore ####################
424
+
425
+ # Virtualenv
426
+ # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
427
+ .Python
428
+ [Bb]in
429
+ [Ii]nclude
430
+ [Ll]ib
431
+ [Ll]ib64
432
+ [Ll]ocal
433
+ [Ss]cripts
434
+ pyvenv.cfg
435
+ .venv
436
+ pip-selfcheck.json
437
+
438
+ #################### VisualStudioCode.gitignore ####################
439
+
440
+ .vscode/*
441
+ !.vscode/settings.json
442
+ !.vscode/tasks.json
443
+ !.vscode/launch.json
444
+ !.vscode/extensions.json
445
+ !.vscode/*.code-snippets
446
+
447
+ # Local History for Visual Studio Code
448
+ .history/
449
+
450
+ # Built Visual Studio Code Extensions
451
+ *.vsix
452
+
453
+ #################### Windows.gitignore ####################
454
+
455
+ # Windows thumbnail cache files
456
+ Thumbs.db
457
+ Thumbs.db:encryptable
458
+ ehthumbs.db
459
+ ehthumbs_vista.db
460
+
461
+ # Dump file
462
+ *.stackdump
463
+
464
+ # Folder config file
465
+ [Dd]esktop.ini
466
+
467
+ # Recycle Bin used on file shares
468
+ $RECYCLE.BIN/
469
+
470
+ # Windows Installer files
471
+ *.cab
472
+ *.msi
473
+ *.msix
474
+ *.msm
475
+ *.msp
476
+
477
+ # Windows shortcuts
478
+ *.lnk
479
+
480
+ #################### macOS.gitignore ####################
481
+
482
+ # General
483
+ .DS_Store
484
+ .AppleDouble
485
+ .LSOverride
486
+
487
+ # Icon must end with two \r
488
+ Icon
489
+
490
+
491
+ # Thumbnails
492
+ ._*
493
+
494
+ # Files that might appear in the root of a volume
495
+ .DocumentRevisions-V100
496
+ .fseventsd
497
+ .Spotlight-V100
498
+ .TemporaryItems
499
+ .Trashes
500
+ .VolumeIcon.icns
501
+ .com.apple.timemachine.donotpresent
502
+
503
+ # Directories potentially created on remote AFP share
504
+ .AppleDB
505
+ .AppleDesktop
506
+ Network Trash Folder
507
+ Temporary Items
508
+ .apdisk
509
+
510
+ #################### Custom.gitignore ####################
511
+
512
+ # add your custom gitignore here:
513
+ !.gitignore
514
+ !.gitsubmodules
515
+
516
+ # ignore data
517
+ data/
518
+ exp/
519
+ !src/lightning_module.py
520
+ *.wav
521
+ # ignore plots
522
+ *.png
523
+ # ignore csv
524
+ *.csv
525
+
526
+
527
+ !config/template.yaml
528
+ config
529
+ !local
530
+
531
+ ## Currently
532
+ src/wav2vec_small.pt
533
+ *.ckpt
534
+ *.bak
app.py CHANGED
@@ -24,10 +24,12 @@ import librosa.display
24
  import matplotlib.pyplot as plt
25
  import soundfile as sf
26
 
 
27
  # local import
28
  import sys
29
 
30
  from local.vis import token_plot
 
31
  sys.path.append("src")
32
 
33
  # Load automos
@@ -189,6 +191,8 @@ def calc_wer(audio_path, ref):
189
  trans = jiwer.ToUpperCase()(trans)
190
  # WER
191
  ref = jiwer.ToUpperCase()(ref)
 
 
192
  wer = jiwer.wer(
193
  ref,
194
  trans,
@@ -196,10 +200,12 @@ def calc_wer(audio_path, ref):
196
  hypothesis_transform=transformation,
197
  )
198
  # pdb.set_trace()
199
- word_acc = 1.0 - float(wer)
200
- return [trans, word_acc, token_wav_plot]
201
  # calc_wer(examples[1][0], examples[1][1])
 
202
  # pdb.set_trace()
 
203
  iface = gr.Interface(
204
  fn=calc_wer,
205
  inputs=[
@@ -207,13 +213,15 @@ iface = gr.Interface(
207
  source="upload",
208
  type="filepath",
209
  label="Audio_to_evaluate",
 
210
  ),
211
  reference_textbox,
212
  ],
 
213
  outputs=[
214
- gr.Textbox(placeholder="Hypothesis", label="Recognition by AI"),
215
  gr.Textbox(placeholder="Word Accuracy", label="Word Accuracy (The Higher the better)"),
216
- gr.Plot(label="waveform")
217
  ],
218
  description=description,
219
  examples=examples,
 
24
  import matplotlib.pyplot as plt
25
  import soundfile as sf
26
 
27
+
28
  # local import
29
  import sys
30
 
31
  from local.vis import token_plot
32
+ from local.wer import get_WER_highlight
33
  sys.path.append("src")
34
 
35
  # Load automos
 
191
  trans = jiwer.ToUpperCase()(trans)
192
  # WER
193
  ref = jiwer.ToUpperCase()(ref)
194
+ highlight_hyp = get_WER_highlight(ref.split(" "), trans.split(" "))
195
+
196
  wer = jiwer.wer(
197
  ref,
198
  trans,
 
200
  hypothesis_transform=transformation,
201
  )
202
  # pdb.set_trace()
203
+ word_acc = "%0.2f%%" %((1.0 - float(wer))*100)
204
+ return [highlight_hyp, word_acc, token_wav_plot]
205
  # calc_wer(examples[1][0], examples[1][1])
206
+ # # calc_wer()
207
  # pdb.set_trace()
208
+
209
  iface = gr.Interface(
210
  fn=calc_wer,
211
  inputs=[
 
213
  source="upload",
214
  type="filepath",
215
  label="Audio_to_evaluate",
216
+ show_label=False
217
  ),
218
  reference_textbox,
219
  ],
220
+ #gr.Textbox(placeholder="Hypothesis", label="Recognition by AI"),
221
  outputs=[
222
+ gr.HighlightedText(placeholder="Hypothesis", label="Diff", combine_adjacent=True, adjacent_separator=" ", show_label=False).style(color_map={"1": "#78bd91", "0": "#ddbabf"}),
223
  gr.Textbox(placeholder="Word Accuracy", label="Word Accuracy (The Higher the better)"),
224
+ gr.Plot(label="waveform", show_label=False)
225
  ],
226
  description=description,
227
  examples=examples,
local/wer.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Levenshtein
2
+ import numpy as np
3
+ import pdb
4
+ import difflib
5
+
6
+ ref = "I love you too".split(' ')
7
+ hyp = "I really don't loie him".split(' ')
8
+
9
+ # return hypothesis wrong word with labels
10
+ def get_WER_highlight(ref, hyp):
11
+ result = []
12
+ for li in difflib.ndiff(ref, hyp):
13
+ if li[0] == "+" or li[0] == " ":
14
+ x = li.split(" ")
15
+ # pdb.set_trace()
16
+ if len(x) == 3:
17
+ x = (x[-1], "1")
18
+ else:
19
+ x = (x[-1], "0")
20
+ result.append(x)
21
+ return result
22
+
23
+
24
+ def diff_texts(text1, text2):
25
+ d = difflib.Differ()
26
+ return [
27
+ (token[2:], token[0] if token[0] != " " else None)
28
+ for token in d.compare(text1, text2)
29
+ ]
30
+
31
+ # x = diff_texts(ref, hyp)
32
+
33
+ # pdb.set_trace()