sanchit-gandhi HF staff commited on
Commit
3155f54
1 Parent(s): e676bd8

single tab

Browse files
Files changed (1) hide show
  1. app.py +47 -77
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- from functools import partial
3
 
4
  import numpy as np
5
  import unicodedata
@@ -64,30 +63,40 @@ target_dtype = np.int16
64
  max_range = np.iinfo(target_dtype).max
65
 
66
 
67
- def get_visualisation(idx, model="v2"):
68
  idx -= 1
69
  audio = dataset[idx]["audio"]
70
  array = (audio["array"] * max_range).astype(np.int16)
71
  sampling_rate = audio["sampling_rate"]
72
 
73
  text1 = norm_target[idx]
74
- text2 = norm_pred_v2[idx] if model == "v2" else norm_pred_32_2[idx]
 
 
 
 
 
75
 
76
  wer_output = process_words(text1, text2, wer_default, wer_default)
77
- wer_percentage = round(100 * wer_output.wer, 2)
78
- ier_percentage = round(100 * wer_output.insertions / len(wer_output.references[0]), 2)
 
 
79
 
80
- rel_length = round(len(text2.split()) / len(text1.split()), 2)
81
 
82
  diff = compare_string(text1, text2)
83
  full_text = style_text(diff)
84
 
85
  return (sampling_rate, array), wer_percentage, ier_percentage, rel_length, full_text
86
 
 
87
  def get_side_by_side_visualisation(idx):
88
- large_v2 = get_visualisation(idx, model="v2")
89
- large_32_2 = get_visualisation(idx, model="32-2")
 
90
  table = [large_v2[1:-1], large_32_2[1:-1]]
 
91
  table[0] = ["large-v2", *table[0]]
92
  table[1] = ["large-32-2", *table[1]]
93
  return large_v2[0], table, large_v2[-1], large_32_2[-1]
@@ -95,76 +104,37 @@ def get_side_by_side_visualisation(idx):
95
 
96
  if __name__ == "__main__":
97
  with gr.Blocks() as demo:
98
- with gr.Tab("large-v2"):
99
- gr.Markdown(
100
- "Analyse the transcriptions generated by the Whisper large-v2 model on the TEDLIUM dev set."
101
- )
102
-
103
- slider = gr.Slider(
104
- minimum=1, maximum=len(norm_target), step=1, label="Dataset sample"
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  )
106
- btn = gr.Button("Analyse")
107
- audio_out = gr.Audio(label="Audio input")
108
  with gr.Row():
109
- wer = gr.Number(label="Word Error Rate (WER)")
110
- ier = gr.Number(
111
- label="Insertion Error Rate (IER)"
112
- )
113
- relative_length = gr.Number(
114
- label="Relative length (reference length / target length)"
115
- )
116
- text_out = gr.Markdown(label="Text difference")
117
-
118
- btn.click(
119
- fn=partial(get_visualisation, model="v2"),
120
- inputs=slider,
121
- outputs=[audio_out, wer, ier, relative_length, text_out],
122
- )
123
- with gr.Tab("large-32-2"):
124
- gr.Markdown(
125
- "Analyse the transcriptions generated by the Whisper large-32-2 model on the TEDLIUM dev set."
126
- )
127
- slider = gr.Slider(
128
- minimum=1, maximum=len(norm_target), step=1, label="Dataset sample"
129
- )
130
- btn = gr.Button("Analyse")
131
- audio_out = gr.Audio(label="Audio input")
132
  with gr.Row():
133
- wer = gr.Number(label="Word Error Rate (WER)")
134
- ier = gr.Number(
135
- label="Insertion Error Rate (IER)"
136
- )
137
- relative_length = gr.Number(
138
- label="Relative length (reference length / target length)"
139
- )
140
- text_out = gr.Markdown(label="Text difference")
141
-
142
- btn.click(
143
- fn=partial(get_visualisation, model="32-2"),
144
- inputs=slider,
145
- outputs=[audio_out, wer, ier, relative_length, text_out],
146
- )
147
- with gr.Tab("side-by-side"):
148
- gr.Markdown(
149
- "Analyse the transcriptions generated by the Whisper large-32-2 model on the TEDLIUM dev set."
150
- )
151
- slider = gr.Slider(
152
- minimum=1, maximum=len(norm_target), step=1, label="Dataset sample"
153
- )
154
- btn = gr.Button("Analyse")
155
- audio_out = gr.Audio(label="Audio input")
156
- with gr.Column():
157
- table = gr.Dataframe(headers=["Model", "Word Error Rate (WER)", "Insertion Error Rate (IER)", "Rel length (ref length / tgt length)"], height=1000)
158
- with gr.Row():
159
- gr.Markdown("large-v2 text diff")
160
- gr.Markdown("large-32-2 text diff")
161
- with gr.Row():
162
- text_out_v2 = gr.Markdown(label="Text difference")
163
- text_out_32_2 = gr.Markdown(label="Text difference")
164
-
165
- btn.click(
166
- fn=get_side_by_side_visualisation,
167
- inputs=slider,
168
- outputs=[audio_out, table, text_out_v2, text_out_32_2],
169
- )
170
  demo.launch()
 
1
  import os
 
2
 
3
  import numpy as np
4
  import unicodedata
 
63
  max_range = np.iinfo(target_dtype).max
64
 
65
 
66
+ def get_visualisation(idx, model="large-v2", round_dp=2):
67
  idx -= 1
68
  audio = dataset[idx]["audio"]
69
  array = (audio["array"] * max_range).astype(np.int16)
70
  sampling_rate = audio["sampling_rate"]
71
 
72
  text1 = norm_target[idx]
73
+ if model == "large-v2":
74
+ text2 = norm_pred_v2[idx]
75
+ elif model == "large-32-2":
76
+ text2 = norm_pred_32_2[idx]
77
+ else:
78
+ raise ValueError(f"Got unknown model {model}, should be one of `'large-v2'` or `'large-32-2'`.")
79
 
80
  wer_output = process_words(text1, text2, wer_default, wer_default)
81
+ wer_percentage = round(100 * wer_output.wer, round_dp)
82
+ ier_percentage = round(
83
+ 100 * wer_output.insertions / len(wer_output.references[0]), round_dp
84
+ )
85
 
86
+ rel_length = round(len(text2.split()) / len(text1.split()), round_dp)
87
 
88
  diff = compare_string(text1, text2)
89
  full_text = style_text(diff)
90
 
91
  return (sampling_rate, array), wer_percentage, ier_percentage, rel_length, full_text
92
 
93
+
94
  def get_side_by_side_visualisation(idx):
95
+ large_v2 = get_visualisation(idx, model="large-v2")
96
+ large_32_2 = get_visualisation(idx, model="large-32-2")
97
+ # format the rows
98
  table = [large_v2[1:-1], large_32_2[1:-1]]
99
+ # format the model names
100
  table[0] = ["large-v2", *table[0]]
101
  table[1] = ["large-32-2", *table[1]]
102
  return large_v2[0], table, large_v2[-1], large_32_2[-1]
 
104
 
105
  if __name__ == "__main__":
106
  with gr.Blocks() as demo:
107
+ gr.Markdown(
108
+ "Analyse the transcriptions generated by the Whisper large-v2 and large-32-2 models on the TEDLIUM dev set."
109
+ "The transcriptions for both models are shown at the bottom of the demo. The text diff for each is computed "
110
+ "relative to the target transcriptions. Insertions are displayed in <span style='background-color:Lightgreen'>green</span>, and "
111
+ "deletions in <span style='background-color:#FFCCCB'><s>red</s></span>."
112
+ )
113
+ slider = gr.Slider(
114
+ minimum=1, maximum=len(norm_target), step=1, label="Dataset sample"
115
+ )
116
+ btn = gr.Button("Analyse")
117
+ audio_out = gr.Audio(label="Audio input")
118
+ with gr.Column():
119
+ table = gr.Dataframe(
120
+ headers=[
121
+ "Model",
122
+ "Word Error Rate (WER)",
123
+ "Insertion Error Rate (IER)",
124
+ "Rel length (ref length / tgt length)",
125
+ ],
126
+ height=1000,
127
  )
 
 
128
  with gr.Row():
129
+ gr.Markdown("**large-v2 text diff**")
130
+ gr.Markdown("**large-32-2 text diff**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  with gr.Row():
132
+ text_out_v2 = gr.Markdown(label="Text difference")
133
+ text_out_32_2 = gr.Markdown(label="Text difference")
134
+
135
+ btn.click(
136
+ fn=get_side_by_side_visualisation,
137
+ inputs=slider,
138
+ outputs=[audio_out, table, text_out_v2, text_out_32_2],
139
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  demo.launch()