Remsky commited on
Commit
3fd1314
·
1 Parent(s): 165abce

Enhance performance metrics visualization in app.py and update plot saving format in tts_model.py

Browse files
Files changed (2) hide show
  1. app.py +63 -21
  2. tts_model.py +1 -1
app.py CHANGED
@@ -2,6 +2,8 @@ import os
2
  import gradio as gr
3
  import spaces
4
  import time
 
 
5
  from tts_model import TTSModel
6
  from lib import format_audio_output
7
 
@@ -32,18 +34,27 @@ def generate_speech_from_ui(text, voice_name, speed, progress=gr.Progress(track_
32
  # Create progress state
33
  progress_state = {
34
  "progress": 0.0,
35
- "tokens_per_sec": 0.0,
36
- "gpu_time_left": gpu_timeout
 
 
 
37
  }
38
 
39
  def update_progress(chunk_num, total_chunks, tokens_per_sec, rtf):
40
  progress_state["progress"] = chunk_num / total_chunks
41
- progress_state["tokens_per_sec"] = tokens_per_sec
 
42
 
43
  # Update GPU time remaining
44
  elapsed = time.time() - start_time
45
  gpu_time_left = max(0, gpu_timeout - elapsed)
46
  progress_state["gpu_time_left"] = gpu_time_left
 
 
 
 
 
47
 
48
  # Only update progress display during processing
49
  progress(progress_state["progress"], desc=f"Processing chunk {chunk_num}/{total_chunks} | GPU Time Left: {int(gpu_time_left)}s")
@@ -62,19 +73,51 @@ def generate_speech_from_ui(text, voice_name, speed, progress=gr.Progress(track_
62
  # Calculate final metrics
63
  total_time = time.time() - start_time
64
  total_duration = len(audio_array) / 24000 # audio duration in seconds
65
- final_rtf = total_time / total_duration if total_duration > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- # Prepare final metrics display
 
 
 
 
 
68
  metrics_text = (
69
- f"Tokens/sec: {progress_state['tokens_per_sec']:.1f}\n" +
70
- f"Real-time factor: {final_rtf:.2f}x (Processing Time / Audio Duration)\n" +
71
- f"GPU Time Used: {int(total_time)}s of {gpu_timeout}s"
 
 
72
  )
73
 
74
  return (
75
  audio_output,
76
- metrics_text,
77
- duration_text
78
  )
79
  except Exception as e:
80
  raise gr.Error(f"Generation failed: {str(e)}")
@@ -83,11 +126,11 @@ def generate_speech_from_ui(text, voice_name, speed, progress=gr.Progress(track_
83
  with gr.Blocks(title="Kokoro TTS Demo") as demo:
84
  gr.HTML(
85
  """
86
- <div style="display: flex; justify-content: flex-end; padding: 10px; gap: 10px;">
 
87
  <a href="https://huggingface.co/hexgrad/Kokoro-82M" target="_blank">
88
- <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-md-dark.svg" alt="Model on HF">
89
  </a>
90
- <a class="github-button" href="https://github.com/remsky/Kokoro-FastAPI" data-color-scheme="no-preference: light; light: light; dark: dark;" data-size="large" data-show-count="true" aria-label="Star remsky/Kokoro-FastAPI on GitHub">Repo for Local Use</a>
91
  </div>
92
  <div style="text-align: center; max-width: 800px; margin: 0 auto;">
93
  <h1>Kokoro TTS Demo</h1>
@@ -155,21 +198,21 @@ with gr.Blocks(title="Kokoro TTS Demo") as demo:
155
  )
156
  progress_bar = gr.Progress(track_tqdm=False)
157
  metrics_text = gr.Textbox(
158
- label="Processing Metrics",
159
  interactive=False,
160
- lines=3
161
  )
162
- duration_text = gr.Textbox(
163
- label="Processing Info",
164
- interactive=False,
165
- lines=2
166
  )
167
 
168
  # Set up event handler
169
  submit_btn.click(
170
  fn=generate_speech_from_ui,
171
  inputs=[text_input, voice_dropdown, speed_slider],
172
- outputs=[audio_output, metrics_text, duration_text],
173
  show_progress=True
174
  )
175
 
@@ -180,7 +223,6 @@ with gr.Blocks(title="Kokoro TTS Demo") as demo:
180
  ### Demo Text Info
181
  The demo text is loaded from H.G. Wells' "The Time Machine". This classic text demonstrates the system's ability to handle long-form content through chunking.
182
  """)
183
-
184
 
185
  # Launch the app
186
  if __name__ == "__main__":
 
2
  import gradio as gr
3
  import spaces
4
  import time
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
  from tts_model import TTSModel
8
  from lib import format_audio_output
9
 
 
34
  # Create progress state
35
  progress_state = {
36
  "progress": 0.0,
37
+ "tokens_per_sec": [],
38
+ "rtf": [],
39
+ "chunk_times": [],
40
+ "gpu_time_left": gpu_timeout,
41
+ "total_chunks": 0
42
  }
43
 
44
  def update_progress(chunk_num, total_chunks, tokens_per_sec, rtf):
45
  progress_state["progress"] = chunk_num / total_chunks
46
+ progress_state["tokens_per_sec"].append(tokens_per_sec)
47
+ progress_state["rtf"].append(rtf)
48
 
49
  # Update GPU time remaining
50
  elapsed = time.time() - start_time
51
  gpu_time_left = max(0, gpu_timeout - elapsed)
52
  progress_state["gpu_time_left"] = gpu_time_left
53
+ progress_state["total_chunks"] = total_chunks
54
+
55
+ # Track individual chunk processing time
56
+ chunk_time = elapsed - (sum(progress_state["chunk_times"]) if progress_state["chunk_times"] else 0)
57
+ progress_state["chunk_times"].append(chunk_time)
58
 
59
  # Only update progress display during processing
60
  progress(progress_state["progress"], desc=f"Processing chunk {chunk_num}/{total_chunks} | GPU Time Left: {int(gpu_time_left)}s")
 
73
  # Calculate final metrics
74
  total_time = time.time() - start_time
75
  total_duration = len(audio_array) / 24000 # audio duration in seconds
76
+ rtf = total_time / total_duration if total_duration > 0 else 0
77
+ mean_tokens_per_sec = np.mean(progress_state["tokens_per_sec"])
78
+
79
+ # Create plot of tokens per second with median line
80
+ fig, ax = plt.subplots(figsize=(10, 5))
81
+ fig.patch.set_facecolor('black')
82
+ ax.set_facecolor('black')
83
+ chunk_nums = list(range(1, len(progress_state["tokens_per_sec"]) + 1))
84
+
85
+ # Plot bars for tokens per second
86
+ ax.bar(chunk_nums, progress_state["tokens_per_sec"], color='#ff2a6d', alpha=0.8)
87
+
88
+ # Add median line
89
+ median_tps = np.median(progress_state["tokens_per_sec"])
90
+ ax.axhline(y=median_tps, color='#05d9e8', linestyle='--', label=f'Median: {median_tps:.1f} tokens/sec')
91
+
92
+ # Style improvements
93
+ ax.set_xlabel('Chunk Number', fontsize=24, labelpad=20)
94
+ ax.set_ylabel('Tokens per Second', fontsize=24, labelpad=20)
95
+ ax.set_title('Processing Speed by Chunk', fontsize=28, pad=30)
96
+
97
+ # Increase tick label size
98
+ ax.tick_params(axis='both', which='major', labelsize=20)
99
+
100
+ # Remove gridlines
101
+ ax.grid(False)
102
 
103
+ # Style legend and position it in bottom left
104
+ ax.legend(fontsize=20, facecolor='black', edgecolor='#05d9e8', loc='lower left')
105
+
106
+ plt.tight_layout()
107
+
108
+ # Prepare final metrics display including audio duration and real-time speed
109
  metrics_text = (
110
+ f"Median Processing Speed: {np.median(progress_state['tokens_per_sec']):.1f} tokens/sec\n" +
111
+ f"Real-time Factor: {rtf:.3f}\n" +
112
+ f"Real Time Generation Speed: {int(1/rtf)}x \n" +
113
+ f"Processing Time: {int(total_time)}s\n" +
114
+ f"Output Audio Duration: {total_duration:.2f}s"
115
  )
116
 
117
  return (
118
  audio_output,
119
+ fig,
120
+ metrics_text
121
  )
122
  except Exception as e:
123
  raise gr.Error(f"Generation failed: {str(e)}")
 
126
  with gr.Blocks(title="Kokoro TTS Demo") as demo:
127
  gr.HTML(
128
  """
129
+ <div style="display: flex; justify-content: flex-end; padding: 5px; gap: 5px;">
130
+ <a class="github-button" href="https://github.com/remsky/Kokoro-FastAPI" data-color-scheme="no-preference: light; light: light; dark: dark;" data-size="large" data-show-count="true" aria-label="Star remsky/Kokoro-FastAPI on GitHub">Kokoro-FastAPI Repo</a>
131
  <a href="https://huggingface.co/hexgrad/Kokoro-82M" target="_blank">
132
+ <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-lg-dark.svg" alt="Model on HF">
133
  </a>
 
134
  </div>
135
  <div style="text-align: center; max-width: 800px; margin: 0 auto;">
136
  <h1>Kokoro TTS Demo</h1>
 
198
  )
199
  progress_bar = gr.Progress(track_tqdm=False)
200
  metrics_text = gr.Textbox(
201
+ label="Performance Summary",
202
  interactive=False,
203
+ lines=4
204
  )
205
+ metrics_plot = gr.Plot(
206
+ label="Processing Metrics",
207
+ show_label=True,
208
+ format="png" # Explicitly set format to PNG which is supported by matplotlib
209
  )
210
 
211
  # Set up event handler
212
  submit_btn.click(
213
  fn=generate_speech_from_ui,
214
  inputs=[text_input, voice_dropdown, speed_slider],
215
+ outputs=[audio_output, metrics_plot, metrics_text],
216
  show_progress=True
217
  )
218
 
 
223
  ### Demo Text Info
224
  The demo text is loaded from H.G. Wells' "The Time Machine". This classic text demonstrates the system's ability to handle long-form content through chunking.
225
  """)
 
226
 
227
  # Launch the app
228
  if __name__ == "__main__":
tts_model.py CHANGED
@@ -308,7 +308,7 @@ class TTSModel:
308
  setup_plot(fig, ax2, 'Chunk Sizes')
309
 
310
  # Save plot
311
- plt.savefig('chunk_times.png')
312
  plt.close()
313
 
314
  # Calculate metrics
 
308
  setup_plot(fig, ax2, 'Chunk Sizes')
309
 
310
  # Save plot
311
+ plt.savefig('chunk_times.png', format='png')
312
  plt.close()
313
 
314
  # Calculate metrics