Spaces:
Running
on
Zero
Running
on
Zero
Add initial implementation of Kokoro TTS Demo Space with Gradio interface and dependencies
Browse files- README.md +59 -1
- app.py +127 -0
- packages.txt +1 -0
- requirements.txt +12 -0
- the_time_machine_hgwells.txt +0 -0
- tts_model.py +329 -0
README.md
CHANGED
@@ -11,4 +11,62 @@ license: apache-2.0
|
|
11 |
short_description: A100 GPU Accelerated Inference applied to Kokoro-82M TTS
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
short_description: A100 GPU Accelerated Inference applied to Kokoro-82M TTS
|
12 |
---
|
13 |
|
14 |
+
# Kokoro TTS Demo Space
|
15 |
+
|
16 |
+
A Zero GPU-optimized Hugging Face Space for the Kokoro TTS model.
|
17 |
+
|
18 |
+
## Overview
|
19 |
+
|
20 |
+
This Space provides a Gradio interface for the Kokoro TTS model, allowing users to:
|
21 |
+
- Convert text to speech using multiple voices
|
22 |
+
- Adjust speech speed
|
23 |
+
- Get instant audio playback
|
24 |
+
|
25 |
+
## Technical Details
|
26 |
+
|
27 |
+
- Uses Zero GPU for efficient GPU resource management
|
28 |
+
- Dynamically loads required modules from hexgrad/Kokoro-82M repository
|
29 |
+
- Automatically downloads model and voice files from Hugging Face Hub
|
30 |
+
- Implements proper GPU memory handling
|
31 |
+
- Includes caching in /data/.huggingface for faster restarts
|
32 |
+
|
33 |
+
## Dependencies
|
34 |
+
|
35 |
+
The Space uses modules from two repositories:
|
36 |
+
- remsky/Kokoro-FastAPI: This repository (UI and Zero GPU implementation)
|
37 |
+
- hexgrad/Kokoro-82M: Original model repository (core TTS functionality)
|
38 |
+
|
39 |
+
All dependencies are automatically handled:
|
40 |
+
- Core modules (kokoro.py, models.py, etc.) are downloaded from hexgrad/Kokoro-82M
|
41 |
+
- Model weights and voice files are cached in /data/.huggingface
|
42 |
+
- System dependencies (espeak-ng) are installed via packages.txt
|
43 |
+
|
44 |
+
## Environment
|
45 |
+
|
46 |
+
- Python 3.10.13
|
47 |
+
- PyTorch 2.2.2
|
48 |
+
- Gradio 5.9.1
|
49 |
+
- Zero GPU compatible
|
50 |
+
|
51 |
+
## Available Voices
|
52 |
+
|
53 |
+
Adult Female voices:
|
54 |
+
- af: Confident, Friendly
|
55 |
+
- af_sky: You know and Love her
|
56 |
+
- af_bella: Warm and Self-Assured
|
57 |
+
- af_nicole: Whispered, ASMR
|
58 |
+
- af_sarah: Bright and Professional
|
59 |
+
- bf_emma: Pensive and Confident, British
|
60 |
+
- bf_isabella: Young Professional, British
|
61 |
+
|
62 |
+
Adult Male voices:
|
63 |
+
- am_adam: Deep Narrative Voice
|
64 |
+
- am_michael: Trustworthy and Thoughtful
|
65 |
+
- bm_george: Distinguished older voice, British
|
66 |
+
- bm_lewis: Assured and Raspy, British
|
67 |
+
|
68 |
+
## Notes
|
69 |
+
|
70 |
+
- First generation may take longer due to model initialization
|
71 |
+
- GPU is allocated only during speech generation
|
72 |
+
- Model and voices are cached in /data/.huggingface for faster subsequent runs
|
app.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import spaces
|
4 |
+
from tts_model import TTSModel
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
# Set HF_HOME for faster restarts with cached models/voices
|
8 |
+
os.environ["HF_HOME"] = "/data/.huggingface"
|
9 |
+
|
10 |
+
# Create TTS model instance
|
11 |
+
model = TTSModel()
|
12 |
+
|
13 |
+
@spaces.GPU(duration=10) # Quick initialization
|
14 |
+
def initialize_model():
|
15 |
+
"""Initialize model and get voices"""
|
16 |
+
if model.model is None:
|
17 |
+
if not model.initialize():
|
18 |
+
raise gr.Error("Failed to initialize model")
|
19 |
+
return model.list_voices()
|
20 |
+
|
21 |
+
# Get initial voice list
|
22 |
+
voice_list = initialize_model()
|
23 |
+
|
24 |
+
@spaces.GPU(duration=120) # Allow 5 minutes for processing
|
25 |
+
def generate_speech_from_ui(text, voice_name, speed):
|
26 |
+
"""Handle text-to-speech generation from the Gradio UI"""
|
27 |
+
try:
|
28 |
+
audio_array, duration = model.generate_speech(text, voice_name, speed)
|
29 |
+
# Convert float array to int16 range (-32768 to 32767)
|
30 |
+
audio_array = np.array(audio_array, dtype=np.float32)
|
31 |
+
audio_array = (audio_array * 32767).astype(np.int16)
|
32 |
+
return (24000, audio_array), f"Audio Duration: {duration:.2f} seconds\nProcessing complete - check console for detailed metrics"
|
33 |
+
except Exception as e:
|
34 |
+
raise gr.Error(str(e))
|
35 |
+
|
36 |
+
# Create Gradio interface
|
37 |
+
with gr.Blocks(title="Kokoro TTS Demo") as demo:
|
38 |
+
gr.HTML(
|
39 |
+
"""
|
40 |
+
<div style="text-align: center; max-width: 800px; margin: 0 auto;">
|
41 |
+
<h1>Kokoro TTS Demo</h1>
|
42 |
+
<p>Convert text to natural-sounding speech using various voices.</p>
|
43 |
+
</div>
|
44 |
+
"""
|
45 |
+
)
|
46 |
+
|
47 |
+
with gr.Row():
|
48 |
+
with gr.Column(scale=3):
|
49 |
+
# Input components
|
50 |
+
text_input = gr.TextArea(
|
51 |
+
label="Text to speak",
|
52 |
+
placeholder="Enter text here...",
|
53 |
+
lines=3,
|
54 |
+
value=open("the_time_machine_hgwells.txt").read()[:1000]
|
55 |
+
)
|
56 |
+
voice_dropdown = gr.Dropdown(
|
57 |
+
label="Voice",
|
58 |
+
choices=voice_list,
|
59 |
+
value=voice_list[0] if voice_list else None,
|
60 |
+
allow_custom_value=True # Allow custom values to avoid warnings
|
61 |
+
)
|
62 |
+
speed_slider = gr.Slider(
|
63 |
+
label="Speed",
|
64 |
+
minimum=0.5,
|
65 |
+
maximum=2.0,
|
66 |
+
value=1.0,
|
67 |
+
step=0.1
|
68 |
+
)
|
69 |
+
submit_btn = gr.Button("Generate Speech")
|
70 |
+
|
71 |
+
with gr.Column(scale=2):
|
72 |
+
# Output components
|
73 |
+
audio_output = gr.Audio(
|
74 |
+
label="Generated Speech",
|
75 |
+
type="numpy",
|
76 |
+
format="wav",
|
77 |
+
autoplay=False
|
78 |
+
)
|
79 |
+
duration_text = gr.Textbox(
|
80 |
+
label="Processing Info",
|
81 |
+
interactive=False,
|
82 |
+
lines=4
|
83 |
+
)
|
84 |
+
|
85 |
+
# Set up event handler
|
86 |
+
submit_btn.click(
|
87 |
+
fn=generate_speech_from_ui,
|
88 |
+
inputs=[text_input, voice_dropdown, speed_slider],
|
89 |
+
outputs=[audio_output, duration_text]
|
90 |
+
)
|
91 |
+
|
92 |
+
# Add voice descriptions
|
93 |
+
gr.Markdown("""
|
94 |
+
### Available Voices
|
95 |
+
- Adult Female (af): Base female voice
|
96 |
+
- Bella (af_bella): Warm and friendly
|
97 |
+
- Nicole (af_nicole): Warm and Whispered
|
98 |
+
- Sarah (af_sarah): Soft and gentle
|
99 |
+
- Sky (af_sky): You know her, you love her
|
100 |
+
- Adult Male (am): Base male voice
|
101 |
+
- Adam (am_adam): Clear and Friendly
|
102 |
+
- Michael (am_michael): Smooth and natural
|
103 |
+
- Young Female (bf):
|
104 |
+
- Emma (bf_emma): Sweet and cheerful
|
105 |
+
- Isabella (bf_isabella): Lively and expressive
|
106 |
+
- Young Male (bm):
|
107 |
+
- George (bm_george): Young and energetic
|
108 |
+
- Lewis (bm_lewis): Deep and confident
|
109 |
+
""")
|
110 |
+
|
111 |
+
# Add text analysis info
|
112 |
+
with gr.Row():
|
113 |
+
with gr.Column():
|
114 |
+
gr.Markdown("""
|
115 |
+
### Demo Text Info
|
116 |
+
The demo text is loaded from H.G. Wells' "The Time Machine". This classic text demonstrates the system's ability to handle long-form content through chunking.
|
117 |
+
""")
|
118 |
+
|
119 |
+
text_stats = gr.Textbox(
|
120 |
+
label="Text Statistics",
|
121 |
+
interactive=False,
|
122 |
+
value=f"Characters: {len(open('the_time_machine_hgwells.txt').read())}\nEstimated chunks: {len(open('the_time_machine_hgwells.txt').read()) // 300 + 1}"
|
123 |
+
)
|
124 |
+
|
125 |
+
# Launch the app
|
126 |
+
if __name__ == "__main__":
|
127 |
+
demo.launch()
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
espeak-ng
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==5.9.1
|
2 |
+
torch==2.2.2
|
3 |
+
numpy<2
|
4 |
+
scipy==1.14.1
|
5 |
+
huggingface_hub>=0.25.1
|
6 |
+
soundfile==0.12.1
|
7 |
+
phonemizer==3.3.0
|
8 |
+
regex==2024.11.6
|
9 |
+
tiktoken==0.8.0
|
10 |
+
transformers==4.47.1
|
11 |
+
munch==4.0.0
|
12 |
+
|
the_time_machine_hgwells.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tts_model.py
ADDED
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import spaces
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
import time
|
7 |
+
import tiktoken
|
8 |
+
import scipy.io.wavfile as wavfile
|
9 |
+
from huggingface_hub import hf_hub_download
|
10 |
+
import importlib.util
|
11 |
+
import sys
|
12 |
+
|
13 |
+
def load_module_from_file(module_name, file_path):
|
14 |
+
"""Load a Python module from file path"""
|
15 |
+
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
16 |
+
if spec is None or spec.loader is None:
|
17 |
+
raise ImportError(f"Cannot load module {module_name} from {file_path}")
|
18 |
+
module = importlib.util.module_from_spec(spec)
|
19 |
+
sys.modules[module_name] = module
|
20 |
+
spec.loader.exec_module(module)
|
21 |
+
return module
|
22 |
+
|
23 |
+
# Download and load required Python modules
|
24 |
+
py_modules = ["istftnet", "plbert", "models"]
|
25 |
+
for py_module in py_modules:
|
26 |
+
path = hf_hub_download(repo_id="hexgrad/Kokoro-82M", filename=f"{py_module}.py")
|
27 |
+
load_module_from_file(py_module, path)
|
28 |
+
|
29 |
+
# Load the kokoro module
|
30 |
+
kokoro_path = hf_hub_download(repo_id="hexgrad/Kokoro-82M", filename="kokoro.py")
|
31 |
+
kokoro = load_module_from_file("kokoro", kokoro_path)
|
32 |
+
|
33 |
+
# Import required functions
|
34 |
+
generate = kokoro.generate
|
35 |
+
normalize_text = kokoro.normalize_text
|
36 |
+
models = sys.modules['models']
|
37 |
+
build_model = models.build_model
|
38 |
+
|
39 |
+
# Set HF_HOME for faster restarts
|
40 |
+
os.environ["HF_HOME"] = "/data/.huggingface"
|
41 |
+
|
42 |
+
class TTSModel:
|
43 |
+
"""Self-contained TTS model manager for Hugging Face Spaces"""
|
44 |
+
|
45 |
+
def __init__(self):
|
46 |
+
self.model = None
|
47 |
+
self.voices_dir = "voices"
|
48 |
+
self.model_repo = "hexgrad/Kokoro-82M"
|
49 |
+
os.makedirs(self.voices_dir, exist_ok=True)
|
50 |
+
|
51 |
+
def initialize(self):
|
52 |
+
"""Initialize model and download voices"""
|
53 |
+
try:
|
54 |
+
print("Initializing model...")
|
55 |
+
|
56 |
+
# Download model and config
|
57 |
+
model_path = hf_hub_download(
|
58 |
+
repo_id=self.model_repo,
|
59 |
+
filename="kokoro-v0_19.pth"
|
60 |
+
)
|
61 |
+
config_path = hf_hub_download(
|
62 |
+
repo_id=self.model_repo,
|
63 |
+
filename="config.json"
|
64 |
+
)
|
65 |
+
|
66 |
+
# Build model directly on GPU if available
|
67 |
+
with torch.cuda.device(0):
|
68 |
+
torch.cuda.set_device(0)
|
69 |
+
self.model = build_model(model_path, 'cuda')
|
70 |
+
self._model_on_gpu = True
|
71 |
+
|
72 |
+
# Download all available voices
|
73 |
+
voices = [
|
74 |
+
"af_bella.pt", "af_nicole.pt", "af_sarah.pt", "af_sky.pt", "af.pt",
|
75 |
+
"am_adam.pt", "am_michael.pt",
|
76 |
+
"bf_emma.pt", "bf_isabella.pt",
|
77 |
+
"bm_george.pt", "bm_lewis.pt"
|
78 |
+
]
|
79 |
+
for voice in voices:
|
80 |
+
try:
|
81 |
+
# Download voice file
|
82 |
+
# Create full destination path
|
83 |
+
voice_path = os.path.join(self.voices_dir, voice)
|
84 |
+
print(f"Attempting to download voice {voice} to {voice_path}")
|
85 |
+
|
86 |
+
# Ensure directory exists
|
87 |
+
os.makedirs(self.voices_dir, exist_ok=True)
|
88 |
+
|
89 |
+
# Download with explicit destination
|
90 |
+
try:
|
91 |
+
downloaded_path = hf_hub_download(
|
92 |
+
repo_id=self.model_repo,
|
93 |
+
filename=f"voices/{voice}",
|
94 |
+
local_dir=self.voices_dir,
|
95 |
+
local_dir_use_symlinks=False,
|
96 |
+
force_filename=voice
|
97 |
+
)
|
98 |
+
print(f"Download completed to: {downloaded_path}")
|
99 |
+
|
100 |
+
# Verify file exists
|
101 |
+
if not os.path.exists(voice_path):
|
102 |
+
print(f"Warning: File not found at expected path {voice_path}")
|
103 |
+
print(f"Checking download location: {downloaded_path}")
|
104 |
+
if os.path.exists(downloaded_path):
|
105 |
+
print(f"Moving file from {downloaded_path} to {voice_path}")
|
106 |
+
os.rename(downloaded_path, voice_path)
|
107 |
+
else:
|
108 |
+
print(f"Verified voice file exists: {voice_path}")
|
109 |
+
|
110 |
+
except Exception as e:
|
111 |
+
print(f"Error downloading voice {voice}: {str(e)}")
|
112 |
+
import traceback
|
113 |
+
traceback.print_exc()
|
114 |
+
|
115 |
+
except Exception as e:
|
116 |
+
print(f"Error downloading voice {voice}: {str(e)}")
|
117 |
+
import traceback
|
118 |
+
traceback.print_exc()
|
119 |
+
|
120 |
+
print("Model initialization complete")
|
121 |
+
return True
|
122 |
+
|
123 |
+
except Exception as e:
|
124 |
+
print(f"Error initializing model: {str(e)}")
|
125 |
+
return False
|
126 |
+
|
127 |
+
def list_voices(self):
|
128 |
+
"""List available voices"""
|
129 |
+
voices = []
|
130 |
+
try:
|
131 |
+
# Verify voices directory exists
|
132 |
+
if not os.path.exists(self.voices_dir):
|
133 |
+
print(f"Voices directory does not exist: {self.voices_dir}")
|
134 |
+
return voices
|
135 |
+
|
136 |
+
# Get list of files
|
137 |
+
files = os.listdir(self.voices_dir)
|
138 |
+
print(f"Found {len(files)} files in voices directory")
|
139 |
+
|
140 |
+
# Filter for .pt files
|
141 |
+
for file in files:
|
142 |
+
if file.endswith(".pt"):
|
143 |
+
voices.append(file[:-3]) # Remove .pt extension
|
144 |
+
print(f"Found voice: {file[:-3]}")
|
145 |
+
|
146 |
+
if not voices:
|
147 |
+
print("No voice files found in voices directory")
|
148 |
+
|
149 |
+
except Exception as e:
|
150 |
+
print(f"Error listing voices: {str(e)}")
|
151 |
+
import traceback
|
152 |
+
traceback.print_exc()
|
153 |
+
|
154 |
+
return sorted(voices)
|
155 |
+
|
156 |
+
def _ensure_model_on_gpu(self):
|
157 |
+
"""Ensure model is on GPU and stays there"""
|
158 |
+
if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
|
159 |
+
print("Moving model to GPU...")
|
160 |
+
with torch.cuda.device(0):
|
161 |
+
torch.cuda.set_device(0)
|
162 |
+
# Move model to GPU using torch.nn.Module method
|
163 |
+
if hasattr(self.model, 'to'):
|
164 |
+
self.model.to('cuda')
|
165 |
+
else:
|
166 |
+
# Fallback for Munch object - move parameters individually
|
167 |
+
for name in self.model:
|
168 |
+
if isinstance(self.model[name], torch.Tensor):
|
169 |
+
self.model[name] = self.model[name].cuda()
|
170 |
+
self._model_on_gpu = True
|
171 |
+
|
172 |
+
def _generate_audio(self, text: str, voicepack: torch.Tensor, lang: str, speed: float) -> np.ndarray:
|
173 |
+
"""GPU-accelerated audio generation"""
|
174 |
+
try:
|
175 |
+
with torch.cuda.device(0):
|
176 |
+
torch.cuda.set_device(0)
|
177 |
+
|
178 |
+
# Move everything to GPU in a single context
|
179 |
+
if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
|
180 |
+
print("Moving model to GPU...")
|
181 |
+
if hasattr(self.model, 'to'):
|
182 |
+
self.model.to('cuda')
|
183 |
+
else:
|
184 |
+
for name in self.model:
|
185 |
+
if isinstance(self.model[name], torch.Tensor):
|
186 |
+
self.model[name] = self.model[name].cuda()
|
187 |
+
self._model_on_gpu = True
|
188 |
+
|
189 |
+
# Move voicepack to GPU
|
190 |
+
voicepack = voicepack.cuda()
|
191 |
+
|
192 |
+
# Run generation with everything on GPU
|
193 |
+
audio, _ = generate(
|
194 |
+
self.model,
|
195 |
+
text,
|
196 |
+
voicepack,
|
197 |
+
lang=lang,
|
198 |
+
speed=speed
|
199 |
+
)
|
200 |
+
|
201 |
+
return audio
|
202 |
+
|
203 |
+
except Exception as e:
|
204 |
+
print(f"Error in audio generation: {str(e)}")
|
205 |
+
raise e
|
206 |
+
|
207 |
+
def chunk_text(self, text: str, max_chars: int = 300) -> list[str]:
|
208 |
+
"""Break text into chunks at natural boundaries"""
|
209 |
+
chunks = []
|
210 |
+
current_chunk = ""
|
211 |
+
|
212 |
+
# Split on sentence boundaries first
|
213 |
+
sentences = text.replace(".", ".|").replace("!", "!|").replace("?", "?|").replace(";", ";|").split("|")
|
214 |
+
|
215 |
+
for sentence in sentences:
|
216 |
+
if not sentence.strip():
|
217 |
+
continue
|
218 |
+
|
219 |
+
# If sentence is already too long, break on commas
|
220 |
+
if len(sentence) > max_chars:
|
221 |
+
parts = sentence.split(",")
|
222 |
+
for part in parts:
|
223 |
+
if len(current_chunk) + len(part) <= max_chars:
|
224 |
+
current_chunk += part + ","
|
225 |
+
else:
|
226 |
+
# If part is still too long, break on whitespace
|
227 |
+
if len(part) > max_chars:
|
228 |
+
words = part.split()
|
229 |
+
for word in words:
|
230 |
+
if len(current_chunk) + len(word) > max_chars:
|
231 |
+
chunks.append(current_chunk.strip())
|
232 |
+
current_chunk = word + " "
|
233 |
+
else:
|
234 |
+
current_chunk += word + " "
|
235 |
+
else:
|
236 |
+
chunks.append(current_chunk.strip())
|
237 |
+
current_chunk = part + ","
|
238 |
+
else:
|
239 |
+
if len(current_chunk) + len(sentence) <= max_chars:
|
240 |
+
current_chunk += sentence
|
241 |
+
else:
|
242 |
+
chunks.append(current_chunk.strip())
|
243 |
+
current_chunk = sentence
|
244 |
+
|
245 |
+
if current_chunk:
|
246 |
+
chunks.append(current_chunk.strip())
|
247 |
+
|
248 |
+
return chunks
|
249 |
+
|
250 |
+
def generate_speech(self, text: str, voice_name: str, speed: float = 1.0) -> tuple[np.ndarray, float]:
|
251 |
+
"""Generate speech from text. Returns (audio_array, duration)"""
|
252 |
+
try:
|
253 |
+
if not text or not voice_name:
|
254 |
+
raise ValueError("Text and voice name are required")
|
255 |
+
|
256 |
+
start_time = time.time()
|
257 |
+
|
258 |
+
# Initialize tokenizer
|
259 |
+
enc = tiktoken.get_encoding("cl100k_base")
|
260 |
+
total_tokens = len(enc.encode(text))
|
261 |
+
|
262 |
+
# Normalize text
|
263 |
+
text = normalize_text(text)
|
264 |
+
if not text:
|
265 |
+
raise ValueError("Text is empty after normalization")
|
266 |
+
|
267 |
+
# Load voice and process within GPU context
|
268 |
+
with torch.cuda.device(0):
|
269 |
+
torch.cuda.set_device(0)
|
270 |
+
|
271 |
+
voice_path = os.path.join(self.voices_dir, f"{voice_name}.pt")
|
272 |
+
if not os.path.exists(voice_path):
|
273 |
+
raise ValueError(f"Voice not found: {voice_name}")
|
274 |
+
|
275 |
+
# Load voice directly to GPU
|
276 |
+
voicepack = torch.load(voice_path, map_location='cuda', weights_only=True)
|
277 |
+
|
278 |
+
# Break text into chunks for better memory management
|
279 |
+
chunks = self.chunk_text(text)
|
280 |
+
print(f"Processing {len(chunks)} chunks...")
|
281 |
+
|
282 |
+
# Ensure model is initialized and on GPU
|
283 |
+
if self.model is None:
|
284 |
+
print("Model not initialized, reinitializing...")
|
285 |
+
if not self.initialize():
|
286 |
+
raise ValueError("Failed to initialize model")
|
287 |
+
|
288 |
+
# Move model to GPU if needed
|
289 |
+
if not hasattr(self, '_model_on_gpu') or not self._model_on_gpu:
|
290 |
+
print("Moving model to GPU...")
|
291 |
+
if hasattr(self.model, 'to'):
|
292 |
+
self.model.to('cuda')
|
293 |
+
else:
|
294 |
+
for name in self.model:
|
295 |
+
if isinstance(self.model[name], torch.Tensor):
|
296 |
+
self.model[name] = self.model[name].cuda()
|
297 |
+
self._model_on_gpu = True
|
298 |
+
|
299 |
+
# Process all chunks within same GPU context
|
300 |
+
audio_chunks = []
|
301 |
+
for i, chunk in enumerate(chunks):
|
302 |
+
chunk_start = time.time()
|
303 |
+
chunk_audio = self._generate_audio(
|
304 |
+
text=chunk,
|
305 |
+
voicepack=voicepack,
|
306 |
+
lang=voice_name[0],
|
307 |
+
speed=speed
|
308 |
+
)
|
309 |
+
chunk_time = time.time() - chunk_start
|
310 |
+
print(f"Chunk {i+1}/{len(chunks)} processed in {chunk_time:.2f}s")
|
311 |
+
audio_chunks.append(chunk_audio)
|
312 |
+
|
313 |
+
# Concatenate audio chunks
|
314 |
+
audio = np.concatenate(audio_chunks)
|
315 |
+
|
316 |
+
# Calculate metrics
|
317 |
+
total_time = time.time() - start_time
|
318 |
+
tokens_per_second = total_tokens / total_time
|
319 |
+
|
320 |
+
print(f"\nProcessing Metrics:")
|
321 |
+
print(f"Total tokens: {total_tokens}")
|
322 |
+
print(f"Total time: {total_time:.2f}s")
|
323 |
+
print(f"Tokens per second: {tokens_per_second:.2f}")
|
324 |
+
|
325 |
+
return audio, len(audio) / 24000 # Return audio array and duration
|
326 |
+
|
327 |
+
except Exception as e:
|
328 |
+
print(f"Error generating speech: {str(e)}")
|
329 |
+
raise
|