Spaces:
Runtime error
Runtime error
File size: 4,701 Bytes
84a8863 e41ca80 84a8863 4fc9ca5 84a8863 e41ca80 0392813 84a8863 d3d4abe 84a8863 663ffb0 84a8863 128b52a 84a8863 128b52a 7f62a47 84a8863 663ffb0 aa299db c1f12e3 27f0dff c1f12e3 8de2ba2 0cda288 c1f12e3 663ffb0 84a8863 0871f11 84a8863 e13ee4d 84a8863 663ffb0 84a8863 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import torch
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm
from transformers import SpeechT5HifiGan
from datasets import load_dataset
from tqdm import tqdm
import soundfile as sf
import librosa
import random
dataset = load_dataset('SeyedAli/Persian-Speech-Dataset')
dataset = dataset["test"]
def set_seed(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(1)
# Load model directly
from transformers import AutoProcessor, AutoModelForTextToSpectrogram
processor = AutoProcessor.from_pretrained("Alidr79/speecht5_v2_best")
model = AutoModelForTextToSpectrogram.from_pretrained("Alidr79/speecht5_v2_best")
from speechbrain.inference.classifiers import EncoderClassifier
import os
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name),
)
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
from PersianG2p import Persian_g2p_converter
from scipy.io import wavfile
import soundfile as sf
PersianG2Pconverter = Persian_g2p_converter(use_large = True)
import noisereduce as nr
def denoise_audio(audio, sr):
# Perform noise reduction
denoised_audio = nr.reduce_noise(y=audio, sr=sr)
return denoised_audio
import noisereduce as nr
from pydub import AudioSegment
def match_target_amplitude(sound, target_dBFS):
change_in_dBFS = target_dBFS - sound.dBFS
return sound.apply_gain(change_in_dBFS)
import librosa
def tts_fn(slider_value, input_text):
audio_embedding = dataset[slider_value]['audio']['array']
sample_rate_embedding = dataset[slider_value]['audio']['sampling_rate']
if sample_rate_embedding != 16000:
audio_embedding = librosa.resample(audio_embedding, orig_sr=sample_rate_embedding, target_sr=16_000)
with torch.no_grad():
speaker_embedding = create_speaker_embedding(audio_embedding)
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
phonemes = PersianG2Pconverter.transliterate(input_text, tidy = False, secret = True)
# text = "</s>"
# for i in phonemes.replace(' .', '').split(" "):
# text += i + " <pad> "
text = phonemes
print("sentence:", input_text)
print("sentence phonemes:", text)
with torch.no_grad():
inputs = processor(text = text, return_tensors="pt")
with torch.no_grad():
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embedding, minlenratio = 2, maxlenratio = 4, threshold = 0.3)
with torch.no_grad():
speech = vocoder(spectrogram)
speech = speech.numpy().reshape(-1)
speech_denoised = denoise_audio(speech, 16000)
sf.write("in_speech.wav", speech_denoised, 16000)
sound = AudioSegment.from_wav("in_speech.wav", "wav")
normalized_sound = match_target_amplitude(sound, -20.0)
normalized_sound.export("out_sound.wav", format="wav")
sample_rate_out, audio_out = wavfile.read("out_sound.wav")
assert sample_rate_out == 16_000
return 16000, (audio_out.reshape(-1)).astype(np.int16)
def master_fn(slider_value, input_text):
if "." not in input_text:
input_text += '.'
print(f"speaker_id = {slider_value}")
all_speech = []
for sentence in input_text.split("."):
if sentence != '' and sentence != ' ' and sentence != '\n':
sampling_rate_response, audio_chunk_response = tts_fn(slider_value, sentence)
all_speech.append(audio_chunk_response)
audio_response = np.concatenate(all_speech)
return sampling_rate_response, audio_response
import gradio as gr
slider = gr.Slider(
minimum=0,
maximum=(len(dataset)-1),
value=86,
step=1,
label="Select a speaker (Good examples: 86, 7)"
)
# Create the text input component
text_input = gr.Textbox(
label="Enter some text",
placeholder="Type something here..."
)
demo = gr.Interface(
fn = master_fn,
inputs=[slider, text_input], # List of inputs
outputs = "audio"
)
demo.launch() |