Spaces:
Sleeping
Sleeping
File size: 4,868 Bytes
e613cea 6072bc6 30e6a40 76c42ac 30e6a40 9c2dd26 30e6a40 dd17486 30e6a40 dd17486 30e6a40 27afdea e227a3a ed3e5df e227a3a 30e6a40 9376c53 01a3172 7f29eee 01a3172 109ec6e 01a3172 f059937 01a3172 d99b549 8668f22 01a3172 9376c53 7f29eee 27afdea 9376c53 30e6a40 9376c53 01a3172 9376c53 30e6a40 9376c53 27afdea 9376c53 27afdea 01a3172 30e6a40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os
import io
import gradio as gr
import librosa
import numpy as np
import logging
import soundfile
import torchaudio
import asyncio
import argparse
import subprocess
import gradio.processing_utils as gr_processing_utils
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
audio_postprocess_ori = gr.Audio.postprocess
def audio_postprocess(self, y):
data = audio_postprocess_ori(self, y)
if data is None:
return None
return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
gr.Audio.postprocess = audio_postprocess
def unused_vc_fn(input_audio, vc_transform, voice):
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
duration = audio.shape[0] / sampling_rate
if duration > 20 and limitation:
return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
raw_path = io.BytesIO()
soundfile.write(raw_path, audio, 16000, format="wav")
raw_path.seek(0)
out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
auto_predict_f0=True,
)
return "Success", (44100, out_audio.cpu().numpy())
def run_inference(input_audio, speaker):
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
duration = audio.shape[0] / sampling_rate
if duration > 20 and limitation:
return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
#TODO edit from GUI
cluster_ratio = 1
noise_scale = 2
is_pitch_prediction_enabled = True
f0_method = "dio"
transpose = 0
model_path = f"./models/{speaker}/{speaker}.pth"
config_path = f"./models/{speaker}/config.json"
cluster_path = ""
raw_path = 'tmp.wav'
soundfile.write(raw_path, audio, 16000, format="wav")
inference_cmd = f"svc infer {raw_path} -m {model_path} -c {config_path} {f'-k {cluster_path} -r {cluster_ratio}' if cluster_path != '' and cluster_ratio > 0 else ''} -t {transpose} --f0-method {f0_method} -n {noise_scale} -o out.wav {'' if is_pitch_prediction_enabled else '--no-auto-predict-f0'}"
print(inference_cmd)
# out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
# auto_predict_f0=True,
# )
result = subprocess.run(
inference_cmd.split(),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True
)
out_audio, sr = torchaudio.load('out.wav')
print(out_audio)
return "Success", (44100, out_audio.cpu().numpy())
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default='cpu')
parser.add_argument('--api', action="store_true", default=False)
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
args = parser.parse_args()
speakers = ["chapaev", "petka", "anka"]
models = []
voices = []
# !svc infer {NAME}.wav -c config.json -m G_riri_220.pth
# display(Audio(f"{NAME}.out.wav", autoplay=True))
with gr.Blocks() as app:
gr.Markdown(
"# <center> Sovits Chapay\n"
)
with gr.Row():
with gr.Column():
vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
speaker = gr.Dropdown(label="Speaker", choices=speakers, visible=True)
vc_submit = gr.Button("Generate", variant="primary")
with gr.Column():
vc_output1 = gr.Textbox(label="Output Message")
vc_output2 = gr.Audio(label="Output Audio") # Audio(label="Output Audio")
vc_submit.click(run_inference, [vc_input, speaker], [vc_output1, vc_output2])
app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
|