Spaces:
Sleeping
Sleeping
File size: 4,822 Bytes
e613cea 6072bc6 30e6a40 dd17486 30e6a40 dd17486 01a3172 30e6a40 ed3e5df e227a3a ed3e5df e227a3a 30e6a40 9376c53 01a3172 7f29eee 01a3172 9376c53 01a3172 9376c53 7f29eee 01a3172 9376c53 30e6a40 9376c53 01a3172 9376c53 30e6a40 9376c53 01a3172 9376c53 01a3172 30e6a40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os
import io
import gradio as gr
import librosa
import numpy as np
import logging
import soundfile
import asyncio
import argparse
import gradio.processing_utils as gr_processing_utils
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
INFERENCE_OUTPUT_DIRNAME = '/output/'
audio_postprocess_ori = gr.Audio.postprocess
def audio_postprocess(self, y):
data = audio_postprocess_ori(self, y)
if data is None:
return None
return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
gr.Audio.postprocess = audio_postprocess
def vc_fn(input_audio, vc_transform, voice):
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
duration = audio.shape[0] / sampling_rate
if duration > 20 and limitation:
return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
raw_path = io.BytesIO()
soundfile.write(raw_path, audio, 16000, format="wav")
raw_path.seek(0)
out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
auto_predict_f0=True,
)
return "Success", (44100, out_audio.cpu().numpy())
def run_inference(input_audio, speaker):
if input_audio is None:
return "You need to upload an audio", None
sampling_rate, audio = input_audio
duration = audio.shape[0] / sampling_rate
if duration > 20 and limitation:
return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
if sampling_rate != 16000:
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
#TODO edit from GUI
cluster_ratio = 1
noise_scale = 2
is_pitch_prediction_enabled = True
f0_method = "dio"
transpose = 0
model_path = f"/models/{speaker}/{speaker}.pth"
config_path = f"/models/{speaker}/config.json"
cluster_path = ""
raw_path = io.BytesIO()
soundfile.write(raw_path, audio, 16000, format="wav")
raw_path.seek(0)
inference_cmd = f"svc infer {raw_path.absolute()} -m {model_path} -c {config_path} {f'-k {cluster_path} -r {cluster_ratio}' if cluster_path != '' and cluster_ratio > 0 else ''} -t {transpose} --f0-method {f0_method} -n {noise_scale} -o {INFERENCE_OUTPUT_DIRNAME}/{raw_path.name} {'' if is_pitch_prediction_enabled else '--no-auto-predict-f0'}"
# out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
# auto_predict_f0=True,
# )
result = subprocess.run(
inference_cmd.split(),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True
)
print(result)
return "Success", "TODO" # (44100, out_audio.cpu().numpy())
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default='cpu')
parser.add_argument('--api', action="store_true", default=False)
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
args = parser.parse_args()
speakers = ["chapaev", "petka", "anka"]
models = []
voices = []
# !svc infer {NAME}.wav -c config.json -m G_riri_220.pth
# display(Audio(f"{NAME}.out.wav", autoplay=True))
with gr.Blocks() as app:
gr.Markdown(
"# <center> Sovits Chapay\n"
)
with gr.Row():
with gr.Column():
vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
speaker = gr.Dropdown(choices=speakers, visible=True)
vc_submit = gr.Button("Generate", variant="primary")
with gr.Column():
vc_output1 = gr.Textbox(label="Output Message")
# vc_output2 = gr.Audio(label="Output Audio")
vc_submit.click(run_inference, [vc_input, speaker], [vc_output1, vc_output2])
app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
|