Plachta commited on
Commit
76a6cb5
β€’
1 Parent(s): a50ee15

Update webui.py

Browse files
Files changed (1) hide show
  1. webui.py +118 -116
webui.py CHANGED
@@ -1,117 +1,119 @@
1
- import gradio as gr
2
- import torch
3
- import torchaudio
4
- import librosa
5
- import numpy as np
6
- import os
7
- from huggingface_hub import hf_hub_download
8
- import yaml
9
- from modules.commons import recursive_munch, build_model
10
-
11
- # setup device
12
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
-
14
-
15
- # load model
16
- def load_model(repo_id):
17
- ckpt_path = hf_hub_download(repo_id, "pytorch_model.bin", cache_dir="./checkpoints")
18
- config_path = hf_hub_download(repo_id, "config.yml", cache_dir="./checkpoints")
19
-
20
- config = yaml.safe_load(open(config_path))
21
- model_params = recursive_munch(config['model_params'])
22
-
23
- if "redecoder" in repo_id:
24
- model = build_model(model_params, stage="redecoder")
25
- else:
26
- model = build_model(model_params, stage="codec")
27
-
28
- ckpt_params = torch.load(ckpt_path, map_location="cpu")
29
-
30
- for key in model:
31
- model[key].load_state_dict(ckpt_params[key])
32
- model[key].eval()
33
- model[key].to(device)
34
-
35
- return model
36
-
37
-
38
- # load models
39
- codec_model = load_model("Plachta/FAcodec")
40
- redecoder_model = load_model("Plachta/FAcodec-redecoder")
41
-
42
-
43
- # preprocess audio
44
- def preprocess_audio(audio_path, sr=24000):
45
- audio = librosa.load(audio_path, sr=sr)[0]
46
- # if audio has two channels, take the first one
47
- if len(audio.shape) > 1:
48
- audio = audio[0]
49
- audio = audio[:sr * 30] # crop only the first 30 seconds
50
- return torch.tensor(audio).unsqueeze(0).float().to(device)
51
-
52
-
53
- # audio reconstruction function
54
- @torch.no_grad()
55
- def reconstruct_audio(audio):
56
- source_audio = preprocess_audio(audio)
57
-
58
- z = codec_model.encoder(source_audio[None, ...])
59
- z, _, _, _, _ = codec_model.quantizer(z, source_audio[None, ...], n_c=2)
60
-
61
- reconstructed_wave = codec_model.decoder(z)
62
-
63
- return (24000, reconstructed_wave[0, 0].cpu().numpy())
64
-
65
-
66
- # voice conversion function
67
- @torch.no_grad()
68
- def voice_conversion(source_audio, target_audio):
69
- source_audio = preprocess_audio(source_audio)
70
- target_audio = preprocess_audio(target_audio)
71
-
72
- z = codec_model.encoder(source_audio[None, ...])
73
- z, _, _, _, timbre, codes = codec_model.quantizer(z, source_audio[None, ...], n_c=2, return_codes=True)
74
-
75
- z_target = codec_model.encoder(target_audio[None, ...])
76
- _, _, _, _, timbre_target, _ = codec_model.quantizer(z_target, target_audio[None, ...], n_c=2, return_codes=True)
77
-
78
- z_converted = redecoder_model.encoder(codes[0], codes[1], timbre_target, use_p_code=False, n_c=1)
79
- converted_wave = redecoder_model.decoder(z_converted)
80
-
81
- return (24000, converted_wave[0, 0].cpu().numpy())
82
-
83
-
84
- # gradio interface
85
- def gradio_interface():
86
- with gr.Blocks() as demo:
87
- gr.Markdown(
88
- "# FAcodec reconstruction and voice conversion"
89
- "[![GitHub stars](https://img.shields.io/github/stars/username/repo-name.svg?style=social&label=Star&maxAge=2592000)](https://github.com/Plachtaa/FAcodec)"
90
- "FAcodec from [Natural Speech 3](https://arxiv.org/pdf/2403.03100). The checkpoint used in this demo is trained on an improved pipeline of "
91
- "where all kinds of annotations are not required, enabling the scale up of training data. <br>This model is "
92
- "trained on 50k hours of data with over 1 million speakers, largely improved timbre diversity compared to "
93
- "the [original FAcodec](https://huggingface.co/spaces/amphion/naturalspeech3_facodec)."
94
- "<br><br>This project is supported by [Amphion](https://github.com/open-mmlab/Amphion)"
95
- )
96
-
97
- with gr.Tab("reconstruction"):
98
- with gr.Row():
99
- input_audio = gr.Audio(type="filepath", label="Input audio")
100
- output_audio = gr.Audio(label="Reconstructed audio")
101
- reconstruct_btn = gr.Button("Reconstruct")
102
- reconstruct_btn.click(reconstruct_audio, inputs=[input_audio], outputs=[output_audio])
103
-
104
- with gr.Tab("voice conversion"):
105
- with gr.Row():
106
- source_audio = gr.Audio(type="filepath", label="Source audio")
107
- target_audio = gr.Audio(type="filepath", label="Reference audio")
108
- converted_audio = gr.Audio(label="Converted audio")
109
- convert_btn = gr.Button("Convert")
110
- convert_btn.click(voice_conversion, inputs=[source_audio, target_audio], outputs=[converted_audio])
111
-
112
- return demo
113
-
114
-
115
- if __name__ == "__main__":
116
- iface = gradio_interface()
 
 
117
  iface.launch()
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ import librosa
5
+ import numpy as np
6
+ import os
7
+ from huggingface_hub import hf_hub_download
8
+ import yaml
9
+ from modules.commons import recursive_munch, build_model
10
+
11
+ # setup device
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+
14
+
15
+ # load model
16
+ def load_model(repo_id):
17
+ ckpt_path = hf_hub_download(repo_id, "pytorch_model.bin", cache_dir="./checkpoints")
18
+ config_path = hf_hub_download(repo_id, "config.yml", cache_dir="./checkpoints")
19
+
20
+ config = yaml.safe_load(open(config_path))
21
+ model_params = recursive_munch(config['model_params'])
22
+
23
+ if "redecoder" in repo_id:
24
+ model = build_model(model_params, stage="redecoder")
25
+ else:
26
+ model = build_model(model_params, stage="codec")
27
+
28
+ ckpt_params = torch.load(ckpt_path, map_location="cpu")
29
+
30
+ for key in model:
31
+ model[key].load_state_dict(ckpt_params[key])
32
+ model[key].eval()
33
+ model[key].to(device)
34
+
35
+ return model
36
+
37
+
38
+ # load models
39
+ codec_model = load_model("Plachta/FAcodec")
40
+ redecoder_model = load_model("Plachta/FAcodec-redecoder")
41
+
42
+
43
+ # preprocess audio
44
+ def preprocess_audio(audio_path, sr=24000):
45
+ audio = librosa.load(audio_path, sr=sr)[0]
46
+ # if audio has two channels, take the first one
47
+ if len(audio.shape) > 1:
48
+ audio = audio[0]
49
+ audio = audio[:sr * 30] # crop only the first 30 seconds
50
+ return torch.tensor(audio).unsqueeze(0).float().to(device)
51
+
52
+
53
+ # audio reconstruction function
54
+ @torch.no_grad()
55
+ def reconstruct_audio(audio):
56
+ source_audio = preprocess_audio(audio)
57
+
58
+ z = codec_model.encoder(source_audio[None, ...])
59
+ z, _, _, _, _ = codec_model.quantizer(z, source_audio[None, ...], n_c=2)
60
+
61
+ reconstructed_wave = codec_model.decoder(z)
62
+
63
+ return (24000, reconstructed_wave[0, 0].cpu().numpy())
64
+
65
+
66
+ # voice conversion function
67
+ @torch.no_grad()
68
+ def voice_conversion(source_audio, target_audio):
69
+ source_audio = preprocess_audio(source_audio)
70
+ target_audio = preprocess_audio(target_audio)
71
+
72
+ z = codec_model.encoder(source_audio[None, ...])
73
+ z, _, _, _, timbre, codes = codec_model.quantizer(z, source_audio[None, ...], n_c=2, return_codes=True)
74
+
75
+ z_target = codec_model.encoder(target_audio[None, ...])
76
+ _, _, _, _, timbre_target, _ = codec_model.quantizer(z_target, target_audio[None, ...], n_c=2, return_codes=True)
77
+
78
+ z_converted = redecoder_model.encoder(codes[0], codes[1], timbre_target, use_p_code=False, n_c=1)
79
+ converted_wave = redecoder_model.decoder(z_converted)
80
+
81
+ return (24000, converted_wave[0, 0].cpu().numpy())
82
+
83
+
84
+ # gradio interface
85
+ def gradio_interface():
86
+ with gr.Blocks() as demo:
87
+ gr.Markdown(
88
+ "# FAcodec reconstruction and voice conversion"
89
+ "[![GitHub stars](https://img.shields.io/github/stars/Plachtaa/FAcodec)](https://github.com/Plachtaa/FAcodec)"
90
+ )
91
+ gr.Markdown(
92
+ "FAcodec from [Natural Speech 3](https://arxiv.org/pdf/2403.03100). <br>The checkpoint used in this demo is trained on an improved pipeline "
93
+ "where all kinds of annotations are not required, enabling the scale up of training data. <br>This model is "
94
+ "trained on 50k hours 24000Hz speech data with over 1 million speakers, largely improved timbre diversity compared to "
95
+ "the [original FAcodec](https://huggingface.co/spaces/amphion/naturalspeech3_facodec)."
96
+ "<br><br>This project is supported by [Amphion](https://github.com/open-mmlab/Amphion)"
97
+ )
98
+
99
+ with gr.Tab("reconstruction"):
100
+ with gr.Row():
101
+ input_audio = gr.Audio(type="filepath", label="Input audio")
102
+ output_audio = gr.Audio(label="Reconstructed audio")
103
+ reconstruct_btn = gr.Button("Reconstruct")
104
+ reconstruct_btn.click(reconstruct_audio, inputs=[input_audio], outputs=[output_audio])
105
+
106
+ with gr.Tab("voice conversion"):
107
+ with gr.Row():
108
+ source_audio = gr.Audio(type="filepath", label="Source audio")
109
+ target_audio = gr.Audio(type="filepath", label="Reference audio")
110
+ converted_audio = gr.Audio(label="Converted audio")
111
+ convert_btn = gr.Button("Convert")
112
+ convert_btn.click(voice_conversion, inputs=[source_audio, target_audio], outputs=[converted_audio])
113
+
114
+ return demo
115
+
116
+
117
+ if __name__ == "__main__":
118
+ iface = gradio_interface()
119
  iface.launch()