Vladimir Alabov commited on
Commit
9376c53
1 Parent(s): 30e6a40
Files changed (3) hide show
  1. app.py +56 -18
  2. models/chapaev/G_5400.pth +3 -0
  3. models/chapaev/config.json +104 -0
app.py CHANGED
@@ -46,19 +46,56 @@ def create_vc_fn(model, sid):
46
  return "Success", (44100, out_audio.cpu().numpy())
47
  return vc_fn
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if __name__ == '__main__':
50
  parser = argparse.ArgumentParser()
51
  parser.add_argument('--device', type=str, default='cpu')
52
  parser.add_argument('--api', action="store_true", default=False)
53
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
54
  args = parser.parse_args()
 
 
 
 
55
  models = []
56
  voices = []
57
- for f in os.listdir("models"):
58
- name = f
59
- # = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device)
60
- #cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
61
- #models.append((name, cover, create_vc_fn(model, name)))
62
 
63
  # !svc infer {NAME}.wav -c config.json -m G_riri_220.pth
64
  # display(Audio(f"{NAME}.out.wav", autoplay=True))
@@ -67,17 +104,18 @@ if __name__ == '__main__':
67
  "# <center> Sovits Chapay\n"
68
  "## <center> The input audio should be clean and pure voice without background music.\n"
69
  )
70
- with gr.Tabs():
71
- for (name, cover, vc_fn) in models:
72
- with gr.TabItem(name):
73
- with gr.Row():
74
- with gr.Column():
75
- vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
76
- vc_transform = gr.Number(label="vc_transform", value=0)
77
- auto_f0 = gr.Checkbox(label="auto_f0", value=False)
78
- vc_submit = gr.Button("Generate", variant="primary")
79
- with gr.Column():
80
- vc_output1 = gr.Textbox(label="Output Message")
81
- vc_output2 = gr.Audio(label="Output Audio")
82
- vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0], [vc_output1, vc_output2])
 
83
  app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
 
46
  return "Success", (44100, out_audio.cpu().numpy())
47
  return vc_fn
48
 
49
+ def get_speakers():
50
+ speakers = []
51
+
52
+ for _,dirs,_ in os.walk(CUSTOM_MODELS_DIR):
53
+ for folder in dirs:
54
+ cur_speaker = {}
55
+ # Look for G_****.pth
56
+ g = glob.glob(os.path.join(CUSTOM_MODELS_DIR,folder,'G_*.pth'))
57
+ if not len(g):
58
+ continue
59
+ cur_speaker["model_path"] = g[0]
60
+ cur_speaker["model_folder"] = folder
61
+
62
+ # Look for *.pt (clustering model)
63
+ clst = glob.glob(os.path.join(CUSTOM_MODELS_DIR,folder,'*.pt'))
64
+ if not len(clst):
65
+ cur_speaker["cluster_path"] = ""
66
+ else:
67
+ cur_speaker["cluster_path"] = clst[0]
68
+
69
+ # Look for config.json
70
+ cfg = glob.glob(os.path.join(CUSTOM_MODELS_DIR,folder,'*.json'))
71
+ if not len(cfg):
72
+ continue
73
+ cur_speaker["cfg_path"] = cfg[0]
74
+ with open(cur_speaker["cfg_path"]) as f:
75
+ try:
76
+ cfg_json = json.loads(f.read())
77
+ except Exception as e:
78
+ print("Malformed config json in "+folder)
79
+ for name, i in cfg_json["spk"].items():
80
+ cur_speaker["name"] = name
81
+ cur_speaker["id"] = i
82
+ if not name.startswith('.'):
83
+ speakers.append(copy.copy(cur_speaker))
84
+
85
+ return sorted(speakers, key=lambda x:x["name"].lower())
86
+
87
  if __name__ == '__main__':
88
  parser = argparse.ArgumentParser()
89
  parser.add_argument('--device', type=str, default='cpu')
90
  parser.add_argument('--api', action="store_true", default=False)
91
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
92
  args = parser.parse_args()
93
+
94
+ speakers = get_speakers()
95
+ speaker_list = [x["name"] for x in speakers]
96
+
97
  models = []
98
  voices = []
 
 
 
 
 
99
 
100
  # !svc infer {NAME}.wav -c config.json -m G_riri_220.pth
101
  # display(Audio(f"{NAME}.out.wav", autoplay=True))
 
104
  "# <center> Sovits Chapay\n"
105
  "## <center> The input audio should be clean and pure voice without background music.\n"
106
  )
107
+
108
+ with gr.Row():
109
+ with gr.Column():
110
+ vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
111
+
112
+ vc_transform = gr.Number(label="vc_transform", value=0)
113
+
114
+ voice = gr.Dropdown(choices=speaker_list, visible=True)
115
+
116
+ vc_submit = gr.Button("Generate", variant="primary")
117
+ with gr.Column():
118
+ vc_output1 = gr.Textbox(label="Output Message")
119
+ vc_output2 = gr.Audio(label="Output Audio")
120
+ vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0], [vc_output1, vc_output2])
121
  app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
models/chapaev/G_5400.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33f2fd791a7a6dcd075a4c56fa992b8ef3ca1acc13aeeff2ef437a712e032fad
3
+ size 548687709
models/chapaev/config.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 100,
4
+ "eval_interval": 200,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 16,
14
+ "fp16_run": false,
15
+ "bf16_run": false,
16
+ "lr_decay": 0.999875,
17
+ "segment_size": 10240,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "use_sr": true,
23
+ "max_speclen": 512,
24
+ "port": "8001",
25
+ "keep_ckpts": 3,
26
+ "num_workers": 4,
27
+ "log_version": 0,
28
+ "ckpt_name_by_step": false,
29
+ "accumulate_grad_batches": 1
30
+ },
31
+ "data": {
32
+ "training_files": "filelists/44k/train.txt",
33
+ "validation_files": "filelists/44k/val.txt",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 80,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": 22050,
42
+ "contentvec_final_proj": false
43
+ },
44
+ "model": {
45
+ "inter_channels": 192,
46
+ "hidden_channels": 192,
47
+ "filter_channels": 768,
48
+ "n_heads": 2,
49
+ "n_layers": 6,
50
+ "kernel_size": 3,
51
+ "p_dropout": 0.1,
52
+ "resblock": "1",
53
+ "resblock_kernel_sizes": [
54
+ 3,
55
+ 7,
56
+ 11
57
+ ],
58
+ "resblock_dilation_sizes": [
59
+ [
60
+ 1,
61
+ 3,
62
+ 5
63
+ ],
64
+ [
65
+ 1,
66
+ 3,
67
+ 5
68
+ ],
69
+ [
70
+ 1,
71
+ 3,
72
+ 5
73
+ ]
74
+ ],
75
+ "upsample_rates": [
76
+ 8,
77
+ 8,
78
+ 2,
79
+ 2,
80
+ 2
81
+ ],
82
+ "upsample_initial_channel": 512,
83
+ "upsample_kernel_sizes": [
84
+ 16,
85
+ 16,
86
+ 4,
87
+ 4,
88
+ 4
89
+ ],
90
+ "n_layers_q": 3,
91
+ "use_spectral_norm": false,
92
+ "gin_channels": 256,
93
+ "ssl_dim": 768,
94
+ "n_speakers": 200,
95
+ "type_": "hifi-gan",
96
+ "pretrained": {
97
+ "D_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_D_320000.pth",
98
+ "G_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_G_320000.pth"
99
+ }
100
+ },
101
+ "spk": {
102
+ "chapaev": 0
103
+ }
104
+ }