Spaces:

pivich
/

sovits-new

Sleeping

App Files Files Community

Vladimir Alabov commited on Jul 14, 2023

Commit

9376c53

•

1 Parent(s): 30e6a40

Add Model

Browse files

Files changed (3) hide show

app.py +56 -18
models/chapaev/G_5400.pth +3 -0
models/chapaev/config.json +104 -0

app.py CHANGED Viewed

@@ -46,19 +46,56 @@ def create_vc_fn(model, sid):
         return "Success", (44100, out_audio.cpu().numpy())
     return vc_fn
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--device', type=str, default='cpu')
     parser.add_argument('--api', action="store_true", default=False)
     parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
     args = parser.parse_args()
     models = []
     voices = []
-    for f in os.listdir("models"):
-        name = f
-        # = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device)
-        #cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
-        #models.append((name, cover, create_vc_fn(model, name)))
     # !svc infer {NAME}.wav -c config.json -m G_riri_220.pth
     #  display(Audio(f"{NAME}.out.wav", autoplay=True))
@@ -67,17 +104,18 @@ if __name__ == '__main__':
             "# <center> Sovits Chapay\n"
             "## <center> The input audio should be clean and pure voice without background music.\n"
         )
-        with gr.Tabs():
-            for (name, cover, vc_fn) in models:
-                with gr.TabItem(name):
-                    with gr.Row():
-                        with gr.Column():
-                            vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
-                            vc_transform = gr.Number(label="vc_transform", value=0)
-                            auto_f0 = gr.Checkbox(label="auto_f0", value=False)
-                            vc_submit = gr.Button("Generate", variant="primary")
-                        with gr.Column():
-                            vc_output1 = gr.Textbox(label="Output Message")
-                            vc_output2 = gr.Audio(label="Output Audio")
-                vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0], [vc_output1, vc_output2])
         app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)

         return "Success", (44100, out_audio.cpu().numpy())
     return vc_fn
+def get_speakers():
+  speakers = []
+  for _,dirs,_ in os.walk(CUSTOM_MODELS_DIR):
+    for folder in dirs:
+      cur_speaker = {}
+      # Look for G_****.pth
+      g = glob.glob(os.path.join(CUSTOM_MODELS_DIR,folder,'G_*.pth'))
+      if not len(g):
+        continue
+      cur_speaker["model_path"] = g[0]
+      cur_speaker["model_folder"] = folder
+      # Look for *.pt (clustering model)
+      clst = glob.glob(os.path.join(CUSTOM_MODELS_DIR,folder,'*.pt'))
+      if not len(clst):
+        cur_speaker["cluster_path"] = ""
+      else:
+        cur_speaker["cluster_path"] = clst[0]
+      # Look for config.json
+      cfg = glob.glob(os.path.join(CUSTOM_MODELS_DIR,folder,'*.json'))
+      if not len(cfg):
+        continue
+      cur_speaker["cfg_path"] = cfg[0]
+      with open(cur_speaker["cfg_path"]) as f:
+        try:
+          cfg_json = json.loads(f.read())
+        except Exception as e:
+          print("Malformed config json in "+folder)
+        for name, i in cfg_json["spk"].items():
+          cur_speaker["name"] = name
+          cur_speaker["id"] = i
+          if not name.startswith('.'):
+            speakers.append(copy.copy(cur_speaker))
+  return sorted(speakers, key=lambda x:x["name"].lower())
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--device', type=str, default='cpu')
     parser.add_argument('--api', action="store_true", default=False)
     parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
     args = parser.parse_args()
+    speakers = get_speakers()
+    speaker_list = [x["name"] for x in speakers]
     models = []
     voices = []
     # !svc infer {NAME}.wav -c config.json -m G_riri_220.pth
     #  display(Audio(f"{NAME}.out.wav", autoplay=True))
             "# <center> Sovits Chapay\n"
             "## <center> The input audio should be clean and pure voice without background music.\n"
         )
+        with gr.Row():
+            with gr.Column():
+                vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
+                vc_transform = gr.Number(label="vc_transform", value=0)
+                voice = gr.Dropdown(choices=speaker_list, visible=True)
+                vc_submit = gr.Button("Generate", variant="primary")
+            with gr.Column():
+                vc_output1 = gr.Textbox(label="Output Message")
+                vc_output2 = gr.Audio(label="Output Audio")
+            vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0], [vc_output1, vc_output2])
         app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)

models/chapaev/G_5400.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33f2fd791a7a6dcd075a4c56fa992b8ef3ca1acc13aeeff2ef437a712e032fad
+size 548687709

models/chapaev/config.json ADDED Viewed

	@@ -0,0 +1,104 @@

+{
+  "train": {
+    "log_interval": 100,
+    "eval_interval": 200,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 0.0001,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 16,
+    "fp16_run": false,
+    "bf16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 10240,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 512,
+    "port": "8001",
+    "keep_ckpts": 3,
+    "num_workers": 4,
+    "log_version": 0,
+    "ckpt_name_by_step": false,
+    "accumulate_grad_batches": 1
+  },
+  "data": {
+    "training_files": "filelists/44k/train.txt",
+    "validation_files": "filelists/44k/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": 22050,
+    "contentvec_final_proj": false
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      4,
+      4,
+      4
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "ssl_dim": 768,
+    "n_speakers": 200,
+    "type_": "hifi-gan",
+    "pretrained": {
+      "D_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_D_320000.pth",
+      "G_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_G_320000.pth"
+    }
+  },
+  "spk": {
+    "chapaev": 0
+  }
+}