Spaces:

ntt123
/

Vietnam-female-voice-TTS

Running

App Files Files Community

ntt123 commited on Aug 26, 2023

Commit

7591e94

•

1 Parent(s): ffe1f9e

Support generating long clips

Browse files

Files changed (1) hide show

app.py +36 -115

app.py CHANGED Viewed

@@ -30,102 +30,11 @@ assert phone_set[0][1:-1] == "SEP"
 assert "sil" in phone_set
 sil_idx = phone_set.index("sil")
-vietnamese_characters = [
-    "a",
-    "à",
-    "á",
-    "ả",
-    "ã",
-    "ạ",
-    "ă",
-    "ằ",
-    "ắ",
-    "ẳ",
-    "ẵ",
-    "ặ",
-    "â",
-    "ầ",
-    "ấ",
-    "ẩ",
-    "ẫ",
-    "ậ",
-    "e",
-    "è",
-    "é",
-    "ẻ",
-    "ẽ",
-    "ẹ",
-    "ê",
-    "ề",
-    "ế",
-    "ể",
-    "ễ",
-    "ệ",
-    "i",
-    "ì",
-    "í",
-    "ỉ",
-    "ĩ",
-    "ị",
-    "o",
-    "ò",
-    "ó",
-    "ỏ",
-    "õ",
-    "ọ",
-    "ô",
-    "ồ",
-    "ố",
-    "ổ",
-    "ỗ",
-    "ộ",
-    "ơ",
-    "ờ",
-    "ớ",
-    "ở",
-    "ỡ",
-    "ợ",
-    "u",
-    "ù",
-    "ú",
-    "ủ",
-    "ũ",
-    "ụ",
-    "ư",
-    "ừ",
-    "ứ",
-    "ử",
-    "ữ",
-    "ự",
-    "y",
-    "ỳ",
-    "ý",
-    "ỷ",
-    "ỹ",
-    "ỵ",
-    "b",
-    "c",
-    "d",
-    "đ",
-    "g",
-    "h",
-    "k",
-    "l",
-    "m",
-    "n",
-    "p",
-    "q",
-    "r",
-    "s",
-    "t",
-    "v",
-    "x",
-]
-alphabet = "".join(vietnamese_characters)
 space_re = regex.compile(r"\s+")
 number_re = regex.compile("([0-9]+)")
 digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
 num_re = regex.compile(r"([0-9.,]*[0-9])")
 keep_text_and_num_re = regex.compile(rf"[^\s{alphabet}.,0-9]")
 keep_text_re = regex.compile(rf"[^\s{alphabet}]")
@@ -225,7 +134,7 @@ def text_to_phone_idx(text):
     return tokens
-def text_to_speech(text):
     # prevent too long text
     if len(text) > 500:
         text = text[:500]
@@ -237,9 +146,6 @@ def text_to_speech(text):
     }
     # predict phoneme duration
-    duration_net = DurationNet(hps.data.vocab_size, 64, 4).to(device)
-    duration_net.load_state_dict(torch.load(duration_model_path, map_location=device))
-    duration_net = duration_net.eval()
     phone_length = torch.from_numpy(batch["phone_length"].copy()).long().to(device)
     phone_idx = torch.from_numpy(batch["phone_idx"].copy()).long().to(device)
     with torch.inference_mode():
@@ -249,24 +155,7 @@ def text_to_speech(text):
     )
     phone_duration = torch.where(phone_idx == 0, 0, phone_duration)
-    generator = SynthesizerTrn(
-        hps.data.vocab_size,
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        **vars(hps.model),
-    ).to(device)
-    del generator.enc_q
-    ckpt = torch.load(lightspeed_model_path, map_location=device)
-    params = {}
-    for k, v in ckpt["net_g"].items():
-        k = k[7:] if k.startswith("module.") else k
-        params[k] = v
-    generator.load_state_dict(params, strict=False)
-    del ckpt, params
-    generator = generator.eval()
-    # mininum 1 frame for each phone
-    # phone_duration = torch.clamp_min(phone_duration, hps.data.hop_length * 1000 / hps.data.sampling_rate)
-    # phone_duration = torch.where(phone_idx == 0, 0, phone_duration)
     end_time = torch.cumsum(phone_duration, dim=-1)
     start_time = end_time - phone_duration
     start_frame = start_time / 1000 * hps.data.sampling_rate / hps.data.hop_length
@@ -285,8 +174,40 @@ def text_to_speech(text):
     return (wave * (2**15)).astype(np.int16)
 def speak(text):
-    y = text_to_speech(text)
     return hps.data.sampling_rate, y

 assert "sil" in phone_set
 sil_idx = phone_set.index("sil")
 space_re = regex.compile(r"\s+")
 number_re = regex.compile("([0-9]+)")
 digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
 num_re = regex.compile(r"([0-9.,]*[0-9])")
+alphabet = "aàáảãạăằắẳẵặâầấẩẫậeèéẻẽẹêềếểễệiìíỉĩịoòóỏõọôồốổỗộơờớởỡợuùúủũụưừứửữựyỳýỷỹỵbcdđghklmnpqrstvx"
 keep_text_and_num_re = regex.compile(rf"[^\s{alphabet}.,0-9]")
 keep_text_re = regex.compile(rf"[^\s{alphabet}]")
     return tokens
+def text_to_speech(duration_net, generator, text):
     # prevent too long text
     if len(text) > 500:
         text = text[:500]
     }
     # predict phoneme duration
     phone_length = torch.from_numpy(batch["phone_length"].copy()).long().to(device)
     phone_idx = torch.from_numpy(batch["phone_idx"].copy()).long().to(device)
     with torch.inference_mode():
     )
     phone_duration = torch.where(phone_idx == 0, 0, phone_duration)
+    # generate waveform
     end_time = torch.cumsum(phone_duration, dim=-1)
     start_time = end_time - phone_duration
     start_frame = start_time / 1000 * hps.data.sampling_rate / hps.data.hop_length
     return (wave * (2**15)).astype(np.int16)
+def load_models():
+    duration_net = DurationNet(hps.data.vocab_size, 64, 4).to(device)
+    duration_net.load_state_dict(torch.load(duration_model_path, map_location=device))
+    duration_net = duration_net.eval()
+    generator = SynthesizerTrn(
+        hps.data.vocab_size,
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        **vars(hps.model),
+    ).to(device)
+    del generator.enc_q
+    ckpt = torch.load(lightspeed_model_path, map_location=device)
+    params = {}
+    for k, v in ckpt["net_g"].items():
+        k = k[7:] if k.startswith("module.") else k
+        params[k] = v
+    generator.load_state_dict(params, strict=False)
+    del ckpt, params
+    generator = generator.eval()
+    return duration_net, generator
 def speak(text):
+    duration_net, generator = load_models()
+    paragraphs = text.split("\n")
+    clips = []  # list of audio clips
+    # silence = np.zeros(hps.data.sampling_rate // 4)
+    for paragraph in paragraphs:
+        paragraph = paragraph.strip()
+        if paragraph == "":
+            continue
+        clips.append(text_to_speech(duration_net, generator, paragraph))
+        # clips.append(silence)
+    y = np.concatenate(clips)
     return hps.data.sampling_rate, y