File size: 17,726 Bytes
021dd19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7abd36
acb3192
252a6de
021dd19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6627c1
 
 
8b7bfeb
021dd19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c77dda2
021dd19
 
55d3ee4
 
 
 
021dd19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7abd36
021dd19
 
 
 
 
 
0d3dcb0
021dd19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d3dcb0
 
 
 
 
 
 
 
 
021dd19
0d3dcb0
 
 
 
 
 
 
 
 
 
c77dda2
 
 
8f027c4
ad6295f
88c0862
 
 
0d3dcb0
021dd19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62f0943
 
641ccf5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
import argparse
import json
import os
import re
import tempfile
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
import ONNXVITS_infer
import librosa
import numpy as np
import torch
from torch import no_grad, LongTensor
import commons
import utils
import gradio as gr
import gradio.utils as gr_utils
import gradio.processing_utils as gr_processing_utils
from models import SynthesizerTrn
from text import text_to_sequence, _clean_text
from text.symbols import symbols
from mel_processing import spectrogram_torch
import translators.server as tss
import psutil
from datetime import datetime
from text.cleaners import japanese_cleaners
from gradio import routes
from typing import List, Type
import os

def audio_postprocess(self, y):
    if y is None:
        return None

    if gr_utils.validate_url(y):
        file = gr_processing_utils.download_to_file(y, dir=self.temp_dir)
    elif isinstance(y, tuple):
        sample_rate, data = y
        file = tempfile.NamedTemporaryFile(
            suffix=".wav", dir=self.temp_dir, delete=False
        )
        gr_processing_utils.audio_to_file(sample_rate, data, file.name)
    else:
        file = gr_processing_utils.create_tmp_copy_of_file(y, dir=self.temp_dir)

    return gr_processing_utils.encode_url_or_file_to_base64(file.name)


gr.Audio.postprocess = audio_postprocess

limitation = os.getenv("SYSTEM") == "spaces"  # limit text and audio length in huggingface spaces
languages = ['日本語', '简体中文', 'English']
characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
              '4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
              '8:大和赤骥', '9:大树快车', '10:草上飞', '11:菱亚马逊',
              '12:目白麦昆', '13:神鹰', '14:好歌剧', '15:成田白仁',
              '16:鲁道夫象征', '17:气槽', '18:爱丽数码', '19:青云天空',
              '20:玉藻十字', '21:美妙姿势', '22:琵琶晨光', '23:重炮',
              '24:曼城茶座', '25:美普波旁', '26:目白雷恩', '27:菱曙',
              '28:雪之美人', '29:米浴', '30:艾尼斯风神', '31:爱丽速子',
              '32:爱慕织姬', '33:稻荷一', '34:胜利奖券', '35:空中神宫',
              '36:荣进闪耀', '37:真机伶', '38:川上公主', '39:黄金城市',
              '40:樱花进王', '41:采珠', '42:新光风', '43:东商变革',
              '44:超级小溪', '45:醒目飞鹰', '46:荒漠英雄', '47:东瀛佐敦',
              '48:中山庆典', '49:成田大进', '50:西野花', '51:春乌拉拉',
              '52:青竹回忆', '53:微光飞驹', '54:美丽周日', '55:待兼福来',
              '56:Mr.C.B', '57:名将怒涛', '58:目白多伯', '59:优秀素质',
              '60:帝王光环', '61:待兼诗歌剧', '62:生野狄杜斯', '63:目白善信',
              '64:大拓太阳神', '65:双涡轮', '66:里见光钻', '67:北部玄驹',
              '68:樱花千代王', '69:天狼星象征', '70:目白阿尔丹', '71:八重无敌',
              '72:鹤丸刚志', '73:目白光明', '74:樱花桂冠', '75:成田路',
              '76:也文摄辉', '77:吉兆', '78:谷野美酒', '79:第一红宝石',
              '80:真弓快车', '81:骏川手纲', '82:凯斯奇迹', '83:小林历奇',
              '84:北港火山', '85:奇锐骏', '86:秋川理事长']
def show_memory_info(hint):
    pid = os.getpid()
    p = psutil.Process(pid)
    info = p.memory_info()
    memory = info.rss / 1024.0 / 1024
    print("{} 内存占用: {} MB".format(hint, memory))

def text_to_phoneme(text, symbols, is_symbol):
  _symbol_to_id = {s: i for i, s in enumerate(symbols)}

  sequence = ""
  if not is_symbol:
      clean_text = japanese_cleaners(text)
  else:
      clean_text = text
  for symbol in clean_text:
    if symbol not in _symbol_to_id.keys():
      continue
    symbol_id = _symbol_to_id[symbol]
    sequence += symbol
  return sequence

def get_text(text, hps, is_symbol):
    text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = LongTensor(text_norm)
    return text_norm

hps = utils.get_hparams_from_file("./configs/uma87.json")
symbols = hps.symbols
net_g = ONNXVITS_infer.SynthesizerTrn(
    len(hps.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model)
_ = net_g.eval()

_ = utils.load_checkpoint("pretrained_models/G_1153000.pth", net_g)

def to_symbol_fn(is_symbol_input, input_text, temp_text):
    return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
        else (temp_text, temp_text)

def infer2(text_raw, character, language, duration, noise_scale, noise_scale_w, is_symbol):
    return (None, None, None, None)

def infer(text_raw, character, language, duration, noise_scale, noise_scale_w, is_symbol): 
    # check character & duraction parameter
    if language not in languages:
        print("Error: No such language\n")
        return "Error: No such language", None, None, None
    if character not in characters:
        print("Error: No such character\n")
        return "Error: No such character", None, None, None
    # check text length
    if limitation:
        text_len = len(text_raw) if is_symbol else len(re.sub("\[([A-Z]{2})\]", "", text_raw))
        max_len = 150
        if is_symbol:
            max_len *= 3
        if text_len > max_len:
            print(f"Refused: Text too long ({text_len}).")
            return "Error: Text is too long", None, None, None
        if text_len == 0:
            print("Refused: Text length is zero.")
            return "Error: Please input text!", None, None, None
    if is_symbol:
        text = text_raw
    elif language == '日本語':
        text = text_raw
    elif language == '简体中文':
        text = tss.google(text_raw, from_language='zh', to_language='ja')
    elif language == 'English':
        text = tss.google(text_raw, from_language='en', to_language='ja')
    char_id = int(character.split(':')[0])
    stn_tst = get_text(text, hps, is_symbol)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        sid = torch.LongTensor([char_id])
        try:
            jp2phoneme = text_to_phoneme(text, hps.symbols, is_symbol)
            durations = net_g.predict_duration(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
                                               noise_scale_w=noise_scale_w, length_scale=duration)
            char_dur_list = []
            for i, char in enumerate(jp2phoneme):
                char_pos = i * 2 + 1
                char_dur = durations[char_pos]
                char_dur_list.append(char_dur)
        except IndexError:
            print("Refused: Phoneme input contains non-phoneme character.")
            return "Error: You can only input phoneme under phoneme input model", None, None, None
        char_spacing_dur_list = []
        char_spacings = []
        for i in range(len(durations)):
            if i % 2 == 0:  # spacing
                char_spacings.append("spacing")
            elif i % 2 == 1:  # char
                char_spacings.append(jp2phoneme[int((i - 1) / 2)])
            char_spacing_dur_list.append(int(durations[i]))
        # convert duration information to string
        duration_info_str = ""
        for i in range(len(char_spacings)):
            if i == len(char_spacings) - 1:
                duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")"
            elif char_spacings[i] == "spacing":
                duration_info_str += "(" + str(char_spacing_dur_list[i]) + ")" + ", "
            else:
                duration_info_str += char_spacings[i] + ":" + str(char_spacing_dur_list[i])
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
    currentDateAndTime = datetime.now()
    print(f"\nCharacter {character} inference successful: {text}")
    if language != '日本語':
        print(f"translate from {language}: {text_raw}")
    show_memory_info(str(currentDateAndTime) + " infer调用后")
    #return (text,(22050, audio), jp2phoneme, duration_info_str)

def infer_from_phoneme_dur(duration_info_str, character, duration, noise_scale, noise_scale_w):
    """
    infer from phoneme dur
    """
    
    try:
        phonemes = duration_info_str.split(", ")
        recons_durs = []
        recons_phonemes = ""
        for i, item in enumerate(phonemes):
            if i == 0:
                recons_durs.append(int(item.strip("()")))
            else:
                phoneme_n_dur, spacing_dur = item.split("(")
                recons_phonemes += phoneme_n_dur.split(":")[0]
                recons_durs.append(int(phoneme_n_dur.split(":")[1]))
                recons_durs.append(int(spacing_dur.strip(")")))
    except ValueError:
        return ("Error: Format must not be changed!", None)
    except AssertionError:
        return ("Error: Format must not be changed!", None)
    char_id = int(character.split(':')[0])
    stn_tst = get_text(recons_phonemes, hps, is_symbol=True)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        sid = torch.LongTensor([char_id])
        audio = net_g.infer_with_duration(x_tst, x_tst_lengths, w_ceil=recons_durs, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
                            length_scale=duration)[0][0, 0].data.cpu().float().numpy()
    print(f"\nCharacter {character} inference successful: {recons_phonemes}, from {duration_info_str}")
    return (recons_phonemes, (22050, audio))

download_audio_js = """
() =>{{
    let root = document.querySelector("body > gradio-app");
    if (root.shadowRoot != null)
        root = root.shadowRoot;
    let audio = root.querySelector("#{audio_id}").querySelector("audio");
    if (audio == undefined)
        return;
    audio = audio.src;
    let oA = document.createElement("a");
    oA.download = Math.floor(Math.random()*100000000)+'.wav';
    oA.href = audio;
    document.body.appendChild(oA);
    oA.click();
    oA.remove();
}}
"""


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
    args = parser.parse_args()
    app = gr.Blocks()
    with app:
        gr.Markdown("# Umamusume voice synthesizer 赛马娘语音合成器\n\n")
        with gr.Row():
            with gr.Column():
                # We instantiate the Textbox class
                textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
                with gr.Accordion(label="Phoneme Input", open=False):
                    temp_text_var = gr.Variable()
                    symbol_input = gr.Checkbox(value=False, label="Symbol input")
                    symbol_list = gr.Dataset(label="Symbol list", components=[textbox],
                                             samples=[[x] for x in symbols],
                                             elem_id=f"symbol-list")
                    symbol_list_json = gr.Json(value=symbols, visible=False)
                symbol_input.change(to_symbol_fn,
                                    [symbol_input, textbox, temp_text_var],
                                    [textbox, temp_text_var])
                symbol_list.click(None, [symbol_list, symbol_list_json], textbox,
                                  _js=f"""
                (i, symbols, text) => {{
                    let root = document.querySelector("body > gradio-app");
                    if (root.shadowRoot != null)
                        root = root.shadowRoot;
                    let text_input = root.querySelector("#tts-input").querySelector("textarea");
                    let startPos = text_input.selectionStart;
                    let endPos = text_input.selectionEnd;
                    let oldTxt = text_input.value;
                    let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
                    text_input.value = result;
                    let x = window.scrollX, y = window.scrollY;
                    text_input.focus();
                    text_input.selectionStart = startPos + symbols[i].length;
                    text_input.selectionEnd = startPos + symbols[i].length;
                    text_input.blur();
                    window.scrollTo(x, y);

                    text = text_input.value;
                    
                    return text;
                }}""")
                # select character
                char_dropdown = gr.Dropdown(choices=characters, value = "0:特别周", label='character')
                language_dropdown = gr.Dropdown(choices=languages, value = "日本語", label='language')


                duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1, label='时长 Duration')
                noise_scale_slider = gr.Slider(minimum=0.1, maximum=5, value=0.667, step=0.001, label='噪声比例 noise_scale')
                noise_scale_w_slider = gr.Slider(minimum=0.1, maximum=5, value=0.8, step=0.1, label='噪声偏差 noise_scale_w')

                
            text_output = gr.Textbox(label="Output Text")
            phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
            audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
            btn = gr.Button("Generate!")
            cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
            
            download = gr.Button("Download Audio")
            download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"), api_name="download_audio")
            with gr.Accordion(label="Speaking Pace Control", open=True):
                
                duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
                                            interactive = True)
            gr.Markdown(
                "The number after the : mark represents the length of each phoneme in the generated audio, while the number inside ( ) represents the lenght of spacing between each phoneme and its next phoneme. "
                "You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled. "
                "Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
                "音素冒号后的数字代表音素在生成音频中的长度,( )内的数字代表每个音素与下一个音素之间间隔的长度。"
                "您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
                "注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
            )
            #def a1(textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input):
            #    pass
            #btn.click(a1, [textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input], [], api_name="download_audio2")
        btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input], 
        outputs=[text_output, audio_output], api_name="1")
        #  outputs=[text_output, audio_output, phoneme_output, duration_output], api_name="1")
        cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider], 
                  outputs=[phoneme_output, audio_output])#, api_name="2")
                
                
        examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
                    ['お疲れ様です,トレーナーさん。', '1:无声铃鹿', '日本語', 1, 0.667, 0.8, False],
                    ['張り切っていこう!', '67:北部玄驹', '日本語', 1, 0.667, 0.8, False],
                    ['何でこんなに慣れでんのよ,私のほが先に好きだっだのに。', '10:草上飞', '日本語', 1, 0.667, 0.8, False],
                    ['授業中に出しだら,学校生活終わるですわ。', '12:目白麦昆', '日本語', 1, 0.667, 0.8, False],
                    ['お帰りなさい,お兄様!', '29:米浴', '日本語', 1, 0.667, 0.8, False],
                    ['私の処女をもらっでください!', '29:米浴', '日本語', 1, 0.667, 0.8, False]]
        gr.Examples(
            examples=examples,
            inputs=[textbox, char_dropdown, language_dropdown,
                    duration_slider, noise_scale_slider,noise_scale_w_slider, symbol_input],
            outputs=[text_output, audio_output],
            fn=infer
        )
    ifa = gr.Interface(lambda: None, inputs=[textbox], outputs=[text_output])
    
    app.queue(concurrency_count=3).launch(show_api=True, share=args.share)