cymic commited on
Commit
0d3dcb0
1 Parent(s): 252a6de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -89
app.py CHANGED
@@ -26,7 +26,6 @@ from text.cleaners import japanese_cleaners
26
  from gradio import routes
27
  from typing import List, Type
28
  import os
29
- os.system('pip install gradio==3.18.0')
30
 
31
  def audio_postprocess(self, y):
32
  if y is None:
@@ -242,31 +241,6 @@ download_audio_js = """
242
  }}
243
  """
244
 
245
- def monkey_patch():
246
- def postprocess(self, y):
247
- """
248
- Any postprocessing needed to be performed on a block context.
249
- """
250
- return y
251
- gr.blocks.BlockContext.postprocess = postprocess
252
-
253
- def get_types(cls_set: List[Type], component: str):
254
- docset = []
255
- types = []
256
- if component == "input":
257
- for cls in cls_set:
258
- doc = inspect.getdoc(cls)
259
- doc_lines = doc.split("\n")
260
- docset.append(doc_lines[1].split(":")[-1])
261
- types.append(doc_lines[1].split(")")[0].split("(")[-1])
262
- else:
263
- for cls in cls_set:
264
- doc = inspect.getdoc(cls)
265
- doc_lines = doc.split("\n")
266
- docset.append(doc_lines[-1].split(":")[-1])
267
- types.append(doc_lines[-1].split(")")[0].split("(")[-1])
268
- return docset, types
269
- routes.get_types = get_types
270
 
271
  if __name__ == "__main__":
272
  parser = argparse.ArgumentParser()
@@ -274,18 +248,7 @@ if __name__ == "__main__":
274
  args = parser.parse_args()
275
  app = gr.Blocks()
276
  with app:
277
- gr.Markdown("# Umamusume voice synthesizer 赛马娘语音合成器\n\n"
278
- "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
279
- "This synthesizer is created based on [VITS](https://arxiv.org/abs/2106.06103) model, trained on voice data extracted from mobile game Umamusume Pretty Derby \n\n"
280
- "这个合成器是基于VITS文本到语音模型,在从手游《賽馬娘:Pretty Derby》解包的语音数据上训练得到。[Dataset Link](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
281
- "[introduction video / 模型介绍视频](https://www.bilibili.com/video/BV1T84y1e7p5/?vd_source=6d5c00c796eff1cbbe25f1ae722c2f9f#reply607277701)\n\n"
282
- "You may duplicate this space or [open in Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing) to run it privately and without any queue.\n\n"
283
- "您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
284
- "If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
285
- "若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
286
- "If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
287
- "如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
288
- )
289
  with gr.Row():
290
  with gr.Column():
291
  # We instantiate the Textbox class
@@ -333,35 +296,34 @@ if __name__ == "__main__":
333
  noise_scale_w_slider = gr.Slider(minimum=0.1, maximum=5, value=0.8, step=0.1, label='噪声偏差 noise_scale_w')
334
 
335
 
 
 
 
 
 
 
 
 
 
336
 
337
- with gr.Column():
338
- text_output = gr.Textbox(label="Output Text")
339
- phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
340
- audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
341
- btn = gr.Button("Generate!")
342
- cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
343
-
344
- download = gr.Button("Download Audio")
345
- download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"), api_name="download_audio")
346
- with gr.Accordion(label="Speaking Pace Control", open=True):
347
-
348
- duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
349
- interactive = True)
350
- gr.Markdown(
351
- "The number after the : mark represents the length of each phoneme in the generated audio, while the number inside ( ) represents the lenght of spacing between each phoneme and its next phoneme. "
352
- "You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled. "
353
- "Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
354
- "音素冒号后的数字代表音素在生成音频中的长度,( )内的数字代表每个音素与下一个音素之间间隔的长度。"
355
- "您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
356
- "注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
357
- )
358
 
359
- monkey_patch()
360
-
361
- btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
362
- outputs=[text_output, audio_output, phoneme_output, duration_output])#, api_name="1")
363
- cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider],
364
- outputs=[phoneme_output, audio_output])#, api_name="2")
 
365
 
366
  examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
367
  ['お疲れ様です,トレーナーさん。', '1:无声铃鹿', '日本語', 1, 0.667, 0.8, False],
@@ -377,30 +339,6 @@ if __name__ == "__main__":
377
  outputs=[text_output, audio_output],
378
  fn=infer
379
  )
380
- gr.Markdown("# Updates Logs 更新日志:\n\n"
381
- "2023/1/24:\n\n"
382
- "Improved the format of phoneme length control.\n\n"
383
- "改善了音素控制的格式。\n\n"
384
- "2023/1/24:\n\n"
385
- "Added more precise control on pace of speaking by modifying the duration of each phoneme.\n\n"
386
- "增加了对说话节奏的音素级控制。\n\n"
387
- "2023/1/13:\n\n"
388
- "Added one example of phoneme input.\n\n"
389
- "增加了音素输入的example(米浴喘气)\n\n"
390
- "2023/1/12:\n\n"
391
- "Added phoneme input, which enables more precise control on output audio.\n\n"
392
- "增加了音素输入的功能,可以对语气和语调做到一定程度的精细控制。\n\n"
393
- "Adjusted UI arrangements.\n\n"
394
- "调整了UI的布局。\n\n"
395
- "2023/1/10:\n\n"
396
- "Dataset used for training is now uploaded to [here](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)\n\n"
397
- "数据集已上传,您可以在[这里](https://huggingface.co/datasets/Plachta/Umamusume-voice-text-pairs/tree/main)下载。\n\n"
398
- "2023/1/9:\n\n"
399
- "Model inference has been fully converted to onnxruntime. There will be no more Runtime Error: Memory Limit Exceeded\n\n"
400
- "模型推理已全面转为onnxruntime,现在不会出现Runtime Error: Memory Limit Exceeded了。\n\n"
401
- "Now integrated to [Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts) collection.\n\n"
402
- "现已加入[Moe-tts](https://huggingface.co/spaces/skytnt/moe-tts)模型大全。\n\n"
403
- )
404
  ifa = gr.Interface(lambda: None, inputs=[textbox], outputs=[text_output])
405
 
406
  app.queue(concurrency_count=3).launch(show_api=True, share=args.share)
 
26
  from gradio import routes
27
  from typing import List, Type
28
  import os
 
29
 
30
  def audio_postprocess(self, y):
31
  if y is None:
 
241
  }}
242
  """
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  if __name__ == "__main__":
246
  parser = argparse.ArgumentParser()
 
248
  args = parser.parse_args()
249
  app = gr.Blocks()
250
  with app:
251
+ gr.Markdown("# Umamusume voice synthesizer 赛马娘语音合成器\n\n")
 
 
 
 
 
 
 
 
 
 
 
252
  with gr.Row():
253
  with gr.Column():
254
  # We instantiate the Textbox class
 
296
  noise_scale_w_slider = gr.Slider(minimum=0.1, maximum=5, value=0.8, step=0.1, label='噪声偏差 noise_scale_w')
297
 
298
 
299
+ text_output = gr.Textbox(label="Output Text")
300
+ phoneme_output = gr.Textbox(label="Output Phonemes", interactive=False)
301
+ audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
302
+ btn = gr.Button("Generate!")
303
+ cus_dur_gn_btn = gr.Button("Regenerate with custom phoneme durations")
304
+
305
+ download = gr.Button("Download Audio")
306
+ download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"), api_name="download_audio")
307
+ with gr.Accordion(label="Speaking Pace Control", open=True):
308
 
309
+ duration_output = gr.Textbox(label="Duration of each phoneme", placeholder="After you generate a sentence, the detailed information of each phoneme's duration will be presented here.",
310
+ interactive = True)
311
+ gr.Markdown(
312
+ "The number after the : mark represents the length of each phoneme in the generated audio, while the number inside ( ) represents the lenght of spacing between each phoneme and its next phoneme. "
313
+ "You can manually change the numbers to adjust the length of each phoneme, so that speaking pace can be completely controlled. "
314
+ "Note that these numbers should be integers only. \n\n(1 represents a length of 0.01161 seconds)\n\n"
315
+ "音素冒号后的数字代表音素在生成音频中的长度,( )内的数字代表每个音素与下一个音素之间间隔的长度。"
316
+ "您可以手动修改这些数字来控制每个音素以及间隔的长度,从而完全控制合成音频的说话节奏。"
317
+ "注意这些数字只能是整数。 \n\n(1 代表 0.01161 秒的长度)\n\n"
318
+ )
 
 
 
 
 
 
 
 
 
 
 
319
 
320
+ monkey_patch()
321
+
322
+ btn.click(infer, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider, symbol_input],
323
+ outputs=[text_output, audio_output, phoneme_output, duration_output])#, api_name="1")
324
+ cus_dur_gn_btn.click(infer_from_phoneme_dur, inputs=[duration_output, char_dropdown, duration_slider, noise_scale_slider, noise_scale_w_slider],
325
+ outputs=[phoneme_output, audio_output])#, api_name="2")
326
+
327
 
328
  examples = [['haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......haa\u2193......', '29:米浴', '日本語', 1, 0.667, 0.8, True],
329
  ['お疲れ様です,トレーナーさん。', '1:无声铃鹿', '日本語', 1, 0.667, 0.8, False],
 
339
  outputs=[text_output, audio_output],
340
  fn=infer
341
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  ifa = gr.Interface(lambda: None, inputs=[textbox], outputs=[text_output])
343
 
344
  app.queue(concurrency_count=3).launch(show_api=True, share=args.share)