Artrajz commited on
Commit
d94ccbe
·
1 Parent(s): ea294dc

Upload 44 files

Browse files
Dockerfile CHANGED
@@ -6,15 +6,13 @@ WORKDIR /app
6
  ENV DEBIAN_FRONTEND=noninteractive
7
 
8
  RUN apt-get update && \
9
- apt install build-essential -yq && \
10
- apt install espeak-ng -yq && \
11
- apt install cmake -yq && \
12
- apt install -y wget -yq && \
13
  apt-get clean && \
14
  apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
15
  rm -rf /var/lib/apt/lists/*
16
 
17
- RUN pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0
 
18
 
19
  RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
20
  tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
@@ -25,13 +23,15 @@ RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openj
25
  rm -f openjtalk-0.3.0.dev2.tar.gz && \
26
  rm -rf openjtalk-0.3.0.dev2
27
 
28
- RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
29
 
30
  COPY requirements.txt /app
31
- RUN pip install -r requirements.txt
 
 
32
 
33
  COPY . /app
34
 
35
  EXPOSE 23456
36
 
37
- CMD ["python", "/app/app.py"]
 
6
  ENV DEBIAN_FRONTEND=noninteractive
7
 
8
  RUN apt-get update && \
9
+ apt-get install -yq build-essential espeak-ng cmake wget && \
 
 
 
10
  apt-get clean && \
11
  apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
12
  rm -rf /var/lib/apt/lists/*
13
 
14
+ RUN pip install --upgrade pip --no-cache-dir && \
15
+ pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0 safetensors==0.3.2 --no-cache-dir
16
 
17
  RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
18
  tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
 
23
  rm -f openjtalk-0.3.0.dev2.tar.gz && \
24
  rm -rf openjtalk-0.3.0.dev2
25
 
26
+ RUN pip install torch --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
27
 
28
  COPY requirements.txt /app
29
+ RUN pip install -r requirements.txt --no-cache-dir
30
+
31
+ RUN pip install gunicorn --no-cache-dir
32
 
33
  COPY . /app
34
 
35
  EXPOSE 23456
36
 
37
+ CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
Dockerfile_GPU ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10.11-slim-bullseye
2
+
3
+ RUN mkdir -p /app
4
+ WORKDIR /app
5
+
6
+ ENV DEBIAN_FRONTEND=noninteractive
7
+
8
+ RUN apt-get update && \
9
+ apt-get install -yq build-essential espeak-ng cmake wget && \
10
+ apt-get clean && \
11
+ apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
12
+ rm -rf /var/lib/apt/lists/*
13
+
14
+ RUN pip install --upgrade pip --no-cache-dir && \
15
+ pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0 safetensors==0.3.2 --no-cache-dir
16
+
17
+ RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
18
+ tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
19
+ cd openjtalk-0.3.0.dev2 && \
20
+ rm -rf ./pyopenjtalk/open_jtalk_dic_utf_8-1.11 && \
21
+ python setup.py install && \
22
+ cd ../ && \
23
+ rm -f openjtalk-0.3.0.dev2.tar.gz && \
24
+ rm -rf openjtalk-0.3.0.dev2
25
+
26
+ RUN pip install torch --index-url https://download.pytorch.org/whl/cu117 --no-cache-dir
27
+
28
+ COPY requirements.txt /app
29
+ RUN pip install -r requirements.txt --no-cache-dir
30
+
31
+ RUN pip install gunicorn --no-cache-dir
32
+
33
+ COPY . /app
34
+
35
+ EXPOSE 23456
36
+
37
+ CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
README_zh.md CHANGED
@@ -63,7 +63,7 @@
63
 
64
 
65
  - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
66
- - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你知道1+1=几吗?我觉得1+1≠3&id=164&lang=zh`
67
  - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
68
  - 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
69
  - 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
@@ -495,14 +495,15 @@ def voice_dimensional_emotion(upload_path):
495
 
496
  | Name | Parameter | Is must | Default | Type | Instruction |
497
  | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
498
- | 合成文本 | text | true | | str | |
499
- | 角色id | id | false | 0 | int | |
500
  | 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
501
  | 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
502
- | 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
503
- | 噪声 | noise | false | 0.667 | float | |
504
- | 噪声偏差 | noisew | false | 0.8 | float | |
505
  | 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
 
506
 
507
  ## VITS 语音转换
508
 
@@ -516,12 +517,12 @@ def voice_dimensional_emotion(upload_path):
516
 
517
  | Name | Parameter | Is must | Default | Type | Instruction |
518
  | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------ |
519
- | 上传音频 | upload | true | | file | |
520
- | 目标角色id | id | true | | int | |
521
  | 音频格式 | format | true | | str | wav,ogg,silk |
522
  | 语音长度/语速 | length | true | | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
523
- | 噪声 | noise | true | | float | |
524
- | 噪声偏差 | noisew | true | | float | |
525
 
526
  ## Dimensional emotion
527
 
@@ -533,13 +534,13 @@ def voice_dimensional_emotion(upload_path):
533
 
534
  | Name | Parameter | Is must | Default | Type | Instruction |
535
  | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
536
- | 合成文本 | text | true | | str | |
537
- | 角色id | id | false | 0 | int | |
538
  | 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
539
  | 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
540
  | 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
541
- | 噪声 | noise | false | 0.667 | float | |
542
- | 噪声偏差 | noisew | false | 0.8 | float | |
543
  | 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
544
  | 维度情感 | emotion | false | 0 | int | 范围取决于npy情感参考文件,如[innnky](https://huggingface.co/spaces/innnky/nene-emotion/tree/main)的all_emotions.npy模型范围是0-5457 |
545
 
@@ -623,4 +624,5 @@ def voice_dimensional_emotion(upload_path):
623
  - MoeGoe:https://github.com/CjangCjengh/MoeGoe
624
  - emotional-vits:https://github.com/innnky/emotional-vits
625
  - vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
 
626
 
 
63
 
64
 
65
  - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
66
+ - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=我觉得1%2B1≠3&id=164&lang=zh`(get中一些字符需要转义不然会被过滤掉)
67
  - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
68
  - 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
69
  - 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
 
495
 
496
  | Name | Parameter | Is must | Default | Type | Instruction |
497
  | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
498
+ | 合成文本 | text | true | | str | 需要合成语音的文本。 |
499
+ | 角色id | id | false | 0 | int | 即说话人id。 |
500
  | 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
501
  | 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
502
+ | 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢。 |
503
+ | 噪声 | noise | false | 0.33 | float | 样本噪声,控制合成的随机性。 |
504
+ | sdp噪声 | noisew | false | 0.4 | float | 随机时长预测器噪声,控制音素发音长度。 |
505
  | 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
506
+ | 流式响应 | streaming | false | false | bool | 流式合成语音,更快的首包响应。 |
507
 
508
  ## VITS 语音转换
509
 
 
517
 
518
  | Name | Parameter | Is must | Default | Type | Instruction |
519
  | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------ |
520
+ | 上传音频 | upload | true | | file | 需要转换说话人的音频文件。 |
521
+ | 目标角色id | id | true | | int | 目标说话人id。 |
522
  | 音频格式 | format | true | | str | wav,ogg,silk |
523
  | 语音长度/语速 | length | true | | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
524
+ | 噪声 | noise | true | | float | 样本噪声,控制合成的随机性。 |
525
+ | sdp噪声 | noisew | true | | float | 随机时长预测器噪声,控制音素发音长度。 |
526
 
527
  ## Dimensional emotion
528
 
 
534
 
535
  | Name | Parameter | Is must | Default | Type | Instruction |
536
  | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
537
+ | 合���文本 | text | true | | str | 需要合成语音的文本。 |
538
+ | 角色id | id | false | 0 | int | 即说话人id。 |
539
  | 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
540
  | 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
541
  | 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
542
+ | 噪声 | noise | false | 0.33 | float | 样本噪声,控制合成的随机性。 |
543
+ | sdp噪声 | noisew | false | 0.4 | float | 随机时长预测器噪声,控制音素发音长度。 |
544
  | 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
545
  | 维度情感 | emotion | false | 0 | int | 范围取决于npy情感参考文件,如[innnky](https://huggingface.co/spaces/innnky/nene-emotion/tree/main)的all_emotions.npy模型范围是0-5457 |
546
 
 
624
  - MoeGoe:https://github.com/CjangCjengh/MoeGoe
625
  - emotional-vits:https://github.com/innnky/emotional-vits
626
  - vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
627
+ - vits_chinese:https://github.com/PlayVoice/vits_chinese
628
 
app.py CHANGED
@@ -1,8 +1,7 @@
1
  import os
2
- import logging
3
  import time
4
- import logzero
5
  import uuid
 
6
  from flask import Flask, request, send_file, jsonify, make_response, render_template
7
  from werkzeug.utils import secure_filename
8
  from flask_apscheduler import APScheduler
@@ -19,24 +18,15 @@ scheduler.init_app(app)
19
  if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
20
  scheduler.start()
21
 
22
- logzero.loglevel(logging.WARNING)
23
- logger = logging.getLogger("vits-simple-api")
24
- level = app.config.get("LOGGING_LEVEL", "DEBUG")
25
- level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
26
- 'CRITICAL': logging.CRITICAL}
27
- logging.basicConfig(level=level_dict[level])
28
- logging.getLogger('numba').setLevel(logging.WARNING)
29
- logging.getLogger("langid.langid").setLevel(logging.INFO)
30
- logging.getLogger("apscheduler.scheduler").setLevel(logging.INFO)
31
 
 
32
  tts = merge_model(app.config["MODEL_LIST"])
33
 
34
- if not os.path.exists(app.config['UPLOAD_FOLDER']):
35
- os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
36
-
37
- if not os.path.exists(app.config['CACHE_PATH']):
38
- os.makedirs(app.config['CACHE_PATH'], exist_ok=True)
39
-
40
 
41
  def require_api_key(func):
42
  @wraps(func)
@@ -57,7 +47,10 @@ def require_api_key(func):
57
  def index():
58
  kwargs = {
59
  "speakers": tts.voice_speakers,
60
- "speakers_count": tts.speakers_count
 
 
 
61
  }
62
  return render_template("index.html", **kwargs)
63
 
@@ -362,25 +355,18 @@ def ssml():
362
  return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
363
 
364
  logger.debug(ssml)
365
-
366
  fname = f"{str(uuid.uuid1())}.{format}"
367
  file_type = f"audio/{format}"
368
 
369
  t1 = time.time()
370
- audio, format = tts.create_ssml_infer_task(ssml, fname)
371
  t2 = time.time()
372
  if app.config.get("SAVE_AUDIO", False):
373
  logger.debug(f"[ssml] {fname}")
374
  logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
375
 
376
- if eval(ssml.get('streaming', False)):
377
- audio = tts.generate_audio_chunks(audio)
378
- response = make_response(audio)
379
- response.headers['Content-Disposition'] = f'attachment; filename={fname}'
380
- response.headers['Content-Type'] = file_type
381
- return response
382
- else:
383
- return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
384
 
385
 
386
  @app.route('/voice/dimension-emotion', methods=["POST"])
 
1
  import os
 
2
  import time
 
3
  import uuid
4
+ from logger import logger
5
  from flask import Flask, request, send_file, jsonify, make_response, render_template
6
  from werkzeug.utils import secure_filename
7
  from flask_apscheduler import APScheduler
 
18
  if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
19
  scheduler.start()
20
 
21
+ for path in (app.config['LOGS_PATH'], app.config['UPLOAD_FOLDER'], app.config['CACHE_PATH']):
22
+ try:
23
+ os.makedirs(path, exist_ok=True)
24
+ except Exception as e:
25
+ logger.error(f"Unable to create directory {path}: {str(e)}")
 
 
 
 
26
 
27
+ # load model
28
  tts = merge_model(app.config["MODEL_LIST"])
29
 
 
 
 
 
 
 
30
 
31
  def require_api_key(func):
32
  @wraps(func)
 
47
  def index():
48
  kwargs = {
49
  "speakers": tts.voice_speakers,
50
+ "speakers_count": tts.speakers_count,
51
+ "vits_speakers_count":tts._vits_speakers_count,
52
+ "w2v2_speakers_count":tts._w2v2_speakers_count,
53
+ "w2v2_emotion_count":tts._w2v2_emotion_count
54
  }
55
  return render_template("index.html", **kwargs)
56
 
 
355
  return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
356
 
357
  logger.debug(ssml)
358
+ voice_tasks, format = tts.parse_ssml(ssml)
359
  fname = f"{str(uuid.uuid1())}.{format}"
360
  file_type = f"audio/{format}"
361
 
362
  t1 = time.time()
363
+ audio = tts.create_ssml_infer_task(voice_tasks, format, fname)
364
  t2 = time.time()
365
  if app.config.get("SAVE_AUDIO", False):
366
  logger.debug(f"[ssml] {fname}")
367
  logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
368
 
369
+ return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
 
 
 
 
 
 
 
370
 
371
 
372
  @app.route('/voice/dimension-emotion', methods=["POST"])
config.py CHANGED
@@ -12,7 +12,7 @@ DEBUG = False
12
  PORT = 7860
13
 
14
  # Absolute path of vits-simple-api
15
- ABS_PATH = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])))
16
 
17
  # Upload path
18
  UPLOAD_FOLDER = ABS_PATH + "/upload"
@@ -20,6 +20,12 @@ UPLOAD_FOLDER = ABS_PATH + "/upload"
20
  # Cahce path
21
  CACHE_PATH = ABS_PATH + "/cache"
22
 
 
 
 
 
 
 
23
  # If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
24
  CLEAN_INTERVAL_SECONDS = 3600
25
 
@@ -39,7 +45,7 @@ API_KEY = "api-key"
39
  LOGGING_LEVEL = "DEBUG"
40
 
41
  # Language identification library. Optional fastlid, langid
42
- LANGUAGE_IDENTIFICATION_LIBRARY = "langid"
43
 
44
  # To use the english_cleaner, you need to install espeak and provide the path of libespeak-ng.dll as input here.
45
  # If ESPEAK_LIBRARY is set to empty, it will be read from the environment variable.
@@ -48,7 +54,7 @@ ESPEAK_LIBRARY = ""
48
 
49
  # Fill in the model path here
50
  MODEL_LIST = [
51
- # VITS
52
  [ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/1374_epochs.pth", ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/config.json"],
53
  [ABS_PATH + "/Model/vctk/pretrained_vctk.pth", ABS_PATH + "/Model/vctk/vctk_base.json"],
54
  [ABS_PATH + "/Model/paimon/paimon6k_390000.pth", ABS_PATH + "/Model/paimon/paimon6k.json"],
@@ -73,7 +79,7 @@ HUBERT_SOFT_MODEL = ABS_PATH + "/Model/hubert-soft-0d54a1f4.pt"
73
  DIMENSIONAL_EMOTION_NPY = ABS_PATH + "/Model/npy"
74
 
75
  # w2v2-vits: Need to have both `model.onnx` and `model.yaml` files in the same path.
76
- DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
77
 
78
  """
79
  Default parameter
 
12
  PORT = 7860
13
 
14
  # Absolute path of vits-simple-api
15
+ ABS_PATH = os.path.dirname(os.path.realpath(__file__))
16
 
17
  # Upload path
18
  UPLOAD_FOLDER = ABS_PATH + "/upload"
 
20
  # Cahce path
21
  CACHE_PATH = ABS_PATH + "/cache"
22
 
23
+ # Logs path
24
+ LOGS_PATH = ABS_PATH + "/logs"
25
+
26
+ # Set the number of backup log files to keep.
27
+ LOGS_BACKUPCOUNT = 30
28
+
29
  # If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
30
  CLEAN_INTERVAL_SECONDS = 3600
31
 
 
45
  LOGGING_LEVEL = "DEBUG"
46
 
47
  # Language identification library. Optional fastlid, langid
48
+ LANGUAGE_IDENTIFICATION_LIBRARY = "fastlid"
49
 
50
  # To use the english_cleaner, you need to install espeak and provide the path of libespeak-ng.dll as input here.
51
  # If ESPEAK_LIBRARY is set to empty, it will be read from the environment variable.
 
54
 
55
  # Fill in the model path here
56
  MODEL_LIST = [
57
+ # VITS
58
  [ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/1374_epochs.pth", ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/config.json"],
59
  [ABS_PATH + "/Model/vctk/pretrained_vctk.pth", ABS_PATH + "/Model/vctk/vctk_base.json"],
60
  [ABS_PATH + "/Model/paimon/paimon6k_390000.pth", ABS_PATH + "/Model/paimon/paimon6k.json"],
 
79
  DIMENSIONAL_EMOTION_NPY = ABS_PATH + "/Model/npy"
80
 
81
  # w2v2-vits: Need to have both `model.onnx` and `model.yaml` files in the same path.
82
+ # DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
83
 
84
  """
85
  Default parameter
docker-compose-gpu.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.4'
2
+ services:
3
+ vits:
4
+ image: artrajz/vits-simple-api:latest-gpu
5
+ restart: always
6
+ ports:
7
+ - 23456:23456
8
+ environment:
9
+ LANG: 'C.UTF-8'
10
+ TZ: Asia/Shanghai #timezone
11
+ volumes:
12
+ - ./Model:/app/Model # 挂载模型文件夹
13
+ - ./config.py:/app/config.py # 挂载配置文件
14
+ - ./logs:/app/logs # logging logs
15
+ - ./gunicorn_config.py:/app/gunicorn_config.py # gunicorn configuration
docker-compose.yaml CHANGED
@@ -10,4 +10,6 @@ services:
10
  TZ: Asia/Shanghai #timezone
11
  volumes:
12
  - ./Model:/app/Model # 挂载模型文件夹
13
- - ./config.py:/app/config.py # 挂载配置文件
 
 
 
10
  TZ: Asia/Shanghai #timezone
11
  volumes:
12
  - ./Model:/app/Model # 挂载模型文件夹
13
+ - ./config.py:/app/config.py # 挂载配置文件
14
+ - ./logs:/app/logs # logging logs
15
+ - ./gunicorn_config.py:/app/gunicorn_config.py # gunicorn configuration
gunicorn_config.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import multiprocessing
2
+
3
+ bind = "0.0.0.0:23456"
4
+ workers = multiprocessing.cpu_count()
logger.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+ import logzero
5
+ import config
6
+ from logging.handlers import TimedRotatingFileHandler
7
+
8
+ logzero.loglevel(logging.WARNING)
9
+ logger = logging.getLogger("vits-simple-api")
10
+ level = getattr(config, "LOGGING_LEVEL", "DEBUG")
11
+ level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
12
+ 'CRITICAL': logging.CRITICAL}
13
+ logging.basicConfig(level=level_dict[level])
14
+ logging.getLogger('numba').setLevel(logging.WARNING)
15
+ logging.getLogger("langid.langid").setLevel(logging.INFO)
16
+ logging.getLogger("apscheduler.scheduler").setLevel(logging.INFO)
17
+
18
+ os.makedirs(config.LOGS_PATH, exist_ok=True)
19
+ log_file = os.path.join(config.LOGS_PATH, 'latest.log')
20
+ backup_count = getattr(config, "LOGS_BACKUPCOUNT", 30)
21
+ handler = TimedRotatingFileHandler(log_file, when="midnight", interval=1, backupCount=backup_count, encoding='utf-8')
22
+ handler.suffix = "%Y-%m-%d.log"
23
+ formatter = logging.Formatter('%(levelname)s:%(name)s %(message)s')
24
+ handler.setFormatter(formatter)
25
+ logger.addHandler(handler)
26
+
27
+ logging.getLogger("werkzeug").addHandler(handler)
28
+ logging.getLogger("apscheduler.scheduler").addHandler(handler)
29
+
30
+
31
+ # Custom function to handle uncaught exceptions
32
+ def handle_exception(exc_type, exc_value, exc_traceback):
33
+ # If it's a keyboard interrupt, don't handle it, just return
34
+ if issubclass(exc_type, KeyboardInterrupt):
35
+ sys.__excepthook__(exc_type, exc_value, exc_traceback)
36
+ return
37
+
38
+ logger.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
39
+
40
+
41
+ # Set the global exception handler in Python
42
+ sys.excepthook = handle_exception
requirements.txt CHANGED
@@ -27,4 +27,5 @@ fasttext
27
  fastlid
28
  langid
29
  phonemizer==3.2.1
30
- transformers
 
 
27
  fastlid
28
  langid
29
  phonemizer==3.2.1
30
+ transformers
31
+ pydantic==1.10.6
static/css/style.css ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .main-container {
2
+ position: relative;
3
+ width: 100%;
4
+ min-height: 300px;
5
+ }
6
+
7
+ .container {
8
+ width: 300px;
9
+ position: relative;
10
+ }
11
+
12
+
13
+ /*tabs*/
14
+ .tabs {
15
+ display: flex;
16
+ left: 0;
17
+ }
18
+
19
+ .tab-button {
20
+ display: inline-block;
21
+ background-color: transparent;
22
+ padding: 5px 10px;
23
+ cursor: pointer;
24
+ margin-bottom: -2px;
25
+ border-top: 2px solid transparent;
26
+ border-left: 2px solid transparent;
27
+ border-right: 2px solid transparent;
28
+ border-bottom: 0px;
29
+ border-top-left-radius: 0.5rem;
30
+ border-top-right-radius: 0.5rem;
31
+ color: gray;
32
+ }
33
+
34
+ .tab-button.active {
35
+ background-color: white;
36
+ border-top: 2px solid #dee2e6;
37
+ border-left: 2px solid #dee2e6;
38
+ border-right: 2px solid #dee2e6;
39
+ color: black;
40
+ }
41
+
42
+ /*content*/
43
+
44
+ .content {
45
+ border: gray;
46
+ border-left-width: 2px;
47
+ }
48
+
49
+ .content-pane {
50
+ display: none;
51
+ padding: 20px;
52
+ }
53
+
54
+ .content-pane.active {
55
+ display: flex;
56
+ -ms-flex-wrap: wrap;
57
+ flex-wrap: wrap;
58
+ }
59
+
60
+ *, :before, :after {
61
+ box-sizing: border-box;
62
+ border-width: 0;
63
+ border-style: solid;
64
+ border-color: #e5e7eb;
65
+ }
66
+
67
+
68
+ .flex {
69
+ display: flex;
70
+ }
71
+
72
+ .border-transparent {
73
+ border-color: transparent;
74
+ }
75
+
76
+ .border-b-2 {
77
+ border-bottom: 2px solid #dee2e6;
78
+ }
79
+
80
+ .border-lr-2 {
81
+ border-left: 2px solid #dee2e6;
82
+ border-right: 2px solid #dee2e6;
83
+ }
84
+
templates/index.html CHANGED
@@ -4,126 +4,230 @@
4
  <meta charset="UTF-8"/>
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
  <title>vits-simple-api</title>
7
-
8
  <link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
9
  </head>
10
  <body>
11
- <main style="margin: 0 auto; width: 1024px">
12
- <h1>
13
- <a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
14
- style="text-decoration: none; color: black"> vits-simple-api </a>
15
- </h1>
 
 
 
16
 
17
- <div>
18
- <label>文档:</label>
19
- <a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
20
- style="text-decoration: none; color: black"> https://github.com/Artrajz/vits-simple-api </a>
21
- </div>
22
- <div>
23
- <label>返回speakers(json):</label>
24
- <a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
25
- style="text-decoration: none; color: black">
26
- https://artrajz-vits-simple-api.hf.space/voice/speakers
27
- </a>
28
- </div>
29
- <div>
30
- <label>简单调用api:</label>
31
- <a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
32
- style="text-decoration: none; color: black">
33
- https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
34
- </a>
35
- </div>
36
 
37
- <!-- <div style="display: flex; justify-content: center; align-items: center"> -->
38
- <div>
39
- <form>
40
- <div class="form-group">
41
- <label>text</label>
42
- <textarea class="form-control" id="inputText" rows="3" oninput="updateLink()">你好,こんにちは</textarea>
43
- </div>
44
- <div class="form-group">
45
- <label>id</label>
46
- <select class="form-control" id="inputId" oninput="updateLink()">
47
- {% for speaker in speakers["VITS"] %}
48
- {% if speaker["name"] == "雷电将军(雷神)" %}
 
 
 
 
 
 
49
  <option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
50
  | {{ speaker["lang"] }}</option>
51
  {% else %}
52
  <option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
53
  | {{ speaker["lang"] }}</option>
54
  {% endif %}
55
- {% endfor %}
56
- </select>
57
- </div>
58
- </form>
59
- </div>
60
- <p>
61
- <button class="btn btn-primary" type="button" data-toggle="collapse" data-target="#collapseExample"
62
- aria-expanded="false" aria-controls="collapseExample">
63
- Advanced
64
- </button>
65
- {% if speakers_count == 0 %}
66
- <div style="color: red;">未加载任何模型</div>
67
- {% endif %}
68
- </p>
69
- <div class="collapse" id="collapseExample">
70
- <div class="card card-body">
71
- <form>
72
- <div class="form-group">
73
- <label>format</label>
74
- <select class="form-control" id="inputFormat" oninput="updateLink()">
75
- <option></option>
76
- <option>wav</option>
77
- <option>mp3</option>
78
- <option>ogg</option>
79
- <option>silk</option>
80
- </select>
81
- </div>
82
- <div class="form-group">
83
- <label>lang</label>
84
- <input type="text" class="form-control" id="inputLang" oninput="updateLink()" value=""
85
- placeholder="auto"/>
86
- </div>
87
- <div class="form-group">
88
- <label>length</label>
89
- <input type="text" class="form-control" id="inputLength" oninput="updateLink()" value=""
90
- placeholder="1"/>
91
- </div>
92
- <div class="form-group">
93
- <label>noise</label>
94
- <input type="text" class="form-control" id="inputNoise" oninput="updateLink()" value=""
95
- placeholder="0.33"/>
96
- </div>
97
- <div class="form-group">
98
- <label>noisew</label>
99
- <input type="text" class="form-control" id="inputNoisew" oninput="updateLink()" value=""
100
- placeholder="0.4"/>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  </div>
102
- <div class="form-group">
103
- <label>max</label>
104
- <input type="text" class="form-control" id="inputMax" oninput="updateLink()" value=""
105
- placeholder="50"/>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  </div>
107
- </form>
108
  </div>
109
- </div>
110
 
111
- <div style="display: flex; justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
112
- <button type="button" class="btn btn-outline-secondary" id="getAudio" style="margin-right: 10px">播放器生成</button>
113
- <audio id="audioPlayer" controls>
114
- <source src="" type="audio/mp3"/>
115
- Your browser does not support the audio element.
116
- </audio>
117
- <div class="form-group form-check">
118
- <input type="checkbox" id="streaming" onchange="updateLink()">
119
- <label class="form-check-label">流式响应</label>
 
 
 
 
 
 
 
 
 
120
  </div>
121
- </div>
122
- <div>自动识别语言:可识别的语言根据不同speaker而不同,方言无法自动识别</div>
123
- <div>方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd</div>
124
- <br/>
125
-
126
- <h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
127
  <p>
128
  Nene_Nanami_Rong_Tang:
129
  <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
@@ -164,6 +268,8 @@
164
  vits_chinese:
165
  <a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
166
  </p>
 
 
167
 
168
  </main>
169
 
@@ -171,6 +277,10 @@
171
  <script src="/static/js/bootstrap.bundle.min.js"></script>
172
 
173
  <script>
 
 
 
 
174
  function getProtocol() {
175
  return 'https:' == location.protocol ? "https://" : "http://";
176
  }
@@ -181,12 +291,21 @@
181
  }
182
 
183
  var baseUrl = getProtocol() + getUrl();
 
 
 
 
 
 
 
 
 
184
 
185
  setBaseUrl();
186
 
187
  function setBaseUrl() {
188
- var text = document.getElementById("inputText").value;
189
- var id = document.getElementById("inputId").value;
190
 
191
  var vitsLink = document.getElementById("vitsLink");
192
  var speakersLink = document.getElementById("speakersLink");
@@ -202,17 +321,22 @@
202
  }
203
 
204
  function getLink() {
205
- var text = document.getElementById("inputText").value;
206
- var id = document.getElementById("inputId").value;
207
- var format = document.getElementById("inputFormat").value;
208
- var lang = document.getElementById("inputLang").value;
209
- var length = document.getElementById("inputLength").value;
210
- var noise = document.getElementById("inputNoise").value;
211
- var noisew = document.getElementById("inputNoisew").value;
212
- var max = document.getElementById("inputMax").value;
213
- var streaming = document.getElementById('streaming');
214
 
215
- var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
 
 
 
 
 
216
  if (format != "") {
217
  url += "&format=" + format;
218
  }
@@ -231,6 +355,7 @@
231
  if (max != "") {
232
  url += "&max=" + max;
233
  }
 
234
  if (streaming.checked) {
235
  url += '&streaming=true';
236
  }
@@ -245,16 +370,37 @@
245
  }
246
 
247
  function setAudioSource() {
 
 
 
 
 
 
 
 
248
  var url = getLink();
249
- var audioPlayer = document.getElementById("audioPlayer");
250
  audioPlayer.src = url;
251
  audioPlayer.play();
252
  }
253
 
254
- var button = document.getElementById("getAudio");
255
- button.addEventListener("click", function () {
256
- setAudioSource();
257
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  </script>
259
  </body>
260
  </html>
 
4
  <meta charset="UTF-8"/>
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
  <title>vits-simple-api</title>
7
+ <link rel="stylesheet" href="/static/css/style.css">
8
  <link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
9
  </head>
10
  <body>
11
+ <main class="main-container">
12
+ <div class="container flex flex-wrap mx-auto">
13
+ <div class="text-center d-flex align-items-center w-100" style="height: 100px;" id="component-1">
14
+ <h1 class="w-100">
15
+ <a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
16
+ style="text-decoration: none; color: black"> vits-simple-api </a>
17
+ </h1>
18
+ </div>
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ <div class="tabs w-100 border-b-2" id="component-2">
22
+ <button class="tab-button px-4 pb-2 pt-2 active " onclick="showContent(0)">VITS</button>
23
+ <button class="tab-button px-4 pb-2 pt-2" onclick="showContent(1)">W2V2-VITS</button>
24
+ </div>
25
+
26
+ <div class="content w-100 border-lr-2 border-b-2" id="component-3">
27
+ <div class="content-pane active w-100 flex-wrap">
28
+ <form class="w-100">
29
+ <div class="form-group">
30
+ <label>text</label>
31
+ <textarea class="form-control" id="inputText1" rows="3"
32
+ oninput="updateLink()">你好,こんにちは</textarea>
33
+ </div>
34
+ <div class="form-group">
35
+ <label>id</label>
36
+ <select class="form-control" id="inputId1" oninput="updateLink()">
37
+ {% for speaker in speakers["VITS"] %}
38
+ {% if speaker["name"] == "雷电将军(雷神)" %}
39
  <option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
40
  | {{ speaker["lang"] }}</option>
41
  {% else %}
42
  <option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
43
  | {{ speaker["lang"] }}</option>
44
  {% endif %}
45
+ {% endfor %}
46
+ </select>
47
+ </div>
48
+ </form>
49
+ <form class="w-100">
50
+ <div class="row">
51
+ <div class="col-md-4 form-group">
52
+ <label data-toggle="tooltip" data-placement="top"
53
+ title="默认为wav">format</label>
54
+ <select class="form-control" id="inputFormat1" oninput="updateLink()">
55
+ <option></option>
56
+ <option>wav</option>
57
+ <option>mp3</option>
58
+ <option>ogg</option>
59
+ <option>silk</option>
60
+ </select>
61
+ </div>
62
+ <div class="col-md-4 form-group">
63
+ <label data-toggle="tooltip" data-placement="top"
64
+ title="自动识别语言auto:可识别的语言根据不同speaker而不同,方言无法自动识别。方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd">lang</label>
65
+ <input type="text" class="form-control" id="inputLang1" oninput="updateLink()" value=""
66
+ placeholder="auto"/>
67
+ </div>
68
+ <div class="col-md-4 form-group">
69
+ <label data-toggle="tooltip" data-placement="top"
70
+ title="调节语音长度,相当于调节语速,该数值越大语速越慢。">length</label>
71
+ <input type="number" class="form-control" id="inputLength1" oninput="updateLink()" value=""
72
+ placeholder="1" min="0" step="0.001"/>
73
+ </div>
74
+ </div>
75
+ <div class="row">
76
+ <div class="col-md-4 form-group">
77
+ <label data-toggle="tooltip" data-placement="top"
78
+ title="样本噪声,控制合成的随机性。">noise</label>
79
+ <input type="number" class="form-control" id="inputNoise1" oninput="updateLink()" value=""
80
+ placeholder="0.33" min="0" step="0.001"/>
81
+ </div>
82
+ <div class="col-md-4 form-group">
83
+ <label data-toggle="tooltip" data-placement="top"
84
+ title="随机时长预测器噪声,控制音素发音长度。">noisew</label>
85
+ <input type="number" class="form-control" id="inputNoisew1" oninput="updateLink()" value=""
86
+ placeholder="0.4" min="0" step="0.001"/>
87
+ </div>
88
+ <div class="col-md-4 form-group">
89
+ <label data-toggle="tooltip" data-placement="top"
90
+ title="按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。">max</label>
91
+ <input type="number" class="form-control" id="inputMax1" oninput="updateLink()" value=""
92
+ placeholder="50" step="1"/>
93
+ </div>
94
+ </div>
95
+ </form>
96
+
97
+
98
+ <div class="flex flex-wrap w-100"
99
+ style="justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
100
+ <button type="button" class="btn btn-outline-secondary" onclick="setAudioSource()"
101
+ style="margin-right: 10px">
102
+ 播放器生成
103
+ </button>
104
+ <audio id="audioPlayer1" controls>
105
+ <source src="" type="audio/mp3"/>
106
+ Your browser does not support the audio element.
107
+ </audio>
108
+ <div class="form-group form-check">
109
+ <input type="checkbox" id="streaming1" onchange="updateLink()">
110
+ <label class="form-check-label" data-toggle="tooltip" data-placement="top"
111
+ title="按照max分段推理文本,推理好一段即输出,无需等待所有文本都推理完毕">流式响应</label>
112
+ </div>
113
  </div>
114
+ </div>
115
+ <div class="content-pane">
116
+ <form class="w-100">
117
+ <div class="form-group">
118
+ <label>text</label>
119
+ <textarea class="form-control" id="inputText2" rows="3"
120
+ oninput="updateLink()">你好,こんにちは</textarea>
121
+ </div>
122
+ <div class="form-group">
123
+ <label>id</label>
124
+ <select class="form-control" id="inputId2" oninput="updateLink()">
125
+ {% for speaker in speakers["W2V2-VITS"] %}
126
+ <option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
127
+ | {{ speaker["lang"] }}</option>
128
+ {% endfor %}
129
+ </select>
130
+ </div>
131
+ <div class="form-group mb-3">
132
+ <label data-toggle="tooltip" data-placement="top"
133
+ title="情感嵌入,{% if w2v2_emotion_count > 0 %}
134
+ 可输入范围是0-{{ w2v2_emotion_count-1 }}
135
+ {% else %}
136
+ 未加载emotion
137
+ {% endif %}">emotion</label>
138
+ <input type="number" class="form-control" min="0" max="{{ w2v2_emotion_count-1 }}" step="1"
139
+ id="emotion" value="0" oninput="updateLink()">
140
+ </div>
141
+ </form>
142
+
143
+
144
+ <form class="w-100">
145
+ <div class="row">
146
+ <div class="col-md-4 form-group">
147
+ <label data-toggle="tooltip" data-placement="top"
148
+ title="默认为wav">format</label>
149
+ <select class="form-control" id="inputFormat2" oninput="updateLink()">
150
+ <option></option>
151
+ <option>wav</option>
152
+ <option>mp3</option>
153
+ <option>ogg</option>
154
+ <option>silk</option>
155
+ </select>
156
+ </div>
157
+ <div class="col-md-4 form-group">
158
+ <label data-toggle="tooltip" data-placement="top"
159
+ title="自动识别语言auto:可识别的语言根据不同speaker而不同,方言无法自动识别。方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd">lang</label>
160
+ <input type="text" class="form-control" id="inputLang2" oninput="updateLink()" value=""
161
+ placeholder="auto"/>
162
+ </div>
163
+ <div class="col-md-4 form-group">
164
+ <label data-toggle="tooltip" data-placement="top"
165
+ title="调节语音长度,相当于调节语速,该数值越大语速越慢。">length</label>
166
+ <input type="number" class="form-control" id="inputLength2" oninput="updateLink()" value=""
167
+ placeholder="1" min="0" step="0.001"/>
168
+ </div>
169
+ </div>
170
+ <div class="row">
171
+ <div class="col-md-4 form-group">
172
+ <label data-toggle="tooltip" data-placement="top"
173
+ title="样本噪声,控制合成的随机性。">noise</label>
174
+ <input type="number" class="form-control" id="inputNoise2" oninput="updateLink()" value=""
175
+ placeholder="0.33" min="0" step="0.001"/>
176
+ </div>
177
+ <div class="col-md-4 form-group">
178
+ <label data-toggle="tooltip" data-placement="top"
179
+ title="随机时长预测器噪声,控制音素发音长度。">noisew</label>
180
+ <input type="number" class="form-control" id="inputNoisew2" oninput="updateLink()" value=""
181
+ placeholder="0.4" min="0" step="0.001"/>
182
+ </div>
183
+ <div class="col-md-4 form-group">
184
+ <label data-toggle="tooltip" data-placement="top"
185
+ title="按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。">max</label>
186
+ <input type="number" class="form-control" id="inputMax2" oninput="updateLink()" value=""
187
+ placeholder="50" step="1"/>
188
+ </div>
189
+ </div>
190
+ </form>
191
+
192
+ <div class="flex flex-wrap w-100"
193
+ style="justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
194
+ <button type="button" class="btn btn-outline-secondary" onclick="setAudioSource()"
195
+ style="margin-right: 10px">
196
+ 播放器生成
197
+ </button>
198
+ <audio id="audioPlayer2" controls>
199
+ <source src="" type="audio/mp3"/>
200
+ Your browser does not support the audio element.
201
+ </audio>
202
+ <div class="form-group form-check">
203
+ <input type="checkbox" id="streaming2" onchange="updateLink()">
204
+ <label class="form-check-label">流式响应</label>
205
+ </div>
206
  </div>
207
+ </div>
208
  </div>
 
209
 
210
+ <div class="mt-2">
211
+ {% if speakers_count == 0 %}
212
+ <div style="color: red;">未加载任何模型</div>
213
+ {% endif %}
214
+ <div>
215
+ <label>返回speakers(json):</label>
216
+ <a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
217
+ style="text-decoration: none; color: black">
218
+ https://artrajz-vits-simple-api.hf.space/voice/speakers
219
+ </a>
220
+ </div>
221
+ <div>
222
+ <label>API调用:</label>
223
+ <a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
224
+ style="text-decoration: none; color: black">
225
+ https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
226
+ </a>
227
+ </div>
228
  </div>
229
+ <h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
230
+ <h2>请严格遵循模型原作者使用协议!</h2>
 
 
 
 
231
  <p>
232
  Nene_Nanami_Rong_Tang:
233
  <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
 
268
  vits_chinese:
269
  <a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
270
  </p>
271
+ </div>
272
+ <br/>
273
 
274
  </main>
275
 
 
277
  <script src="/static/js/bootstrap.bundle.min.js"></script>
278
 
279
  <script>
280
+ $(function () {
281
+ $('[data-toggle="tooltip"]').tooltip()
282
+ })
283
+
284
  function getProtocol() {
285
  return 'https:' == location.protocol ? "https://" : "http://";
286
  }
 
291
  }
292
 
293
  var baseUrl = getProtocol() + getUrl();
294
+ var modelType = 1;
295
+ var vitsStatus = false;
296
+ var w2v2Status = false;
297
+ {% if vits_speakers_count > 0 %}
298
+ vitsStatus = true;
299
+ {% endif %}
300
+ {% if w2v2_speakers_count > 0 %}
301
+ w2v2Status = true;
302
+ {% endif %}
303
 
304
  setBaseUrl();
305
 
306
  function setBaseUrl() {
307
+ var text = document.getElementById("inputText" + modelType).value;
308
+ var id = document.getElementById("inputId" + modelType).value;
309
 
310
  var vitsLink = document.getElementById("vitsLink");
311
  var speakersLink = document.getElementById("speakersLink");
 
321
  }
322
 
323
  function getLink() {
324
+ var text = document.getElementById("inputText" + modelType).value;
325
+ var id = document.getElementById("inputId" + modelType).value;
326
+ var format = document.getElementById("inputFormat" + modelType).value;
327
+ var lang = document.getElementById("inputLang" + modelType).value;
328
+ var length = document.getElementById("inputLength" + modelType).value;
329
+ var noise = document.getElementById("inputNoise" + modelType).value;
330
+ var noisew = document.getElementById("inputNoisew" + modelType).value;
331
+ var max = document.getElementById("inputMax" + modelType).value;
332
+ var streaming = document.getElementById('streaming' + modelType);
333
 
334
+ if (modelType == 1) {
335
+ var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
336
+ } else if (modelType == 2) {
337
+ var emotion = document.getElementById('emotion').value;
338
+ var url = baseUrl + "/voice/w2v2-vits?text=" + text + "&id=" + id + "&emotion=" + emotion;
339
+ }
340
  if (format != "") {
341
  url += "&format=" + format;
342
  }
 
355
  if (max != "") {
356
  url += "&max=" + max;
357
  }
358
+
359
  if (streaming.checked) {
360
  url += '&streaming=true';
361
  }
 
370
  }
371
 
372
  function setAudioSource() {
373
+ if (modelType==1 && !vitsStatus){
374
+ alert("未加载VITS模型");
375
+ return;
376
+ }
377
+ if (modelType==2 && !w2v2Status){
378
+ alert("未加载W2V2-VITS模型");
379
+ return;
380
+ }
381
  var url = getLink();
382
+ var audioPlayer = document.getElementById("audioPlayer" + modelType);
383
  audioPlayer.src = url;
384
  audioPlayer.play();
385
  }
386
 
387
+ function showContent(index) {
388
+ const panes = document.querySelectorAll(".content-pane");
389
+ const buttons = document.querySelectorAll(".tab-button");
390
+ modelType = index + 1;
391
+
392
+ for (let i = 0; i < panes.length; i++) {
393
+ if (i === index) {
394
+ panes[i].classList.add("active");
395
+ buttons[i].classList.add("active");
396
+
397
+ } else {
398
+ panes[i].classList.remove("active");
399
+ buttons[i].classList.remove("active");
400
+ }
401
+ }
402
+ updateLink();
403
+ }
404
  </script>
405
  </body>
406
  </html>
text/cleaners.py CHANGED
@@ -186,6 +186,21 @@ def cjke_cleaners2(text):
186
 
187
 
188
  def cje_cleaners(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  from text.mandarin import chinese_to_ipa
190
  from text.japanese import japanese_to_ipa2
191
  from text.english import english_to_ipa2
 
186
 
187
 
188
  def cje_cleaners(text):
189
+ from text.mandarin import chinese_to_lazy_ipa
190
+ from text.japanese import japanese_to_ipa
191
+ from text.english import english_to_ipa2
192
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
193
+ 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
194
+ text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
195
+ 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
196
+ text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
197
+ 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
198
+ text = re.sub(r'\s+$', '', text)
199
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
200
+ return text
201
+
202
+
203
+ def cje_cleaners2(text):
204
  from text.mandarin import chinese_to_ipa
205
  from text.japanese import japanese_to_ipa2
206
  from text.english import english_to_ipa2
text/mandarin.py CHANGED
@@ -1,5 +1,4 @@
1
- import os
2
- import sys
3
  import re
4
  from pypinyin import lazy_pinyin, BOPOMOFO
5
  import jieba
@@ -7,7 +6,7 @@ import cn2an
7
  import logging
8
 
9
  logging.getLogger('jieba').setLevel(logging.WARNING)
10
- jieba.set_dictionary(os.path.dirname(os.path.realpath(sys.argv[0])) + '/jieba/dict.txt')
11
  jieba.initialize()
12
 
13
  # List of (Latin alphabet, bopomofo) pairs:
 
1
+ import config
 
2
  import re
3
  from pypinyin import lazy_pinyin, BOPOMOFO
4
  import jieba
 
6
  import logging
7
 
8
  logging.getLogger('jieba').setLevel(logging.WARNING)
9
+ jieba.set_dictionary(config.ABS_PATH + '/jieba/dict.txt')
10
  jieba.initialize()
11
 
12
  # List of (Latin alphabet, bopomofo) pairs:
utils/merge.py CHANGED
@@ -19,12 +19,13 @@ lang_dict = {
19
  "cjke_cleaners": ["zh", "ja", "ko", "en"],
20
  "cjke_cleaners2": ["zh", "ja", "ko", "en"],
21
  "cje_cleaners": ["zh", "ja", "en"],
 
22
  "thai_cleaners": ["th"],
23
  "shanghainese_cleaners": ["sh"],
24
  "chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
25
  "ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
26
  "YB"],
27
- "bert_chinese_cleaners":["zh"],
28
  }
29
 
30
 
@@ -109,11 +110,16 @@ def merge_model(merging_model):
109
  for obj_id, i in enumerate(vits_list):
110
  obj = vits(model=i[0], config=i[1], model_type="vits")
111
  lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
112
-
113
- for id, name in enumerate(obj.get_speakers()):
114
- vits_obj.append([int(id), obj, obj_id])
115
- vits_speakers.append({"id": new_id, "name": name, "lang": lang})
116
- new_id += 1
 
 
 
 
 
117
 
118
  # merge hubert-vits
119
  if len(hubert_vits_list) != 0:
@@ -136,6 +142,7 @@ def merge_model(merging_model):
136
  new_id += 1
137
 
138
  # merge w2v2-vits
 
139
  if len(w2v2_vits_list) != 0:
140
  if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
141
  raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
@@ -156,7 +163,8 @@ def merge_model(merging_model):
156
 
157
  voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
158
  voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
159
-
160
- tts = TTS(voice_obj, voice_speakers)
 
161
 
162
  return tts
 
19
  "cjke_cleaners": ["zh", "ja", "ko", "en"],
20
  "cjke_cleaners2": ["zh", "ja", "ko", "en"],
21
  "cje_cleaners": ["zh", "ja", "en"],
22
+ "cje_cleaners2": ["zh", "ja", "en"],
23
  "thai_cleaners": ["th"],
24
  "shanghainese_cleaners": ["sh"],
25
  "chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
26
  "ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
27
  "YB"],
28
+ "bert_chinese_cleaners": ["zh"],
29
  }
30
 
31
 
 
110
  for obj_id, i in enumerate(vits_list):
111
  obj = vits(model=i[0], config=i[1], model_type="vits")
112
  lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
113
+ if isinstance(obj.get_speakers(), list):
114
+ for id, name in enumerate(obj.get_speakers()):
115
+ vits_obj.append([int(id), obj, obj_id])
116
+ vits_speakers.append({"id": new_id, "name": name, "lang": lang})
117
+ new_id += 1
118
+ else:
119
+ for id, (name, _) in enumerate(obj.get_speakers().items()):
120
+ vits_obj.append([int(id), obj, obj_id])
121
+ vits_speakers.append({"id": new_id, "name": name, "lang": lang})
122
+ new_id += 1
123
 
124
  # merge hubert-vits
125
  if len(hubert_vits_list) != 0:
 
142
  new_id += 1
143
 
144
  # merge w2v2-vits
145
+ emotion_reference = None
146
  if len(w2v2_vits_list) != 0:
147
  if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
148
  raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
 
163
 
164
  voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
165
  voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
166
+ w2v2_emotion_count = len(emotion_reference) if emotion_reference is not None else 0
167
+
168
+ tts = TTS(voice_obj, voice_speakers, w2v2_emotion_count=w2v2_emotion_count)
169
 
170
  return tts
utils/nlp.py CHANGED
@@ -1,13 +1,7 @@
1
  import regex as re
2
- import logging
3
  import config
4
  from .utils import check_is_none
5
-
6
- logger = logging.getLogger("vits-simple-api")
7
- level = getattr(config, "LOGGING_LEVEL", "DEBUG")
8
- level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
9
- 'CRITICAL': logging.CRITICAL}
10
- logger.setLevel(level_dict[level])
11
 
12
 
13
  def clasify_lang(text, speaker_lang):
 
1
  import regex as re
 
2
  import config
3
  from .utils import check_is_none
4
+ from logger import logger
 
 
 
 
 
5
 
6
 
7
  def clasify_lang(text, speaker_lang):
vits-simple-api-installer-latest.sh CHANGED
@@ -12,7 +12,32 @@ if [ ! -f config.py ]; then
12
  wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
13
  fi
14
 
15
- wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose.yaml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
18
 
 
12
  wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
13
  fi
14
 
15
+ if [ ! -f gunicorn_config.py ]; then
16
+ echo -e "${YELLOW}download config.py\n${PLAIN}"
17
+ wget -O $INSTALL_DIR/gunicorn_config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/gunicorn_config.py
18
+ fi
19
+
20
+ while true; do
21
+ echo -e "${GREEN}Which version of docker-compose.yaml do you want to download?"
22
+ echo -e "1. docker-compose.yaml (CPU version)"
23
+ echo -e "2. docker-compose-gpu.yaml (GPU version)"
24
+ read -p "Enter your choice (1 or 2): " choice
25
+ case $choice in
26
+ 1)
27
+ echo -e "${YELLOW}Downloading docker-compose.yaml (CPU version)\n${PLAIN}"
28
+ wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose.yaml
29
+ break
30
+ ;;
31
+ 2)
32
+ echo -e "${YELLOW}Downloading docker-compose-gpu.yaml (GPU version)\n${PLAIN}"
33
+ wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose-gpu.yaml
34
+ break
35
+ ;;
36
+ *)
37
+ echo -e "${RED}Invalid choice. Please enter 1 or 2.${PLAIN}"
38
+ ;;
39
+ esac
40
+ done
41
 
42
  echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
43
 
voice.py CHANGED
@@ -6,7 +6,6 @@ import numpy as np
6
  import torch
7
  import xml.etree.ElementTree as ET
8
  import config
9
- import logging
10
  import soundfile as sf
11
  from torch import no_grad, LongTensor, inference_mode, FloatTensor
12
  from io import BytesIO
@@ -16,6 +15,7 @@ from mel_processing import spectrogram_torch
16
  from text import text_to_sequence
17
  from models import SynthesizerTrn
18
  from utils import utils
 
19
 
20
  # torch.set_num_threads(1) # 设置torch线程为1
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -251,7 +251,7 @@ class vits:
251
 
252
 
253
  class TTS:
254
- def __init__(self, voice_obj, voice_speakers):
255
  self._voice_obj = voice_obj
256
  self._voice_speakers = voice_speakers
257
  self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
@@ -259,10 +259,11 @@ class TTS:
259
  self._vits_speakers_count = len(self._voice_speakers["VITS"])
260
  self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
261
  self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
 
262
  self.dem = None
263
 
264
  # Initialization information
265
- self.logger = logging.getLogger("vits-simple-api")
266
  self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
267
  self.logger.info(f'device:{device} device.type:{device.type}')
268
 
@@ -420,9 +421,7 @@ class TTS:
420
 
421
  return voice_tasks, format
422
 
423
- def create_ssml_infer_task(self, ssml, fname):
424
- voice_tasks, format = self.parse_ssml(ssml)
425
-
426
  audios = []
427
  for voice in voice_tasks:
428
  if voice.get("break"):
@@ -438,10 +437,10 @@ class TTS:
438
 
439
  audio = np.concatenate(audios, axis=0)
440
  encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
441
- if config.SAVE_AUDIO:
442
  path = f"{config.CACHE_PATH}/{fname}"
443
  utils.save_audio(encoded_audio.getvalue(), path)
444
- return encoded_audio, format
445
 
446
  def vits_infer(self, voice, fname):
447
  format = voice.get("format", "wav")
@@ -450,7 +449,7 @@ class TTS:
450
  sampling_rate = voice_obj.hps_ms.data.sampling_rate
451
  audio = voice_obj.get_audio(voice, auto_break=True)
452
  encoded_audio = self.encode(sampling_rate, audio, format)
453
- if config.SAVE_AUDIO:
454
  path = f"{config.CACHE_PATH}/{fname}"
455
  utils.save_audio(encoded_audio.getvalue(), path)
456
  return encoded_audio
@@ -466,9 +465,9 @@ class TTS:
466
  encoded_audio = self.encode(sampling_rate, chunk, format)
467
  for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
468
  yield encoded_audio_chunk
469
- if config.SAVE_AUDIO:
470
- audio.write(encoded_audio.getvalue())
471
- if config.SAVE_AUDIO:
472
  path = f"{config.CACHE_PATH}/{fname}"
473
  utils.save_audio(audio.getvalue(), path)
474
 
@@ -479,7 +478,7 @@ class TTS:
479
  sampling_rate = voice_obj.hps_ms.data.sampling_rate
480
  audio = voice_obj.get_audio(voice)
481
  encoded_audio = self.encode(sampling_rate, audio, format)
482
- if config.SAVE_AUDIO:
483
  path = f"{config.CACHE_PATH}/{fname}"
484
  utils.save_audio(encoded_audio.getvalue(), path)
485
  return encoded_audio
@@ -491,7 +490,7 @@ class TTS:
491
  sampling_rate = voice_obj.hps_ms.data.sampling_rate
492
  audio = voice_obj.get_audio(voice, auto_break=True)
493
  encoded_audio = self.encode(sampling_rate, audio, format)
494
- if config.SAVE_AUDIO:
495
  path = f"{config.CACHE_PATH}/{fname}"
496
  utils.save_audio(encoded_audio.getvalue(), path)
497
  return encoded_audio
@@ -515,7 +514,7 @@ class TTS:
515
 
516
  audio = voice_obj.voice_conversion(voice)
517
  encoded_audio = self.encode(sampling_rate, audio, format)
518
- if config.SAVE_AUDIO:
519
  path = f"{config.CACHE_PATH}/{fname}"
520
  utils.save_audio(encoded_audio.getvalue(), path)
521
  return encoded_audio
 
6
  import torch
7
  import xml.etree.ElementTree as ET
8
  import config
 
9
  import soundfile as sf
10
  from torch import no_grad, LongTensor, inference_mode, FloatTensor
11
  from io import BytesIO
 
15
  from text import text_to_sequence
16
  from models import SynthesizerTrn
17
  from utils import utils
18
+ from logger import logger
19
 
20
  # torch.set_num_threads(1) # 设置torch线程为1
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
251
 
252
 
253
  class TTS:
254
+ def __init__(self, voice_obj, voice_speakers, w2v2_emotion_count=0):
255
  self._voice_obj = voice_obj
256
  self._voice_speakers = voice_speakers
257
  self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
 
259
  self._vits_speakers_count = len(self._voice_speakers["VITS"])
260
  self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
261
  self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
262
+ self._w2v2_emotion_count = w2v2_emotion_count
263
  self.dem = None
264
 
265
  # Initialization information
266
+ self.logger = logger
267
  self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
268
  self.logger.info(f'device:{device} device.type:{device.type}')
269
 
 
421
 
422
  return voice_tasks, format
423
 
424
+ def create_ssml_infer_task(self, voice_tasks, format, fname):
 
 
425
  audios = []
426
  for voice in voice_tasks:
427
  if voice.get("break"):
 
437
 
438
  audio = np.concatenate(audios, axis=0)
439
  encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
440
+ if getattr(config, "SAVE_AUDIO", False):
441
  path = f"{config.CACHE_PATH}/{fname}"
442
  utils.save_audio(encoded_audio.getvalue(), path)
443
+ return encoded_audio
444
 
445
  def vits_infer(self, voice, fname):
446
  format = voice.get("format", "wav")
 
449
  sampling_rate = voice_obj.hps_ms.data.sampling_rate
450
  audio = voice_obj.get_audio(voice, auto_break=True)
451
  encoded_audio = self.encode(sampling_rate, audio, format)
452
+ if getattr(config, "SAVE_AUDIO", False):
453
  path = f"{config.CACHE_PATH}/{fname}"
454
  utils.save_audio(encoded_audio.getvalue(), path)
455
  return encoded_audio
 
465
  encoded_audio = self.encode(sampling_rate, chunk, format)
466
  for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
467
  yield encoded_audio_chunk
468
+ if getattr(config, "SAVE_AUDIO", False):
469
+ audio.write(encoded_audio.getvalue())
470
+ if getattr(config, "SAVE_AUDIO", False):
471
  path = f"{config.CACHE_PATH}/{fname}"
472
  utils.save_audio(audio.getvalue(), path)
473
 
 
478
  sampling_rate = voice_obj.hps_ms.data.sampling_rate
479
  audio = voice_obj.get_audio(voice)
480
  encoded_audio = self.encode(sampling_rate, audio, format)
481
+ if getattr(config, "SAVE_AUDIO", False):
482
  path = f"{config.CACHE_PATH}/{fname}"
483
  utils.save_audio(encoded_audio.getvalue(), path)
484
  return encoded_audio
 
490
  sampling_rate = voice_obj.hps_ms.data.sampling_rate
491
  audio = voice_obj.get_audio(voice, auto_break=True)
492
  encoded_audio = self.encode(sampling_rate, audio, format)
493
+ if getattr(config, "SAVE_AUDIO", False):
494
  path = f"{config.CACHE_PATH}/{fname}"
495
  utils.save_audio(encoded_audio.getvalue(), path)
496
  return encoded_audio
 
514
 
515
  audio = voice_obj.voice_conversion(voice)
516
  encoded_audio = self.encode(sampling_rate, audio, format)
517
+ if getattr(config, "SAVE_AUDIO", False):
518
  path = f"{config.CACHE_PATH}/{fname}"
519
  utils.save_audio(encoded_audio.getvalue(), path)
520
  return encoded_audio