GSVI_Test1 / Inference /src /classic_inference /classic_text_cleaner.py
XTer
Automated commit from batch script
5bbd2a7
raw
history blame contribute delete
No virus
3.4 kB
import sys
sys.path.insert(0, "GPT_SoVITS")
from text import cleaned_text_to_sequence
from text.cleaner import clean_text
import LangSegment
import re
splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", }
def clean_text_inf(text, language):
phones, word2ph, norm_text = clean_text(text, language)
phones = cleaned_text_to_sequence(phones)
return phones, word2ph, norm_text
def get_first(text):
pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
text = re.split(pattern, text)[0].strip()
return text
def splite_en_inf(sentence, language):
"""
Split the input sentence into a list of text segments and language tags.
Args:
sentence (str): The input sentence to be split.
language (str): The language tag of the input sentence.
Returns:
tuple: A tuple containing two lists - textlist and langlist.
- textlist: A list of text segments extracted from the input sentence.
- langlist: A list of language tags corresponding to each text segment.
"""
pattern = re.compile(r'[a-zA-Z ]+')
textlist = []
langlist = []
pos = 0
for match in pattern.finditer(sentence):
start, end = match.span()
if start > pos:
textlist.append(sentence[pos:start])
langlist.append(language)
textlist.append(sentence[start:end])
langlist.append("en")
pos = end
if pos < len(sentence):
textlist.append(sentence[pos:])
langlist.append(language)
# Merge punctuation into previous word
for i in range(len(textlist)-1, 0, -1):
if re.match(r'^[\W_]+$', textlist[i]):
textlist[i-1] += textlist[i]
del textlist[i]
del langlist[i]
# Merge consecutive words with the same language tag
i = 0
while i < len(langlist) - 1:
if langlist[i] == langlist[i+1]:
textlist[i] += textlist[i+1]
del textlist[i+1]
del langlist[i+1]
else:
i += 1
return textlist, langlist
def merge_short_text_in_array(texts, threshold):
if (len(texts)) < 2:
return texts
result = []
text = ""
for ele in texts:
text += ele
if len(text) >= threshold:
result.append(text)
text = ""
if (len(text) > 0):
if len(result) == 0:
result.append(text)
else:
result[len(result) - 1] += text
return result
def auto_cut(inp):
# if not re.search(r'[^\w\s]', inp[-1]):
# inp += '。'
inp = inp.strip("\n")
split_punds = r'[?!。?!~:]'
if inp[-1] not in split_punds:
inp+="。"
items = re.split(f'({split_punds})', inp)
items = ["".join(group) for group in zip(items[::2], items[1::2])]
def process_commas(text):
separators = [',', ',', '、', '——', '…']
count = 0
processed_text = ""
for char in text:
processed_text += char
if char in separators:
if count > 12:
processed_text += '\n'
count = 0
else:
count += 1 # 对于非分隔符字符,增加计数
return processed_text
final_items=[process_commas(item) for item in items]
return "\n".join(final_items)