Spaces:

XTer123
/

GSVI_Test1

Runtime error

App Files Files Community

GSVI_Test1 / Inference /src /classic_inference /classic_text_cleaner.py

XTer

Automated commit from batch script

5bbd2a7 6 months ago

raw

history blame contribute delete

No virus

3.4 kB

	import sys
	sys.path.insert(0, "GPT_SoVITS")
	from text import cleaned_text_to_sequence
	from text.cleaner import clean_text
	import LangSegment
	import re

	splits = {"，", "。", "？", "！", ",", ".", "?", "!", "~", ":", "：", "—", "…", }

	def clean_text_inf(text, language):
	phones, word2ph, norm_text = clean_text(text, language)
	phones = cleaned_text_to_sequence(phones)
	return phones, word2ph, norm_text





	def get_first(text):
	pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
	text = re.split(pattern, text)[0].strip()
	return text

	def splite_en_inf(sentence, language):
	"""
	Split the input sentence into a list of text segments and language tags.

	Args:
	sentence (str): The input sentence to be split.
	language (str): The language tag of the input sentence.

	Returns:
	tuple: A tuple containing two lists - textlist and langlist.
	- textlist: A list of text segments extracted from the input sentence.
	- langlist: A list of language tags corresponding to each text segment.
	"""
	pattern = re.compile(r'[a-zA-Z ]+')
	textlist = []
	langlist = []
	pos = 0
	for match in pattern.finditer(sentence):
	start, end = match.span()
	if start > pos:
	textlist.append(sentence[pos:start])
	langlist.append(language)
	textlist.append(sentence[start:end])
	langlist.append("en")
	pos = end
	if pos < len(sentence):
	textlist.append(sentence[pos:])
	langlist.append(language)
	# Merge punctuation into previous word
	for i in range(len(textlist)-1, 0, -1):
	if re.match(r'^[\W_]+$', textlist[i]):
	textlist[i-1] += textlist[i]
	del textlist[i]
	del langlist[i]
	# Merge consecutive words with the same language tag
	i = 0
	while i < len(langlist) - 1:
	if langlist[i] == langlist[i+1]:
	textlist[i] += textlist[i+1]
	del textlist[i+1]
	del langlist[i+1]
	else:
	i += 1

	return textlist, langlist


	def merge_short_text_in_array(texts, threshold):
	if (len(texts)) < 2:
	return texts
	result = []
	text = ""
	for ele in texts:
	text += ele
	if len(text) >= threshold:
	result.append(text)
	text = ""
	if (len(text) > 0):
	if len(result) == 0:
	result.append(text)
	else:
	result[len(result) - 1] += text
	return result


	def auto_cut(inp):
	# if not re.search(r'[^\w\s]', inp[-1]):
	# inp += '。'
	inp = inp.strip("\n")

	split_punds = r'[?!。？！~：]'
	if inp[-1] not in split_punds:
	inp+="。"
	items = re.split(f'({split_punds})', inp)
	items = ["".join(group) for group in zip(items[::2], items[1::2])]

	def process_commas(text):

	separators = ['，', ',', '、', '——', '…']
	count = 0
	processed_text = ""
	for char in text:
	processed_text += char
	if char in separators:
	if count > 12:
	processed_text += '\n'
	count = 0
	else:
	count += 1 # 对于非分隔符字符，增加计数
	return processed_text

	final_items=[process_commas(item) for item in items]


	return "\n".join(final_items)