Spaces:

XTer123
/

GSVI_Test1

Runtime error

GSVI_Test1 / GPT_SoVITS /TTS_infer_pack /text_segmentation_method.py

XTer

Automated commit from batch script

5bbd2a7 8 months ago

8.54 kB

	import re
	from typing import Callable
	from tools.i18n.i18n import I18nAuto

	i18n = I18nAuto()

	METHODS = dict()

	def get_method(name:str)->Callable:
	method = METHODS.get(name, None)
	if method is None:
	raise ValueError(f"Method {name} not found")
	return method

	def register_method(name):
	def decorator(func):
	METHODS[name] = func
	return func
	return decorator

	splits = {"，", "。", "？", "！", ",", ".", "?", "!", "~", ":", "：", "—", "…", }



	def split_big_text(text, max_len=510):
	# 定义全角和半角标点符号
	punctuation = "".join(splits)

	# 切割文本
	segments = re.split('([' + punctuation + '])', text)

	# 初始化结果列表和当前片段
	result = []
	current_segment = ''

	for segment in segments:
	# 如果当前片段加上新的片段长度超过max_len，就将当前片段加入结果列表，并重置当前片段
	if len(current_segment + segment) > max_len:
	result.append(current_segment)
	current_segment = segment
	else:
	current_segment += segment

	# 将最后一个片段加入结果列表
	if current_segment:
	result.append(current_segment)

	return result

	def split(todo_text):
	todo_text = todo_text.replace("……", "。").replace("——", "，")
	if todo_text[-1] not in splits:
	todo_text += "。"
	i_split_head = i_split_tail = 0
	len_text = len(todo_text)
	todo_texts = []
	while 1:
	if i_split_head >= len_text:
	break # 结尾一定有标点，所以直接跳出即可，最后一段在上次已加入
	if todo_text[i_split_head] in splits:
	i_split_head += 1
	todo_texts.append(todo_text[i_split_tail:i_split_head])
	i_split_tail = i_split_head
	else:
	i_split_head += 1
	return todo_texts

	def cut_sentence_multilang(text, max_length=30):
	# 初始化计数器
	word_count = 0
	in_word = False


	for index, char in enumerate(text):
	if char.isspace(): # 如果当前字符是空格
	in_word = False
	elif char.isascii() and not in_word: # 如果是ASCII字符（英文）并且不在单词内
	word_count += 1 # 新的英文单词
	in_word = True
	elif not char.isascii(): # 如果字符非英文
	word_count += 1 # 每个非英文字符单独计为一个字
	if word_count > max_length:
	return text[:index], text[index:]

	return text, ""

	# contributed by XTer
	# 简单的按长度切分，不希望出现超长的句子
	def split_long_sentence(text, max_length=510):

	opts = []
	sentences = text.split('\n')
	for sentence in sentences:
	prev_text , sentence = cut_sentence_multilang(sentence, max_length)
	while sentence.strip() != "":
	opts.append(prev_text)
	prev_text , sentence = cut_sentence_multilang(sentence, max_length)
	opts.append(prev_text)
	return "\n".join(opts)

	# 不切
	@register_method("cut0")
	def cut0(inp):
	return inp


	# 凑四句一切
	@register_method("cut1")
	def cut1(inp):
	inp = inp.strip("\n")
	inps = split(inp)
	split_idx = list(range(0, len(inps), 4))
	split_idx[-1] = None
	if len(split_idx) > 1:
	opts = []
	for idx in range(len(split_idx) - 1):
	opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
	else:
	opts = [inp]
	return "\n".join(opts)


	# 凑50字一切
	@register_method("cut2")
	def cut2(inp, max_length=50):
	inp = split_long_sentence(inp).strip("\n")
	inps = split(inp)
	if len(inps) < 2:
	return inp
	opts = []
	summ = 0
	tmp_str = ""
	for i in range(len(inps)):
	summ += len(inps[i])
	tmp_str += inps[i]
	if summ > max_length:
	summ = 0
	opts.append(tmp_str)
	tmp_str = ""
	if tmp_str != "":
	opts.append(tmp_str)
	# print(opts)
	if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了，和前一个合一起
	opts[-2] = opts[-2] + opts[-1]
	opts = opts[:-1]
	return "\n".join(opts)


	# 按中文句号。切
	@register_method("cut3")
	def cut3(inp):
	inp = split_long_sentence(inp).strip("\n")
	return "\n".join(["%s" % item for item in inp.strip("。").split("。")])


	# 按英文句号.切
	@register_method("cut4")
	def cut4(inp):
	inp = inp.strip("\n")
	return "\n".join(["%s" % item for item in inp.strip(".").split(".")])

	# 按标点符号切
	# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
	@register_method("cut5")
	def cut5(inp):
	# if not re.search(r'[^\w\s]', inp[-1]):
	# inp += '。'
	inp = inp.strip("\n")
	punds = r'[,.;?!、，。？！;：…]'
	items = re.split(f'({punds})', inp)
	mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
	# 在句子不存在符号或句尾无符号的时候保证文本完整
	if len(items)%2 == 1:
	mergeitems.append(items[-1])
	opt = "\n".join(mergeitems)
	return opt

	def count_words_multilang(text):
	# 初始化计数器
	word_count = 0
	in_word = False

	for char in text:
	if char.isspace(): # 如果当前字符是空格
	in_word = False
	elif char.isascii() and not in_word: # 如果是ASCII字符（英文）并且不在单词内
	word_count += 1 # 新的英文单词
	in_word = True
	elif not char.isascii(): # 如果字符非英文
	word_count += 1 # 每个非英文字符单独计为一个字

	return word_count


	# contributed by https://github.com/X-T-E-R/GPT-SoVITS-Inference/blob/main/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
	@register_method("auto_cut")
	def auto_cut(inp, max_length=30):
	# if not re.search(r'[^\w\s]', inp[-1]):
	# inp += '。'
	inp = inp.strip("\n")
	inp = inp.replace(". ", "。")
	erase_punds = r'[“”"‘’\'（）()【】[\]{}<>《》〈〉〔〕〖〗〘〙〚〛〛〞〟]'
	inp = re.sub(erase_punds, '', inp)
	split_punds = r'[?!。？！~：]'
	if inp[-1] not in split_punds:
	inp+="。"
	items = re.split(f'({split_punds})', inp)
	items = ["".join(group) for group in zip(items[::2], items[1::2])]

	def process_commas(text, max_length):

	# Define separators and the regular expression for splitting
	separators = ['，', ',', '、', '——', '…']
	# 使用正则表达式的捕获组来保留分隔符，分隔符两边的括号就是所谓的捕获组
	regex_pattern = '(' + '\|'.join(map(re.escape, separators)) + ')'
	# 使用re.split函数分割文本，由于使用了捕获组，分隔符也会作为分割结果的一部分返回
	sentences = re.split(regex_pattern, text)

	processed_text = ""
	current_line = ""

	final_sentences = []

	for sentence in sentences:
	if count_words_multilang(sentence)>max_length:

	final_sentences+=split_long_sentence(sentence,max_length=max_length).split("\n")
	else:
	final_sentences.append(sentence)

	for sentence in final_sentences:
	# Add the length of the sentence plus one for the space or newline that will follow
	if count_words_multilang(current_line + sentence) <= max_length:
	# If adding the next sentence does not exceed max length, add it to the current line
	current_line += sentence
	else:
	# If the current line is too long, start a new line
	processed_text += current_line.strip() + '\n'
	current_line = sentence + " " # Start the new line with the current sentence

	# Add any remaining text in current_line to processed_text
	processed_text += current_line.strip()

	return processed_text

	final_items = []
	for item in items:
	final_items+=process_commas(item,max_length=max_length).split("\n")

	final_items = [item for item in final_items if item.strip() and not (len(item.strip()) == 1 and item.strip() in "?!，,。？！~：")]

	return "\n".join(final_items)


	if __name__ == '__main__':
	str1 = """我有i一个j k 1"""
	print(count_words_multilang(str1))
	print(cut_sentence_multilang(str1, 20))