Artrajz's picture
init
960cd20
raw
history blame
7.65 kB
import logging
import regex as re
from utils.data_utils import check_is_none
from utils.classify_language import classify_language, split_alpha_nonalpha
def _expand_abbreviations(text):
pattern = r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z])'
return re.sub(pattern, ' ', text)
def _expand_hyphens(text):
pattern = r'(?<=[a-zA-Z])-(?=[a-zA-Z])'
expanded_text = re.sub(pattern, ' ', text)
return expanded_text
def markup_language(text: str, target_languages: list = None) -> str:
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
sentences = re.split(pattern, text)
pre_lang = ""
p = 0
new_sentences = []
for sentence in sentences:
new_sentences.extend(split_alpha_nonalpha(sentence))
sentences = new_sentences
for sentence in sentences:
if check_is_none(sentence): continue
lang = classify_language(sentence, target_languages)
if pre_lang == "":
text = text[:p] + text[p:].replace(sentence, f"[{lang.upper()}]{sentence}", 1)
p += len(f"[{lang.upper()}]")
elif pre_lang != lang:
text = text[:p] + text[p:].replace(sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1)
p += len(f"[{pre_lang.upper()}][{lang.upper()}]")
pre_lang = lang
p += text[p:].index(sentence) + len(sentence)
text += f"[{pre_lang.upper()}]"
return text
def split_languages(text: str, target_languages: list = None, segment_size: int = 50,
expand_abbreviations: bool = False, expand_hyphens: bool = False) -> list:
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
r'\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
sentences = re.split(pattern, text)
pre_lang = ""
start = 0
end = 0
sentences_list = []
new_sentences = []
for sentence in sentences:
new_sentences.extend(split_alpha_nonalpha(sentence))
sentences = new_sentences
for sentence in sentences:
if check_is_none(sentence):
continue
lang = classify_language(sentence, target_languages)
end += text[end:].index(sentence)
if pre_lang != "" and pre_lang != lang:
_text = text[start:end]
if pre_lang == "en":
if expand_abbreviations:
_text = _expand_abbreviations(_text)
if _expand_hyphens:
_text = _expand_hyphens(_text)
if len(_text) >= segment_size:
for i in sentence_split(_text, segment_size):
sentences_list.append((i, pre_lang))
else:
sentences_list.append((_text, pre_lang))
start = end
end += len(sentence)
pre_lang = lang
_text = text[start:]
if pre_lang == "en":
if expand_abbreviations:
_text = _expand_abbreviations(_text)
if _expand_hyphens:
_text = _expand_hyphens(_text)
if len(_text) >= segment_size:
for i in sentence_split(_text, segment_size):
sentences_list.append((i, pre_lang))
else:
sentences_list.append((_text, pre_lang))
return sentences_list
def sentence_split(text: str, segment_size: int) -> list:
# Split text into paragraphs
paragraphs = re.split(r'\r\n|\n', text)
pattern = r'[!(),—+\-.:;??。,、;:]+'
sentences_list = []
for paragraph in paragraphs:
sentences = re.split(pattern, paragraph)
discarded_chars = re.findall(pattern, paragraph)
count, p = 0, 0
# Iterate over the symbols by which it is split
for i, discarded_char in enumerate(discarded_chars):
count += len(sentences[i]) + len(discarded_char)
if count >= segment_size:
sentences_list.append(paragraph[p:p + count].strip())
p += count
count = 0
# Add the remaining text
if len(paragraph) - p > 0:
if len(paragraph) - p <= 4 and len(sentences_list) > 0:
sentences_list[-1] += paragraph[p:]
else:
sentences_list.append(paragraph[p:])
# Uncomment the following lines if you want to log the sentences
# for sentence in sentences_list:
# logging.debug(sentence)
return sentences_list
def sentence_split_reading(text: str) -> list:
pattern = r'“[^“”]*”|[^“”]+'
parts = re.findall(pattern, text)
sentences_list = []
for part in parts:
if part:
is_quote = part.startswith("“") and part.endswith("”") and part[-2] in "!!?。,;……?!.,;"
if is_quote:
sentence = part.strip("“”")
sentences_list.append((sentence, is_quote))
else:
if len(sentences_list) > 0 and not sentences_list[-1][1]:
sentences_list[-1] = (sentences_list[-1][0] + part, sentences_list[-1][1])
else:
sentences_list.append((part, is_quote))
return sentences_list
def sentence_split_and_markup(text, segment_size=50, lang="auto", speaker_lang=None):
# 如果该speaker只支持一种语言
if speaker_lang is not None and len(speaker_lang) == 1:
if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
logging.debug(
f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}")
lang = speaker_lang[0]
sentences_list = []
if lang.upper() != "MIX":
if segment_size <= 0:
sentences_list.append(
markup_language(text,
speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]")
else:
for i in sentence_split(text, segment_size):
if check_is_none(i): continue
sentences_list.append(
markup_language(i,
speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]")
else:
sentences_list.append(text)
for i in sentences_list:
logging.debug(i)
return sentences_list
if __name__ == '__main__':
text = """这几天心里颇不宁静。
今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。"""
# print(markup_language(text, target_languages=None))
print(sentence_split(text, segment_size=50))
# print(sentence_split_and_markup(text, segment_size=50, lang="auto", speaker_lang=None))
# text = "你好hello,这是一段用来测试vits自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging."
# print(split_languages(text, ["zh", "ja", "en"]))