Spaces:

Remsky
/

Kokoro-TTS-Zero

Running on Zero

Kokoro-TTS-Zero / lib /text_utils.py

Add audio and text utility modules, update requirements, and revise README

cb7f5d0 7 days ago

2.07 kB

	import tiktoken

	def normalize_text(text: str) -> str:
	"""Normalize text for TTS processing"""
	if not text:
	return ""
	# Basic normalization - can be expanded based on needs
	return text.strip()

	def chunk_text(text: str, max_chars: int = 300) -> list[str]:
	"""Break text into chunks at natural boundaries"""
	chunks = []
	current_chunk = ""

	# Split on sentence boundaries first
	sentences = text.replace(".", ".\|").replace("!", "!\|").replace("?", "?\|").replace(";", ";\|").split("\|")

	for sentence in sentences:
	if not sentence.strip():
	continue

	# If sentence is already too long, break on commas
	if len(sentence) > max_chars:
	parts = sentence.split(",")
	for part in parts:
	if len(current_chunk) + len(part) <= max_chars:
	current_chunk += part + ","
	else:
	# If part is still too long, break on whitespace
	if len(part) > max_chars:
	words = part.split()
	for word in words:
	if len(current_chunk) + len(word) > max_chars:
	chunks.append(current_chunk.strip())
	current_chunk = word + " "
	else:
	current_chunk += word + " "
	else:
	chunks.append(current_chunk.strip())
	current_chunk = part + ","
	else:
	if len(current_chunk) + len(sentence) <= max_chars:
	current_chunk += sentence
	else:
	chunks.append(current_chunk.strip())
	current_chunk = sentence

	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks

	def count_tokens(text: str) -> int:
	"""Count tokens in text using tiktoken"""
	enc = tiktoken.get_encoding("cl100k_base")
	return len(enc.encode(text))