Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,065 Bytes
165abce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import tiktoken
def normalize_text(text: str) -> str:
"""Normalize text for TTS processing"""
if not text:
return ""
# Basic normalization - can be expanded based on needs
return text.strip()
def chunk_text(text: str, max_chars: int = 300) -> list[str]:
"""Break text into chunks at natural boundaries"""
chunks = []
current_chunk = ""
# Split on sentence boundaries first
sentences = text.replace(".", ".|").replace("!", "!|").replace("?", "?|").replace(";", ";|").split("|")
for sentence in sentences:
if not sentence.strip():
continue
# If sentence is already too long, break on commas
if len(sentence) > max_chars:
parts = sentence.split(",")
for part in parts:
if len(current_chunk) + len(part) <= max_chars:
current_chunk += part + ","
else:
# If part is still too long, break on whitespace
if len(part) > max_chars:
words = part.split()
for word in words:
if len(current_chunk) + len(word) > max_chars:
chunks.append(current_chunk.strip())
current_chunk = word + " "
else:
current_chunk += word + " "
else:
chunks.append(current_chunk.strip())
current_chunk = part + ","
else:
if len(current_chunk) + len(sentence) <= max_chars:
current_chunk += sentence
else:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
def count_tokens(text: str) -> int:
"""Count tokens in text using tiktoken"""
enc = tiktoken.get_encoding("cl100k_base")
return len(enc.encode(text))
|