wl-dub

Running

App Files Files Community

wl-dub / soni_translate /speech_segmentation.py

r3gm

v0.5.0

b152010 7 months ago

raw

history blame

15.4 kB

	from whisperx.alignment import (
	DEFAULT_ALIGN_MODELS_TORCH as DAMT,
	DEFAULT_ALIGN_MODELS_HF as DAMHF,
	)
	from whisperx.utils import TO_LANGUAGE_CODE
	import whisperx
	import torch
	import gc
	import os
	import soundfile as sf
	from IPython.utils import capture # noqa
	from .language_configuration import EXTRA_ALIGN, INVERTED_LANGUAGES
	from .logging_setup import logger
	from .postprocessor import sanitize_file_name
	from .utils import remove_directory_contents, run_command

	# ZERO GPU CONFIG
	import spaces
	import copy
	import random
	import time

	def random_sleep():
	if os.environ.get("ZERO_GPU") == "TRUE":
	print("Random sleep")
	sleep_time = round(random.uniform(7.2, 9.9), 1)
	time.sleep(sleep_time)


	@spaces.GPU(duration=120)
	def load_and_transcribe_audio(asr_model, audio, compute_type, language, asr_options, batch_size, segment_duration_limit):
	# Load model
	model = whisperx.load_model(
	asr_model,
	os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
	compute_type=compute_type,
	language=language,
	asr_options=asr_options,
	)

	# Transcribe audio
	result = model.transcribe(
	audio,
	batch_size=batch_size,
	chunk_size=segment_duration_limit,
	print_progress=True,
	)

	del model
	gc.collect()
	torch.cuda.empty_cache() # noqa

	return result

	def load_align_and_align_segments(result, audio, DAMHF):

	# Load alignment model
	model_a, metadata = whisperx.load_align_model(
	language_code=result["language"],
	device=os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
	model_name=None
	if result["language"] in DAMHF.keys()
	else EXTRA_ALIGN[result["language"]],
	)

	# Align segments
	alignment_result = whisperx.align(
	result["segments"],
	model_a,
	metadata,
	audio,
	os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda",
	return_char_alignments=True,
	print_progress=False,
	)

	# Clean up
	del model_a
	gc.collect()
	torch.cuda.empty_cache() # noqa

	return alignment_result

	@spaces.GPU(duration=120)
	def diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers):

	if os.environ.get("ZERO_GPU") == "TRUE":
	diarize_model.model.to(torch.device("cuda"))
	diarize_segments = diarize_model(
	audio_wav,
	min_speakers=min_speakers,
	max_speakers=max_speakers
	)
	return diarize_segments

	# ZERO GPU CONFIG

	ASR_MODEL_OPTIONS = [
	"tiny",
	"base",
	"small",
	"medium",
	"large",
	"large-v1",
	"large-v2",
	"large-v3",
	"distil-large-v2",
	"Systran/faster-distil-whisper-large-v3",
	"tiny.en",
	"base.en",
	"small.en",
	"medium.en",
	"distil-small.en",
	"distil-medium.en",
	"OpenAI_API_Whisper",
	]

	COMPUTE_TYPE_GPU = [
	"default",
	"auto",
	"int8",
	"int8_float32",
	"int8_float16",
	"int8_bfloat16",
	"float16",
	"bfloat16",
	"float32"
	]

	COMPUTE_TYPE_CPU = [
	"default",
	"auto",
	"int8",
	"int8_float32",
	"int16",
	"float32",
	]

	WHISPER_MODELS_PATH = './WHISPER_MODELS'


	def openai_api_whisper(
	input_audio_file,
	source_lang=None,
	chunk_duration=1800
	):

	info = sf.info(input_audio_file)
	duration = info.duration

	output_directory = "./whisper_api_audio_parts"
	os.makedirs(output_directory, exist_ok=True)
	remove_directory_contents(output_directory)

	if duration > chunk_duration:
	# Split the audio file into smaller chunks with 30-minute duration
	cm = f'ffmpeg -i "{input_audio_file}" -f segment -segment_time {chunk_duration} -c:a libvorbis "{output_directory}/output%03d.ogg"'
	run_command(cm)
	# Get list of generated chunk files
	chunk_files = sorted(
	[f"{output_directory}/{f}" for f in os.listdir(output_directory) if f.endswith('.ogg')]
	)
	else:
	one_file = f"{output_directory}/output000.ogg"
	cm = f'ffmpeg -i "{input_audio_file}" -c:a libvorbis {one_file}'
	run_command(cm)
	chunk_files = [one_file]

	# Transcript
	segments = []
	language = source_lang if source_lang else None
	for i, chunk in enumerate(chunk_files):
	from openai import OpenAI
	client = OpenAI()

	audio_file = open(chunk, "rb")
	transcription = client.audio.transcriptions.create(
	model="whisper-1",
	file=audio_file,
	language=language,
	response_format="verbose_json",
	timestamp_granularities=["segment"],
	)

	try:
	transcript_dict = transcription.model_dump()
	except: # noqa
	transcript_dict = transcription.to_dict()

	if language is None:
	logger.info(f'Language detected: {transcript_dict["language"]}')
	language = TO_LANGUAGE_CODE[transcript_dict["language"]]

	chunk_time = chunk_duration * (i)

	for seg in transcript_dict["segments"]:

	if "start" in seg.keys():
	segments.append(
	{
	"text": seg["text"],
	"start": seg["start"] + chunk_time,
	"end": seg["end"] + chunk_time,
	}
	)

	audio = whisperx.load_audio(input_audio_file)
	result = {"segments": segments, "language": language}

	return audio, result


	def find_whisper_models():
	path = WHISPER_MODELS_PATH
	folders = []

	if os.path.exists(path):
	for folder in os.listdir(path):
	folder_path = os.path.join(path, folder)
	if (
	os.path.isdir(folder_path)
	and 'model.bin' in os.listdir(folder_path)
	):
	folders.append(folder)
	return folders

	def transcribe_speech(
	audio_wav,
	asr_model,
	compute_type,
	batch_size,
	SOURCE_LANGUAGE,
	literalize_numbers=True,
	segment_duration_limit=15,
	):
	"""
	Transcribe speech using a whisper model.

	Parameters:
	- audio_wav (str): Path to the audio file in WAV format.
	- asr_model (str): The whisper model to be loaded.
	- compute_type (str): Type of compute to be used (e.g., 'int8', 'float16').
	- batch_size (int): Batch size for transcription.
	- SOURCE_LANGUAGE (str): Source language for transcription.

	Returns:
	- Tuple containing:
	- audio: Loaded audio file.
	- result: Transcription result as a dictionary.
	"""

	if asr_model == "OpenAI_API_Whisper":
	if literalize_numbers:
	logger.info(
	"OpenAI's API Whisper does not support "
	"the literalization of numbers."
	)
	return openai_api_whisper(audio_wav, SOURCE_LANGUAGE)

	# https://github.com/openai/whisper/discussions/277
	prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None
	SOURCE_LANGUAGE = (
	SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh"
	)
	asr_options = {
	"initial_prompt": prompt,
	"suppress_numerals": literalize_numbers
	}

	if asr_model not in ASR_MODEL_OPTIONS:

	base_dir = WHISPER_MODELS_PATH
	if not os.path.exists(base_dir):
	os.makedirs(base_dir)
	model_dir = os.path.join(base_dir, sanitize_file_name(asr_model))

	if not os.path.exists(model_dir):
	from ctranslate2.converters import TransformersConverter

	quantization = "float32"
	# Download new model
	try:
	converter = TransformersConverter(
	asr_model,
	low_cpu_mem_usage=True,
	copy_files=[
	"tokenizer_config.json", "preprocessor_config.json"
	]
	)
	converter.convert(
	model_dir,
	quantization=quantization,
	force=False
	)
	except Exception as error:
	if "File tokenizer_config.json does not exist" in str(error):
	converter._copy_files = [
	"tokenizer.json", "preprocessor_config.json"
	]
	converter.convert(
	model_dir,
	quantization=quantization,
	force=True
	)
	else:
	raise error

	asr_model = model_dir
	logger.info(f"ASR Model: {str(model_dir)}")

	audio = whisperx.load_audio(audio_wav)

	result = load_and_transcribe_audio(
	asr_model, audio, compute_type, SOURCE_LANGUAGE, asr_options, batch_size, segment_duration_limit
	)

	if result["language"] == "zh" and not prompt:
	result["language"] = "zh-TW"
	logger.info("Chinese - Traditional (zh-TW)")


	return audio, result


	def align_speech(audio, result):
	"""
	Aligns speech segments based on the provided audio and result metadata.

	Parameters:
	- audio (array): The audio data in a suitable format for alignment.
	- result (dict): Metadata containing information about the segments
	and language.

	Returns:
	- result (dict): Updated metadata after aligning the segments with
	the audio. This includes character-level alignments if
	'return_char_alignments' is set to True.

	Notes:
	- This function uses language-specific models to align speech segments.
	- It performs language compatibility checks and selects the
	appropriate alignment model.
	- Cleans up memory by releasing resources after alignment.
	"""
	DAMHF.update(DAMT) # lang align
	if (
	not result["language"] in DAMHF.keys()
	and not result["language"] in EXTRA_ALIGN.keys()
	):
	logger.warning(
	"Automatic detection: Source language not compatible with align"
	)
	raise ValueError(
	f"Detected language {result['language']} incompatible, "
	"you can select the source language to avoid this error."
	)
	if (
	result["language"] in EXTRA_ALIGN.keys()
	and EXTRA_ALIGN[result["language"]] == ""
	):
	lang_name = (
	INVERTED_LANGUAGES[result["language"]]
	if result["language"] in INVERTED_LANGUAGES.keys()
	else result["language"]
	)
	logger.warning(
	"No compatible wav2vec2 model found "
	f"for the language '{lang_name}', skipping alignment."
	)
	return result

	random_sleep()
	result = load_align_and_align_segments(result, audio, DAMHF)

	return result


	diarization_models = {
	"pyannote_3.1": "pyannote/speaker-diarization-3.1",
	"pyannote_2.1": "pyannote/speaker-diarization@2.1",
	"disable": "",
	}


	def reencode_speakers(result):

	if result["segments"][0]["speaker"] == "SPEAKER_00":
	return result

	speaker_mapping = {}
	counter = 0

	logger.debug("Reencode speakers")

	for segment in result["segments"]:
	old_speaker = segment["speaker"]
	if old_speaker not in speaker_mapping:
	speaker_mapping[old_speaker] = f"SPEAKER_{counter:02d}"
	counter += 1
	segment["speaker"] = speaker_mapping[old_speaker]

	return result


	def diarize_speech(
	audio_wav,
	result,
	min_speakers,
	max_speakers,
	YOUR_HF_TOKEN,
	model_name="pyannote/speaker-diarization@2.1",
	):
	"""
	Performs speaker diarization on speech segments.

	Parameters:
	- audio_wav (array): Audio data in WAV format to perform speaker
	diarization.
	- result (dict): Metadata containing information about speech segments
	and alignments.
	- min_speakers (int): Minimum number of speakers expected in the audio.
	- max_speakers (int): Maximum number of speakers expected in the audio.
	- YOUR_HF_TOKEN (str): Your Hugging Face API token for model
	authentication.
	- model_name (str): Name of the speaker diarization model to be used
	(default: "pyannote/speaker-diarization@2.1").

	Returns:
	- result_diarize (dict): Updated metadata after assigning speaker
	labels to segments.

	Notes:
	- This function utilizes a speaker diarization model to label speaker
	segments in the audio.
	- It assigns speakers to word-level segments based on diarization results.
	- Cleans up memory by releasing resources after diarization.
	- If only one speaker is specified, each segment is automatically assigned
	as the first speaker, eliminating the need for diarization inference.
	"""

	if max(min_speakers, max_speakers) > 1 and model_name:
	try:

	diarize_model = whisperx.DiarizationPipeline(
	model_name=model_name,
	use_auth_token=YOUR_HF_TOKEN,
	device=os.environ.get("SONITR_DEVICE"),
	)

	except Exception as error:
	error_str = str(error)
	gc.collect()
	torch.cuda.empty_cache() # noqa
	if "'NoneType' object has no attribute 'to'" in error_str:
	if model_name == diarization_models["pyannote_2.1"]:
	raise ValueError(
	"Accept the license agreement for using Pyannote 2.1."
	" You need to have an account on Hugging Face and "
	"accept the license to use the models: "
	"https://huggingface.co/pyannote/speaker-diarization "
	"and https://huggingface.co/pyannote/segmentation "
	"Get your KEY TOKEN here: "
	"https://hf.co/settings/tokens "
	)
	elif model_name == diarization_models["pyannote_3.1"]:
	raise ValueError(
	"New Licence Pyannote 3.1: You need to have an account"
	" on Hugging Face and accept the license to use the "
	"models: https://huggingface.co/pyannote/speaker-diarization-3.1 " # noqa
	"and https://huggingface.co/pyannote/segmentation-3.0 "
	)
	else:
	raise error

	random_sleep()
	diarize_segments = diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers)

	result_diarize = whisperx.assign_word_speakers(
	diarize_segments, result
	)

	for segment in result_diarize["segments"]:
	if "speaker" not in segment:
	segment["speaker"] = "SPEAKER_00"
	logger.warning(
	f"No speaker detected in {segment['start']}. First TTS "
	f"will be used for the segment text: {segment['text']} "
	)

	del diarize_model
	gc.collect()
	torch.cuda.empty_cache() # noqa
	else:
	result_diarize = result
	result_diarize["segments"] = [
	{**item, "speaker": "SPEAKER_00"}
	for item in result_diarize["segments"]
	]
	return reencode_speakers(result_diarize)