Spaces:

darag
/

kurdish-kurmanci-to-text-srt

Sleeping

App Files Files Community

kurdish-kurmanci-to-text-srt / app.py

darag

Update app.py

2c66d5b verified 2 months ago

raw

history blame contribute delete

3.55 kB

	import os
	import torch
	import librosa
	import numpy as np
	import gradio as gr
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

	def format_time(milliseconds):
	seconds, milliseconds = divmod(int(milliseconds), 1000)
	minutes, seconds = divmod(seconds, 60)
	hours, minutes = divmod(minutes, 60)
	return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

	def detect_speech_activity(y, sr, frame_length=1024, hop_length=512, threshold=0.01):
	energy = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
	speech_frames = energy > threshold

	speech_regions = []
	in_speech = False
	for i, speech in enumerate(speech_frames):
	if speech and not in_speech:
	start = i
	in_speech = True
	elif not speech and in_speech:
	end = i
	speech_regions.append((start * hop_length / sr, end * hop_length / sr))
	in_speech = False

	if in_speech:
	speech_regions.append((start * hop_length / sr, len(y) / sr))

	return speech_regions

	def post_process_text(text):
	text = text.replace(" ", " ")
	text = text.strip()
	return text

	def transcribe_audio(audio_file):
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	model_name = "Akashpb13/xlsr_kurmanji_kurdish"
	processor = Wav2Vec2Processor.from_pretrained(model_name)
	model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)

	y, sr = librosa.load(audio_file, sr=16000)
	voiced_segments = detect_speech_activity(y, sr, threshold=0.005)

	srt_content = ""
	for i, (start, end) in enumerate(voiced_segments, start=1):
	segment_audio = y[int(start * sr):int(end * sr)]

	input_values = processor(segment_audio, sampling_rate=sr, return_tensors="pt").input_values
	input_values = input_values.to(device)

	with torch.no_grad():
	logits = model(input_values).logits

	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.batch_decode(predicted_ids)[0]

	transcription = post_process_text(transcription)

	if transcription:
	start_time = format_time(start * 1000)
	end_time = format_time(end * 1000)

	srt_content += f"{i}\n"
	srt_content += f"{start_time} --> {end_time}\n"

	# Break long lines into shorter ones (max 50 characters)
	words = transcription.split()
	lines = []
	current_line = ""
	for word in words:
	if len(current_line) + len(word) > 50:
	lines.append(current_line.strip())
	current_line = ""
	current_line += word + " "
	if current_line:
	lines.append(current_line.strip())

	srt_content += "\n".join(lines) + "\n\n"

	return srt_content

	def save_srt(audio_file):
	srt_content = transcribe_audio(audio_file)
	output_filename = "output.srt"
	with open(output_filename, "w", encoding="utf-8") as f:
	f.write(srt_content)
	return output_filename, srt_content

	iface = gr.Interface(
	fn=save_srt,
	inputs=gr.Audio(type="filepath"),
	outputs=[
	gr.File(label="Download SRT"),
	gr.Textbox(label="SRT Content", lines=10)
	],
	title="Kurdish Speech-to-Text Transcription",
	description="Upload an audio file to generate a SRT subtitle file with Kurdish transcription."
	)

	if __name__ == "__main__":
	iface.launch()