Spaces:
Runtime error
Runtime error
import time | |
import logging | |
import os | |
import gradio as gr | |
from faster_whisper import WhisperModel | |
from languages import get_language_names, get_language_from_name | |
from subtitle_manager import Subtitle | |
from pathlib import Path | |
import psutil | |
import pynvml | |
logging.basicConfig(level=logging.INFO) | |
last_model = None | |
model = None | |
def get_free_gpu_memory(): | |
pynvml.nvmlInit() | |
handle = pynvml.nvmlDeviceGetHandleByIndex(0) | |
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) | |
pynvml.nvmlShutdown() | |
return meminfo.free | |
def get_workers_count(): | |
try: | |
memory = get_free_gpu_memory() | |
logging.info("CUDA memory") | |
except Exception: | |
memory = psutil.virtual_memory().available | |
logging.info("RAM memory") | |
logging.info(f"memory:{memory/ 1_000_000_000} GB") | |
workers = int(memory / 2_000_000_000) | |
logging.info(f"workers:{workers}") | |
return workers | |
def transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task, | |
chunk_length, compute_type, beam_size, vad_filter, min_silence_duration_ms, | |
progress=gr.Progress()): | |
global last_model | |
global model | |
progress(0, desc="Loading Audio..") | |
logging.info(f"languageName:{languageName}") | |
logging.info(f"urlData:{urlData}") | |
logging.info(f"multipleFiles:{multipleFiles}") | |
logging.info(f"microphoneData:{microphoneData}") | |
logging.info(f"task: {task}") | |
logging.info(f"chunk_length: {chunk_length}") | |
if last_model == None or modelName != last_model: | |
logging.info("first or new model") | |
progress(0.1, desc="Loading Model..") | |
model = None | |
model = WhisperModel(modelName, device="auto",compute_type=compute_type, cpu_threads=os.cpu_count(),)#device="auto", compute_type="float16" | |
print('loaded') | |
else: | |
logging.info("Model not changed") | |
last_model = modelName | |
srt_sub = Subtitle("srt") | |
# vtt_sub = Subtitle("vtt") | |
# txt_sub = Subtitle("txt") | |
files = [] | |
if multipleFiles: | |
files+=multipleFiles | |
if urlData: | |
files.append(urlData) | |
if microphoneData: | |
files.append(microphoneData) | |
logging.info(files) | |
languageName = None if languageName == "Automatic Detection" else get_language_from_name(languageName).code | |
files_out = [] | |
vtt="" | |
txt="" | |
for file in progress.tqdm(files, desc="Working..."): | |
start_time = time.time() | |
segments, info = model.transcribe( | |
file, | |
beam_size=beam_size, | |
vad_filter=vad_filter, | |
language=languageName, | |
vad_parameters=dict(min_silence_duration_ms=min_silence_duration_ms), | |
# max_new_tokens=128, | |
condition_on_previous_text=False, | |
chunk_length=chunk_length, | |
) | |
file_name = Path(file).stem | |
files_out_srt = srt_sub.write_subtitle(segments, file_name, modelName, progress) | |
# txt = txt_sub.get_subtitle(segments, progress) | |
logging.info(print(f"transcribe: {time.time() - start_time} sec.")) | |
files_out += [files_out_srt] | |
return files_out, vtt, txt | |
with gr.Blocks(title="Fast Whisper WebUI") as demo: | |
description = "faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is a fast inference engine for Transformer models." | |
article = "Read the [documentation here](https://github.com/SYSTRAN/faster-whisper)." | |
whisper_models = [ | |
"tiny", "tiny.en", | |
"base", "base.en", | |
"small", "small.en", "distil-small.en", | |
"medium", "medium.en", "distil-medium.en", | |
"large", | |
"large-v1", | |
"large-v2", "distil-large-v2", | |
"large-v3", "distil-large-v3", | |
] | |
compute_types = [ | |
"auto", "default", "int8", "int8_float32", | |
"int8_float16", "int8_bfloat16", "int16", | |
"float16", "float32", "bfloat16" | |
] | |
# settings | |
# cant put Dropdown in inputs | |
# with gr.Accordion("Settings", open=False): | |
# task = gr.Dropdown(choices=["transcribe", "translate"], label="Task", value="transcribe", interactive = True), | |
# chunk_length = gr.Number(label='chunk_length',value=30, interactive = True), | |
# compute_type = gr.Dropdown(label="compute_type", choices=compute_types, value="auto", interactive = True), | |
# beam_size = gr.Number(label='beam_size',value=5, interactive = True), | |
# vad_filter = gr.Checkbox(label='vad_filter',info='Use vad_filter', value=True), | |
# vad_min_silence_duration_ms = gr.Number(label='Vad min_silence_duration_ms',value=500, interactive = True), | |
gr.Interface( | |
fn=transcribe_webui_simple_progress, | |
description=description, | |
article=article, | |
inputs=[ | |
gr.Dropdown(choices=whisper_models, value="distil-large-v2", label="Model", info="Select whisper model", interactive = True,), | |
gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", info="Select audio voice language", interactive = True,), | |
gr.Text(label="URL", info="(YouTube, etc.)", interactive = True), | |
gr.File(label="Upload Files", file_count="multiple"), | |
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio"), | |
gr.Dropdown(choices=["transcribe", "translate"], label="Task", value="transcribe", interactive = True), | |
gr.Number(label='chunk_length',value=30, interactive = True), | |
gr.Dropdown(label="compute_type", choices=compute_types, value="auto", interactive = True), | |
gr.Number(label='beam_size',value=5, interactive = True), | |
gr.Checkbox(label='vad_filter',info='Use vad_filter', value=True), | |
gr.Number(label='Vad min_silence_duration_ms',value=500, interactive = True), | |
], | |
outputs=[ | |
gr.File(label="Download"), | |
gr.Text(label="Transcription"), | |
gr.Text(label="Segments"), | |
] | |
) | |
if __name__ == "__main__": | |
demo.queue(default_concurrency_limit=get_workers_count()) | |
demo.launch(share=True) | |