nb / app.py
camparchimedes's picture
Update app.py
32e6e2c verified
raw
history blame
6.92 kB
### -----------------------------------------------------------------------
### Transkriber version_1.00
### app.py
### -----------------------------------------------------------------------
# -------------------------------------------------------------------------
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -------------------------------------------------------------------------
import os
import re
import uuid
import time
import psutil
import subprocess
from tqdm import tqdm
import tempfile
from fpdf import FPDF
from pathlib import Path
import numpy as np
import torch
from transformers import pipeline
from gpuinfo import GPUInfo
from pydub import AudioSegment
from IPython.display import Audio
import gradio as gr
import huggingface_hub
###############################################################################
# # Configuration | @version 1.05?
# You are an intelligent assistant specializing in interviews with business clients
# for in-depth content creation, etc..()
###############################################################################
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
###############################################################################
# Function to detect leading silence
###############################################################################
def milliseconds_until_sound(sound, silence_threshold_in_decibels=-20.0, chunk_size=10):
trim_ms = 0
assert chunk_size > 0
while sound[trim_ms:trim_ms + chunk_size].dBFS < silence_threshold_in_decibels and trim_ms < len(sound):
trim_ms += chunk_size
return trim_ms
###############################################################################
# Trim the start of the audio file
###############################################################################
def trim_start(filepath):
path = Path(filepath)
directory = path.parent
filename = path.name
audio = AudioSegment.from_file(filepath, format="wav")
start_trim = milliseconds_until_sound(audio)
trimmed = audio[start_trim:]
new_filename = directory / f"trimmed_{filename}"
trimmed.export(new_filename, format="wav")
return trimmed, new_filename
###############################################################################
# -- segment the audio into smaller parts (1-minute segments for large files)
###############################################################################
def segment_audio(trimmed_audio, output_dir_trimmed):
one_minute = 1 * 60 * 1000 # 1 minute in milliseconds
start_time = 0
i = 0
# -- iterate through trimmed audio, segment it
segmented_files = []
while start_time < len(trimmed_audio):
segment = trimmed_audio[start_time:start_time + one_minute]
# -- filename for each segment
file_name = f"trimmed_{i:02d}.wav"
# --export each segment, save to the Hugging Face hub directly
file_path = file_name
segment.export(file_path, format="wav")
segmented_files.append(file_path)
start_time += one_minute
i += 1
return segmented_files
###############################################################################
# Transcription logic
###############################################################################
def transcribe(file_upload, progress=gr.Progress(track_tqdm=True)):
file = file_upload
start_time = time.time()
# -- trim auio, segment it for processing
trimmed_audio, trimmed_filename = trim_start(file)
segmented_files = segment_audio(trimmed_audio, "trimmed_audio")
pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, device=device)
transcriptions = [pipe(seg_file)["text"] for seg_file in segmented_files]
text = ''.join(transcriptions)
end_time = time.time()
output_time = end_time - start_time
# --Word count
word_count = len(text.split())
# --CPU metric
cpu_usage = psutil.cpu_percent(interval=1)
# --system info string
system_info = f"""
Processing time: {output_time:.2f} seconds.
Number of words: {word_count}
CPU Usage: {cpu_usage}%
"""
return text, system_info
###############################################################################
# Interface
###############################################################################
HEADER_INFO = """
# This space uses the *Norwegian NB-Whisper Large* model by **NbAiLab** to transcribe long-form microphone or audio inputs in Norwegian of arbitrary length.
""".strip()
css = """
#transcription_output textarea {
background-color: #000000; /* black */
color: #00FF00 !important; /* text color */
font-size: 18px; /* font size */
}
#system_info_box textarea {
background-color: #ffe0b3; /* orange */
color: black !important; /* text color */
font-size: 16px; /* font size */
font-weight: bold; /* bold font */
}
"""
iface = gr.Blocks(css=css)
with iface:
gr.Markdown(HEADER_INFO)
with gr.Row():
upload = gr.Audio(label="Upload audio", sources="upload", type="filepath")
transcribe_btn = gr.Button("Transkriber")
with gr.Row():
with gr.Column(scale=3):
text_output = gr.Textbox(label="Transkribert Tekst", placeholder="t r a n s c r i p t i o", elem_id="transcription_output")
with gr.Column(scale=1):
system_info = gr.Textbox(label="Antall sekunder, ord, system data:", elem_id="system_info_box")
with gr.Row():
gr.Markdown('''
<div style="text-align:center;">
<a href="https://opensource.com/resources/what-open-source" style="display: inline-block;">
<img src="https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github" alt="Open Source? Yes!" style="vertical-align: middle;">
</a>
<span style="display:inline-block; width: 20px;"></span> <!-- This adds space between the logos -->
<a href="https://opensource.org/licenses/Apache-2.0" style="display: inline-block;">
<img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License: Apache 2.0" style="vertical-align: middle;">
</a>
</div>
''')
transcribe_btn.click(
fn=transcribe,
inputs=[upload],
outputs=[text_output, system_info]
)
iface.launch(debug=True)