### ----------------------------------------------------------------------- ### Transkriber version_1.00 ### app.py ### ----------------------------------------------------------------------- # ------------------------------------------------------------------------- # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ------------------------------------------------------------------------- import os import re import uuid import time import psutil import subprocess from tqdm import tqdm import tempfile from fpdf import FPDF from pathlib import Path import numpy as np import torch from transformers import pipeline from gpuinfo import GPUInfo from pydub import AudioSegment from IPython.display import Audio import gradio as gr import huggingface_hub ############################################################################### # # Configuration | @version 1.05? # You are an intelligent assistant specializing in interviews with business clients # for in-depth content creation, etc..() ############################################################################### device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") ############################################################################### # Function to detect leading silence ############################################################################### def milliseconds_until_sound(sound, silence_threshold_in_decibels=-20.0, chunk_size=10): trim_ms = 0 assert chunk_size > 0 while sound[trim_ms:trim_ms + chunk_size].dBFS < silence_threshold_in_decibels and trim_ms < len(sound): trim_ms += chunk_size return trim_ms ############################################################################### # Trim the start of the audio file ############################################################################### def trim_start(filepath): path = Path(filepath) directory = path.parent filename = path.name audio = AudioSegment.from_file(filepath, format="wav") start_trim = milliseconds_until_sound(audio) trimmed = audio[start_trim:] new_filename = directory / f"trimmed_{filename}" trimmed.export(new_filename, format="wav") return trimmed, new_filename ############################################################################### # -- segment the audio into smaller parts (1-minute segments for large files) ############################################################################### def segment_audio(trimmed_audio, output_dir_trimmed): one_minute = 1 * 60 * 1000 # 1 minute in milliseconds start_time = 0 i = 0 # -- iterate through trimmed audio, segment it segmented_files = [] while start_time < len(trimmed_audio): segment = trimmed_audio[start_time:start_time + one_minute] # -- filename for each segment file_name = f"trimmed_{i:02d}.wav" # --export each segment, save to the Hugging Face hub directly file_path = file_name segment.export(file_path, format="wav") segmented_files.append(file_path) start_time += one_minute i += 1 return segmented_files ############################################################################### # Transcription logic ############################################################################### def transcribe(file_upload, progress=gr.Progress(track_tqdm=True)): file = file_upload start_time = time.time() # -- trim auio, segment it for processing trimmed_audio, trimmed_filename = trim_start(file) segmented_files = segment_audio(trimmed_audio, "trimmed_audio") pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, device=device) transcriptions = [pipe(seg_file)["text"] for seg_file in segmented_files] text = ''.join(transcriptions) end_time = time.time() output_time = end_time - start_time # --Word count word_count = len(text.split()) # --CPU metric cpu_usage = psutil.cpu_percent(interval=1) # --system info string system_info = f""" Processing time: {output_time:.2f} seconds. Number of words: {word_count} CPU Usage: {cpu_usage}% """ return text, system_info ############################################################################### # Interface ############################################################################### HEADER_INFO = """ # This space uses the *Norwegian NB-Whisper Large* model by **NbAiLab** to transcribe long-form microphone or audio inputs in Norwegian of arbitrary length. """.strip() css = """ #transcription_output textarea { background-color: #000000; /* black */ color: #00FF00 !important; /* text color */ font-size: 18px; /* font size */ } #system_info_box textarea { background-color: #ffe0b3; /* orange */ color: black !important; /* text color */ font-size: 16px; /* font size */ font-weight: bold; /* bold font */ } """ iface = gr.Blocks(css=css) with iface: gr.Markdown(HEADER_INFO) with gr.Row(): upload = gr.Audio(label="Upload audio", sources="upload", type="filepath") transcribe_btn = gr.Button("Transkriber") with gr.Row(): with gr.Column(scale=3): text_output = gr.Textbox(label="Transkribert Tekst", placeholder="t r a n s c r i p t i o", elem_id="transcription_output") with gr.Column(scale=1): system_info = gr.Textbox(label="Antall sekunder, ord, system data:", elem_id="system_info_box") with gr.Row(): gr.Markdown('''
''') transcribe_btn.click( fn=transcribe, inputs=[upload], outputs=[text_output, system_info] ) iface.launch(debug=True)