nb / app.py
camparchimedes's picture
Update app.py
d18b751 verified
raw
history blame
6.17 kB
# app.py
import gradio as gr
import warnings
import torch
from transformers import pipeline, WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
warnings.filterwarnings("ignore")
# Load tokenizer and model
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
# Initialize pipeline
asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch_dtype)
def transcribe_audio(audio_file):
# Perform transcription
with torch.no_grad():
output = asr(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 5, "task": "transcribe", "language": "no"})
return output["text"]
# Create Gradio interface
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Audio Transcription App",
description="Upload an audio file to get the transcription",
theme="default",
live=False
)
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM
from pydub import AudioSegment
import soundfile as sf
import numpy as np
import os
import nltk
from fpdf import FPDF
import time
nltk.download('punkt')
HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
# transcription
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
# summarization
summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/norbert-summarization")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("NbAiLab/norbert-summarization")
# setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
# move 'em
transcription_model.to(device)
summarization_model.to(device) # PS. model needs to be told to use graph-based summary method (Lexname?)
def convert_to_wav(audio_file):
audio = AudioSegment.from_file(audio_file, format="m4a")
wav_file = "temp.wav"
audio.export(wav_file, format="wav")
return wav_file
def transcribe_audio(audio_file, batch_size=4):
start_time = time.time()
# Convert .m4a to .wav
if audio_file.endswith(".m4a"):
audio_file = convert_to_wav(audio_file)
audio_input, sample_rate = sf.read(audio_file)
chunk_size = 16000 * 30
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = transcription_model.generate(
inputs.input_features,
max_length=2048, # Increase max_length for longer outputs
num_beams=7,
task="transcribe",
attention_mask=attention_mask,
# forced_decoder_ids=None, # OBS! forced_decoder_ids must not be set. Just marked it out for, just in case..
language="no"
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
end_time = time.time()
transcription_time = end_time - start_time
word_count = len(transcription.split())
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
return transcription.strip(), result
def summarize_text(text):
inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
inputs = inputs.to(device)
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=4, max_length=150, early_stopping=True)
summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
# HTML syntax for imagery
image_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
<div style="text-align: center; margin-top: 20px;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="Additional Image" width="50%" height="auto">
</div>
"""
# Gradio UI
iface = gr.Blocks()
with iface:
gr.HTML(image_html)
gr.Markdown("# Switch Work Audio Transcription App\nUpload an audio file to get the transcription")
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, default=4, label="Batch Size")
transcription_output = gr.Textbox()
summary_output = gr.Textbox()
transcribe_button = gr.Button("Transcribe and Summarize")
def transcribe_and_summarize(audio_file, batch_size):
transcription, result = transcribe_audio(audio_file, batch_size)
summary = summarize_text(transcription)
return result, summary
transcribe_button.click(fn=transcribe_and_summarize, inputs=[audio_input, batch_size_input], outputs=[transcription_output, summary_output])
def save_to_pdf(transcription, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
# include transcription
pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
# paragraph space
pdf.ln(10)
# include summary
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription_summary.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
# run
iface.launch(share=True, debug=True)