# app.py import gradio as gr import warnings import torch from transformers import pipeline, WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor warnings.filterwarnings("ignore") # Load tokenizer and model tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium") model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium") processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium") # Set up the device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # Initialize pipeline asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch_dtype) def transcribe_audio(audio_file): # Perform transcription with torch.no_grad(): output = asr(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 5, "task": "transcribe", "language": "no"}) return output["text"] # Create Gradio interface iface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath"), outputs="text", title="Audio Transcription App", description="Upload an audio file to get the transcription", theme="default", live=False ) from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM from pydub import AudioSegment import soundfile as sf import numpy as np import os import nltk from fpdf import FPDF import time nltk.download('punkt') HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN') # transcription processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") # summarization summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/norbert-summarization") summarization_model = AutoModelForSeq2SeqLM.from_pretrained("NbAiLab/norbert-summarization") # setup device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # move 'em transcription_model.to(device) summarization_model.to(device) # PS. model needs to be told to use graph-based summary method (Lexname?) def convert_to_wav(audio_file): audio = AudioSegment.from_file(audio_file, format="m4a") wav_file = "temp.wav" audio.export(wav_file, format="wav") return wav_file def transcribe_audio(audio_file, batch_size=4): start_time = time.time() # Convert .m4a to .wav if audio_file.endswith(".m4a"): audio_file = convert_to_wav(audio_file) audio_input, sample_rate = sf.read(audio_file) chunk_size = 16000 * 30 chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] transcription = "" for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i + batch_size] inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) inputs = inputs.to(device) attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None with torch.no_grad(): output = transcription_model.generate( inputs.input_features, max_length=2048, # Increase max_length for longer outputs num_beams=7, task="transcribe", attention_mask=attention_mask, # forced_decoder_ids=None, # OBS! forced_decoder_ids must not be set. Just marked it out for, just in case.. language="no" ) transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " end_time = time.time() transcription_time = end_time - start_time word_count = len(transcription.split()) result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}" return transcription.strip(), result def summarize_text(text): inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True) inputs = inputs.to(device) summary_ids = summarization_model.generate(inputs.input_ids, num_beams=4, max_length=150, early_stopping=True) summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary # HTML syntax for imagery image_html = """
Banner
Additional Image
""" # Gradio UI iface = gr.Blocks() with iface: gr.HTML(image_html) gr.Markdown("# Switch Work Audio Transcription App\nUpload an audio file to get the transcription") audio_input = gr.Audio(type="filepath") batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, default=4, label="Batch Size") transcription_output = gr.Textbox() summary_output = gr.Textbox() transcribe_button = gr.Button("Transcribe and Summarize") def transcribe_and_summarize(audio_file, batch_size): transcription, result = transcribe_audio(audio_file, batch_size) summary = summarize_text(transcription) return result, summary transcribe_button.click(fn=transcribe_and_summarize, inputs=[audio_input, batch_size_input], outputs=[transcription_output, summary_output]) def save_to_pdf(transcription, summary): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) # include transcription pdf.multi_cell(0, 10, "Transcription:\n" + transcription) # paragraph space pdf.ln(10) # include summary pdf.multi_cell(0, 10, "Summary:\n" + summary) pdf_output_path = "transcription_summary.pdf" pdf.output(pdf_output_path) return pdf_output_path # run iface.launch(share=True, debug=True)