# app.py import gradio as gr import warnings import torch from transformers import pipeline, WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor warnings.filterwarnings("ignore") # Load tokenizer and model tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium") model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium") processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium") # Set up the device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # Initialize pipeline asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch_dtype) def transcribe_audio(audio_file): # Perform transcription with torch.no_grad(): output = asr(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 5, "task": "transcribe", "language": "no"}) return output["text"] # Create Gradio interface iface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath"), outputs="text", title="Audio Transcription App", description="Upload an audio file to get the transcription", theme="default", live=False ) from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM from pydub import AudioSegment import soundfile as sf import numpy as np import os import nltk from fpdf import FPDF import time nltk.download('punkt') HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN') # transcription processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") # summarization summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/norbert-summarization") summarization_model = AutoModelForSeq2SeqLM.from_pretrained("NbAiLab/norbert-summarization") # setup device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 # move 'em transcription_model.to(device) summarization_model.to(device) # PS. model needs to be told to use graph-based summary method (Lexname?) def convert_to_wav(audio_file): audio = AudioSegment.from_file(audio_file, format="m4a") wav_file = "temp.wav" audio.export(wav_file, format="wav") return wav_file def transcribe_audio(audio_file, batch_size=4): start_time = time.time() # Convert .m4a to .wav if audio_file.endswith(".m4a"): audio_file = convert_to_wav(audio_file) audio_input, sample_rate = sf.read(audio_file) chunk_size = 16000 * 30 chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] transcription = "" for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i + batch_size] inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) inputs = inputs.to(device) attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None with torch.no_grad(): output = transcription_model.generate( inputs.input_features, max_length=2048, # Increase max_length for longer outputs num_beams=7, task="transcribe", attention_mask=attention_mask, # forced_decoder_ids=None, # OBS! forced_decoder_ids must not be set. Just marked it out for, just in case.. language="no" ) transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " end_time = time.time() transcription_time = end_time - start_time word_count = len(transcription.split()) result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}" return transcription.strip(), result def summarize_text(text): inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True) inputs = inputs.to(device) summary_ids = summarization_model.generate(inputs.input_ids, num_beams=4, max_length=150, early_stopping=True) summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary # HTML syntax for imagery image_html = """