# app.py import gradio as gr import warnings import torch #from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq import soundfile as sf import ffmpeg import os from fpdf import FPDF import time from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import re import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import pandas as pd warnings.filterwarnings("ignore") nltk.download('punkt') nltk.download('stopwords') #tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-large") #model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-large") #processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large") generation_config = { "temperature": 0.8, "top_p": 0.9, "top_k": 0.5, "max_output_tokens": 2048 } processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 model.to(device) def convert_audio_format(audio_path): output_path = "converted_audio.wav" ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True) return output_path def transcribe_audio(audio_file, batch_size=4): start_time = time.time() audio_path = convert_audio_format(audio_file) audio_input, sample_rate = sf.read(audio_path) chunk_size = 16000 * 30 chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] transcription = "" for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i + batch_size] inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) inputs = inputs.to(device) attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None with torch.no_grad(): output = model.generate( inputs.input_features, max_length=2048, num_beams=7, task="transcribe", attention_mask=attention_mask ) transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " end_time = time.time() transcription_time = end_time - start_time word_count = len(transcription.split()) result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}" return transcription.strip(), result def save_to_pdf(transcription): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) pdf.multi_cell(0, 10, transcription) pdf_output_path = "transcription.pdf" pdf.output(pdf_output_path) return pdf_output_path def summarize_text(transcription): sentences = transcription.split(". ") vectorizer = TfidfVectorizer(stop_words='norwegian') X = vectorizer.fit_transform(sentences) kmeans = KMeans(n_clusters=1) kmeans.fit(X) avg = X.mean(axis=0) summary = [sentences[i] for i in kmeans.predict(avg)] return ". ".join(summary) + "." # HTML banner_html = """