nb / app.py
camparchimedes's picture
Update app.py
2d9e081 verified
raw
history blame
4.86 kB
# app.py
import gradio as gr
import warnings
import torch
#from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import soundfile as sf
import ffmpeg
import os
from fpdf import FPDF
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
warnings.filterwarnings("ignore")
nltk.download('punkt')
nltk.download('stopwords')
#tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-large")
#model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-large")
#processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large")
generation_config = {
"temperature": 0.8,
"top_p": 0.9,
"top_k": 0.5,
"max_output_tokens": 2048
}
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
model.to(device)
def convert_audio_format(audio_path):
output_path = "converted_audio.wav"
ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
return output_path
def transcribe_audio(audio_file, batch_size=4):
start_time = time.time()
audio_path = convert_audio_format(audio_file)
audio_input, sample_rate = sf.read(audio_path)
chunk_size = 16000 * 30
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = model.generate(
inputs.input_features,
max_length=2048,
num_beams=7,
task="transcribe",
attention_mask=attention_mask
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
end_time = time.time()
transcription_time = end_time - start_time
word_count = len(transcription.split())
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
return transcription.strip(), result
def save_to_pdf(transcription):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, transcription)
pdf_output_path = "transcription.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
def summarize_text(transcription):
sentences = transcription.split(". ")
vectorizer = TfidfVectorizer(stop_words='norwegian')
X = vectorizer.fit_transform(sentences)
kmeans = KMeans(n_clusters=1)
kmeans.fit(X)
avg = X.mean(axis=0)
summary = [sentences[i] for i in kmeans.predict(avg)]
return ". ".join(summary) + "."
# HTML
banner_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
"""
image_html = """
<div style="text-align: center; margin-top: 20px;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/500x_picture.png" alt="picture" width="50%" height="auto">
</div>
"""
# Gradio interface
iface = gr.Blocks()
with iface:
gr.HTML(banner_html)
gr.Markdown("# 𝐍𝐯𝐒𝐝𝐒𝐚 π€πŸπŸŽπŸŽ πŸ‘‹πŸΌπŸ‘ΎπŸ¦Ύβš‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file: β˜•")
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
transcription_output = gr.Textbox(label="Transcription")
pdf_output = gr.File(label="Download Transcription as PDF")
summary_output = gr.Textbox(label="Summary")
transcribe_button = gr.Button("Transcribe")
def process_audio(audio_file, batch_size):
transcription, result = transcribe_audio(audio_file, batch_size)
pdf_path = save_to_pdf(transcription)
summary = summarize_text(transcription)
return result, pdf_path, summary
transcribe_button.click(fn=process_audio, inputs=[audio_input, batch_size_input], outputs=[transcription_output, pdf_output, summary_output])
# Launch interface
iface.launch(share=True, debug=True)