Spaces:
Build error
Build error
# app.py | |
import gradio as gr | |
import warnings | |
import torch | |
#from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor | |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq | |
import soundfile as sf | |
import ffmpeg | |
import os | |
from fpdf import FPDF | |
import time | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
import re | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
import pandas as pd | |
warnings.filterwarnings("ignore") | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
#tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-large") | |
#model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-large") | |
#processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large") | |
generation_config = { | |
"temperature": 0.8, | |
"top_p": 0.9, | |
"top_k": 0.5, | |
"max_output_tokens": 2048 | |
} | |
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") | |
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
torch_dtype = torch.float32 | |
model.to(device) | |
def convert_audio_format(audio_path): | |
output_path = "converted_audio.wav" | |
ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True) | |
return output_path | |
def transcribe_audio(audio_file, batch_size=4): | |
start_time = time.time() | |
audio_path = convert_audio_format(audio_file) | |
audio_input, sample_rate = sf.read(audio_path) | |
chunk_size = 16000 * 30 | |
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] | |
transcription = "" | |
for i in range(0, len(chunks), batch_size): | |
batch_chunks = chunks[i:i + batch_size] | |
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) | |
inputs = inputs.to(device) | |
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None | |
with torch.no_grad(): | |
output = model.generate( | |
inputs.input_features, | |
max_length=2048, | |
num_beams=7, | |
task="transcribe", | |
attention_mask=attention_mask | |
) | |
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " | |
end_time = time.time() | |
transcription_time = end_time - start_time | |
word_count = len(transcription.split()) | |
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}" | |
return transcription.strip(), result | |
def save_to_pdf(transcription): | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
pdf.multi_cell(0, 10, transcription) | |
pdf_output_path = "transcription.pdf" | |
pdf.output(pdf_output_path) | |
return pdf_output_path | |
def summarize_text(transcription): | |
sentences = transcription.split(". ") | |
vectorizer = TfidfVectorizer(stop_words='norwegian') | |
X = vectorizer.fit_transform(sentences) | |
kmeans = KMeans(n_clusters=1) | |
kmeans.fit(X) | |
avg = X.mean(axis=0) | |
summary = [sentences[i] for i in kmeans.predict(avg)] | |
return ". ".join(summary) + "." | |
# HTML | |
banner_html = """ | |
<div style="text-align: center;"> | |
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto"> | |
</div> | |
""" | |
image_html = """ | |
<div style="text-align: center; margin-top: 20px;"> | |
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/500x_picture.png" alt="picture" width="50%" height="auto"> | |
</div> | |
""" | |
# Gradio interface | |
iface = gr.Blocks() | |
with iface: | |
gr.HTML(banner_html) | |
gr.Markdown("# ππ―π’ππ’π ππππ ππΌπΎπ¦Ύβ‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file: β") | |
audio_input = gr.Audio(type="filepath") | |
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size") | |
transcription_output = gr.Textbox(label="Transcription") | |
pdf_output = gr.File(label="Download Transcription as PDF") | |
summary_output = gr.Textbox(label="Summary") | |
transcribe_button = gr.Button("Transcribe") | |
def process_audio(audio_file, batch_size): | |
transcription, result = transcribe_audio(audio_file, batch_size) | |
pdf_path = save_to_pdf(transcription) | |
summary = summarize_text(transcription) | |
return result, pdf_path, summary | |
transcribe_button.click(fn=process_audio, inputs=[audio_input, batch_size_input], outputs=[transcription_output, pdf_output, summary_output]) | |
# Launch interface | |
iface.launch(share=True, debug=True) | |