# app.py
# Version: 1.07 (08.24.24), ALPHA
#---------------------------------------------------------------------------------------------------------------------------------------------
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#---------------------------------------------------------------------------------------------------------------------------------------------
import gradio as gr
from PIL import Image
from pydub import AudioSegment
import os
import re
import time
import warnings
#import datetime
import subprocess
from pathlib import Path
from fpdf import FPDF
import psutil
from gpuinfo import GPUInfo
#import pandas as pd
#import csv
import numpy as np
import torch
#import torchaudio
#import torchaudio.transforms as transforms
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import spacy
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#---------------------------------------------------------------------------------------------------------------------------------------------
warnings.filterwarnings("ignore")
HEADER_INFO = """
# WEB APP ✨| Norwegian WHISPER Model
Switch Work [Transkribering av lydfiler til norsk skrift]
""".strip()
LOGO = "https://huggingface.co/spaces/camparchimedes/transcription_app/resolve/main/pic09w9678yhit.png"
SIDEBAR_INFO = f"""
"""
def convert_to_wav(filepath):
_,file_ending = os.path.splitext(f'{filepath}')
audio_file = filepath.replace(file_ending, ".wav")
os.system(f'ffmpeg -i "{filepath}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
return audio_file
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model="NbAiLab/nb-whisper-large",
chunk_length_s=30,
device=device,
)
def transcribe_audio(audio_file, batch_size=10):
#if audio_file.endswith(".m4a"):
#audio_file = convert_to_wav(audio_file)
start_time = time.time()
outputs = pipe(audio_file, batch_size=batch_size, return_timestamps=False, generate_kwargs={'task': 'transcribe', 'language': 'no'}) # skip_special_tokens=True
text = outputs["text"]
end_time = time.time()
output_time = end_time - start_time
word_count = len(text.split())
memory = psutil.virtual_memory()
gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
system_info = f"""
*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
*Processing time: {output_time:.2f} seconds.*
*Number of words: {word_count}*
*GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}*"""
return text.strip(), system_info
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# Clean/preprocess text
def clean_text(text):
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
nlp = spacy.blank("nb") # 'nb' ==> codename = Norwegian Bokmål
nlp.add_pipe('sentencizer')
spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
def preprocess_text(text):
# Process the text with SpaCy
doc = nlp(text)
# SpaCy's stop top wrds direct
stop_words = spacy_stop_words
# Filter out stop words
words = [token.text for token in doc if token.text.lower() not in stop_words]
return ' '.join(words)
# Summarize w/T5 model
def summarize_text(text):
preprocessed_text = preprocess_text(text)
inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
inputs = inputs.to(device)
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Builds similarity matrix
def build_similarity_matrix(sentences, stop_words):
similarity_matrix = nx.Graph()
for i, tokens_a in enumerate(sentences):
for j, tokens_b in enumerate(sentences):
if i != j:
common_words = set(tokens_a) & set(tokens_b)
similarity_matrix.add_edge(i, j, weight=len(common_words))
return similarity_matrix
# "Graph-based summarization" =====>
def graph_based_summary(text, num_paragraphs=3):
doc = nlp(text)
sentences = [sent.text for sent in doc.sents]
if len(sentences) < num_paragraphs:
return sentences
sentence_tokens = [nlp(sent) for sent in sentences]
stop_words = spacy_stop_words
filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens]
similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)
scores = nx.pagerank(similarity_matrix)
ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
# LexRank
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
doc = nlp(text)
sentences = [sent.text for sent in doc.sents]
if len(sentences) < num_paragraphs:
return sentences
stop_words = spacy_stop_words
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
X = vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(X, X)
# Apply threshold@similarity matrix
similarity_matrix[similarity_matrix < threshold] = 0
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
# TextRank
def text_rank_summary(text, num_paragraphs=3):
doc = nlp(text)
sentences = [sent.text for sent in doc.sents]
if len(sentences) < num_paragraphs:
return sentences
stop_words = spacy_stop_words
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
X = vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(X, X)
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
# Save text+summary/PDF
def save_to_pdf(text, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
if text:
pdf.multi_cell(0, 10, "Text:\n" + text)
pdf.ln(10) # Paragraph space
if summary:
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
iface = gr.Blocks()
with iface:
gr.HTML(SIDEBAR_INFO)
gr.Markdown(HEADER_INFO)
with gr.Tabs():
with gr.TabItem("Transcription"):
audio_input = gr.Audio(type="filepath")
text_output = gr.Textbox(label="Text")
result_output = gr.Textbox(label="Transcription Details")
transcribe_button = gr.Button("Transcribe")
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])
with gr.TabItem("Summary | Graph-based"):
summary_output = gr.Textbox(label="Summary | Graph-based")
summarize_button = gr.Button("Summarize")
summarize_button.click(fn=lambda text: graph_based_summary(text), inputs=[text_output], outputs=[summary_output])
with gr.TabItem("Summary | LexRank"):
summary_output = gr.Textbox(label="Summary | LexRank")
summarize_button = gr.Button("Summarize")
summarize_button.click(fn=lambda text: lex_rank_summary(text), inputs=[text_output], outputs=[summary_output])
with gr.TabItem("Summary | TextRank"):
summary_output = gr.Textbox(label="Summary | TextRank")
summarize_button = gr.Button("Summarize")
summarize_button.click(fn=lambda text: text_rank_summary(text), inputs=[text_output], outputs=[summary_output])
with gr.TabItem("Download PDF"):
pdf_text_only = gr.Button("Download PDF with Text Only")
pdf_summary_only = gr.Button("Download PDF with Summary Only")
pdf_both = gr.Button("Download PDF with Both")
pdf_output = gr.File(label="Download PDF")
pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output], outputs=[pdf_output])
pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output], outputs=[pdf_output])
iface.launch(share=True, debug=True)