Spaces:

camparchimedes
/

nb

Build error

App Files Files

camparchimedes commited on Aug 11

Commit

f7e87b9

•

1 Parent(s): 9e722fb

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -38

app.py CHANGED Viewed

@@ -12,64 +12,50 @@ def convert_to_wav(audio_file):
     audio.export(wav_file, format="wav")
     return wav_file
 import torch
 from transformers import pipeline # AutoProcessor, AutoModelForSpeechSeq2Seq
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 torch_dtype = torch.float32
-# ASR pipeline
 asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype)
-# ASR
 def transcribe_audio(audio_file):
     if audio_file.endswith(".m4a"):
         audio_file = convert_to_wav(audio_file)
     start_time = time.time()
     with torch.no_grad():
-        output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8}) # "task": "transcribe", "language": "no" + ..batch_size=10 ?
     transcription = output["text"]
     end_time = time.time()
     output_time = end_time - start_time
     word_count = len(transcription.split())
-    result = f"Time taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
     return transcription.strip(), result
-# -------------------------------------[VERSION 3: full-on t5-base + NLTK + 3 styles for summarization]-------------------------------------
 import nltk
 from nltk.tokenize import word_tokenize, sent_tokenize
 from nltk.corpus import stopwords
 import networkx as nx
-from sklearn.feature_extraction.text import TfidfVectorizer # from before
-from sklearn.metrics.pairwise import cosine_similarity # from before
 import pandas as pd
 import numpy as np
-from rouge_score import rouge_scorer
-nltk.download('punkt') # from before
 nltk.download('stopwords')
 WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
-# Clean text using Norwegian-specific replacements
 def clean_text(text):
     text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
     text = re.sub(r'\<a href', ' ', str(text))
@@ -98,15 +84,11 @@ def preprocess_text(text):
     except Exception as e:
         st.error(f"Error during text preprocessing: {e}")
         return None
-# ROUGE metric
-scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16)
 summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
 summarization_model.to(device)
 def summarize_text(text):
@@ -128,7 +110,6 @@ def build_similarity_matrix(sentences, stop_words):
                 similarity_matrix.add_edge(i, j, weight=len(common_words))
     return similarity_matrix
 def graph_based_summary(text, num_paragraphs=3):
     sentences = text.strip().split(".")
     if len(sentences) < num_paragraphs:
@@ -142,7 +123,7 @@ def graph_based_summary(text, num_paragraphs=3):
     summary = [sent for _, sent in ranked_sentences[:num_paragraphs]]
     return summary
-def lex_rank_summary(text, num_paragraphs=3):
     sentences = nltk.sent_tokenize(text)
     if len(sentences) < num_paragraphs:
         return sentences
@@ -150,26 +131,37 @@ def lex_rank_summary(text, num_paragraphs=3):
     vectorizer = TfidfVectorizer(stop_words=list(stop_words))
     X = vectorizer.fit_transform(sentences)
     similarity_matrix = cosine_similarity(X, X)
     nx_graph = nx.from_numpy_array(similarity_matrix)
-    scores = nx.pagerank_numpy(nx_graph)
     ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
     summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
     return summary
 def text_rank_summary(text, num_paragraphs=3):
     sentences = nltk.sent_tokenize(text)
     if len(sentences) < num_paragraphs:
         return sentences
     stop_words = set(stopwords.words('norwegian'))
     vectorizer = TfidfVectorizer(stop_words=list(stop_words))
     X = vectorizer.fit_transform(sentences)
     similarity_matrix = cosine_similarity(X, X)
-    nx_graph = nx.from_numpy_array(similarity_matrix)
-    scores = nx.pagerank(nx_graph)
-    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
-    summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
-    return summary
 import gradio as gr
@@ -194,7 +186,6 @@ def save_to_pdf(transcription, summary):
     pdf.output(pdf_output_path)
     return pdf_output_path
 banner_html = """
 <div style="text-align: center;">
     <img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/picture.png" alt="Banner" width="100%" height="auto">
@@ -267,7 +258,6 @@ with iface:
                 outputs=summary_output
             )
         with gr.TabItem("Summary_t3"):
             summary_output = gr.Textbox(label="Summary | TextRank")
             summarize_button = gr.Button("Summarize")
@@ -284,7 +274,6 @@ with iface:
                 outputs=summary_output
             )
         with gr.TabItem("Download PDF"):
             pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
             pdf_summary_only = gr.Button("Download PDF with Summary Only")

     audio.export(wav_file, format="wav")
     return wav_file
 import torch
 from transformers import pipeline # AutoProcessor, AutoModelForSpeechSeq2Seq
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 torch_dtype = torch.float32
 asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype)
 def transcribe_audio(audio_file):
     if audio_file.endswith(".m4a"):
         audio_file = convert_to_wav(audio_file)
     start_time = time.time()
     with torch.no_grad():
+        output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8, "batch_size": 10}) # "task": "transcribe", "language": "no" + ..batch_size=10 ?
     transcription = output["text"]
     end_time = time.time()
     output_time = end_time - start_time
     word_count = len(transcription.split())
+    result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
     return transcription.strip(), result
+# [VERSION 3: full-on w/ 3 styles for summarization]
 import nltk
 from nltk.tokenize import word_tokenize, sent_tokenize
 from nltk.corpus import stopwords
 import networkx as nx
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
 import pandas as pd
 import numpy as np
+import re
+nltk.download('punkt')
 nltk.download('stopwords')
 WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
+text = transcription
 def clean_text(text):
     text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
     text = re.sub(r'\<a href', ' ', str(text))
     except Exception as e:
         st.error(f"Error during text preprocessing: {e}")
         return None
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16)
 summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
 summarization_model.to(device)
 def summarize_text(text):
                 similarity_matrix.add_edge(i, j, weight=len(common_words))
     return similarity_matrix
 def graph_based_summary(text, num_paragraphs=3):
     sentences = text.strip().split(".")
     if len(sentences) < num_paragraphs:
     summary = [sent for _, sent in ranked_sentences[:num_paragraphs]]
     return summary
+def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
     sentences = nltk.sent_tokenize(text)
     if len(sentences) < num_paragraphs:
         return sentences
     vectorizer = TfidfVectorizer(stop_words=list(stop_words))
     X = vectorizer.fit_transform(sentences)
     similarity_matrix = cosine_similarity(X, X)
+    for i in range(len(similarity_matrix)): # threshold
+        for j in range(len(similarity_matrix[i])):
+            if similarity_matrix[i][j] < threshold:
+                similarity_matrix[i][j] = 0.0
     nx_graph = nx.from_numpy_array(similarity_matrix)
+    scores = nx.pagerank(nx_graph)
     ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
     summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
     return summary
 def text_rank_summary(text, num_paragraphs=3):
     sentences = nltk.sent_tokenize(text)
     if len(sentences) < num_paragraphs:
         return sentences
     stop_words = set(stopwords.words('norwegian'))
     vectorizer = TfidfVectorizer(stop_words=list(stop_words))
     X = vectorizer.fit_transform(sentences)
     similarity_matrix = cosine_similarity(X, X)
+    nx_graph = nx.from_numpy_array(similarity_matrix)  # graph, nodes (i.e sentences) & edges are similarity scores (is cool)
+    scores = nx.pagerank(nx_graph) # PageRank algorithm, scoring sentences
+    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # rank by PageRank scores
+    summary = [ranked_sentences[i][1] for i in range(num_paragraphs)] # top sentences for summary
+    return ' '.join(summary)
 import gradio as gr
     pdf.output(pdf_output_path)
     return pdf_output_path
 banner_html = """
 <div style="text-align: center;">
     <img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/picture.png" alt="Banner" width="100%" height="auto">
                 outputs=summary_output
             )
         with gr.TabItem("Summary_t3"):
             summary_output = gr.Textbox(label="Summary | TextRank")
             summarize_button = gr.Button("Summarize")
                 outputs=summary_output
             )
         with gr.TabItem("Download PDF"):
             pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
             pdf_summary_only = gr.Button("Download PDF with Summary Only")