Spaces:
Build error
Build error
camparchimedes
commited on
Commit
•
f7e87b9
1
Parent(s):
9e722fb
Update app.py
Browse files
app.py
CHANGED
@@ -12,64 +12,50 @@ def convert_to_wav(audio_file):
|
|
12 |
audio.export(wav_file, format="wav")
|
13 |
return wav_file
|
14 |
|
15 |
-
|
16 |
-
|
17 |
import torch
|
18 |
from transformers import pipeline # AutoProcessor, AutoModelForSpeechSeq2Seq
|
19 |
|
20 |
-
|
21 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
22 |
torch_dtype = torch.float32
|
23 |
|
24 |
-
# ASR pipeline
|
25 |
asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype)
|
26 |
|
27 |
-
# ASR
|
28 |
def transcribe_audio(audio_file):
|
29 |
if audio_file.endswith(".m4a"):
|
30 |
audio_file = convert_to_wav(audio_file)
|
31 |
|
32 |
start_time = time.time()
|
33 |
|
34 |
-
|
35 |
with torch.no_grad():
|
36 |
-
output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8}) # "task": "transcribe", "language": "no" + ..batch_size=10 ?
|
37 |
|
38 |
transcription = output["text"]
|
39 |
end_time = time.time()
|
40 |
-
|
41 |
output_time = end_time - start_time
|
42 |
word_count = len(transcription.split())
|
43 |
|
44 |
-
result = f"Time taken: {
|
45 |
|
46 |
return transcription.strip(), result
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
# -------------------------------------[VERSION 3: full-on t5-base + NLTK + 3 styles for summarization]-------------------------------------
|
56 |
import nltk
|
57 |
from nltk.tokenize import word_tokenize, sent_tokenize
|
58 |
from nltk.corpus import stopwords
|
59 |
import networkx as nx
|
60 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
61 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
62 |
import pandas as pd
|
63 |
import numpy as np
|
64 |
-
|
65 |
|
66 |
-
nltk.download('punkt')
|
67 |
nltk.download('stopwords')
|
68 |
|
69 |
-
|
70 |
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
|
71 |
|
72 |
-
|
|
|
73 |
def clean_text(text):
|
74 |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
|
75 |
text = re.sub(r'\<a href', ' ', str(text))
|
@@ -98,15 +84,11 @@ def preprocess_text(text):
|
|
98 |
except Exception as e:
|
99 |
st.error(f"Error during text preprocessing: {e}")
|
100 |
return None
|
101 |
-
# ROUGE metric
|
102 |
-
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
|
103 |
-
|
104 |
|
105 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
106 |
|
107 |
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16)
|
108 |
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
|
109 |
-
|
110 |
summarization_model.to(device)
|
111 |
|
112 |
def summarize_text(text):
|
@@ -128,7 +110,6 @@ def build_similarity_matrix(sentences, stop_words):
|
|
128 |
similarity_matrix.add_edge(i, j, weight=len(common_words))
|
129 |
return similarity_matrix
|
130 |
|
131 |
-
|
132 |
def graph_based_summary(text, num_paragraphs=3):
|
133 |
sentences = text.strip().split(".")
|
134 |
if len(sentences) < num_paragraphs:
|
@@ -142,7 +123,7 @@ def graph_based_summary(text, num_paragraphs=3):
|
|
142 |
summary = [sent for _, sent in ranked_sentences[:num_paragraphs]]
|
143 |
return summary
|
144 |
|
145 |
-
def lex_rank_summary(text, num_paragraphs=3):
|
146 |
sentences = nltk.sent_tokenize(text)
|
147 |
if len(sentences) < num_paragraphs:
|
148 |
return sentences
|
@@ -150,26 +131,37 @@ def lex_rank_summary(text, num_paragraphs=3):
|
|
150 |
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
151 |
X = vectorizer.fit_transform(sentences)
|
152 |
similarity_matrix = cosine_similarity(X, X)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
nx_graph = nx.from_numpy_array(similarity_matrix)
|
154 |
-
scores = nx.
|
155 |
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
|
156 |
summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
|
157 |
return summary
|
158 |
|
|
|
159 |
def text_rank_summary(text, num_paragraphs=3):
|
160 |
sentences = nltk.sent_tokenize(text)
|
161 |
if len(sentences) < num_paragraphs:
|
162 |
return sentences
|
|
|
163 |
stop_words = set(stopwords.words('norwegian'))
|
164 |
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
165 |
X = vectorizer.fit_transform(sentences)
|
166 |
similarity_matrix = cosine_similarity(X, X)
|
167 |
-
nx_graph = nx.from_numpy_array(similarity_matrix)
|
168 |
-
scores = nx.pagerank(nx_graph)
|
169 |
-
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
|
170 |
-
summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
|
171 |
-
return summary
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
|
175 |
import gradio as gr
|
@@ -194,7 +186,6 @@ def save_to_pdf(transcription, summary):
|
|
194 |
pdf.output(pdf_output_path)
|
195 |
return pdf_output_path
|
196 |
|
197 |
-
|
198 |
banner_html = """
|
199 |
<div style="text-align: center;">
|
200 |
<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/picture.png" alt="Banner" width="100%" height="auto">
|
@@ -267,7 +258,6 @@ with iface:
|
|
267 |
outputs=summary_output
|
268 |
)
|
269 |
|
270 |
-
|
271 |
with gr.TabItem("Summary_t3"):
|
272 |
summary_output = gr.Textbox(label="Summary | TextRank")
|
273 |
summarize_button = gr.Button("Summarize")
|
@@ -284,7 +274,6 @@ with iface:
|
|
284 |
outputs=summary_output
|
285 |
)
|
286 |
|
287 |
-
|
288 |
with gr.TabItem("Download PDF"):
|
289 |
pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
|
290 |
pdf_summary_only = gr.Button("Download PDF with Summary Only")
|
|
|
12 |
audio.export(wav_file, format="wav")
|
13 |
return wav_file
|
14 |
|
|
|
|
|
15 |
import torch
|
16 |
from transformers import pipeline # AutoProcessor, AutoModelForSpeechSeq2Seq
|
17 |
|
|
|
18 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
19 |
torch_dtype = torch.float32
|
20 |
|
|
|
21 |
asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype)
|
22 |
|
|
|
23 |
def transcribe_audio(audio_file):
|
24 |
if audio_file.endswith(".m4a"):
|
25 |
audio_file = convert_to_wav(audio_file)
|
26 |
|
27 |
start_time = time.time()
|
28 |
|
|
|
29 |
with torch.no_grad():
|
30 |
+
output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8, "batch_size": 10}) # "task": "transcribe", "language": "no" + ..batch_size=10 ?
|
31 |
|
32 |
transcription = output["text"]
|
33 |
end_time = time.time()
|
|
|
34 |
output_time = end_time - start_time
|
35 |
word_count = len(transcription.split())
|
36 |
|
37 |
+
result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
|
38 |
|
39 |
return transcription.strip(), result
|
40 |
|
41 |
+
# [VERSION 3: full-on w/ 3 styles for summarization]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
import nltk
|
43 |
from nltk.tokenize import word_tokenize, sent_tokenize
|
44 |
from nltk.corpus import stopwords
|
45 |
import networkx as nx
|
46 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
47 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
48 |
import pandas as pd
|
49 |
import numpy as np
|
50 |
+
import re
|
51 |
|
52 |
+
nltk.download('punkt')
|
53 |
nltk.download('stopwords')
|
54 |
|
|
|
55 |
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
|
56 |
|
57 |
+
text = transcription
|
58 |
+
|
59 |
def clean_text(text):
|
60 |
text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
|
61 |
text = re.sub(r'\<a href', ' ', str(text))
|
|
|
84 |
except Exception as e:
|
85 |
st.error(f"Error during text preprocessing: {e}")
|
86 |
return None
|
|
|
|
|
|
|
87 |
|
88 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
89 |
|
90 |
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16)
|
91 |
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
|
|
|
92 |
summarization_model.to(device)
|
93 |
|
94 |
def summarize_text(text):
|
|
|
110 |
similarity_matrix.add_edge(i, j, weight=len(common_words))
|
111 |
return similarity_matrix
|
112 |
|
|
|
113 |
def graph_based_summary(text, num_paragraphs=3):
|
114 |
sentences = text.strip().split(".")
|
115 |
if len(sentences) < num_paragraphs:
|
|
|
123 |
summary = [sent for _, sent in ranked_sentences[:num_paragraphs]]
|
124 |
return summary
|
125 |
|
126 |
+
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
|
127 |
sentences = nltk.sent_tokenize(text)
|
128 |
if len(sentences) < num_paragraphs:
|
129 |
return sentences
|
|
|
131 |
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
132 |
X = vectorizer.fit_transform(sentences)
|
133 |
similarity_matrix = cosine_similarity(X, X)
|
134 |
+
|
135 |
+
|
136 |
+
for i in range(len(similarity_matrix)): # threshold
|
137 |
+
for j in range(len(similarity_matrix[i])):
|
138 |
+
if similarity_matrix[i][j] < threshold:
|
139 |
+
similarity_matrix[i][j] = 0.0
|
140 |
+
|
141 |
nx_graph = nx.from_numpy_array(similarity_matrix)
|
142 |
+
scores = nx.pagerank(nx_graph)
|
143 |
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
|
144 |
summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
|
145 |
return summary
|
146 |
|
147 |
+
|
148 |
def text_rank_summary(text, num_paragraphs=3):
|
149 |
sentences = nltk.sent_tokenize(text)
|
150 |
if len(sentences) < num_paragraphs:
|
151 |
return sentences
|
152 |
+
|
153 |
stop_words = set(stopwords.words('norwegian'))
|
154 |
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
|
155 |
X = vectorizer.fit_transform(sentences)
|
156 |
similarity_matrix = cosine_similarity(X, X)
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
+
nx_graph = nx.from_numpy_array(similarity_matrix) # graph, nodes (i.e sentences) & edges are similarity scores (is cool)
|
159 |
+
scores = nx.pagerank(nx_graph) # PageRank algorithm, scoring sentences
|
160 |
+
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # rank by PageRank scores
|
161 |
+
|
162 |
+
summary = [ranked_sentences[i][1] for i in range(num_paragraphs)] # top sentences for summary
|
163 |
+
|
164 |
+
return ' '.join(summary)
|
165 |
|
166 |
|
167 |
import gradio as gr
|
|
|
186 |
pdf.output(pdf_output_path)
|
187 |
return pdf_output_path
|
188 |
|
|
|
189 |
banner_html = """
|
190 |
<div style="text-align: center;">
|
191 |
<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/picture.png" alt="Banner" width="100%" height="auto">
|
|
|
258 |
outputs=summary_output
|
259 |
)
|
260 |
|
|
|
261 |
with gr.TabItem("Summary_t3"):
|
262 |
summary_output = gr.Textbox(label="Summary | TextRank")
|
263 |
summarize_button = gr.Button("Summarize")
|
|
|
274 |
outputs=summary_output
|
275 |
)
|
276 |
|
|
|
277 |
with gr.TabItem("Download PDF"):
|
278 |
pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
|
279 |
pdf_summary_only = gr.Button("Download PDF with Summary Only")
|