|
import streamlit as st |
|
from transformers import pipeline |
|
import re |
|
|
|
def custom_sentence_splitter(text): |
|
|
|
return re.split(r'(?<=[.!?]) +', text) |
|
|
|
st.title('Hugging Face BERT Summarizer') |
|
|
|
|
|
models = ["sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn", "t5-base", "t5-large", "google/pegasus-newsroom"] |
|
|
|
|
|
model = st.sidebar.selectbox("Choose a model", models) |
|
|
|
uploaded_file = st.file_uploader("Choose a .txt file", type="txt") |
|
|
|
|
|
keywords = st.text_input("Enter keywords (comma-separated)") |
|
|
|
|
|
scale_percentage = st.sidebar.slider('Scale %', min_value=1, max_value=100, value=50) |
|
|
|
|
|
chunk_size = st.sidebar.slider('Chunk size (words)', min_value=100, max_value=1000, value=500) |
|
|
|
if uploaded_file is not None and keywords: |
|
user_input = uploaded_file.read().decode('utf-8') |
|
keywords = [keyword.strip() for keyword in keywords.split(",")] |
|
|
|
|
|
sentences = custom_sentence_splitter(user_input) |
|
|
|
|
|
filtered_sentences = [sentence for sentence in sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)] |
|
filtered_text = ' '.join(filtered_sentences) |
|
|
|
if st.button('Summarize'): |
|
summarizer = pipeline('summarization', model=model) |
|
summarized_text = "" |
|
|
|
|
|
words = filtered_text.split() |
|
chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] |
|
|
|
|
|
for chunk in chunks: |
|
chunk_length = len(chunk.split()) |
|
min_length_percentage = max(scale_percentage - 10, 1) |
|
max_length_percentage = min(scale_percentage + 10, 100) |
|
min_length = max(int(chunk_length * min_length_percentage / 100), 1) |
|
max_length = int(chunk_length * max_length_percentage / 100) |
|
summarized = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False) |
|
summarized_text += summarized[0]['summary_text'] + " " |
|
|
|
st.text_area('Summarized Text', summarized_text, height=200) |
|
|