File size: 2,387 Bytes
4142a7d
1457f73
4b1ed8b
de3b0ca
f1e7200
 
 
4142a7d
2e9079e
de301cb
 
 
 
 
 
 
2e9079e
724a5a1
4b1ed8b
 
 
20be358
 
 
4d3ca74
de3b0ca
4d3ca74
4b1ed8b
2e9079e
4b1ed8b
 
f1e7200
 
de3b0ca
4b1ed8b
 
 
eb51c13
2e9079e
de301cb
eb51c13
 
de3b0ca
 
 
eb51c13
 
 
20be358
de3b0ca
 
 
 
17ad421
eb51c13
 
2e9079e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import streamlit as st
from transformers import pipeline
import re

def custom_sentence_splitter(text):
    # Simple regex to split sentences by periods, exclamations, or question marks followed by a space
    return re.split(r'(?<=[.!?]) +', text)

st.title('Hugging Face BERT Summarizer')

# List of models
models = ["sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn", "t5-base", "t5-large", "google/pegasus-newsroom"]

# Dropdown model selector
model = st.sidebar.selectbox("Choose a model", models)

uploaded_file = st.file_uploader("Choose a .txt file", type="txt")

# Add text input for keywords
keywords = st.text_input("Enter keywords (comma-separated)")

# Add slider to the sidebar for the scale value
scale_percentage = st.sidebar.slider('Scale %', min_value=1, max_value=100, value=50)

# Add slider for the chunk size
chunk_size = st.sidebar.slider('Chunk size (words)', min_value=100, max_value=1000, value=500)

if uploaded_file is not None and keywords:
    user_input = uploaded_file.read().decode('utf-8')
    keywords = [keyword.strip() for keyword in keywords.split(",")]

    # Split text into sentences using the custom function
    sentences = custom_sentence_splitter(user_input)
    
    # Filter sentences based on keywords
    filtered_sentences = [sentence for sentence in sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)]
    filtered_text = ' '.join(filtered_sentences)

    if st.button('Summarize'):
        summarizer = pipeline('summarization', model=model)
        summarized_text = ""

        # Split filtered text into chunks by words
        words = filtered_text.split()
        chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
        
        # Summarize each chunk
        for chunk in chunks:
            chunk_length = len(chunk.split())
            min_length_percentage = max(scale_percentage - 10, 1)
            max_length_percentage = min(scale_percentage + 10, 100)
            min_length = max(int(chunk_length * min_length_percentage / 100), 1)
            max_length = int(chunk_length * max_length_percentage / 100)
            summarized = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
            summarized_text += summarized[0]['summary_text'] + " "

        st.text_area('Summarized Text', summarized_text, height=200)