Spaces:

ml6team
/

post-processing-summarization

Running

App Files Files Community

MatthiasC commited on Apr 13, 2022

Commit

8545c27

•

1 Parent(s): 3d2eedb

Add more functionalities

Browse files

Files changed (5) hide show

app.py +49 -28
requirements.txt +2 -0
sample-articles/{Biden.txt → biden.txt} +0 -0
sample-articles/{Protestors.txt → protestors.txt} +0 -0
sample-summaries/protestors.txt +1 -0

app.py CHANGED Viewed

@@ -4,6 +4,9 @@ from typing import AnyStr
 import streamlit as st
 from bs4 import BeautifulSoup
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from transformers import pipeline
 import os
@@ -98,15 +101,20 @@ def format_explainer_html(html_string):
     return p
-def list_all_filenames() -> list:
     filenames = []
     for file in os.listdir('./sample-articles/'):
         if file.endswith('.txt'):
             filenames.append(file.replace('.txt', ''))
     return filenames
-def fetch_file_contents(filename: str) -> AnyStr:
-    with open(f'./sample-terms-and-conditions/{filename.lower()}.txt', 'r') as f:
         data = f.read()
     return data
@@ -141,25 +149,25 @@ if 'results' not in st.session_state:
     st.session_state.results = []
 # Page
-st.title('🤬 Dutch Toxic Comment Detection')
-st.markdown("""This demo showcases two Dutch toxic comment detection models.""")
-# Introduction
-st.markdown(f"""Both models were trained using a sequence classification task on a translated [Jigsaw Toxicity dataset](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) which contains toxic online comments.
-    The first model is a fine-tuned multilingual [DistilBERT](https://huggingface.co/distilbert-base-multilingual-cased) model whereas the second is a fine-tuned Dutch RoBERTa-based model called [RobBERT](https://huggingface.co/pdelobelle/robbert-v2-dutch-base).""")
-st.markdown(f"""For a more comprehensive overview of the models check out their model card on 🤗 Model Hub: [distilbert-base-dutch-toxic-comments]({model_names_to_URLs['ml6team/distilbert-base-dutch-cased-toxic-comments']}) and [RobBERT-dutch-base-toxic-comments]({model_names_to_URLs['ml6team/robbert-dutch-base-toxic-comments']}).
-""")
-st.markdown("""Enter a comment that you want to classify below. The model will determine the probability that it is toxic and highlights how much each token contributes to its decision:
-    <font color="black">
-        <span style="background-color: rgb(250, 219, 219); opacity: 1;">r</span><span style="background-color: rgb(244, 179, 179); opacity: 1;">e</span><span style="background-color: rgb(238, 135, 135); opacity: 1;">d</span>
-    </font>
-    tokens indicate toxicity whereas
-    <font color="black">
-    <span style="background-color: rgb(224, 251, 224); opacity: 1;">g</span><span style="background-color: rgb(197, 247, 197); opacity: 1;">re</span><span style="background-color: rgb(121, 236, 121); opacity: 1;">en</span>
-    </font> tokens indicate the opposite.
-Try it yourself! 👇""",
-    unsafe_allow_html=True)
 # Demo
@@ -172,24 +180,37 @@ Try it yourself! 👇""",
 #     submitted = rightmost_col.form_submit_button("Classify",
 #                                                  help="Classify comment")
-with st.form("article-inpu"):
     # TODO: should probably set a minimum length of article or something
-    selected_article = st.selectbox('Select an article or provide your own:', list_all_filenames(),
     )#index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
-    st.session_state.article_text = fetch_file_contents(selected_article)
     article_text = st.text_area(
-        #label='Enter the comment you want to classify below (in Dutch):')
         value = st.session_state.article_text)
     _, rightmost_col = st.columns([6,1])
     get_summary = rightmost_col.form_submit_button("Generate summary",
                                                  help="Generate summary for the given article text")
 # Listener
 if get_summary:
     if article_text:
-        with st.spinner('Analysing comment...'):
             #classify_comment(article_text, selected_model)
-            print("TEST")
     else:
         st.error('**Error**: No comment to classify. Please provide a comment.')

 import streamlit as st
 from bs4 import BeautifulSoup
+import spacy
+from spacy import displacy
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from transformers import pipeline
 import os
     return p
+def list_all_article_names() -> list:
     filenames = []
     for file in os.listdir('./sample-articles/'):
         if file.endswith('.txt'):
             filenames.append(file.replace('.txt', ''))
     return filenames
+def fetch_article_contents(filename: str) -> AnyStr:
+    with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f:
+        data = f.read()
+    return data
+def fetch_summary_contents(filename: str) -> AnyStr:
+    with open(f'./sample-summaries/{filename.lower()}.txt', 'r') as f:
         data = f.read()
     return data
     st.session_state.results = []
 # Page
+# st.title('🤬 Dutch Toxic Comment Detection')
+# st.markdown("""This demo showcases two Dutch toxic comment detection models.""")
+#
+# # Introduction
+# st.markdown(f"""Both models were trained using a sequence classification task on a translated [Jigsaw Toxicity dataset](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) which contains toxic online comments.
+#     The first model is a fine-tuned multilingual [DistilBERT](https://huggingface.co/distilbert-base-multilingual-cased) model whereas the second is a fine-tuned Dutch RoBERTa-based model called [RobBERT](https://huggingface.co/pdelobelle/robbert-v2-dutch-base).""")
+# st.markdown(f"""For a more comprehensive overview of the models check out their model card on 🤗 Model Hub: [distilbert-base-dutch-toxic-comments]({model_names_to_URLs['ml6team/distilbert-base-dutch-cased-toxic-comments']}) and [RobBERT-dutch-base-toxic-comments]({model_names_to_URLs['ml6team/robbert-dutch-base-toxic-comments']}).
+# """)
+# st.markdown("""Enter a comment that you want to classify below. The model will determine the probability that it is toxic and highlights how much each token contributes to its decision:
+#     <font color="black">
+#         <span style="background-color: rgb(250, 219, 219); opacity: 1;">r</span><span style="background-color: rgb(244, 179, 179); opacity: 1;">e</span><span style="background-color: rgb(238, 135, 135); opacity: 1;">d</span>
+#     </font>
+#     tokens indicate toxicity whereas
+#     <font color="black">
+#     <span style="background-color: rgb(224, 251, 224); opacity: 1;">g</span><span style="background-color: rgb(197, 247, 197); opacity: 1;">re</span><span style="background-color: rgb(121, 236, 121); opacity: 1;">en</span>
+#     </font> tokens indicate the opposite.
+#
+# Try it yourself! 👇""",
+#     unsafe_allow_html=True)
 # Demo
 #     submitted = rightmost_col.form_submit_button("Classify",
 #                                                  help="Classify comment")
+with st.form("article-input"):
     # TODO: should probably set a minimum length of article or something
+    selected_article = st.selectbox('Select an article or provide your own:', list_all_article_names(),
     )#index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
+    st.session_state.article_text = fetch_article_contents(selected_article)
     article_text = st.text_area(
+        label='Enter the comment you want to classify below (in Dutch):',
         value = st.session_state.article_text)
     _, rightmost_col = st.columns([6,1])
     get_summary = rightmost_col.form_submit_button("Generate summary",
                                                  help="Generate summary for the given article text")
+def display_summary(article_name: str):
+    st.subheader("GENERATED SUMMARY")
+    st.markdown("######")
+    summary_content = fetch_summary_contents(article_name)
+    nlp = spacy.load('en_core_web_sm')
+    doc = nlp(summary_content)
+    html = displacy.render(doc, style="ent")
+    html = html.replace("\n", " ")
+    HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
+    st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
+    st.markdown(summary_content)
 # Listener
 if get_summary:
     if article_text:
+        with st.spinner('Generating summary...'):
             #classify_comment(article_text, selected_model)
+            display_summary(selected_article)
     else:
         st.error('**Error**: No comment to classify. Please provide a comment.')

requirements.txt CHANGED Viewed

@@ -2,3 +2,5 @@ beautifulsoup4==4.10.0
 streamlit==1.2.0
 transformers==4.15.0
 transformers-interpret==0.5.2

 streamlit==1.2.0
 transformers==4.15.0
 transformers-interpret==0.5.2
+spacy==3.0.0
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz

sample-articles/{Biden.txt → biden.txt} RENAMED Viewed

File without changes

sample-articles/{Protestors.txt → protestors.txt} RENAMED Viewed

File without changes

sample-summaries/protestors.txt ADDED Viewed

	@@ -0,0 +1 @@

+ Protestors demonstrate at Boise State University after professor's remarks. Professor said men should be prioritized for fields of study such as engineering, medicine and law. Boise State issues statement saying it doesn't endorse comments, but "cannot infringe" on First Amendment rights. State representative: Professor's beliefs "completely sexist and reflect a society that no longer exists.")."CNN has reached out to the professor for comment but has not heard back; he didn't respond to CNN's request for comment. "Women shouldn't have to spend time today defending our value in society," she says.