MatthiasC commited on
Commit
8545c27
β€’
1 Parent(s): 3d2eedb

Add more functionalities

Browse files
app.py CHANGED
@@ -4,6 +4,9 @@ from typing import AnyStr
4
  import streamlit as st
5
  from bs4 import BeautifulSoup
6
 
 
 
 
7
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
  from transformers import pipeline
9
  import os
@@ -98,15 +101,20 @@ def format_explainer_html(html_string):
98
 
99
  return p
100
 
101
- def list_all_filenames() -> list:
102
  filenames = []
103
  for file in os.listdir('./sample-articles/'):
104
  if file.endswith('.txt'):
105
  filenames.append(file.replace('.txt', ''))
106
  return filenames
107
 
108
- def fetch_file_contents(filename: str) -> AnyStr:
109
- with open(f'./sample-terms-and-conditions/{filename.lower()}.txt', 'r') as f:
 
 
 
 
 
110
  data = f.read()
111
  return data
112
 
@@ -141,25 +149,25 @@ if 'results' not in st.session_state:
141
  st.session_state.results = []
142
 
143
  # Page
144
- st.title('🀬 Dutch Toxic Comment Detection')
145
- st.markdown("""This demo showcases two Dutch toxic comment detection models.""")
146
-
147
- # Introduction
148
- st.markdown(f"""Both models were trained using a sequence classification task on a translated [Jigsaw Toxicity dataset](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) which contains toxic online comments.
149
- The first model is a fine-tuned multilingual [DistilBERT](https://huggingface.co/distilbert-base-multilingual-cased) model whereas the second is a fine-tuned Dutch RoBERTa-based model called [RobBERT](https://huggingface.co/pdelobelle/robbert-v2-dutch-base).""")
150
- st.markdown(f"""For a more comprehensive overview of the models check out their model card on πŸ€— Model Hub: [distilbert-base-dutch-toxic-comments]({model_names_to_URLs['ml6team/distilbert-base-dutch-cased-toxic-comments']}) and [RobBERT-dutch-base-toxic-comments]({model_names_to_URLs['ml6team/robbert-dutch-base-toxic-comments']}).
151
- """)
152
- st.markdown("""Enter a comment that you want to classify below. The model will determine the probability that it is toxic and highlights how much each token contributes to its decision:
153
- <font color="black">
154
- <span style="background-color: rgb(250, 219, 219); opacity: 1;">r</span><span style="background-color: rgb(244, 179, 179); opacity: 1;">e</span><span style="background-color: rgb(238, 135, 135); opacity: 1;">d</span>
155
- </font>
156
- tokens indicate toxicity whereas
157
- <font color="black">
158
- <span style="background-color: rgb(224, 251, 224); opacity: 1;">g</span><span style="background-color: rgb(197, 247, 197); opacity: 1;">re</span><span style="background-color: rgb(121, 236, 121); opacity: 1;">en</span>
159
- </font> tokens indicate the opposite.
160
-
161
- Try it yourself! πŸ‘‡""",
162
- unsafe_allow_html=True)
163
 
164
 
165
  # Demo
@@ -172,24 +180,37 @@ Try it yourself! πŸ‘‡""",
172
  # submitted = rightmost_col.form_submit_button("Classify",
173
  # help="Classify comment")
174
 
175
- with st.form("article-inpu"):
176
  # TODO: should probably set a minimum length of article or something
177
- selected_article = st.selectbox('Select an article or provide your own:', list_all_filenames(),
178
  )#index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
179
- st.session_state.article_text = fetch_file_contents(selected_article)
180
  article_text = st.text_area(
181
- #label='Enter the comment you want to classify below (in Dutch):')
182
  value = st.session_state.article_text)
183
  _, rightmost_col = st.columns([6,1])
184
  get_summary = rightmost_col.form_submit_button("Generate summary",
185
  help="Generate summary for the given article text")
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  # Listener
188
  if get_summary:
189
  if article_text:
190
- with st.spinner('Analysing comment...'):
191
  #classify_comment(article_text, selected_model)
192
- print("TEST")
193
  else:
194
  st.error('**Error**: No comment to classify. Please provide a comment.')
195
 
 
4
  import streamlit as st
5
  from bs4 import BeautifulSoup
6
 
7
+ import spacy
8
+ from spacy import displacy
9
+
10
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
  from transformers import pipeline
12
  import os
 
101
 
102
  return p
103
 
104
+ def list_all_article_names() -> list:
105
  filenames = []
106
  for file in os.listdir('./sample-articles/'):
107
  if file.endswith('.txt'):
108
  filenames.append(file.replace('.txt', ''))
109
  return filenames
110
 
111
+ def fetch_article_contents(filename: str) -> AnyStr:
112
+ with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f:
113
+ data = f.read()
114
+ return data
115
+
116
+ def fetch_summary_contents(filename: str) -> AnyStr:
117
+ with open(f'./sample-summaries/{filename.lower()}.txt', 'r') as f:
118
  data = f.read()
119
  return data
120
 
 
149
  st.session_state.results = []
150
 
151
  # Page
152
+ # st.title('🀬 Dutch Toxic Comment Detection')
153
+ # st.markdown("""This demo showcases two Dutch toxic comment detection models.""")
154
+ #
155
+ # # Introduction
156
+ # st.markdown(f"""Both models were trained using a sequence classification task on a translated [Jigsaw Toxicity dataset](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) which contains toxic online comments.
157
+ # The first model is a fine-tuned multilingual [DistilBERT](https://huggingface.co/distilbert-base-multilingual-cased) model whereas the second is a fine-tuned Dutch RoBERTa-based model called [RobBERT](https://huggingface.co/pdelobelle/robbert-v2-dutch-base).""")
158
+ # st.markdown(f"""For a more comprehensive overview of the models check out their model card on πŸ€— Model Hub: [distilbert-base-dutch-toxic-comments]({model_names_to_URLs['ml6team/distilbert-base-dutch-cased-toxic-comments']}) and [RobBERT-dutch-base-toxic-comments]({model_names_to_URLs['ml6team/robbert-dutch-base-toxic-comments']}).
159
+ # """)
160
+ # st.markdown("""Enter a comment that you want to classify below. The model will determine the probability that it is toxic and highlights how much each token contributes to its decision:
161
+ # <font color="black">
162
+ # <span style="background-color: rgb(250, 219, 219); opacity: 1;">r</span><span style="background-color: rgb(244, 179, 179); opacity: 1;">e</span><span style="background-color: rgb(238, 135, 135); opacity: 1;">d</span>
163
+ # </font>
164
+ # tokens indicate toxicity whereas
165
+ # <font color="black">
166
+ # <span style="background-color: rgb(224, 251, 224); opacity: 1;">g</span><span style="background-color: rgb(197, 247, 197); opacity: 1;">re</span><span style="background-color: rgb(121, 236, 121); opacity: 1;">en</span>
167
+ # </font> tokens indicate the opposite.
168
+ #
169
+ # Try it yourself! πŸ‘‡""",
170
+ # unsafe_allow_html=True)
171
 
172
 
173
  # Demo
 
180
  # submitted = rightmost_col.form_submit_button("Classify",
181
  # help="Classify comment")
182
 
183
+ with st.form("article-input"):
184
  # TODO: should probably set a minimum length of article or something
185
+ selected_article = st.selectbox('Select an article or provide your own:', list_all_article_names(),
186
  )#index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
187
+ st.session_state.article_text = fetch_article_contents(selected_article)
188
  article_text = st.text_area(
189
+ label='Enter the comment you want to classify below (in Dutch):',
190
  value = st.session_state.article_text)
191
  _, rightmost_col = st.columns([6,1])
192
  get_summary = rightmost_col.form_submit_button("Generate summary",
193
  help="Generate summary for the given article text")
194
 
195
+
196
+ def display_summary(article_name: str):
197
+ st.subheader("GENERATED SUMMARY")
198
+ st.markdown("######")
199
+ summary_content = fetch_summary_contents(article_name)
200
+ nlp = spacy.load('en_core_web_sm')
201
+ doc = nlp(summary_content)
202
+ html = displacy.render(doc, style="ent")
203
+ html = html.replace("\n", " ")
204
+ HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
205
+ st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
206
+ st.markdown(summary_content)
207
+
208
  # Listener
209
  if get_summary:
210
  if article_text:
211
+ with st.spinner('Generating summary...'):
212
  #classify_comment(article_text, selected_model)
213
+ display_summary(selected_article)
214
  else:
215
  st.error('**Error**: No comment to classify. Please provide a comment.')
216
 
requirements.txt CHANGED
@@ -2,3 +2,5 @@ beautifulsoup4==4.10.0
2
  streamlit==1.2.0
3
  transformers==4.15.0
4
  transformers-interpret==0.5.2
 
 
 
2
  streamlit==1.2.0
3
  transformers==4.15.0
4
  transformers-interpret==0.5.2
5
+ spacy==3.0.0
6
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
sample-articles/{Biden.txt β†’ biden.txt} RENAMED
File without changes
sample-articles/{Protestors.txt β†’ protestors.txt} RENAMED
File without changes
sample-summaries/protestors.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Protestors demonstrate at Boise State University after professor's remarks. Professor said men should be prioritized for fields of study such as engineering, medicine and law. Boise State issues statement saying it doesn't endorse comments, but "cannot infringe" on First Amendment rights. State representative: Professor's beliefs "completely sexist and reflect a society that no longer exists.")."CNN has reached out to the professor for comment but has not heard back; he didn't respond to CNN's request for comment. "Women shouldn't have to spend time today defending our value in society," she says.