MatthiasC commited on
Commit
065051d
1 Parent(s): 558912e

Delete non-used code

Browse files
.idea/HFSummSpace.iml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$">
5
+ <excludeFolder url="file://$MODULE_DIR$/venv" />
6
+ </content>
7
+ <orderEntry type="inheritedJdk" />
8
+ <orderEntry type="sourceFolder" forTests="false" />
9
+ </component>
10
+ <component name="PyDocumentationSettings">
11
+ <option name="format" value="PLAIN" />
12
+ <option name="myDocStringFormat" value="Plain" />
13
+ </component>
14
+ <component name="TestRunnerService">
15
+ <option name="PROJECT_TEST_RUNNER" value="py.test" />
16
+ </component>
17
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
5
+ <option name="ignoredErrors">
6
+ <list>
7
+ <option value="N806" />
8
+ </list>
9
+ </option>
10
+ </inspection_tool>
11
+ </profile>
12
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (HFSummSpace)" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/HFSummSpace.iml" filepath="$PROJECT_DIR$/.idea/HFSummSpace.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
.idea/workspace.xml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ChangeListManager">
4
+ <list default="true" id="57f23431-346d-451d-8d77-db859508e831" name="Changes" comment="">
5
+ <change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
6
+ <change beforePath="$PROJECT_DIR$/custom_renderer.py" beforeDir="false" afterPath="$PROJECT_DIR$/custom_renderer.py" afterDir="false" />
7
+ </list>
8
+ <option name="SHOW_DIALOG" value="false" />
9
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
10
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
11
+ <option name="LAST_RESOLUTION" value="IGNORE" />
12
+ </component>
13
+ <component name="FileTemplateManagerImpl">
14
+ <option name="RECENT_TEMPLATES">
15
+ <list>
16
+ <option value="Python Script" />
17
+ </list>
18
+ </option>
19
+ </component>
20
+ <component name="Git.Settings">
21
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
22
+ </component>
23
+ <component name="GitSEFilterConfiguration">
24
+ <file-type-list>
25
+ <filtered-out-file-type name="LOCAL_BRANCH" />
26
+ <filtered-out-file-type name="REMOTE_BRANCH" />
27
+ <filtered-out-file-type name="TAG" />
28
+ <filtered-out-file-type name="COMMIT_BY_MESSAGE" />
29
+ </file-type-list>
30
+ </component>
31
+ <component name="HighlightingSettingsPerFile">
32
+ <setting file="file://$PROJECT_DIR$/venv/lib/python3.7/site-packages/flair/models/sequence_tagger_model.py" root0="SKIP_INSPECTION" />
33
+ </component>
34
+ <component name="MarkdownSettingsMigration">
35
+ <option name="stateVersion" value="1" />
36
+ </component>
37
+ <component name="ProjectId" id="27jdqgqsSB1v523dZaR7czhkX4c" />
38
+ <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
39
+ <component name="ProjectViewState">
40
+ <option name="hideEmptyMiddlePackages" value="true" />
41
+ <option name="showLibraryContents" value="true" />
42
+ </component>
43
+ <component name="PropertiesComponent"><![CDATA[{
44
+ "keyToString": {
45
+ "last_opened_file_path": "/home/matthias/Documents/Summarization-fact-checker/HugginfaceSpace/HFSummSpace",
46
+ "settings.editor.selected.configurable": "editor.preferences.fonts.default"
47
+ }
48
+ }]]></component>
49
+ <component name="RecentsManager">
50
+ <key name="CopyFile.RECENT_KEYS">
51
+ <recent name="$PROJECT_DIR$" />
52
+ </key>
53
+ <key name="MoveFile.RECENT_KEYS">
54
+ <recent name="$PROJECT_DIR$/sample-articles-temp" />
55
+ </key>
56
+ </component>
57
+ <component name="RunManager">
58
+ <configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
59
+ <module name="HFSummSpace" />
60
+ <option name="INTERPRETER_OPTIONS" value="" />
61
+ <option name="PARENT_ENVS" value="true" />
62
+ <envs>
63
+ <env name="PYTHONUNBUFFERED" value="1" />
64
+ </envs>
65
+ <option name="SDK_HOME" value="" />
66
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
67
+ <option name="IS_MODULE_SDK" value="true" />
68
+ <option name="ADD_CONTENT_ROOTS" value="true" />
69
+ <option name="ADD_SOURCE_ROOTS" value="true" />
70
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/app.py" />
71
+ <option name="PARAMETERS" value="" />
72
+ <option name="SHOW_COMMAND_LINE" value="false" />
73
+ <option name="EMULATE_TERMINAL" value="false" />
74
+ <option name="MODULE_MODE" value="false" />
75
+ <option name="REDIRECT_INPUT" value="false" />
76
+ <option name="INPUT_FILE" value="" />
77
+ <method v="2" />
78
+ </configuration>
79
+ <recent_temporary>
80
+ <list>
81
+ <item itemvalue="Python.app" />
82
+ </list>
83
+ </recent_temporary>
84
+ </component>
85
+ <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
86
+ <component name="TaskManager">
87
+ <task active="true" id="Default" summary="Default task">
88
+ <changelist id="57f23431-346d-451d-8d77-db859508e831" name="Changes" comment="" />
89
+ <created>1649837622575</created>
90
+ <option name="number" value="Default" />
91
+ <option name="presentableId" value="Default" />
92
+ <updated>1649837622575</updated>
93
+ </task>
94
+ <servers />
95
+ </component>
96
+ <component name="Vcs.Log.Tabs.Properties">
97
+ <option name="TAB_STATES">
98
+ <map>
99
+ <entry key="MAIN">
100
+ <value>
101
+ <State />
102
+ </value>
103
+ </entry>
104
+ </map>
105
+ </option>
106
+ </component>
107
+ </project>
__pycache__/custom_renderer.cpython-37.pyc CHANGED
Binary files a/__pycache__/custom_renderer.cpython-37.pyc and b/__pycache__/custom_renderer.cpython-37.pyc differ
 
app.py CHANGED
@@ -1,10 +1,6 @@
1
- import random
2
- from typing import AnyStr, List, Dict
3
- # import tensorflow_hub as hub
4
 
5
  import itertools
6
-
7
- #import en_core_web_sm
8
  import streamlit as st
9
  import en_core_web_lg
10
 
@@ -13,25 +9,15 @@ from bs4 import BeautifulSoup
13
  import numpy as np
14
  import base64
15
 
16
- import validators
17
  from spacy_streamlit.util import get_svg
18
- from validators import ValidationFailure
19
 
20
  from custom_renderer import render_sentence_custom
21
- # from flair.data import Sentence
22
- # from flair.models import SequenceTagger
23
  from sentence_transformers import SentenceTransformer
24
 
25
- import spacy
26
- from spacy import displacy
27
- from spacy_streamlit import visualize_parser
28
-
29
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
30
  from transformers import pipeline
31
  import os
32
- from transformers_interpret import SequenceClassificationExplainer
33
 
34
- # USE_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
35
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36
 
37
 
@@ -42,19 +28,10 @@ def get_sentence_embedding_model():
42
 
43
  @st.experimental_singleton
44
  def get_spacy():
45
- # nlp = spacy.load('en_core_web_lg')
46
  nlp = en_core_web_lg.load()
47
  return nlp
48
 
49
 
50
- # TODO: might look into which one is the best here
51
- # TODO: might be useful to make an ml6 preloaded model for flair as this takes ridiculously long to load the first time
52
- # @st.experimental_singleton
53
- # @st.cache(suppress_st_warning=True, allow_output_mutation=True)
54
- # def get_flair_tagger():
55
- # return SequenceTagger.load("flair/ner-english-ontonotes-fast")
56
-
57
-
58
  @st.experimental_singleton
59
  def get_transformer_pipeline():
60
  tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
@@ -96,7 +73,7 @@ def list_all_article_names() -> list:
96
 
97
 
98
  def fetch_article_contents(filename: str) -> AnyStr:
99
- if (filename == "Provide your own input"):
100
  return " "
101
  with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f:
102
  data = f.read()
@@ -174,13 +151,13 @@ def get_all_entities(text):
174
 
175
  # TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
176
  def get_and_compare_entities():
177
- #article_content = fetch_article_contents(article_name)
178
  article_content = st.session_state.article_text
179
  all_entities_per_sentence = get_all_entities_per_sentence(article_content)
180
  # st.session_state.entities_per_sentence_article = all_entities_per_sentence
181
  entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
182
 
183
- #summary_content = fetch_summary_contents(article_name)
184
  summary_content = st.session_state.summary_output
185
  all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
186
  # st.session_state.entities_per_sentence_summary = all_entities_per_sentence
@@ -193,7 +170,8 @@ def get_and_compare_entities():
193
  if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
194
  matched_entities.append(entity)
195
  elif any(
196
- np.inner(sentence_embedding_model.encode(entity, show_progress_bar=False), sentence_embedding_model.encode(art_entity, show_progress_bar=False)) > 0.9 for
 
197
  art_entity in entities_article):
198
  matched_entities.append(entity)
199
  else:
@@ -202,7 +180,7 @@ def get_and_compare_entities():
202
 
203
 
204
  def highlight_entities():
205
- #summary_content = fetch_summary_contents(article_name)
206
  summary_content = st.session_state.summary_output
207
  markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
208
  markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
@@ -277,13 +255,6 @@ def check_dependency(article: bool):
277
  # return all_deps
278
 
279
 
280
- def is_valid_url(url: str) -> bool:
281
- result = validators.url(url)
282
- if isinstance(result, ValidationFailure):
283
- return False
284
- return True
285
-
286
-
287
  def render_svg(svg_file):
288
  with open(svg_file, "r") as f:
289
  lines = f.readlines()
@@ -296,7 +267,6 @@ def render_svg(svg_file):
296
 
297
 
298
  def generate_abstractive_summary(text, type, min_len=120, max_len=512, **kwargs):
299
- summarization_model = get_summarizer_model()
300
  text = text.strip().replace("\n", " ")
301
  if type == "top_p":
302
  text = summarization_model(text, min_length=min_len,
@@ -316,10 +286,6 @@ def generate_abstractive_summary(text, type, min_len=120, max_len=512, **kwargs)
316
  return summary
317
 
318
 
319
- # Start session
320
- if 'results' not in st.session_state:
321
- st.session_state.results = []
322
-
323
  # Page
324
  st.title('Summarization fact checker')
325
 
@@ -341,11 +307,11 @@ metric, indicating the trustworthiness of the generated summary. Throughout this
341
  results for some methods on specific examples. These text blocks will be indicated and they change according to the
342
  currently selected article.""")
343
 
 
344
  sentence_embedding_model = get_sentence_embedding_model()
345
- # tagger = get_flair_tagger()
346
  ner_model = get_transformer_pipeline()
347
  nlp = get_spacy()
348
- # nlp = en_core_web_sm.load()
349
 
350
  # GENERATING SUMMARIES PART
351
  st.header("Generating summaries")
@@ -353,7 +319,6 @@ st.markdown("Let’s start by selecting an article text for which we want to gen
353
  "text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
354
  "generated from it might not be optimal, leading to suboptimal performance of the post-processing steps.")
355
 
356
- # TODO: NEED TO CHECK ARTICLE TEXT INSTEAD OF ARTICLE NAME ALSO FREE INPUT OPTION
357
  selected_article = st.selectbox('Select an article or provide your own:',
358
  list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
359
  st.session_state.article_text = fetch_article_contents(selected_article)
@@ -363,23 +328,27 @@ article_text = st.text_area(
363
  height=150
364
  )
365
 
366
- summarize_button = st.button(label='Process article content', help="Generates summary and applies entity matching and dependency parsing for given article")
 
367
 
368
  if summarize_button:
369
  st.session_state.article_text = article_text
370
- st.markdown("Below you can find the generated summary for the article. Based on empirical research, we will discuss "
371
- "two main methods that detect some common errors. We can then score different summaries, to indicate how "
372
- "factual a summary is for a given article. The idea is that in production, you could generate a set of "
373
- "summaries for the same article, with different parameters (or even different models). By using "
374
- "post-processing error detection, we can then select the best possible summary.")
 
375
  if st.session_state.article_text:
376
  with st.spinner('Generating summary...'):
377
  # classify_comment(article_text, selected_model)
378
- if selected_article != "Provide your own input" and article_text == fetch_article_contents(selected_article):
 
379
  st.session_state.unchanged_text = True
380
  summary_content = fetch_summary_contents(selected_article)
381
  else:
382
- summary_content = generate_abstractive_summary(article_text, type="beam", do_sample=True, num_beams=15, no_repeat_ngram_size=4)
 
383
  st.session_state.unchanged_text = False
384
  summary_displayed = display_summary(summary_content)
385
  st.write("**Generated summary:**", summary_displayed, unsafe_allow_html=True)
@@ -428,10 +397,11 @@ if summarize_button:
428
 
429
  # DEPENDENCY PARSING PART
430
  st.header("Dependency comparison")
431
- st.markdown("The second method we use for post-processing is called **Dependency parsing**: the process in which the "
432
- "grammatical structure in a sentence is analysed, to find out related words as well as the type of the "
433
- "relationship between them. For the sentence “Jan’s wife is called Sarah” you would get the following "
434
- "dependency graph:")
 
435
 
436
  # TODO: I wonder why the first doesn't work but the second does (it doesn't show deps otherwise)
437
  # st.image("ExampleParsing.svg")
@@ -442,14 +412,15 @@ if summarize_button:
442
  "are still correct. “The borders of Ukraine” have a different dependency between “borders” and “Ukraine” "
443
  "than “Ukraine’s borders”, while both descriptions have the same meaning. So just matching all "
444
  "dependencies between article and summary (as we did with entity matching) would not be a robust method.")
445
- st.markdown("However, by empirical testing, we have found that there are certain dependencies which can be used for "
446
- "such matching techniques. When unmatched, these specific dependencies are often an indication of a "
447
- "wrongly constructed sentence. **Should I explain this more/better or is it enough that I explain by "
448
- "example specific run throughs?**. We found 2(/3 TODO) common dependencies which, when present in the "
449
- "summary but not in the article, are highly indicative of factualness errors. Furthermore, we only check "
450
- "dependencies between an existing **entity** and its direct connections. Below we highlight all unmatched "
451
- "dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
452
- "currently selected article.")
 
453
  with st.spinner("Doing dependency parsing..."):
454
  # TODO RIGHT IF FUNCTION (IF EXAMPLE AND IF INPUT UNCHANGED)
455
  # if selected_article == 'article11':
@@ -474,12 +445,13 @@ if summarize_button:
474
 
475
  # OUTRO/CONCLUSION
476
  st.header("Wrapping up")
477
- st.markdown("We have presented 2 methods that try to improve summaries via post-processing steps. Entity matching can "
478
- "be used to solve hallucinations, while dependency comparison can be used to filter out some bad "
479
- "sentences (and thus worse summaries). These methods highlight the possibilities of post-processing "
480
- "AI-made summaries, but are only a basic introduction. As the methods were empirically tested they are "
481
- "definitely not sufficiently robust for general use-cases. (something about that we tested also RE and "
482
- "maybe other things).")
 
483
  st.markdown("####")
484
  st.markdown("Below we generated 5 different kind of summaries from the article in which their ranks are estimated, "
485
  "and hopefully the best summary (read: the one that a human would prefer or indicate as the best one) "
 
1
+ from typing import AnyStr, Dict
 
 
2
 
3
  import itertools
 
 
4
  import streamlit as st
5
  import en_core_web_lg
6
 
 
9
  import numpy as np
10
  import base64
11
 
 
12
  from spacy_streamlit.util import get_svg
 
13
 
14
  from custom_renderer import render_sentence_custom
 
 
15
  from sentence_transformers import SentenceTransformer
16
 
17
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
 
 
 
 
18
  from transformers import pipeline
19
  import os
 
20
 
 
21
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
 
23
 
 
28
 
29
  @st.experimental_singleton
30
  def get_spacy():
 
31
  nlp = en_core_web_lg.load()
32
  return nlp
33
 
34
 
 
 
 
 
 
 
 
 
35
  @st.experimental_singleton
36
  def get_transformer_pipeline():
37
  tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
 
73
 
74
 
75
  def fetch_article_contents(filename: str) -> AnyStr:
76
+ if filename == "Provide your own input":
77
  return " "
78
  with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f:
79
  data = f.read()
 
151
 
152
  # TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
153
  def get_and_compare_entities():
154
+ # article_content = fetch_article_contents(article_name)
155
  article_content = st.session_state.article_text
156
  all_entities_per_sentence = get_all_entities_per_sentence(article_content)
157
  # st.session_state.entities_per_sentence_article = all_entities_per_sentence
158
  entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
159
 
160
+ # summary_content = fetch_summary_contents(article_name)
161
  summary_content = st.session_state.summary_output
162
  all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
163
  # st.session_state.entities_per_sentence_summary = all_entities_per_sentence
 
170
  if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
171
  matched_entities.append(entity)
172
  elif any(
173
+ np.inner(sentence_embedding_model.encode(entity, show_progress_bar=False),
174
+ sentence_embedding_model.encode(art_entity, show_progress_bar=False)) > 0.9 for
175
  art_entity in entities_article):
176
  matched_entities.append(entity)
177
  else:
 
180
 
181
 
182
  def highlight_entities():
183
+ # summary_content = fetch_summary_contents(article_name)
184
  summary_content = st.session_state.summary_output
185
  markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
186
  markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
 
255
  # return all_deps
256
 
257
 
 
 
 
 
 
 
 
258
  def render_svg(svg_file):
259
  with open(svg_file, "r") as f:
260
  lines = f.readlines()
 
267
 
268
 
269
  def generate_abstractive_summary(text, type, min_len=120, max_len=512, **kwargs):
 
270
  text = text.strip().replace("\n", " ")
271
  if type == "top_p":
272
  text = summarization_model(text, min_length=min_len,
 
286
  return summary
287
 
288
 
 
 
 
 
289
  # Page
290
  st.title('Summarization fact checker')
291
 
 
307
  results for some methods on specific examples. These text blocks will be indicated and they change according to the
308
  currently selected article.""")
309
 
310
+ # Load all different models (cached) at start time of the hugginface space
311
  sentence_embedding_model = get_sentence_embedding_model()
 
312
  ner_model = get_transformer_pipeline()
313
  nlp = get_spacy()
314
+ summarization_model = get_summarizer_model()
315
 
316
  # GENERATING SUMMARIES PART
317
  st.header("Generating summaries")
 
319
  "text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
320
  "generated from it might not be optimal, leading to suboptimal performance of the post-processing steps.")
321
 
 
322
  selected_article = st.selectbox('Select an article or provide your own:',
323
  list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
324
  st.session_state.article_text = fetch_article_contents(selected_article)
 
328
  height=150
329
  )
330
 
331
+ summarize_button = st.button(label='Process article content',
332
+ help="Generates summary and applies entity matching and dependency parsing for given article")
333
 
334
  if summarize_button:
335
  st.session_state.article_text = article_text
336
+ st.markdown(
337
+ "Below you can find the generated summary for the article. Based on empirical research, we will discuss "
338
+ "two main methods that detect some common errors. We can then score different summaries, to indicate how "
339
+ "factual a summary is for a given article. The idea is that in production, you could generate a set of "
340
+ "summaries for the same article, with different parameters (or even different models). By using "
341
+ "post-processing error detection, we can then select the best possible summary.")
342
  if st.session_state.article_text:
343
  with st.spinner('Generating summary...'):
344
  # classify_comment(article_text, selected_model)
345
+ if selected_article != "Provide your own input" and article_text == fetch_article_contents(
346
+ selected_article):
347
  st.session_state.unchanged_text = True
348
  summary_content = fetch_summary_contents(selected_article)
349
  else:
350
+ summary_content = generate_abstractive_summary(article_text, type="beam", do_sample=True, num_beams=15,
351
+ no_repeat_ngram_size=4)
352
  st.session_state.unchanged_text = False
353
  summary_displayed = display_summary(summary_content)
354
  st.write("**Generated summary:**", summary_displayed, unsafe_allow_html=True)
 
397
 
398
  # DEPENDENCY PARSING PART
399
  st.header("Dependency comparison")
400
+ st.markdown(
401
+ "The second method we use for post-processing is called **Dependency parsing**: the process in which the "
402
+ "grammatical structure in a sentence is analysed, to find out related words as well as the type of the "
403
+ "relationship between them. For the sentence “Jan’s wife is called Sarah” you would get the following "
404
+ "dependency graph:")
405
 
406
  # TODO: I wonder why the first doesn't work but the second does (it doesn't show deps otherwise)
407
  # st.image("ExampleParsing.svg")
 
412
  "are still correct. “The borders of Ukraine” have a different dependency between “borders” and “Ukraine” "
413
  "than “Ukraine’s borders”, while both descriptions have the same meaning. So just matching all "
414
  "dependencies between article and summary (as we did with entity matching) would not be a robust method.")
415
+ st.markdown(
416
+ "However, by empirical testing, we have found that there are certain dependencies which can be used for "
417
+ "such matching techniques. When unmatched, these specific dependencies are often an indication of a "
418
+ "wrongly constructed sentence. **Should I explain this more/better or is it enough that I explain by "
419
+ "example specific run throughs?**. We found 2(/3 TODO) common dependencies which, when present in the "
420
+ "summary but not in the article, are highly indicative of factualness errors. Furthermore, we only check "
421
+ "dependencies between an existing **entity** and its direct connections. Below we highlight all unmatched "
422
+ "dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
423
+ "currently selected article.")
424
  with st.spinner("Doing dependency parsing..."):
425
  # TODO RIGHT IF FUNCTION (IF EXAMPLE AND IF INPUT UNCHANGED)
426
  # if selected_article == 'article11':
 
445
 
446
  # OUTRO/CONCLUSION
447
  st.header("Wrapping up")
448
+ st.markdown(
449
+ "We have presented 2 methods that try to improve summaries via post-processing steps. Entity matching can "
450
+ "be used to solve hallucinations, while dependency comparison can be used to filter out some bad "
451
+ "sentences (and thus worse summaries). These methods highlight the possibilities of post-processing "
452
+ "AI-made summaries, but are only a basic introduction. As the methods were empirically tested they are "
453
+ "definitely not sufficiently robust for general use-cases. (something about that we tested also RE and "
454
+ "maybe other things).")
455
  st.markdown("####")
456
  st.markdown("Below we generated 5 different kind of summaries from the article in which their ranks are estimated, "
457
  "and hopefully the best summary (read: the one that a human would prefer or indicate as the best one) "
custom_renderer.py CHANGED
@@ -1,6 +1,4 @@
1
  from typing import Dict
2
-
3
- import spacy
4
  from PIL import ImageFont
5
 
6
 
 
1
  from typing import Dict
 
 
2
  from PIL import ImageFont
3
 
4