MatthiasC commited on
Commit
357d42c
1 Parent(s): 08e0095

Add dependency comp general functionality, fix issues and add more examples

Browse files
ExampleParsing.svg ADDED
__pycache__/custom_renderer.cpython-37.pyc CHANGED
Binary files a/__pycache__/custom_renderer.cpython-37.pyc and b/__pycache__/custom_renderer.cpython-37.pyc differ
 
app.py CHANGED
@@ -8,7 +8,9 @@ from bs4 import BeautifulSoup
8
  import numpy as np
9
  import base64
10
 
 
11
  from spacy_streamlit.util import get_svg
 
12
 
13
  from custom_renderer import render_sentence_custom
14
  from flair.data import Sentence
@@ -134,6 +136,18 @@ def fetch_summary_contents(filename: str) -> AnyStr:
134
  return data
135
 
136
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  def classify_comment(comment, selected_model):
138
  """Classify the given comment and augment with additional information."""
139
  toxicity_pipeline, cls_explainer = load_pipeline(selected_model)
@@ -162,9 +176,10 @@ def classify_comment(comment, selected_model):
162
 
163
  def display_summary(article_name: str):
164
  summary_content = fetch_summary_contents(article_name)
 
165
  soup = BeautifulSoup(summary_content, features="html.parser")
166
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
167
- st.session_state.summary_output = HTML_WRAPPER.format(soup)
168
 
169
 
170
  ##@st.cache(hash_funcs={preshed.maps.PreshMap: my_hash_func})
@@ -215,12 +230,12 @@ def get_all_entities(text):
215
  def get_and_compare_entities(article_name: str):
216
  article_content = fetch_article_contents(article_name)
217
  all_entities_per_sentence = get_all_entities_per_sentence(article_content)
218
- #st.session_state.entities_per_sentence_article = all_entities_per_sentence
219
  entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
220
 
221
  summary_content = fetch_summary_contents(article_name)
222
  all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
223
- #st.session_state.entities_per_sentence_summary = all_entities_per_sentence
224
  entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
225
 
226
  matched_entities = []
@@ -268,45 +283,55 @@ def check_dependency(article: bool):
268
  if article:
269
  text = st.session_state.article_text
270
  all_entities = get_all_entities_per_sentence(text)
271
- #all_entities = st.session_state.entities_per_sentence_article
272
  else:
273
  text = st.session_state.summary_output
274
  all_entities = get_all_entities_per_sentence(text)
275
- #all_entities = st.session_state.entities_per_sentence_summary
276
  doc = nlp(text)
277
  tok_l = doc.to_json()['tokens']
278
- all_deps = ""
279
- print(str(all_deps))
280
- print("OOPS")
281
 
282
  sentences = list(doc.sents)
283
- print(sentences)
284
  for i, sentence in enumerate(sentences):
285
- #TODO MONDAY: THE PROBLEM LIES HERE WITH THE SENTENCE!!! (I THINK I KNOW PROBLEM: TEXT SAVED AS SESSION STATE IS HTML NOT PURE TEXT!)
286
- print(str(sentence))
287
  start_id = sentence.start
288
  end_id = sentence.end
289
  for t in tok_l:
 
290
  if t["id"] < start_id or t["id"] > end_id:
291
  continue
292
  head = tok_l[t['head']]
293
- if t['dep'] == 'amod':
294
- print("AMOD FOUND")
295
  object_here = text[t['start']:t['end']]
296
  object_target = text[head['start']:head['end']]
297
-
 
298
  # ONE NEEDS TO BE ENTITY
299
  if object_here in all_entities[i]:
300
- print("SENTENCE ADDED")
301
- print(all_deps)
302
- all_deps = all_deps.join(str(sentence))
 
 
303
  elif object_target in all_entities[i]:
304
- all_deps = all_deps.join(str(sentence))
 
 
 
 
305
  else:
306
  continue
307
- #print(f'all depps are {all_deps}')
308
- #print(all_deps)
309
- return all_deps
 
 
 
 
 
 
 
310
 
311
 
312
  # Start session
@@ -359,13 +384,28 @@ if st.session_state.article_text:
359
  with st.spinner('Generating summary...'):
360
  # classify_comment(article_text, selected_model)
361
 
362
- display_summary(selected_article)
363
 
364
- st.write("**Generated summary:**", st.session_state.summary_output, unsafe_allow_html=True)
365
  else:
366
  st.error('**Error**: No comment to classify. Please provide a comment.',
367
  help="Generate summary for the given article text")
368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  # ENTITY MATCHING PART
370
  st.header("Entity matching")
371
  st.markdown("**Named entity recognition** (NER) is the task of identifying and categorising key information ("
@@ -376,23 +416,67 @@ st.markdown("**Named entity recognition** (NER) is the task of identifying and c
376
  with st.spinner("Calculating and matching entities..."):
377
  entity_match_html = highlight_entities(selected_article)
378
  st.write(entity_match_html, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
  # DEPENDENCY PARSING PART
381
  st.header("Dependency comparison")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  with st.spinner("Doing dependency parsing..."):
383
- render_dependency_parsing(check_dependency(False))
384
- # Results
385
- # if 'results' in st.session_state and st.session_state.results:
386
- # first = True
387
- # for result in st.session_state.results[::-1]:
388
- # if not first:
389
- # st.markdown("---")
390
- # st.markdown(f"Text:\n> {result['text']}")
391
- # col_1, col_2, col_3 = st.columns([1,2,2])
392
- # col_1.metric(label='', value=f"{result['emoji']}")
393
- # col_2.metric(label='Label', value=f"{result['label']}")
394
- # col_3.metric(label='Score', value=f"{result['score']:.3f}")
395
- # st.markdown(f"Token Attribution:\n{result['tokens_with_background']}",
396
- # unsafe_allow_html=True)
397
- # st.caption(f"Model: {result['model_name']}")
398
- # first = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import numpy as np
9
  import base64
10
 
11
+ import validators
12
  from spacy_streamlit.util import get_svg
13
+ from validators import ValidationFailure
14
 
15
  from custom_renderer import render_sentence_custom
16
  from flair.data import Sentence
 
136
  return data
137
 
138
 
139
+ def fetch_entity_specific_contents(filename: str) -> AnyStr:
140
+ with open(f'./entity-specific-text/{filename.lower()}.txt', 'r') as f:
141
+ data = f.read()
142
+ return data
143
+
144
+
145
+ def fetch_dependency_specific_contents(filename: str) -> AnyStr:
146
+ with open(f'./dependency-specific-text/{filename.lower()}.txt', 'r') as f:
147
+ data = f.read()
148
+ return data
149
+
150
+
151
  def classify_comment(comment, selected_model):
152
  """Classify the given comment and augment with additional information."""
153
  toxicity_pipeline, cls_explainer = load_pipeline(selected_model)
 
176
 
177
  def display_summary(article_name: str):
178
  summary_content = fetch_summary_contents(article_name)
179
+ st.session_state.summary_output = summary_content
180
  soup = BeautifulSoup(summary_content, features="html.parser")
181
  HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
182
+ return HTML_WRAPPER.format(soup)
183
 
184
 
185
  ##@st.cache(hash_funcs={preshed.maps.PreshMap: my_hash_func})
 
230
  def get_and_compare_entities(article_name: str):
231
  article_content = fetch_article_contents(article_name)
232
  all_entities_per_sentence = get_all_entities_per_sentence(article_content)
233
+ # st.session_state.entities_per_sentence_article = all_entities_per_sentence
234
  entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
235
 
236
  summary_content = fetch_summary_contents(article_name)
237
  all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
238
+ # st.session_state.entities_per_sentence_summary = all_entities_per_sentence
239
  entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
240
 
241
  matched_entities = []
 
283
  if article:
284
  text = st.session_state.article_text
285
  all_entities = get_all_entities_per_sentence(text)
286
+ # all_entities = st.session_state.entities_per_sentence_article
287
  else:
288
  text = st.session_state.summary_output
289
  all_entities = get_all_entities_per_sentence(text)
290
+ # all_entities = st.session_state.entities_per_sentence_summary
291
  doc = nlp(text)
292
  tok_l = doc.to_json()['tokens']
293
+ # all_deps = ""
294
+ test_list_dict_output = []
 
295
 
296
  sentences = list(doc.sents)
 
297
  for i, sentence in enumerate(sentences):
 
 
298
  start_id = sentence.start
299
  end_id = sentence.end
300
  for t in tok_l:
301
+ # print(t)
302
  if t["id"] < start_id or t["id"] > end_id:
303
  continue
304
  head = tok_l[t['head']]
305
+ if t['dep'] == 'amod' or t['dep'] == "pobj":
 
306
  object_here = text[t['start']:t['end']]
307
  object_target = text[head['start']:head['end']]
308
+ if t['dep'] == "pobj" and str.lower(object_target) != "in":
309
+ continue
310
  # ONE NEEDS TO BE ENTITY
311
  if object_here in all_entities[i]:
312
+ # all_deps = all_deps.join(str(sentence))
313
+ identifier = object_here + t['dep'] + object_target
314
+ test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start),
315
+ "target_word_index": (t['head'] - sentence.start),
316
+ "identifier": identifier, "sentence": str(sentence)})
317
  elif object_target in all_entities[i]:
318
+ # all_deps = all_deps.join(str(sentence))
319
+ identifier = object_here + t['dep'] + object_target
320
+ test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start),
321
+ "target_word_index": (t['head'] - sentence.start),
322
+ "identifier": identifier, "sentence": str(sentence)})
323
  else:
324
  continue
325
+ # print(f'NOW TEST LIST DICT: {test_list_dict_output}')
326
+ return test_list_dict_output
327
+ # return all_deps
328
+
329
+
330
+ def is_valid_url(url: str) -> bool:
331
+ result = validators.url(url)
332
+ if isinstance(result, ValidationFailure):
333
+ return False
334
+ return True
335
 
336
 
337
  # Start session
 
384
  with st.spinner('Generating summary...'):
385
  # classify_comment(article_text, selected_model)
386
 
387
+ summary_displayed = display_summary(selected_article)
388
 
389
+ st.write("**Generated summary:**", summary_displayed, unsafe_allow_html=True)
390
  else:
391
  st.error('**Error**: No comment to classify. Please provide a comment.',
392
  help="Generate summary for the given article text")
393
 
394
+ if is_valid_url(article_text):
395
+ print("YES")
396
+ else:
397
+ print("NO")
398
+ def render_svg(svg_file):
399
+ with open(svg_file, "r") as f:
400
+ lines = f.readlines()
401
+ svg = "".join(lines)
402
+
403
+ # """Renders the given svg string."""
404
+ b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
405
+ html = r'<img src="data:image/svg+xml;base64,%s"/>' % b64
406
+ return html
407
+
408
+
409
  # ENTITY MATCHING PART
410
  st.header("Entity matching")
411
  st.markdown("**Named entity recognition** (NER) is the task of identifying and categorising key information ("
 
416
  with st.spinner("Calculating and matching entities..."):
417
  entity_match_html = highlight_entities(selected_article)
418
  st.write(entity_match_html, unsafe_allow_html=True)
419
+ red_text = """<font color="black"><span style="background-color: rgb(238, 135, 135); opacity:
420
+ 1;">red</span></font> """
421
+ green_text = """<font color="black">
422
+ <span style="background-color: rgb(121, 236, 121); opacity: 1;">green</span>
423
+ </font>"""
424
+
425
+ markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
426
+ markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
427
+ st.markdown("Here you can see what this looks like when we apply entity-matching on the summary (compared to the "
428
+ "original article). Entities in this summary are marked " + green_text + " when the entity also "
429
+ "exists in the article, while unmatched entities are marked " + red_text + ".",
430
+ unsafe_allow_html=True)
431
+ entity_specific_text = fetch_entity_specific_contents(selected_article)
432
+ st.markdown(entity_specific_text)
433
 
434
  # DEPENDENCY PARSING PART
435
  st.header("Dependency comparison")
436
+ st.markdown("**Dependency parsing** is the process in which the grammatical structure in a sentence is analysed, "
437
+ "to find out related words as well as the type of the relationship between them. For the sentence “Jan’s "
438
+ "wife is called Sarah” you would get the following dependency graph:")
439
+
440
+ # TODO: I wonder why the first doesn't work but the second does (it doesn't show deps otherwise)
441
+ # st.image("ExampleParsing.svg")
442
+ st.write(render_svg('ExampleParsing.svg'), unsafe_allow_html=True)
443
+ st.markdown("Here, “Jan” is the “poss” (possession modifier) of “wife”. If suddenly the summary would read “Jan’s "
444
+ "husband…”, there would be a dependency in the summary that is non-existent in the article itself. "
445
+ "However, it could be that such a new dependency is not per se correct, “The borders of Ukraine” have a "
446
+ "different dependency between “borders” and “Ukraine” than “Ukraine’s borders”, while this would also be "
447
+ "correct. So general matching between summary and article wont work.")
448
+ st.markdown("There is however a simple method that we found has potential in post-processing. Based on empirical "
449
+ "results, we have found that when there are specific kinds of dependencies in the summary that are not in "
450
+ "the article, these specific types are often an indication of a wrongly constructed sentence. Let’s take "
451
+ "a look at an example:")
452
  with st.spinner("Doing dependency parsing..."):
453
+ summary_deps = check_dependency(False)
454
+ article_deps = check_dependency(True)
455
+ total_unmatched_deps = []
456
+ for summ_dep in summary_deps:
457
+ if not any(summ_dep['identifier'] in art_dep['identifier'] for art_dep in article_deps):
458
+ total_unmatched_deps.append(summ_dep)
459
+ # print(f'ALL UNMATCHED DEPS ARE: {total_unmatched_deps}')
460
+ # render_dependency_parsing(check_dependency(False))
461
+ if total_unmatched_deps:
462
+ for current_drawing_list in total_unmatched_deps:
463
+ render_dependency_parsing(current_drawing_list)
464
+ dep_spec_text = fetch_dependency_specific_contents(selected_article)
465
+ st.markdown(dep_spec_text)
466
+ soup = BeautifulSoup("Example text option with box", features="html.parser")
467
+ HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
468
+ margin-bottom: 2.5rem">{}</div> """
469
+ st.write(HTML_WRAPPER.format(soup), unsafe_allow_html=True)
470
+
471
+ # OUTRO/CONCLUSION
472
+ st.header("Wrapping up")
473
+ st.markdown("We have presented 2 methods that try to improve summaries via post-processing steps. Entity matching can "
474
+ "be used to solve hallucinations, while checking if specific dependencies are matched between summary and "
475
+ "article can be used to filter out some bad sentences (and thus worse summaries). Of course these are "
476
+ "only basic methods which were empirically tested, but they are a start at actually making something good "
477
+ "(???). (something about that we tested also RE and maybe other things).")
478
+ st.markdown("####")
479
+ st.markdown("Now based on these methods you can check summaries and whether they are “good” or “bad”. Below you can "
480
+ "generate 5 different kind of summaries for the starting article (based on different model params) in "
481
+ "which their ranks are estimated, and hopefully the best summary (read: the one that a human would prefer "
482
+ "or indicate as the best one) will be at the top.")
custom_renderer.py CHANGED
@@ -1,10 +1,12 @@
1
  from typing import Dict, Any
2
 
 
3
  import spacy
4
  from PIL import ImageFont
5
 
6
  from spacy.tokens import Doc
7
 
 
8
  def get_pil_text_size(text, font_size, font_name):
9
  font = ImageFont.truetype(font_name, font_size)
10
  size = font.getsize(text)
@@ -32,8 +34,8 @@ def render_arrow(
32
  <path class="displacy-arrowhead" d="{head}" fill="red"/>
33
  </g>
34
  """
35
- arc = get_arc(start + 20, 50, 5, end + 20)
36
- arrowhead = get_arrowhead(direction, start + 20, 50, end + 20)
37
  label_side = "right" if direction == "rtl" else "left"
38
  return TPL_DEP_ARCS.format(
39
  id=0,
@@ -77,7 +79,7 @@ def get_arrowhead(direction: str, x: int, y: int, end: int) -> str:
77
 
78
 
79
  # parsed = [{'words': [{'text': 'The', 'tag': 'DET', 'lemma': None}, {'text': 'OnePlus', 'tag': 'PROPN', 'lemma': None}, {'text': '10', 'tag': 'NUM', 'lemma': None}, {'text': 'Pro', 'tag': 'PROPN', 'lemma': None}, {'text': 'is', 'tag': 'AUX', 'lemma': None}, {'text': 'the', 'tag': 'DET', 'lemma': None}, {'text': 'company', 'tag': 'NOUN', 'lemma': None}, {'text': "'s", 'tag': 'PART', 'lemma': None}, {'text': 'first', 'tag': 'ADJ', 'lemma': None}, {'text': 'flagship', 'tag': 'NOUN', 'lemma': None}, {'text': 'phone.', 'tag': 'NOUN', 'lemma': None}], 'arcs': [{'start': 0, 'end': 3, 'label': 'det', 'dir': 'left'}, {'start': 1, 'end': 3, 'label': 'nmod', 'dir': 'left'}, {'start': 1, 'end': 2, 'label': 'nummod', 'dir': 'right'}, {'start': 3, 'end': 4, 'label': 'nsubj', 'dir': 'left'}, {'start': 5, 'end': 6, 'label': 'det', 'dir': 'left'}, {'start': 6, 'end': 10, 'label': 'poss', 'dir': 'left'}, {'start': 6, 'end': 7, 'label': 'case', 'dir': 'right'}, {'start': 8, 'end': 10, 'label': 'amod', 'dir': 'left'}, {'start': 9, 'end': 10, 'label': 'compound', 'dir': 'left'}, {'start': 4, 'end': 10, 'label': 'attr', 'dir': 'right'}], 'settings': {'lang': 'en', 'direction': 'ltr'}}]
80
- def render_sentence_custom(parsed: str):
81
  TPL_DEP_WORDS = """
82
  <text class="displacy-token" fill="currentColor" text-anchor="start" y="{y}">
83
  <tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
@@ -89,43 +91,94 @@ def render_sentence_custom(parsed: str):
89
  <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="{lang}" id="{id}" class="displacy" width="{width}" height="{height}" direction="{dir}" style="max-width: none; height: {height}px; color: {color}; background: {bg}; font-family: {font}; direction: {dir}">{content}</svg>
90
  """
91
  arcs_svg = []
92
- couples = []
93
- nlp = spacy.load('en_core_web_sm')
94
- doc = nlp(parsed)
95
- arcs = {}
96
- words = {}
97
- parsed = [parse_deps(doc)]
98
- for i, p in enumerate(parsed):
99
- arcs = p["arcs"]
100
- words = p["words"]
101
- for i, a in enumerate(arcs):
102
- if a["label"] == "amod":
103
- couples = (a["start"], a["end"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  x_value_counter = 10
106
  index_counter = 0
107
  svg_words = []
 
108
  coords_test = []
109
- for i, word in enumerate(words):
110
- word = word["text"]
 
 
 
 
 
 
 
 
 
 
 
111
  word = word + " "
112
  pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
113
  svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
114
- if index_counter >= couples[0] and index_counter <= couples[1]:
115
  coords_test.append(x_value_counter)
116
- x_value_counter += 50
 
117
  index_counter += 1
118
  x_value_counter += pixel_x_length + 4
119
- for i, a in enumerate(arcs):
120
- if a["label"] == "amod":
121
- arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
 
122
 
123
  content = "".join(svg_words) + "".join(arcs_svg)
124
 
125
  full_svg = TPL_DEP_SVG.format(
126
  id=0,
127
- width=1975,
128
- height=574.5,
129
  color="#00000",
130
  bg="#ffffff",
131
  font="Arial",
@@ -133,9 +186,9 @@ def render_sentence_custom(parsed: str):
133
  dir="ltr",
134
  lang="en",
135
  )
136
-
137
  return full_svg
138
 
 
139
  def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
140
  """Generate dependency parse in {'words': [], 'arcs': []} format.
141
 
@@ -196,8 +249,9 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
196
  )
197
  return {"words": words, "arcs": arcs, "settings": get_doc_settings(orig_doc)}
198
 
 
199
  def get_doc_settings(doc: Doc) -> Dict[str, Any]:
200
  return {
201
  "lang": doc.lang_,
202
  "direction": doc.vocab.writing_system.get("direction", "ltr"),
203
- }
 
1
  from typing import Dict, Any
2
 
3
+ import numpy as np
4
  import spacy
5
  from PIL import ImageFont
6
 
7
  from spacy.tokens import Doc
8
 
9
+
10
  def get_pil_text_size(text, font_size, font_name):
11
  font = ImageFont.truetype(font_name, font_size)
12
  size = font.getsize(text)
 
34
  <path class="displacy-arrowhead" d="{head}" fill="red"/>
35
  </g>
36
  """
37
+ arc = get_arc(start + 10, 50, 5, end + 10)
38
+ arrowhead = get_arrowhead(direction, start + 10, 50, end + 10)
39
  label_side = "right" if direction == "rtl" else "left"
40
  return TPL_DEP_ARCS.format(
41
  id=0,
 
79
 
80
 
81
  # parsed = [{'words': [{'text': 'The', 'tag': 'DET', 'lemma': None}, {'text': 'OnePlus', 'tag': 'PROPN', 'lemma': None}, {'text': '10', 'tag': 'NUM', 'lemma': None}, {'text': 'Pro', 'tag': 'PROPN', 'lemma': None}, {'text': 'is', 'tag': 'AUX', 'lemma': None}, {'text': 'the', 'tag': 'DET', 'lemma': None}, {'text': 'company', 'tag': 'NOUN', 'lemma': None}, {'text': "'s", 'tag': 'PART', 'lemma': None}, {'text': 'first', 'tag': 'ADJ', 'lemma': None}, {'text': 'flagship', 'tag': 'NOUN', 'lemma': None}, {'text': 'phone.', 'tag': 'NOUN', 'lemma': None}], 'arcs': [{'start': 0, 'end': 3, 'label': 'det', 'dir': 'left'}, {'start': 1, 'end': 3, 'label': 'nmod', 'dir': 'left'}, {'start': 1, 'end': 2, 'label': 'nummod', 'dir': 'right'}, {'start': 3, 'end': 4, 'label': 'nsubj', 'dir': 'left'}, {'start': 5, 'end': 6, 'label': 'det', 'dir': 'left'}, {'start': 6, 'end': 10, 'label': 'poss', 'dir': 'left'}, {'start': 6, 'end': 7, 'label': 'case', 'dir': 'right'}, {'start': 8, 'end': 10, 'label': 'amod', 'dir': 'left'}, {'start': 9, 'end': 10, 'label': 'compound', 'dir': 'left'}, {'start': 4, 'end': 10, 'label': 'attr', 'dir': 'right'}], 'settings': {'lang': 'en', 'direction': 'ltr'}}]
82
+ def render_sentence_custom(unmatched_list: Dict):
83
  TPL_DEP_WORDS = """
84
  <text class="displacy-token" fill="currentColor" text-anchor="start" y="{y}">
85
  <tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
 
91
  <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="{lang}" id="{id}" class="displacy" width="{width}" height="{height}" direction="{dir}" style="max-width: none; height: {height}px; color: {color}; background: {bg}; font-family: {font}; direction: {dir}">{content}</svg>
92
  """
93
  arcs_svg = []
94
+ nlp = spacy.load('en_core_web_lg')
95
+ doc = nlp(unmatched_list["sentence"])
96
+ # words = {}
97
+ # unmatched_list = [parse_deps(doc)]
98
+ # #print(parsed)
99
+ # for i, p in enumerate(unmatched_list):
100
+ # arcs = p["arcs"]
101
+ # words = p["words"]
102
+ # for i, a in enumerate(arcs):
103
+ # #CHECK CERTAIN DEPS (ALSO ADD/CHANGE BELOW WHEN CHANGING HERE)
104
+ # if a["label"] == "amod":
105
+ # couples = (a["start"], a["end"])
106
+ # elif a["label"] == "pobj":
107
+ # couples = (a["start"], a["end"])
108
+ # #couples = (3,5)
109
+ #
110
+ # x_value_counter = 10
111
+ # index_counter = 0
112
+ # svg_words = []
113
+ # coords_test = []
114
+ # for i, word in enumerate(words):
115
+ # word = word["text"]
116
+ # word = word + " "
117
+ # pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
118
+ # svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
119
+ # if index_counter >= couples[0] and index_counter <= couples[1]:
120
+ # coords_test.append(x_value_counter)
121
+ # x_value_counter += 50
122
+ # index_counter += 1
123
+ # x_value_counter += pixel_x_length + 4
124
+ # for i, a in enumerate(arcs):
125
+ # if a["label"] == "amod":
126
+ # arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
127
+ # elif a["label"] == "pobj":
128
+ # arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
129
+ #
130
+ # content = "".join(svg_words) + "".join(arcs_svg)
131
+ #
132
+ # full_svg = TPL_DEP_SVG.format(
133
+ # id=0,
134
+ # width=1200, #600
135
+ # height=250, #125
136
+ # color="#00000",
137
+ # bg="#ffffff",
138
+ # font="Arial",
139
+ # content=content,
140
+ # dir="ltr",
141
+ # lang="en",
142
+ # )
143
 
144
  x_value_counter = 10
145
  index_counter = 0
146
  svg_words = []
147
+ words = unmatched_list["sentence"].split(" ")
148
  coords_test = []
149
+ #print(unmatched_list)
150
+ #print(words)
151
+ #print("NOW")
152
+ direction_current = "rtl"
153
+ if unmatched_list["cur_word_index"] < unmatched_list["target_word_index"]:
154
+ min_index = unmatched_list["cur_word_index"]
155
+ max_index = unmatched_list["target_word_index"]
156
+ direction_current = "left"
157
+ else:
158
+ max_index = unmatched_list["cur_word_index"]
159
+ min_index = unmatched_list["target_word_index"]
160
+ for i, token in enumerate(doc):
161
+ word = str(token)
162
  word = word + " "
163
  pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
164
  svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
165
+ if min_index <= index_counter <= max_index:
166
  coords_test.append(x_value_counter)
167
+ if index_counter < max_index - 1:
168
+ x_value_counter += 50
169
  index_counter += 1
170
  x_value_counter += pixel_x_length + 4
171
+
172
+ # TODO: DYNAMIC DIRECTION MAKING (SHOULD GIVE WITH DICT I THINK)
173
+ #print(coords_test)
174
+ arcs_svg.append(render_arrow(unmatched_list['dep'], coords_test[0], coords_test[-1], direction_current, i))
175
 
176
  content = "".join(svg_words) + "".join(arcs_svg)
177
 
178
  full_svg = TPL_DEP_SVG.format(
179
  id=0,
180
+ width=1200, # 600
181
+ height=75, # 125
182
  color="#00000",
183
  bg="#ffffff",
184
  font="Arial",
 
186
  dir="ltr",
187
  lang="en",
188
  )
 
189
  return full_svg
190
 
191
+
192
  def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
193
  """Generate dependency parse in {'words': [], 'arcs': []} format.
194
 
 
249
  )
250
  return {"words": words, "arcs": arcs, "settings": get_doc_settings(orig_doc)}
251
 
252
+
253
  def get_doc_settings(doc: Doc) -> Dict[str, Any]:
254
  return {
255
  "lang": doc.lang_,
256
  "direction": doc.vocab.writing_system.get("direction", "ltr"),
257
+ }
dependency-specific-text/article11.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ One of the dependencies that, when found in the summary but not in the article, indicates a possible error is the "poss" (possession modifier) dependency.
2
+ In the image above, you can see the unmatched dependency that is found in the summary but not present in the article. For the "poss" dependency, we only check matches when the target word is "in", as it is here. U.S. is the entity here.
3
+ For this specific example, it's obvious that the dependency of "in U.S." is not found in the article, as you can already see in the entity matching paragraph that U.S. is a hallucinated entity and doesn't occur in the article itself,
4
+ so technically we don't need dependency comparison here to spot this particular error.
dependency-specific-text/article13.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ One of the dependencies that, when found in the summary but not in the article, indicates a possible error is the "amod" (adjectival modifier) dependency.
2
+ In the image above, you can see the unmatched dependency that is found in the summary but not present in the article. "First" is the entity here, and it's the adjectival modifier of the word "phone".
3
+ However, this sentence is not factual, since the article talks about a **new** type of flagship phone, and not at all the **first** flagship phone. This is wrong, and the error was found by filtering on this specific kind of dependency.
dependency-specific-text/example.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ This is example explanation.
entity-specific-text/article11.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ For this summary, there are 2 unmatched entities: "The Mark Levinson" and "U.S". The first one
2
+ is not actually a real error per se, but rather a "the" before "Mark Levinson" (TODO EXPLAIN BIT BETTER).
3
+ The "U.S." however is a hallucinated entity not present in the article, and via this method this can be found.
entity-specific-text/article13.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ For this summary, there are 2 unmatched entities: "January 18" and "U.S". January 18 is indeed a hallucinated entity, as there is no sentence containing this exact date. U.S. does occur in the article, but as "US" instead of "U.S.". This can be solved
2
+ by comparing to a list of abbreviations (of embeddings :TODO?)
sample-articles/article11.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Not so long ago, the internet was debating what the folks at Apple were thinking charging $549 for the AirPods Max. At CES 2022, luxury audio brand Mark Levinson would like a word. The Harman-owned company (which also owns AKG, JBL and Harman Kardon, and is itself a Samsung subsidiary) mostly known for its high-end home and car audio systems has announced its first wireless headphones: the No. 5909. While they offer everything you'd expect from a premium set, they have a nearly $1,000 price tag that only a select few might commit to.
2
+
3
+ The over-ear No. 5909 packs 40mm Beryllium drivers "expertly tuned to the Harman curve." The company explains that "the Harman curve" is acoustic response that it says has taken decades of research to construct. The result here is "incredible acoustic performance" in a set of "reference class" wireless headphones. Mark Levinson says that audio performance meets the guidelines for Hi-Res Audio certification thanks to 24-bit/96kHz signal processing and 40kHz acoustic response. The No. 5909 supports LDAC, AAC and aptX Adaptive wireless codecs via Bluetooth 5.1.
4
+
5
+ Mark Levinson promises you'll hear details you haven't before, like "the slightest breath an artist takes" or "a hidden harmony." The company explains that the same "world-class sound engineers" that built the luxury brand's amps, turntables and streaming players are behind the tuning of the ultra pricey No. 5909.
6
+
7
+ Mark Levinson/Harman
8
+
9
+ Sound quality isn't the only consideration though. The No. 5909 has adaptive active noise cancellation (ANC) with three modes "for premium sound isolation" and an Ambient Aware feature that lets you tune into your surroundings as needed. The company also packed in four microphones for calls that are equipped with a so-called Smart Wind Adaption feature. The materials used to make the headphones are also better than the mostly plastic sets we typically see. The No. 5909 is built with an aluminum frame, painted metallic earcups, leather headband and replaceable leather ear cushions. An included hard shell travel case comes stocked with a USB-C charging cable, USB-C to USB-A adaptor, two USB-C to 3.5mm cables, 3.5mm to 6.3mm adaptor, airplane adaptor and a polishing cloth. Basically, it's everything you'd need to use the headphones on any setup — wired, wireless or while traveling.
10
+
11
+ Mark Levinson says you can expect up to 30 hours of use with adaptive ANC active and up to 34 hours with the feature disabled. A quick-charge feature will give you up to six hours of play time in 15 minutes. Via an app for Android and iOS, you'll get some control over the headphones, but the company didn't go into specifics there.
12
+
13
+ The No. 5909 will be available in black, pewter and red color options starting today for $999.
sample-summaries/article11.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ The Mark Levinson No. 5909 is the company's first wireless headphones. It's "reference class" and comes in black, pewter and red color options.. The headphones start at $999 and will be available starting today in the U.S. A quick-charge feature will give you up to six hours of play time in 15 minutes, the company says, via an app for Android and iOS.The company also packed in four microphones for calls that are equipped with a so-called Smart Wind Adaption feature., via Bluetooth 5.1.
sample-summaries/article13.txt CHANGED
@@ -1 +1 @@
1
- The OnePlus 10 Pro is the company's first flagship phone. It's the result of a merger between OnePlus and Oppo, which will be called "SuperVOOC" The phone is launching in China first on January 11. There's also no word on a US release date yet. The 10 Pro will have a 6.7-inch display and three cameras on the back. We don't have a price yet, but OnePlus' flagship prices have gone up every year so far, and the 9 Pro was $969.The phone will go on sale January 11 in China and January 18 in the U.S.
 
1
+ The OnePlus 10 Pro is the company's first flagship phone. It's the result of a merger between OnePlus and Oppo, which will be called "SuperVOOC" The phone is launching in China first on January 11. There's also no word on a US release date yet. The 10 Pro will have a 6.7-inch display and three cameras on the back. We don't have a price yet, but OnePlus' flagship prices have gone up every year so far, and the 9 Pro was $969. The phone will go on sale January 11 in China and January 18 in the U.S.