Spaces:
Runtime error
Runtime error
from collections import defaultdict | |
from itertools import count | |
from operator import itemgetter | |
from pathlib import Path | |
from typing import Dict, Optional | |
from typing import List, Tuple, Union | |
import htbuilder | |
import streamlit as st | |
from htbuilder import span, div, script, style, link, styles, HtmlElement, br | |
from htbuilder.units import px | |
from spacy.tokens import Doc | |
palette = [ | |
"#66c2a5", | |
"#fc8d62", | |
"#8da0cb", | |
"#e78ac3", | |
"#a6d854", | |
"#ffd92f", | |
"#e5c494", | |
"#b3b3b3", | |
] | |
inactive_color = "#BBB" | |
def local_stylesheet(path): | |
with open(path) as f: | |
css = f.read() | |
return style()( | |
css | |
) | |
def remote_stylesheet(url): | |
return link( | |
href=url | |
) | |
def local_script(path): | |
with open(path) as f: | |
code = f.read() | |
return script()( | |
code | |
) | |
def remote_script(url): | |
return script( | |
src=url | |
) | |
def get_color(sent_idx): | |
return palette[sent_idx % len(palette)] | |
def hex_to_rgb(hex): | |
hex = hex.replace("#", '') | |
return tuple(int(hex[i:i + 2], 16) for i in (0, 2, 4)) | |
def color_with_opacity(hex_color, opacity): | |
rgb = hex_to_rgb(hex_color) | |
return f"rgba({rgb[0]},{rgb[1]},{rgb[2]},{opacity:.2f})" | |
class Component: | |
def show(self, width=None, height=None, scrolling=True, **kwargs): | |
out = div(style=styles( | |
**kwargs | |
))(self.html()) | |
html = str(out) | |
st.components.v1.html(html, width=width, height=height, scrolling=scrolling) | |
def html(self): | |
raise NotImplemented | |
class MainView(Component): | |
def __init__( | |
self, | |
document: Doc, | |
summaries: List[Doc], | |
semantic_alignments: Optional[List[Dict]], | |
lexical_alignments: Optional[List[Dict]], | |
layout: str, | |
scroll: bool, | |
gray_out_stopwords: bool | |
): | |
self.document = document | |
self.summaries = summaries | |
self.semantic_alignments = semantic_alignments | |
self.lexical_alignments = lexical_alignments | |
self.layout = layout | |
self.scroll = scroll | |
self.gray_out_stopwords = gray_out_stopwords | |
def html(self): | |
# Add document elements | |
if self.document._.name == 'Document': | |
document_name = 'Source Document' | |
else: | |
document_name = self.document._.name + ' summary' | |
doc_header = div( | |
id_="document-header" | |
)( | |
document_name | |
) | |
doc_elements = [] | |
# Add document content, which comprises multiple elements, one for each summary. Only the elment corresponding to | |
# selected summary will be visible. | |
mu = MultiUnderline() | |
for summary_idx, summary in enumerate(self.summaries): | |
token_idx_to_sent_idx = {} | |
for sent_idx, sent in enumerate(summary.sents): | |
for token in sent: | |
token_idx_to_sent_idx[token.i] = sent_idx | |
is_selected_summary = (summary_idx == 0) # By default, first summary is selected | |
if self.semantic_alignments is not None: | |
doc_token_idx_to_matches = defaultdict(list) | |
semantic_alignment = self.semantic_alignments[summary_idx] | |
for summary_token_idx, matches in semantic_alignment.items(): | |
for doc_token_idx, sim in matches: | |
doc_token_idx_to_matches[doc_token_idx].append((summary_token_idx, sim)) | |
else: | |
doc_token_idx_to_matches = {} | |
token_elements = [] | |
for doc_token_idx, doc_token in enumerate(self.document): | |
if doc_token.is_stop or doc_token.is_punct: | |
classes = ["stopword"] | |
if self.gray_out_stopwords: | |
classes.append("grayed-out") | |
el = span( | |
_class=" ".join(classes) | |
)( | |
doc_token.text | |
) | |
else: | |
matches = doc_token_idx_to_matches.get(doc_token_idx) | |
if matches: | |
summary_token_idx, sim = max(matches, key=itemgetter(1)) | |
sent_idx = token_idx_to_sent_idx[summary_token_idx] | |
color_primary = get_color(sent_idx) | |
highlight_color_primary = color_with_opacity(color_primary, sim) | |
props = { | |
'data-highlight-id': str(doc_token_idx), | |
'data-primary-color': highlight_color_primary | |
} | |
match_classes = [] | |
for summary_token_idx, sim in matches: | |
sent_idx = token_idx_to_sent_idx[summary_token_idx] | |
match_classes.append(f"summary-highlight-{summary_idx}-{summary_token_idx}") | |
color = color_with_opacity(get_color(sent_idx), sim) | |
props[f"data-color-{summary_idx}-{summary_token_idx}"] = color | |
props["data-match-classes"] = " ".join(match_classes) | |
el = self._highlight( | |
doc_token.text, | |
highlight_color_primary, | |
color_primary, | |
match_classes + ["annotation-hidden"], | |
**props | |
) | |
else: | |
el = doc_token.text | |
token_elements.append(el) | |
spans = [] | |
if self.lexical_alignments is not None: | |
lexical_alignment = self.lexical_alignments[summary_idx] | |
for summary_span, doc_spans in lexical_alignment.items(): | |
summary_span_start, summary_span_end = summary_span | |
span_id = f"{summary_idx}-{summary_span_start}-{summary_span_end}" | |
sent_idx = token_idx_to_sent_idx[summary_span_start] | |
for doc_span_start, doc_span_end in doc_spans: | |
spans.append(( | |
doc_span_start, | |
doc_span_end, | |
sent_idx, | |
get_color(sent_idx), | |
span_id | |
)) | |
token_elements = mu.markup(token_elements, spans) | |
classes = ["main-doc", "bordered"] | |
if self.scroll: | |
classes.append("scroll") | |
main_doc = div( | |
_class=" ".join(classes) | |
)( | |
token_elements | |
), | |
classes = ["doc"] | |
if is_selected_summary: | |
classes.append("display") | |
else: | |
classes.append("nodisplay") | |
doc_elements.append( | |
div( | |
**{ | |
"class": " ".join(classes), | |
"data-index": summary_idx | |
} | |
)( | |
main_doc, | |
div(_class="proxy-doc"), | |
div(_class="proxy-scroll") | |
) | |
) | |
summary_title = "Summary" | |
summary_header = div( | |
id_="summary-header" | |
)( | |
summary_title, | |
div(id="summary-header-gap"), | |
) | |
summary_items = [] | |
for summary_idx, summary in enumerate(self.summaries): | |
token_idx_to_sent_idx = {} | |
for sent_idx, sent in enumerate(summary.sents): | |
for token in sent: | |
token_idx_to_sent_idx[token.i] = sent_idx | |
spans = [] | |
matches_ngram = [False] * len(list(summary)) | |
if self.lexical_alignments is not None: | |
lexical_alignment = self.lexical_alignments[summary_idx] | |
for summary_span in lexical_alignment.keys(): | |
start, end = summary_span | |
matches_ngram[slice(start, end)] = [True] * (end - start) | |
span_id = f"{summary_idx}-{start}-{end}" | |
sent_idx = token_idx_to_sent_idx[start] | |
spans.append(( | |
start, | |
end, | |
sent_idx, | |
get_color(sent_idx), | |
span_id | |
)) | |
if self.semantic_alignments is not None: | |
semantic_alignment = self.semantic_alignments[summary_idx] | |
else: | |
semantic_alignment = {} | |
token_elements = [] | |
for token_idx, token in enumerate(summary): | |
if token.is_stop or token.is_punct: | |
classes = ["stopword"] | |
if self.gray_out_stopwords: | |
classes.append("grayed-out") | |
el = span( | |
_class=" ".join(classes) | |
)( | |
token.text | |
) | |
else: | |
classes = [] | |
if token.ent_iob_ in ('I', 'B'): | |
classes.append("entity") | |
if matches_ngram[token_idx]: | |
classes.append("matches-ngram") | |
matches = semantic_alignment.get(token_idx) | |
if matches: | |
top_match = max(matches, key=itemgetter(1)) | |
top_sim = max(top_match[1], 0) | |
top_doc_token_idx = top_match[0] | |
props = { | |
"data-highlight-id": f"{summary_idx}-{token_idx}", | |
"data-top-doc-highlight-id": str(top_doc_token_idx), | |
"data-top-doc-sim": f"{top_sim:.2f}", | |
} | |
classes.extend([ | |
"annotation-hidden", | |
f"summary-highlight-{summary_idx}-{token_idx}" | |
]) | |
sent_idx = token_idx_to_sent_idx[token_idx] | |
el = self._highlight( | |
token.text, | |
color_with_opacity(get_color(sent_idx), top_sim), | |
color_with_opacity(get_color(sent_idx), 1), | |
classes, | |
**props | |
) | |
else: | |
if classes: | |
el = span(_class=" ".join(classes))(token.text) | |
else: | |
el = token.text | |
token_elements.append(el) | |
token_elements = mu.markup(token_elements, spans) | |
classes = ["summary-item"] | |
if summary_idx == 0: # Default is for first summary to be selected | |
classes.append("selected") | |
summary_items.append( | |
div( | |
**{"class": ' '.join(classes), "data-index": summary_idx} | |
)( | |
div(_class="name")(summary._.name), | |
div(_class="content")(token_elements) | |
) | |
) | |
classes = ["summary-list", "bordered"] | |
if self.scroll: | |
classes.append("scroll") | |
if self.lexical_alignments is not None: | |
classes.append("has-lexical-alignment") | |
if self.semantic_alignments is not None: | |
classes.append("has-semantic-alignment") | |
summary_list = div( | |
_class=" ".join(classes) | |
)( | |
summary_items | |
) | |
annotation_key = \ | |
""" | |
<ul class="annotation-key"> | |
<li class="annotation-key-label">Annotations:</li> | |
<li id="option-lexical" class="option selected"> | |
<span class="annotation-key-ngram">N-Gram overlap</span> | |
</li> | |
<li id="option-semantic" class="option selected"> | |
<span class="annotation-key-semantic">Semantic overlap</span> | |
</li> | |
<li id="option-novel" class="option selected"> | |
<span class="annotation-key-novel">Novel words</span> | |
</li> | |
<li id="option-entity" class="option selected"> | |
<span class="annotation-key-entity">Novel entities</span> | |
</li> | |
</ul> | |
""" | |
body = div( | |
annotation_key, | |
div( | |
_class=f"vis-container {self.layout}-layout" | |
)( | |
div( | |
_class="doc-container" | |
)( | |
doc_header, | |
*doc_elements | |
), | |
div( | |
_class="summary-container" | |
)( | |
summary_header, | |
summary_list | |
) | |
), | |
) | |
return [ | |
"""<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">""", | |
local_stylesheet(Path(__file__).parent / "resources" / "summvis.css"), | |
"""<link rel="preconnect" href="https://fonts.gstatic.com"> | |
<link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;500&display=swap" rel="stylesheet">""", | |
body, | |
"""<script | |
src="https://code.jquery.com/jquery-3.5.1.min.js" | |
integrity="sha256-9/aliU8dGd2tb6OSsuzixeV4y/faTqgFtohetphbbj0=" | |
crossorigin="anonymous"></script> | |
<script src="https://cdn.jsdelivr.net/npm/bootstrap@4.6.0/dist/js/bootstrap.bundle.min.js" | |
integrity="sha384-Piv4xVNRyMGpqkS2by6br4gNJ7DXjqk09RmUpJ8jgGtD7zP9yug3goQfGII0yAns" | |
crossorigin="anonymous"></script>""", | |
local_script(Path(__file__).parent / "resources" / "jquery.color-2.1.2.min.js"), | |
local_script(Path(__file__).parent / "resources" / "summvis.js"), | |
"""<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/js/bootstrap.bundle.min.js" integrity="sha384-gtEjrD/SeCtmISkJkNUaaKMoLD0//ElJ19smozuHV6z3Iehds+3Ulb9Bn9Plx0x4" crossorigin="anonymous"></script>""" | |
] | |
def _highlight( | |
self, | |
token: Union[str, HtmlElement], | |
background_color, | |
dotted_underline_color, | |
classes: List[str], | |
**props | |
): | |
return span( | |
_class=" ".join(classes + ["highlight"]), | |
style=styles( | |
background_color=background_color, | |
border_bottom=f"4px dotted {dotted_underline_color}", | |
), | |
**props | |
)(token) | |
SPACE = " " | |
class MultiUnderline: | |
def __init__( | |
self, | |
underline_thickness=3, | |
underline_spacing=1 | |
): | |
self.underline_thickness = underline_thickness | |
self.underline_spacing = underline_spacing | |
def markup( | |
self, | |
tokens: List[Union[str, HtmlElement]], | |
spans: List[Tuple[int, int, int, str, str]] | |
): | |
"""Style text with multiple layers of colored underlines. | |
Args: | |
tokens: list of tokens, either string or html element | |
spans: list of (start_pos, end_pos, rank, color, id) tuples defined as: | |
start_pos: start position of underline span | |
end_pos: end position of underline span | |
rank: rank for stacking order of underlines, all else being equal | |
color: color of underline | |
id: id of underline (encoded as a class label in resulting html element) | |
Returns: | |
List of HTML elements | |
""" | |
# Map from span start position to span | |
start_to_spans = defaultdict(list) | |
for span in spans: | |
start = span[0] | |
start_to_spans[start].append(span) | |
# Map from each underline slot position to list of active spans | |
slot_to_spans = {} | |
# Collection of html elements | |
elements = [] | |
first_token_in_line = True | |
for pos, token in enumerate(tokens): | |
# Remove spans that are no longer active (end < pos) | |
slot_to_spans = defaultdict( | |
list, | |
{ | |
slot: [span for span in spans if span[1] > pos] # span[1] contains end of spans | |
for slot, spans in slot_to_spans.items() if spans | |
} | |
) | |
# Add underlines to space between tokens for any continuing underlines | |
if first_token_in_line: | |
first_token_in_line = False | |
else: | |
elements.append(self._get_underline_element(SPACE, slot_to_spans)) | |
# Find slot for any new spans | |
new_spans = start_to_spans.pop(pos, None) | |
if new_spans: | |
new_spans.sort( | |
key=lambda span: (-(span[1] - span[0]), span[2])) # Sort by span length (reversed), rank | |
for new_span in new_spans: | |
# Find an existing slot or add a new one | |
for slot, spans in sorted(slot_to_spans.items(), key=itemgetter(0)): # Sort by slot index | |
if spans: | |
containing_span = spans[ | |
0] # The first span in the slot strictly contains all other spans | |
containing_start, containing_end = containing_span[0:2] | |
containing_color = containing_span[3] | |
start, end = new_span[0:2] | |
color = new_span[3] | |
# If the new span (1) is strictly contained in this span, or (2) exactly matches this span | |
# and is the same color, then add span to this slot | |
if end <= containing_end and ( | |
(start > containing_start or end < containing_end) or | |
(start == containing_start and end == containing_end and color == containing_color) | |
): | |
spans.append(new_span) | |
break | |
else: | |
# Find a new slot index to add the span | |
for slot_index in count(): | |
spans = slot_to_spans[slot_index] | |
if not spans: # If slot is free, take it | |
spans.append(new_span) | |
break | |
if token in ("\n", "\r", "\r\n"): | |
elements.append(br()) | |
first_token_in_line = True | |
else: | |
# Add underlines to token for all active spans | |
elements.append(self._get_underline_element(token, slot_to_spans)) | |
return elements | |
def _get_underline_element(self, token, slot_to_spans): | |
if not slot_to_spans: | |
return token | |
max_slot_index = max(slot_to_spans.keys()) | |
element = token | |
for slot_index in range(max_slot_index + 1): | |
spans = slot_to_spans[slot_index] | |
if not spans: | |
color = "rgba(0, 0, 0, 0)" # Transparent element w/opacity=0 | |
props = {} | |
else: | |
containing_slot = spans[0] | |
color = containing_slot[3] | |
classes = ["underline"] | |
if token != SPACE: | |
classes.append("token-underline") | |
classes.extend([f"span-{span[4]}" for span in spans]) # Encode ids in class names | |
props = { | |
"class": " ".join(classes), | |
"data-primary-color": color | |
} | |
if slot_index == 0: | |
padding_bottom = 0 | |
else: | |
padding_bottom = self.underline_spacing | |
display = "inline-block" | |
element = htbuilder.span( | |
style=styles( | |
display=display, | |
border_bottom=f"{self.underline_thickness}px solid", | |
border_color=color, | |
padding_bottom=px(padding_bottom), | |
), | |
**props | |
)(element) | |
# Return outermost nested span | |
return element | |
if __name__ == "__main__": | |
from htbuilder import div | |
# Test | |
text = "The quick brown fox jumps" | |
tokens = text.split() | |
tokens = [ | |
"The", | |
htbuilder.span(style=styles(color="red"))("quick"), | |
"brown", | |
"fox", | |
"jumps" | |
] | |
spans = [ | |
(0, 2, 0, "green", "green1"), | |
(1, 3, 0, "orange", "orange1"), | |
(3, 4, 0, "red", "red1"), | |
(2, 4, 0, "blue", "blue1"), | |
(1, 5, 0, "orange", "orange1"), | |
] | |
mu = MultiUnderline() | |
html = str(div(mu.markup(tokens, spans))) | |
print(html) | |