gabrielaltay commited on
Commit
c94b761
1 Parent(s): 71f3ee6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +299 -0
app.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BigBIO Dataset Explorer Demo
3
+ """
4
+
5
+ from collections import Counter
6
+ from collections import defaultdict
7
+ import string
8
+
9
+ from datasets import load_dataset
10
+ from loguru import logger
11
+ import numpy as np
12
+ import pandas as pd
13
+ import plotly.express as px
14
+ import spacy
15
+ from spacy import displacy
16
+ import streamlit as st
17
+
18
+ from bigbio.dataloader import BigBioConfigHelpers
19
+ from bigbio.hf_maps import BATCH_MAPPERS_TEXT_FROM_SCHEMA
20
+ from sklearn.feature_extraction.text import CountVectorizer
21
+
22
+
23
+ st.set_page_config(layout="wide")
24
+
25
+
26
+ IBM_COLORS = [
27
+ "#648fff",
28
+ "#dc267f",
29
+ "#ffb000",
30
+ "#fe6100",
31
+ "#785ef0",
32
+ "#000000",
33
+ "#ffffff",
34
+ ]
35
+
36
+
37
+ def get_html(html: str):
38
+ """Convert HTML so it can be rendered."""
39
+ WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;\
40
+ margin-bottom: 2.5rem">{}</div>"""
41
+ # Newlines seem to mess with the rendering
42
+ html = html.replace("\n", " ")
43
+ return WRAPPER.format(html)
44
+
45
+
46
+ @st.cache()
47
+ def load_conhelps():
48
+ conhelps = BigBioConfigHelpers()
49
+ logger.info(conhelps)
50
+ conhelps = conhelps.filtered(lambda x: not x.is_large)
51
+ conhelps = conhelps.filtered(lambda x: x.is_bigbio_schema)
52
+ conhelps = conhelps.filtered(lambda x: not x.is_local)
53
+ return conhelps
54
+
55
+
56
+ def update_axis_font(fig):
57
+ fig.update_layout(
58
+ xaxis = dict(title_font = dict(size=20)),
59
+ yaxis = dict(title_font = dict(size=20)),
60
+ )
61
+ return fig
62
+
63
+
64
+ def draw_histogram(hist_data, col_name, histnorm=None, nbins=25, xmax=None, loc=st):
65
+ fig = px.histogram(
66
+ hist_data,
67
+ x=col_name,
68
+ color="split",
69
+ color_discrete_sequence=IBM_COLORS,
70
+ marginal="box", # or violin, rug
71
+ barmode="group",
72
+ hover_data=hist_data.columns,
73
+ histnorm=histnorm,
74
+ nbins=nbins,
75
+ range_x=(0, xmax) if xmax else None,
76
+ )
77
+ fig = update_axis_font(fig)
78
+ loc.plotly_chart(fig, use_container_width=True)
79
+
80
+
81
+ def draw_bar(bar_data, x, y, loc=st):
82
+ fig = px.bar(
83
+ bar_data,
84
+ x=x,
85
+ y=y,
86
+ color="split",
87
+ color_discrete_sequence=IBM_COLORS,
88
+ barmode="group",
89
+ hover_data=bar_data.columns,
90
+ )
91
+ fig = update_axis_font(fig)
92
+ loc.plotly_chart(fig, use_container_width=True)
93
+
94
+
95
+ def parse_metrics(metadata, loc):
96
+ for split, meta in metadata.items():
97
+ for key, val in meta.__dict__.items():
98
+ if isinstance(val, int):
99
+ loc.metric(label=f"{split}-{key}", value=val)
100
+
101
+
102
+ def parse_counters(metadata):
103
+ meta = metadata["train"] # using the training counter to fetch the names
104
+ counters = []
105
+ for k, v in meta.__dict__.items():
106
+ if "counter" in k and len(v) > 0:
107
+ counters.append(k)
108
+ return counters
109
+
110
+
111
+ # generate the df for histogram
112
+ def parse_label_counter(metadata, counter_type):
113
+ hist_data = []
114
+ for split, m in metadata.items():
115
+ metadata_counter = getattr(m, counter_type)
116
+ for k, v in metadata_counter.items():
117
+ row = {}
118
+ row["labels"] = k
119
+ row[counter_type] = v
120
+ row["split"] = split
121
+ hist_data.append(row)
122
+ return pd.DataFrame(hist_data)
123
+
124
+
125
+
126
+
127
+ # load BigBioConfigHelpers
128
+ #==================================
129
+ logger.info("about to call load_conhelps")
130
+ conhelps = load_conhelps()
131
+ logger.info("exiting call load_conhelps")
132
+ config_name_to_conhelp = {ch.config.name: ch for ch in conhelps}
133
+ ds_display_names = sorted(list(set([ch.display_name for ch in conhelps])))
134
+ ds_display_name_to_config_names = defaultdict(list)
135
+ for ch in conhelps:
136
+ ds_display_name_to_config_names[ch.display_name].append(ch.config.name)
137
+
138
+
139
+ # dataset selection
140
+ #==================================
141
+
142
+ st.sidebar.title("Dataset Selection")
143
+ ds_display_name = st.sidebar.selectbox("dataset name", ds_display_names, index=0)
144
+
145
+ config_names = ds_display_name_to_config_names[ds_display_name]
146
+ config_name = st.sidebar.selectbox("config name", config_names)
147
+ conhelp = config_name_to_conhelp[config_name]
148
+
149
+
150
+ st.header(f"Dataset stats for {ds_display_name}")
151
+
152
+
153
+ @st.cache()
154
+ def load_data(conhelp):
155
+ metadata = conhelp.get_metadata()
156
+ dsd = conhelp.load_dataset()
157
+ dsd = dsd.map(
158
+ BATCH_MAPPERS_TEXT_FROM_SCHEMA[conhelp.bigbio_schema_caps.lower()],
159
+ batched=True)
160
+
161
+ return dsd, metadata
162
+
163
+ @st.cache()
164
+ def count_vectorize(dsd):
165
+ cv = CountVectorizer()
166
+ xcvs = {}
167
+ dfs_tok_per_samp = []
168
+ for split, ds in dsd.items():
169
+ xcv = cv.fit_transform(ds['text'])
170
+ token_counts = np.asarray(xcv.sum(axis=1)).flatten()
171
+ df = pd.DataFrame(token_counts, columns=["tokens per sample"])
172
+ df["split"] = split
173
+ dfs_tok_per_samp.append(df)
174
+ xcvs[split] = xcv
175
+ df_tok_per_samp = pd.concat(dfs_tok_per_samp)
176
+ return xcvs, df_tok_per_samp
177
+
178
+
179
+ dsd_load_state = st.info(f"Loading {ds_display_name} - {config_name} ...")
180
+ dsd, metadata = load_data(conhelp)
181
+ dsd_load_state.empty()
182
+
183
+ cv_load_state = st.info(f"Count Vectorizing {ds_display_name} - {config_name} ...")
184
+ xcvs, df_tok_per_samp = count_vectorize(dsd)
185
+ cv_load_state.empty()
186
+
187
+
188
+ st.sidebar.subheader(f"BigBIO Schema = {conhelp.bigbio_schema_caps}")
189
+
190
+ st.sidebar.subheader("Tasks Supported by Dataset")
191
+ tasks = conhelp.tasks
192
+ tasks = [string.capwords(task.replace("_", " ")) for task in tasks]
193
+ st.sidebar.markdown(
194
+ """
195
+ {}
196
+ """.format(
197
+ "\n".join([
198
+ f"- {task}" for task in tasks
199
+ ]))
200
+ )
201
+
202
+ st.sidebar.subheader("Languages")
203
+ langs = conhelp.languages
204
+ st.sidebar.markdown(
205
+ """
206
+ {}
207
+ """.format("\n".join([f"- {lang}" for lang in langs]))
208
+ )
209
+
210
+ st.sidebar.subheader("Home Page")
211
+ st.sidebar.write(conhelp.homepage)
212
+
213
+ st.sidebar.subheader("Description")
214
+ st.sidebar.write(conhelp.description)
215
+
216
+ st.sidebar.subheader("Citation")
217
+ st.sidebar.markdown(f"""\
218
+ ```
219
+ {conhelp.citation}
220
+ ````
221
+ """
222
+ )
223
+ st.sidebar.subheader("Counts")
224
+ parse_metrics(metadata, st.sidebar)
225
+
226
+
227
+
228
+ # dataframe display
229
+ #if "train" in dsd.keys():
230
+ # st.subheader("Sample Preview")
231
+ # df = pd.DataFrame.from_dict(dsd["train"])
232
+ # st.write(df.head(10))
233
+
234
+
235
+
236
+ # draw token distribution
237
+ st.subheader("Sample Length Distribution")
238
+ max_xmax = int(df_tok_per_samp["tokens per sample"].max())
239
+ xmax = st.slider("xmax", min_value=0, max_value=max_xmax, value=max_xmax)
240
+ histnorms = ['percent', 'probability', 'density', 'probability density', None]
241
+ histnorm = st.selectbox("histnorm", histnorms)
242
+ draw_histogram(df_tok_per_samp, "tokens per sample", histnorm=histnorm, xmax=xmax, loc=st)
243
+
244
+
245
+
246
+ st.subheader("Counter Distributions")
247
+ counters = parse_counters(metadata)
248
+ counter_type = st.selectbox("counter_type", counters)
249
+ label_df = parse_label_counter(metadata, counter_type)
250
+ label_max = int(label_df[counter_type].max() - 1)
251
+ label_min = int(label_df[counter_type].min())
252
+ filter_value = st.slider("minimum cutoff", label_min, label_max)
253
+ label_df = label_df[label_df[counter_type] >= filter_value]
254
+ # draw bar chart for counter
255
+ draw_bar(label_df, "labels", counter_type, st)
256
+
257
+
258
+ st.subheader("Sample Explorer")
259
+ split = st.selectbox("split", list(dsd.keys()))
260
+ sample_index = st.number_input(
261
+ "sample index",
262
+ min_value=0,
263
+ max_value=len(dsd[split])-1,
264
+ value=0,
265
+ )
266
+
267
+ sample = dsd[split][sample_index]
268
+
269
+
270
+ if conhelp.bigbio_schema_caps == "KB":
271
+ nlp = spacy.blank("en")
272
+ text = sample["text"]
273
+ doc = nlp(text)
274
+ spans = []
275
+ for bb_ent in sample["entities"]:
276
+ span = doc.char_span(
277
+ bb_ent["offsets"][0][0],
278
+ bb_ent["offsets"][0][1],
279
+ label=bb_ent["type"],
280
+ )
281
+ spans.append(span)
282
+ doc.spans["sc"] = spans
283
+ html = displacy.render(
284
+ doc,
285
+ style="span",
286
+ options={
287
+ "colors": {
288
+ et: clr for et,clr in zip(
289
+ metadata[split].entities_type_counter.keys(),
290
+ IBM_COLORS*10
291
+ )
292
+ }
293
+ },
294
+ )
295
+ style = "<style>mark.entity { display: inline-block }</style>"
296
+ st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)
297
+
298
+
299
+ st.write(sample)