greco commited on
Commit
0c0c0b8
1 Parent(s): 9485a72

update library

Browse files
Files changed (1) hide show
  1. survey_analytics_library.py +0 -150
survey_analytics_library.py CHANGED
@@ -1,20 +1,7 @@
1
 
2
  # imports
3
  import pandas as pd
4
- import numpy as np
5
- import streamlit as st
6
- from tqdm.notebook import tqdm
7
- import matplotlib.pyplot as plt
8
- import plotly.express as px
9
-
10
- from sklearn.cluster import KMeans
11
- from sklearn.metrics import silhouette_score
12
-
13
- import zipfile
14
- from xml.etree.cElementTree import XML
15
-
16
  import re
17
- from nltk.corpus import stopwords
18
 
19
 
20
 
@@ -82,143 +69,6 @@ def clean_text(text_string, list_of_replacements, lowercase=True, ignorecase=Fal
82
 
83
 
84
 
85
- # remove stopwords from tokens
86
- def remove_stopwords(tokens, language='english'):
87
- '''
88
- remove stopwords from tokens using list comprehension
89
- default to using english stopwords
90
- arguments:
91
- tokens (list): list of token#s, output of word_tokenize()
92
- language (str): default to english
93
- returns:
94
- a list of tokens without stopwords
95
- '''
96
- # define stopwords and store as a set
97
- stopwords_set = set(stopwords.words(language))
98
- # check if word is in list of stopwords
99
- # returns a list of words not found in list of stopwords
100
- stopwords_removed = [word for word in tokens if word not in stopwords_set]
101
- # return
102
- return stopwords_removed
103
-
104
-
105
-
106
- import itertools
107
- from typing import List
108
- import plotly.graph_objects as go
109
- from plotly.subplots import make_subplots
110
- def visualize_barchart_titles(topic_model,
111
- topics: List[int] = None,
112
- subplot_titles: List[str] = None,
113
- top_n_topics: int = 8,
114
- n_words: int = 5,
115
- width: int = 250,
116
- height: int = 250) -> go.Figure:
117
- """ Visualize a barchart of selected topics
118
-
119
- Arguments:
120
- topic_model: A fitted BERTopic instance.
121
- topics: A selection of topics to visualize.
122
- top_n_topics: Only select the top n most frequent topics.
123
- n_words: Number of words to show in a topic
124
- width: The width of each figure.
125
- height: The height of each figure.
126
-
127
- Returns:
128
- fig: A plotly figure
129
-
130
- Usage:
131
-
132
- To visualize the barchart of selected topics
133
- simply run:
134
-
135
- ```python
136
- topic_model.visualize_barchart()
137
- ```
138
-
139
- Or if you want to save the resulting figure:
140
-
141
- ```python
142
- fig = topic_model.visualize_barchart()
143
- fig.write_html("path/to/file.html")
144
- ```
145
- <iframe src="../../getting_started/visualization/bar_chart.html"
146
- style="width:1100px; height: 660px; border: 0px;""></iframe>
147
- """
148
- colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"])
149
-
150
- # Select topics based on top_n and topics args
151
- freq_df = topic_model.get_topic_freq()
152
- freq_df = freq_df.loc[freq_df.Topic != -1, :]
153
- if topics is not None:
154
- topics = list(topics)
155
- elif top_n_topics is not None:
156
- topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
157
- else:
158
- topics = sorted(freq_df.Topic.to_list()[0:6])
159
-
160
- # Initialize figure
161
- if subplot_titles is None:
162
- subplot_titles = [f"Topic {topic}" for topic in topics]
163
- else:
164
- subplot_titles = subplot_titles
165
- columns = 4
166
- rows = int(np.ceil(len(topics) / columns))
167
- fig = make_subplots(rows=rows,
168
- cols=columns,
169
- shared_xaxes=False,
170
- horizontal_spacing=.1,
171
- vertical_spacing=.4 / rows if rows > 1 else 0,
172
- subplot_titles=subplot_titles)
173
-
174
- # Add barchart for each topic
175
- row = 1
176
- column = 1
177
- for topic in topics:
178
- words = [word + " " for word, _ in topic_model.get_topic(topic)][:n_words][::-1]
179
- scores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1]
180
-
181
- fig.add_trace(
182
- go.Bar(x=scores,
183
- y=words,
184
- orientation='h',
185
- marker_color=next(colors)),
186
- row=row, col=column)
187
-
188
- if column == columns:
189
- column = 1
190
- row += 1
191
- else:
192
- column += 1
193
-
194
- # Stylize graph
195
- fig.update_layout(
196
- template="plotly_white",
197
- showlegend=False,
198
- title={
199
- 'text': "<b>Topic Word Scores",
200
- 'x': .5,
201
- 'xanchor': 'center',
202
- 'yanchor': 'top',
203
- 'font': dict(
204
- size=22,
205
- color="Black")
206
- },
207
- width=width*4,
208
- height=height*rows if rows > 1 else height * 1.3,
209
- hoverlabel=dict(
210
- bgcolor="white",
211
- font_size=16,
212
- font_family="Rockwell"
213
- ),
214
- )
215
-
216
- fig.update_xaxes(showgrid=True)
217
- fig.update_yaxes(showgrid=True)
218
-
219
- return fig
220
-
221
-
222
 
223
  # convert transformer model zero shot classification prediction into dataframe
224
  def convert_zero_shot_classification_output_to_dataframe(model_output):
 
1
 
2
  # imports
3
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
4
  import re
 
5
 
6
 
7
 
 
69
 
70
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  # convert transformer model zero shot classification prediction into dataframe
74
  def convert_zero_shot_classification_output_to_dataframe(model_output):