Rainsilves commited on
Commit
ea5c8f0
1 Parent(s): 034e89a

initial release

Browse files
Files changed (2) hide show
  1. app.py +153 -0
  2. requirments.txt +7 -0
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from re import split
2
+ import streamlit as st
3
+ import more_itertools
4
+ import spacy
5
+ from sentence_transformers import SentenceTransformer, util
6
+ from sty import fg, bg, ef, rs
7
+
8
+ st.set_page_config(
9
+ page_title="Semantic_Search/Summarization")
10
+
11
+
12
+ choclate_example = """You’d be hard-pressed to find anyone who doesn’t like chocolate. While it's mostly known for its taste (and the associated cravings), it’s also a good source of nutrients when in its pure form and eaten in moderation.
13
+
14
+ This well-loved food, once called the "drink of the gods" by the Maya people, has a rich history as well. Chocolate comes from the seeds of the cacao pod, which grows on the cacao tree. Theobroma cacao is native to the tropical rainforests of Central America, where it has grown for thousands of years.
15
+
16
+ """
17
+
18
+
19
+
20
+ nlp = spacy.load('en_core_web_sm')
21
+
22
+
23
+
24
+ wit_text = """
25
+
26
+ . .
27
+ . .
28
+ .
29
+ . .. . .*/, . . .
30
+ .*&#&@@@&@@@@&%&&&@&(. .
31
+ *#&&@&&@@@@@@@@@%@@@@@@&,... . . . . .
32
+ #@@@@@@@@@@@@@@@@@@@@@@@@##... .. .. .
33
+ ,&@&&@@&&&@@@@@%*........,&@@@&(,,...... .
34
+ .&&&@@&/,...... . ......(@@@@@&#...... . . .
35
+ .&@@&,... ... .. . ......#@@@@@&....... . .
36
+ .&@@&,... .. .............,,*@@@@@(....... . .
37
+ .@@@@/*..****,,*,*(&@@@@%(,...&@@@/....... . ....
38
+ ,@@&,,/&(&@@@@(..(%(,&%,((,...&@&......... . . .. .
39
+ (@(..(%**,,(/....,,.............%/........ .. . .
40
+ ../...........................(#.......... . . . .
41
+ .(.(...,....,/@@&&&#......,,,............ .. .....
42
+ ..,.,,,,.....,,,*,........,..,.......... .. .. . .
43
+ ../,**,,,.../@@@@@%,.,.,,,,............ . . . .
44
+ . .,**,,*#,**#%%/,,,,,,,,,,. ...... . .. .
45
+ . ..**********,......,,,**,. . ...... . .
46
+ . ....,/(/***,,,**,,,,*/((**,...... .. .
47
+ .. ....,/#&@@@@@@@@@@&%%#/**,........ .
48
+ ....,/*,.,,**/#%&@@@@%#(%#/*,%..(*#(*..
49
+ .,(/*,(/#/(..,,,*///(////(((***,..*###%/%/(/,,. .
50
+ ....,,*(((**/**/#(...,.,,*/(##/,,***/.@..%&#%%///(##/,*,,,. .
51
+ .....,,,*%(,&%(/////#*....,.,,***,,,*/(.@..&&%&#%/***#%(,(((*,*,,. ..
52
+ ..*...,,,*,#%,*%(/*(/*#(##..&,..,,.,**,,*/,.#.&&&&&###((//%%/*****,,,,*.,,
53
+ ...,,(*,,.,.*&&&%((**#((/(/&,.&..../(////,,..@@@@&&&&%(//,*(##(**/*,*,,/****,
54
+ ,,/**/#(//*(%%/%@@@%(*/(/%##/,@.....,,*,...(@@@&%%(##(*/@@&((/(#/*/((%/*//***
55
+ ,,/&/(#(###*/#&**#/%#///*//#//&@(... ....&@&&&###(%#####/%((*(#%%(#&&#%,((%&/
56
+ .,(,,*&%*/%&#(&((//#/,*/*/(/*//#&&/,.&.. ./&(((%%%#((%/(#(##,%*#/((%@%(##%&&*/(
57
+ .*,,**@#**%%&&#%%#(#/*/***/*/**(//&... , .%####%%&%(#/(%/(/(%&%#(#%@%%(&@(,*%%(
58
+ ***(@%,,&@&##&%(@(/(//,,///(//*/**%.. ....#((###%&##(/%(&,(&@&#%###&&%@/@&(&%#&#
59
+ **/#&(((&&#@@@#%%&*/(**///(/,/,**/*.. ,*##/(((&%(%%#((%%&@@@#####&&@@&(%&@&#/(%%
60
+ (%#(*,%@@@(@@@&%%@&//(//(////**,/(#(,*(&%(%#%%&(%%#&(%(#@@@@&&%%%&@@@&/(&&&/*%&%
61
+ ,*#&@@@@@&/(%@@%&&&%#(#//#/#/****##%*/*/*%&&%&&%%#%%(%@@@&@#%%&&@@@&&@@*#%&#(***
62
+ #(/***//%(((((&@&&&@&(/#(**(*((//**#/,/%&%&%&&%&##%%&&&@@@&#%&&&@@%(#*(*((%(%%&%
63
+ *,#%&&&@%&&&%(&@@&&&%&/%//(***/(///%(@&#&&&%&&&%&%@&&&##@@@&&&@@@&&((((###%(#/(*
64
+ /#**..,,*,/*#&@&@@&%&@%#(((#**(/(//%@@#&@#&&%#%%@@&&@@@@@@&@@@@@&#(###%##%%(%&#/
65
+ #%&%(,*,##&&&&&@@@&&@&&#*/*/(/*((//#@&(@&&%&&%&@&&&&@&@@@@@@&&@&&%#%(%(%#&%%#/(%
66
+ ***,/((&%(/(**//&@@&@@@@(*/(,((#/((&@%%&#%&&&@&&#&&@@@@@@&@&@@@&((#&%#%%////(###
67
+ *,/*/#&&@@&@&*,%@@@@@@@&&(**#*(*#%#&@%%&%#&@@@@&@&&&@@@@@@@@@@@%%%%#//(*/#/#(%%%
68
+ #%#%&#&%%*,/##&@@@@@@&&&&&#(*#//#%&&&%&(/&@&@@@@@@&&&&@@@@@@@@@&%(%%(%(##%*//(%&
69
+ &&%(**/%&&&%&&@&@&@@@%&&@@@#####%&&@##//&%%%&@&&@&@@&@&&@@@@@@&&&%#/##(%##%&&&&#
70
+ **(#&%&%%&&%&&@@@@@@@@@@@@@@&%&%@@&&#%(@&&&&%&&&&@@@&&&@@@@@@@@/*%%%&&&&&%%%%##/
71
+ %&&%&%/***%&@@@&&@@@@@@&&@&&&@&&@@@%(%&&&&%%&&%&&&%@@@@@@@@@@@@&&&@&&%/,***,***/
72
+
73
+ The limits of my language means the limits of my world
74
+
75
+ - Ludwig Wittgenstein
76
+ """
77
+
78
+
79
+ st.title("Unsupervised Extractive Text Summarization and Semantic Search - by Der_Einzige!")
80
+ st.caption("Under active development! Please contact me or drop an issue on https://github.com/Hellisotherpeople/CX_DB8")
81
+
82
+ with st.expander("Dedicated to the late..."):
83
+ st.text(wit_text)
84
+
85
+
86
+ model_name = st.text_area("Enter the name of the pre-trained model from sentence transformers that we are using for summarization", value = "paraphrase-MiniLM-L3-v2")
87
+ st.caption("This will download a new model, so it may take awhile or even break if the model is too large")
88
+ st.caption("See the list of pre-trained models that are available here! https://www.sbert.net/docs/pretrained_models.html")
89
+ embedder = SentenceTransformer(model_name)
90
+
91
+
92
+ granularity = st.radio("What level of granularity do you want to summarize at?", ('Sentence', 'Word', 'Paragraph'))
93
+
94
+ y = st.text_area('enter a query', value = "I love choclate")
95
+ x = st.text_area('Enter a document', value = choclate_example)
96
+ percentage = st.number_input("Enter the percentage of the text you want highlighted", max_value = 0.99, min_value = 0.01, value = 0.3)
97
+
98
+ query_embedding = embedder.encode(y, convert_to_tensor=True)
99
+
100
+ if granularity == "Sentence":
101
+ doc = nlp(x)
102
+ doc_sents = [str(sent) for sent in doc.sents]
103
+ corpus_embeddings = embedder.encode(doc_sents, convert_to_tensor=True)
104
+ len_doc = len(doc_sents)
105
+ elif granularity == "Word":
106
+ split_words = x.split()
107
+ ww_size = st.number_input("What size do you want the word window to be?", value = 3, min_value = 1, max_value=int(len(split_words)/4))
108
+ window_words = list(more_itertools.windowed(split_words, ww_size)) ###Doesn't seem to be impacting anything??? Must be related to implementation of semantic search by sentence transformers...
109
+ len_doc = len(window_words)
110
+ corpus_embeddings = embedder.encode(window_words, convert_to_tensor=True)
111
+ elif granularity == "Paragraph":
112
+ split_lines = x.splitlines()
113
+ corpus_embeddings = embedder.encode(split_lines, convert_to_tensor=True)
114
+ len_doc = len(split_lines)
115
+
116
+ semantic_search_results = util.semantic_search(query_embedding, corpus_embeddings, top_k= int(percentage * len_doc))
117
+
118
+ list_of_selected_extracts = []
119
+ for extract in semantic_search_results[0]:
120
+ list_of_selected_extracts.append(extract["corpus_id"])
121
+
122
+ if granularity == "Sentence":
123
+ original_granularity = doc_sents
124
+ elif granularity == "Word":
125
+ original_granularity = split_words
126
+ elif granularity == "Paragraph":
127
+ original_granularity = split_lines
128
+
129
+ string_to_print = ""
130
+ for count, chunk in enumerate(original_granularity):
131
+ if count in list_of_selected_extracts:
132
+ string_to_add = "\u0332".join(" " + chunk)
133
+ string_to_print += string_to_add
134
+ else:
135
+ string_to_print += " "
136
+ string_to_print += chunk
137
+
138
+ st.subheader("Output summary")
139
+ st.write(string_to_print)
140
+
141
+ st.subheader("Raw results")
142
+ st.caption("corpus_id is the number of the word, sentence, or paragraph. Score is the raw cosine similarty score between the document and the query")
143
+ st.write(semantic_search_results[0])
144
+
145
+ st.subheader("Results of segmentation/tokenization")
146
+ st.caption("This shows the representation that the webapp gets of the input document. Useful for debugging if you get strange output")
147
+
148
+ if granularity == "Sentence":
149
+ st.write(doc_sents)
150
+ elif granularity == "Word":
151
+ st.write(window_words)
152
+ elif granularity == "Paragraph":
153
+ st.write(split_lines)
requirments.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ spacy==3.2.0
2
+ streamlit==1.2.0
3
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
4
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz
5
+ more_itertools
6
+ simple_transformers
7
+ torch