Linseypass commited on
Commit
e77c4bf
1 Parent(s): 84ea4c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -116
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  from nltk.tokenize import sent_tokenize
3
  import torch
4
  import ujson as json
@@ -26,143 +27,112 @@ stop_token_ids = [0]
26
  print('Guanaco model loaded into memory.')
27
 
28
 
29
- def keyphraseElaboration(title, abstract, userGivenKeyphrases, maxTokensElaboration, numAbstractSentencesKeyphrase):
30
- numKeywordsToExtract = 2
31
- if userGivenKeyphrases == "":
32
- '''
33
- Process Abstract (eliminate word abstract at front and put into sentences)
34
- '''
35
- # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
36
- if abstract.lower()[0:9] == "abstract.":
37
- abstract = abstract[9:]
38
- elif abstract.lower()[0:8] == "abstract":
39
- abstract = abstract[8:]
40
- abstractSentences = sent_tokenize(abstract)
41
- tooShort = True # if the document only has one or fewer abstract sentences, then the document is too short for the keyphrase extraction/elaboration to give a meaningful output.
42
- numAbstractSentences = len(abstractSentences)
43
- if numAbstractSentences > 1:
44
- tooShort = False
45
- numAbstractSentencesKeyphrase = min(numAbstractSentences, numAbstractSentencesKeyphrase)
46
- doc = f"{title}. {' '.join(abstractSentences[:numAbstractSentencesKeyphrase])}"
47
- kw_model = KeyBERT(model="all-MiniLM-L6-v2")
48
- vectorizer = KeyphraseCountVectorizer()
49
- keywordsOut = kw_model.extract_keywords(doc, stop_words="english", top_n = numKeywordsToExtract, vectorizer=vectorizer, use_mmr=True)
50
- keyBERTKeywords = [x[0] for x in keywordsOut]
51
- for entry in keyBERTKeywords:
52
- print(entry)
53
-
54
- keywordString = ""
55
- if userGivenKeyphrases != "":
56
- keywordString = userGivenKeyphrases
57
- elif not tooShort:
58
- separator = ', '
59
- keywordString = separator.join(keyBERTKeywords)
60
- prompt = "What is the purpose of studying " + keywordString + "? Comment on areas of application."
61
- if keywordString != "":
62
- formatted_prompt = (
63
- f"A chat between a curious human and an artificial intelligence assistant."
64
- f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
65
- f"### Human: {prompt} \n"
66
- f"### Assistant:"
67
- )
68
- inputs = tok(formatted_prompt, return_tensors="pt").to(deviceElaboration)
69
- outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=maxTokensElaboration)
70
- output = tok.decode(outputs[0], skip_special_tokens=True)
71
- index_response = output.find("### Assistant: ") + 15
72
- end_response = output.rfind('.') + 1
73
- response = output[index_response:end_response]
74
- return keywordString, response
75
-
76
- def plainLanguageSummary(title, abstract, maxTokensSummary, numAbstractSentencesSummary):
77
- '''
78
- Process Abstract (eliminate word abstract at front and put into sentences)
79
  '''
 
 
 
 
80
  # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
81
- if abstract.lower()[0:9] == "abstract.":
82
- abstract = abstract[9:]
83
- elif abstract.lower()[0:8] == "abstract":
84
- abstract = abstract[8:]
85
- abstractSentences = sent_tokenize(abstract)
 
 
 
 
 
 
86
  '''
87
  This is for summarization
88
  '''
 
 
 
 
 
 
 
 
 
 
 
 
89
  prompt = """
90
  Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
91
  """
92
- text = ""
93
- if text == "":
94
- numAbstractSentences = len(abstractSentences)
95
- numAbstractSentencesSummary = min(numAbstractSentences, numAbstractSentencesSummary)
96
- text = f"{title}. {' '.join(abstractSentences[:numAbstractSentencesSummary])}"
97
-
98
  formatted_prompt = (
99
  f"A chat between a curious human and an artificial intelligence assistant."
100
  f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
101
- f"### Human: {prompt + text} \n"
102
  f"### Assistant:"
103
  )
104
- inputs = tok(formatted_prompt, return_tensors="pt").to(deviceSummary)
105
- outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=maxTokensSummary)
106
  output = tok.decode(outputs[0], skip_special_tokens=True)
107
  index_response = output.find("### Assistant: ") + 15
108
- if (output[index_response:index_response + 10] == "Certainly!" or output[index_response:index_response + 10] == "Certainly,"):
109
  index_response += 10
110
  end_response = output.rfind('.') + 1
111
  response = output[index_response:end_response]
112
- return response
113
 
 
 
 
 
114
 
115
- with gr.Blocks() as demo:
116
- with gr.Row():
117
- with gr.Column():
118
- title = gr.Textbox(label="Title")
119
- abstract = gr.Textbox(label="Abstract")
120
- userDefinedKeyphrases = gr.Textbox(label="Your keyphrases (Optional - Model will elaborate on these keyphrases without using the title or abstract)")
121
- keyphraseButton = gr.Button("Generate Keyphrase Elaboration")
122
- summaryButton = gr.Button("Generate Plain Language Summary")
123
- with gr.Accordion(label="Parameters", open=False):
124
- maxTokensElaboration = gr.Slider(
125
- label="Maximum Number of Elaboration Tokens",
126
- value=500,
127
- minimum=0,
128
- maximum=2048,
129
- step=10,
130
- interactive=True,
131
- info="Length of Keyphrase Elaboration",
132
- )
133
- maxTokensSummary = gr.Slider(
134
- label="Maximum Number of Summary Tokens",
135
- value=300,
136
- minimum=0,
137
- maximum=2048,
138
- step=10,
139
- interactive=True,
140
- info="Length of Plain Language Summary",
141
- )
142
- numAbstractSentencesKeyphrase = gr.Slider(
143
- label="Number of Abstract Sentences to use for Keyphrase Extraction",
144
- value=2,
145
- minimum=0,
146
- maximum=20,
147
- step=1,
148
- interactive=True,
149
- info="Default: use first two sentences of abstract."
150
- )
151
- numAbstractSentencesSummary = gr.Slider(
152
- label="Number of Abstract Sentences to use for Plain Language Summary",
153
- value=2,
154
- minimum=0,
155
- maximum=20,
156
- step=1,
157
- interactive=True,
158
- info="Default: use first two sentences of abstract."
159
- )
160
- with gr.Column():
161
- outputKeyphrase = [gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration")]
162
- outputSummary = gr.Textbox(label="Plain Language Summary")
163
 
164
- keyphraseButton.click(fn=keyphraseElaboration, inputs=[title, abstract, userDefinedKeyphrases, maxTokensElaboration, numAbstractSentencesKeyphrase], outputs=outputKeyphrase)
165
- summaryButton.click(fn=plainLanguageSummary, inputs=[title, abstract, maxTokensSummary, numAbstractSentencesSummary], outputs = outputSummary)
 
 
 
 
 
 
 
 
 
 
 
 
166
 
 
 
 
 
 
167
  demo.launch()
168
 
 
1
  import gradio as gr
2
+
3
  from nltk.tokenize import sent_tokenize
4
  import torch
5
  import ujson as json
 
27
  print('Guanaco model loaded into memory.')
28
 
29
 
30
+ def generate(title, abstract):
31
+ print("Started running.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  '''
33
+ Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run.
34
+ '''
35
+ newline = {}
36
+ text = abstract
37
  # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
38
+ if text.lower()[0:9] == "abstract.":
39
+ text = text[9:]
40
+ elif text.lower()[0:8] == "abstract":
41
+ text = text[8:]
42
+ sentences = sent_tokenize(text)
43
+ newline["target"] = sentences
44
+ newline["title"] = title
45
+ print("Tokenized abstract to sentences.")
46
+ '''
47
+ Main part
48
+ '''
49
  '''
50
  This is for summarization
51
  '''
52
+ tooShortForKeyword = False
53
+ obj = newline
54
+ doc = ""
55
+ if len(obj["target"]) > 1:
56
+ doc += obj["title"] + ". " + obj["target"][0] + " " + obj["target"][1]
57
+ elif len(obj["target"]) == 1:
58
+ tooShortForKeyword = True
59
+ doc += obj["title"] + ". " + obj["target"][0]
60
+ else:
61
+ tooShortForKeyword = True
62
+ doc += obj["title"]
63
+ text = doc
64
  prompt = """
65
  Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
66
  """
 
 
 
 
 
 
67
  formatted_prompt = (
68
  f"A chat between a curious human and an artificial intelligence assistant."
69
  f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
70
+ f"### Human: {prompt + doc} \n"
71
  f"### Assistant:"
72
  )
73
+ inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:1")
74
+ outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
75
  output = tok.decode(outputs[0], skip_special_tokens=True)
76
  index_response = output.find("### Assistant: ") + 15
77
+ if (output[index_response:index_response + 10] == "Certainly!"):
78
  index_response += 10
79
  end_response = output.rfind('.') + 1
80
  response = output[index_response:end_response]
81
+ print('Plain Language Summary Created.')
82
 
83
+ '''
84
+ Keyphrase extraction.
85
+ '''
86
+ # the document is the title and first two sentences of the abstract.
87
 
88
+ obj = newline
89
+ doc = ""
90
+ if len(obj["target"]) > 1:
91
+ doc += obj["title"] + ". " + obj["target"][0] + " " + obj["target"][1]
92
+ kw_model = KeyBERT(model="all-MiniLM-L6-v2")
93
+ vectorizer = KeyphraseCountVectorizer()
94
+ top_n = 2
95
+ keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True)
96
+ my_keywords = []
97
+ for i in range(top_n):
98
+ add = True
99
+ for j in range(top_n):
100
+ if i != j:
101
+ if keywords[i][0] in keywords[j][0]:
102
+ add = False
103
+ if add:
104
+ my_keywords.append(keywords[i][0])
105
+ for entry in my_keywords:
106
+ print(entry)
107
+ '''
108
+ This is for feeding the keyphrases into Guanaco.
109
+ '''
110
+ responseTwo = ""
111
+ keyword_string = ""
112
+ if not tooShortForKeyword:
113
+ separator = ', '
114
+ keyword_string = separator.join(my_keywords)
115
+ prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ formatted_prompt = (
118
+ f"A chat between a curious human and an artificial intelligence assistant."
119
+ f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
120
+ f"### Human: {prompt} \n"
121
+ f"### Assistant:"
122
+ )
123
+ inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:2")
124
+ outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
125
+ output = tok.decode(outputs[0], skip_special_tokens=True)
126
+ index_response = output.find("### Assistant: ") + 15
127
+ end_response = output.rfind('.') + 1
128
+ responseTwo = output[index_response:end_response]
129
+ print('Keyphrase elaboration ran.')
130
+ return keyword_string, responseTwo, response
131
 
132
+ demo = gr.Interface(
133
+ fn=generate,
134
+ inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")],
135
+ outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")],
136
+ )
137
  demo.launch()
138