Spaces:

Linseypass
/

PLTS

Runtime error

App Files Files Community

Linseypass commited on Aug 23, 2023

Commit

e77c4bf

•

1 Parent(s): 84ea4c2

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -116

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 from nltk.tokenize import sent_tokenize
 import torch
 import ujson as json
@@ -26,143 +27,112 @@ stop_token_ids = [0]
 print('Guanaco model loaded into memory.')
-def keyphraseElaboration(title, abstract, userGivenKeyphrases, maxTokensElaboration, numAbstractSentencesKeyphrase):
-    numKeywordsToExtract = 2
-    if userGivenKeyphrases == "":
-        '''
-        Process Abstract (eliminate word abstract at front and put into sentences)
-        '''
-        # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
-        if abstract.lower()[0:9] == "abstract.":
-            abstract = abstract[9:]
-        elif abstract.lower()[0:8] == "abstract":
-            abstract = abstract[8:]
-        abstractSentences = sent_tokenize(abstract)
-        tooShort = True # if the document only has one or fewer abstract sentences, then the document is too short for the keyphrase extraction/elaboration to give a meaningful output.
-        numAbstractSentences = len(abstractSentences)
-        if numAbstractSentences > 1:
-            tooShort = False
-            numAbstractSentencesKeyphrase = min(numAbstractSentences, numAbstractSentencesKeyphrase)
-            doc = f"{title}. {' '.join(abstractSentences[:numAbstractSentencesKeyphrase])}"
-            kw_model = KeyBERT(model="all-MiniLM-L6-v2")
-            vectorizer = KeyphraseCountVectorizer()
-            keywordsOut = kw_model.extract_keywords(doc, stop_words="english", top_n = numKeywordsToExtract, vectorizer=vectorizer, use_mmr=True)
-            keyBERTKeywords = [x[0] for x in keywordsOut]
-            for entry in keyBERTKeywords:
-                print(entry)
-    keywordString = ""
-    if userGivenKeyphrases != "":
-        keywordString = userGivenKeyphrases
-    elif not tooShort:
-        separator = ', '
-        keywordString = separator.join(keyBERTKeywords)
-    prompt = "What is the purpose of studying " + keywordString + "? Comment on areas of application."
-    if keywordString != "":
-        formatted_prompt = (
-            f"A chat between a curious human and an artificial intelligence assistant."
-            f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
-            f"### Human: {prompt} \n"
-            f"### Assistant:"
-        )
-        inputs = tok(formatted_prompt, return_tensors="pt").to(deviceElaboration)
-        outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=maxTokensElaboration)
-        output = tok.decode(outputs[0], skip_special_tokens=True)
-        index_response = output.find("### Assistant: ") + 15
-        end_response = output.rfind('.') + 1
-        response = output[index_response:end_response]
-        return keywordString, response
-def plainLanguageSummary(title, abstract, maxTokensSummary, numAbstractSentencesSummary):
-    '''
-    Process Abstract (eliminate word abstract at front and put into sentences)
     '''
     # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
-    if abstract.lower()[0:9] == "abstract.":
-        abstract = abstract[9:]
-    elif abstract.lower()[0:8] == "abstract":
-        abstract = abstract[8:]
-    abstractSentences = sent_tokenize(abstract)
     '''
     This is for summarization
     '''
     prompt = """
     Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
     """
-    text = ""
-    if text == "":
-        numAbstractSentences = len(abstractSentences)
-        numAbstractSentencesSummary = min(numAbstractSentences, numAbstractSentencesSummary)
-        text = f"{title}. {' '.join(abstractSentences[:numAbstractSentencesSummary])}"
     formatted_prompt = (
         f"A chat between a curious human and an artificial intelligence assistant."
         f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
-        f"### Human: {prompt + text} \n"
         f"### Assistant:"
     )
-    inputs = tok(formatted_prompt, return_tensors="pt").to(deviceSummary)
-    outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=maxTokensSummary)
     output = tok.decode(outputs[0], skip_special_tokens=True)
     index_response = output.find("### Assistant: ") + 15
-    if (output[index_response:index_response + 10] == "Certainly!" or output[index_response:index_response + 10] == "Certainly,"):
         index_response += 10
     end_response = output.rfind('.') + 1
     response = output[index_response:end_response]
-    return response
-with gr.Blocks() as demo:
-    with gr.Row():
-        with gr.Column():
-            title = gr.Textbox(label="Title")
-            abstract = gr.Textbox(label="Abstract")
-            userDefinedKeyphrases = gr.Textbox(label="Your keyphrases (Optional - Model will elaborate on these keyphrases without using the title or abstract)")
-            keyphraseButton = gr.Button("Generate Keyphrase Elaboration")
-            summaryButton = gr.Button("Generate Plain Language Summary")
-            with gr.Accordion(label="Parameters", open=False):
-                maxTokensElaboration = gr.Slider(
-                    label="Maximum Number of Elaboration Tokens",
-                    value=500,
-                    minimum=0,
-                    maximum=2048,
-                    step=10,
-                    interactive=True,
-                    info="Length of Keyphrase Elaboration",
-                )
-                maxTokensSummary = gr.Slider(
-                    label="Maximum Number of Summary Tokens",
-                    value=300,
-                    minimum=0,
-                    maximum=2048,
-                    step=10,
-                    interactive=True,
-                    info="Length of Plain Language Summary",
-                )
-                numAbstractSentencesKeyphrase = gr.Slider(
-                    label="Number of Abstract Sentences to use for Keyphrase Extraction",
-                    value=2,
-                    minimum=0,
-                    maximum=20,
-                    step=1,
-                    interactive=True,
-                    info="Default: use first two sentences of abstract."
-                )
-                numAbstractSentencesSummary = gr.Slider(
-                    label="Number of Abstract Sentences to use for Plain Language Summary",
-                    value=2,
-                    minimum=0,
-                    maximum=20,
-                    step=1,
-                    interactive=True,
-                    info="Default: use first two sentences of abstract."
-                )
-        with gr.Column():
-            outputKeyphrase = [gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration")]
-            outputSummary = gr.Textbox(label="Plain Language Summary")
-    keyphraseButton.click(fn=keyphraseElaboration, inputs=[title, abstract, userDefinedKeyphrases, maxTokensElaboration, numAbstractSentencesKeyphrase], outputs=outputKeyphrase)
-    summaryButton.click(fn=plainLanguageSummary, inputs=[title, abstract, maxTokensSummary, numAbstractSentencesSummary], outputs = outputSummary)
 demo.launch()

 import gradio as gr
 from nltk.tokenize import sent_tokenize
 import torch
 import ujson as json
 print('Guanaco model loaded into memory.')
+def generate(title, abstract):
+    print("Started running.")
     '''
+    Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run.
+    '''
+    newline = {}
+    text = abstract
     # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
+    if text.lower()[0:9] == "abstract.":
+        text = text[9:]
+    elif text.lower()[0:8] == "abstract":
+        text = text[8:]
+    sentences = sent_tokenize(text)
+    newline["target"] = sentences
+    newline["title"] = title
+    print("Tokenized abstract to sentences.")
+    '''
+    Main part
+    '''
     '''
     This is for summarization
     '''
+    tooShortForKeyword = False
+    obj = newline
+    doc = ""
+    if len(obj["target"]) > 1:
+        doc += obj["title"] + ". " + obj["target"][0] + " "  + obj["target"][1]
+    elif len(obj["target"]) == 1:
+        tooShortForKeyword = True
+        doc += obj["title"] + ". " + obj["target"][0]
+    else:
+        tooShortForKeyword = True
+        doc += obj["title"]
+    text = doc
     prompt = """
     Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
     """
     formatted_prompt = (
         f"A chat between a curious human and an artificial intelligence assistant."
         f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
+        f"### Human: {prompt + doc} \n"
         f"### Assistant:"
     )
+    inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:1")
+    outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
     output = tok.decode(outputs[0], skip_special_tokens=True)
     index_response = output.find("### Assistant: ") + 15
+    if (output[index_response:index_response + 10] == "Certainly!"):
         index_response += 10
     end_response = output.rfind('.') + 1
     response = output[index_response:end_response]
+    print('Plain Language Summary Created.')
+    '''
+    Keyphrase extraction.
+    '''
+    # the document is the title and first two sentences of the abstract.
+    obj = newline
+    doc = ""
+    if len(obj["target"]) > 1:
+        doc += obj["title"] + ". " + obj["target"][0] + " "  + obj["target"][1]
+        kw_model = KeyBERT(model="all-MiniLM-L6-v2")
+        vectorizer = KeyphraseCountVectorizer()
+        top_n = 2
+        keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True)
+        my_keywords = []
+        for i in range(top_n):
+            add = True
+            for j in range(top_n):
+                if i != j:
+                    if keywords[i][0] in keywords[j][0]:
+                        add = False
+            if add:
+                my_keywords.append(keywords[i][0])
+        for entry in my_keywords:
+            print(entry)
+    '''
+    This is for feeding the keyphrases into Guanaco.
+    '''
+    responseTwo = ""
+    keyword_string = ""
+    if not tooShortForKeyword:
+        separator = ', '
+        keyword_string = separator.join(my_keywords)
+        prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application."
+        formatted_prompt = (
+            f"A chat between a curious human and an artificial intelligence assistant."
+            f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
+            f"### Human: {prompt} \n"
+            f"### Assistant:"
+        )
+        inputs = tok(formatted_prompt, return_tensors="pt")#.to("cuda:2")
+        outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
+        output = tok.decode(outputs[0], skip_special_tokens=True)
+        index_response = output.find("### Assistant: ") + 15
+        end_response = output.rfind('.') + 1
+        responseTwo = output[index_response:end_response]
+    print('Keyphrase elaboration ran.')
+    return keyword_string, responseTwo, response
+demo = gr.Interface(
+    fn=generate,
+    inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")],
+    outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")],
+)
 demo.launch()