dromerosm commited on
Commit
f439c8b
·
0 Parent(s):

Duplicate from dromerosm/chatgpt-info-extraction

Browse files
Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +100 -0
  4. requirements.txt +4 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Chatgpt Info Extraction
3
+ emoji: 😻
4
+ colorFrom: red
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.20.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: cc-by-4.0
11
+ duplicated_from: dromerosm/chatgpt-info-extraction
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import openai
4
+ from newspaper import Article
5
+ import json
6
+ import re
7
+ from transformers import GPT2Tokenizer
8
+ import requests
9
+
10
+
11
+ # define the text summarizer function
12
+ def text_prompt(request, system_role, page_url, contraseña, temp):
13
+ try:
14
+ headers = {'User-Agent': 'Chrome/83.0.4103.106'}
15
+ response = requests.get(page_url, headers=headers)
16
+ html = response.text
17
+
18
+ page = Article('')
19
+ page.set_html(html)
20
+ page.parse()
21
+
22
+ except Exception as e:
23
+ return "", f"--- An error occurred while processing the URL: {e} ---", ""
24
+
25
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
26
+ sentences = page.text.split('.')
27
+
28
+ tokens = []
29
+ page_text = ""
30
+
31
+ for sentence in sentences:
32
+ tokens.extend(tokenizer.tokenize(sentence))
33
+
34
+ # Trim text to a maximum of 3100 tokens
35
+ if len(tokens) > 3100:
36
+ break
37
+ page_text += sentence + ". "
38
+
39
+ # Delete the last space
40
+ page_text = page_text.strip()
41
+
42
+ num_tokens = len(tokens)
43
+
44
+ if num_tokens > 10 and contraseña.startswith("sk-"):
45
+ openai.api_key = contraseña
46
+ # get the response from openai API
47
+ try:
48
+ response = openai.ChatCompletion.create(
49
+ model="gpt-3.5-turbo",
50
+ messages=[
51
+ {"role": "system", "content": system_role},
52
+ {"role": "user", "content": request + "\n\n" + 'Text:\n\n"' + page_text + '\n"'}
53
+ ],
54
+ max_tokens=512,
55
+ temperature=temp,
56
+ top_p=1.0,
57
+ )
58
+ # get the response text
59
+ response_text = response['choices'][0]['message']['content']
60
+ total_tokens = response["usage"]["total_tokens"]
61
+
62
+ # clean the response text
63
+ response_text = re.sub(r'\s+', ' ', response_text)
64
+ response_text = f"#### [{page.title}]({page_url})\n\n{response_text.strip()}"
65
+ total_tokens_str = str(total_tokens) + " (${:.2f} USD)".format(total_tokens/1000*0.002)
66
+
67
+
68
+ return page.text, response_text, total_tokens_str
69
+ except Exception as e:
70
+ return page.text, f"--- An error occurred while processing the request: {e} ---", num_tokens
71
+ return page.text, "--- Check API-Key or Min number of tokens:", str(num_tokens)
72
+
73
+ # define the gradio interface
74
+ iface = gr.Interface(
75
+ fn=text_prompt,
76
+ inputs=[gr.Textbox(lines=1, placeholder="Enter your prompt here...", label="Prompt:", type="text"),
77
+ gr.Textbox(lines=1, placeholder="Enter your system-role description here...", label="System Role:", type="text"),
78
+ gr.Textbox(lines=1, placeholder="Enter the Article's URL here...", label="Article's URL to parse:", type="text"),
79
+ gr.Textbox(lines=1, placeholder="Enter your API-key here...", label="API-Key:", type="password"),
80
+ gr.Slider(0.0,1.0, value=0.3, label="Temperature:")
81
+ ],
82
+ outputs=[gr.Textbox(label="Input:"), gr.Markdown(label="Output:"), gr.Markdown(label="Total Tokens:")],
83
+ examples=[["Resumen el siguiente texto en un máximo de 100 palabras.", "Actuar como consultor de negocio. La respuesta deberá aparentar ser novedosa. Formatea la respuesta en Markdown. El texto deberá ser traducido siempre al español. Deberás añadir al final una lista de topics del texto en forma de lista separada por comas.", "https://blog.google/outreach-initiatives/google-org/our-commitment-on-using-ai-to-accelerate-progress-on-global-development-goals/","",0.3],
84
+ ["Generate a summary of the following text. Give me an overview of the main business impact from the text following this template:\n- Summary:\n- Business Impact:\n- Companies:", "Act as a Business Consultant", "https://ai.googleblog.com/2019/10/quantum-supremacy-using-programmable.html","",0.7],
85
+ ["Generate the next insights based on the following text. Indicates N/A if the information is not available in the text.\n- Summary:\n- Acquisition Price:\n- Why is this important for the acquirer:\n- Business Line for the acquirer:\n- Tech Focus for the acquired (list):","Act as a Business Consultant", "https://techcrunch.com/2022/09/28/eqt-acquires-billtrust-a-company-automating-the-invoice-to-cash-process-for-1-7b/","",0.3]
86
+ ],
87
+ title="ChatGPT info extraction from URL",
88
+ description="This tool allows querying the text retrieved from the URL with newspaper3k lib and using OpenAI's [gpt-3.5-turbo] engine.\nThe URL text can be referenced in the prompt as \"following text\".\nA GPT2 tokenizer is included to ensure that the 1.800 token limit for OpenAI queries is not exceeded. Provide a prompt with your request, the description for the system role, the url for text retrieval, your api-key and temperature to process the text."
89
+ )
90
+
91
+ # error capturing in integration as a component
92
+
93
+ error_message = ""
94
+
95
+ try:
96
+ iface.queue(concurrency_count=20)
97
+ iface.launch()
98
+ except Exception as e:
99
+ error_message = "An error occurred: " + str(e)
100
+ iface.outputs[1].value = error_message
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ openai==0.27
2
+ transformers
3
+ newspaper3k
4
+ requests