onurnsfw commited on
Commit
32063f6
1 Parent(s): b4ecccf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -0
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Creating Brochure App that takes a website, fetches all links and retrieves data from all and then creates a brochure with all the data
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+ import json
5
+ import gradio as gr
6
+ from langchain_groq import ChatGroq
7
+ import os
8
+ # A class to represent a Webpage
9
+
10
+ class Website:
11
+ """
12
+ A utility class to represent a Website that we have scraped, now with links
13
+ """
14
+
15
+ def __init__(self, url):
16
+ self.url = url
17
+ response = requests.get(url)
18
+ self.body = response.content
19
+ soup = BeautifulSoup(self.body, 'html.parser')
20
+ self.title = soup.title.string if soup.title else "No title found"
21
+ if soup.body:
22
+ for irrelevant in soup.body(["script", "style", "img", "input"]):
23
+ irrelevant.decompose()
24
+ self.text = soup.body.get_text(separator="\n", strip=True)
25
+ else:
26
+ self.text = ""
27
+ links = [link.get('href') for link in soup.find_all('a')]
28
+ self.links = [link for link in links if link]
29
+
30
+ def get_contents(self):
31
+ return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
32
+
33
+ ## Check whether the links are relevant or not
34
+ def get_links_system_prompt(website):
35
+ link_system_prompt = "You are provided with a list of links found on a webpage. \
36
+ You are able to decide which of the links would be most relevant to include in a brochure about the company, \
37
+ such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
38
+ link_system_prompt += "You should respond in JSON as in this example, do not say anything else:"
39
+ link_system_prompt += """
40
+ {
41
+ "links": [
42
+ {"type": "about page", "url": "https://full.url/goes/here/about"},
43
+ {"type": "careers page": "url": "https://another.full.url/careers"}
44
+ ]
45
+ }
46
+ """
47
+ return link_system_prompt
48
+
49
+ def get_links_user_prompt(website):
50
+ user_prompt = f"Here is the list of links on the website of {website.url} - "
51
+ user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
52
+ Do not include Terms of Service, Privacy, email links.\n"
53
+ user_prompt += "Links (some might be relative links):\n"
54
+ user_prompt += "\n".join(website.links)
55
+ return user_prompt
56
+
57
+ def get_relevant_links(url, llm):
58
+ website=Website(url)
59
+ messages = [
60
+ ("system", get_links_system_prompt(website)),
61
+ ("human", get_links_user_prompt(website))
62
+ ]
63
+ result = llm.invoke(messages)
64
+ return json.loads(result.content)
65
+
66
+ def get_all_details(url, llm):
67
+ result = "Landing page:\n"
68
+ result += Website(url).get_contents()
69
+ links = get_relevant_links(url, llm)
70
+ print("Found links:", links)
71
+ for link in links["links"]:
72
+ result += f"\n\n{link['type']}\n"
73
+ result += Website(link["url"]).get_contents()
74
+ return result
75
+
76
+ def get_brochure_system_prompt():
77
+ system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
78
+ and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
79
+ Include details of company culture, customers and careers/jobs if you have the information."
80
+ return system_prompt
81
+
82
+ def get_brochure_user_prompt(company_name, url, llm):
83
+ user_prompt = f"You are looking at a company called: {company_name}\n"
84
+ user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
85
+ user_prompt += get_all_details(url,llm)
86
+ user_prompt = user_prompt[:20_000] # Truncate if more than 20,000 characters
87
+ return user_prompt
88
+
89
+ def create_brochure(company_name, url, llm_choice, groq_api_key):
90
+ # Set the Groq API key dynamically
91
+ os.environ["GROQ_API_KEY"] = groq_api_key
92
+ llm = ChatGroq(model=llm_options[llm_choice])
93
+ messages = [
94
+ ("system", get_brochure_system_prompt()),
95
+ ("human", get_brochure_user_prompt(company_name, url, llm))
96
+ ]
97
+ # Process the stream and return the result
98
+ return llm.invoke(messages).content
99
+
100
+
101
+ llm_options = {
102
+ "Gemma2":"gemma2-9b-it",
103
+ "LLama": "llama-3.2-3b-preview",
104
+ "Mixtral": "mixtral-8x7b-32768"
105
+ }
106
+
107
+ # Gradio interface
108
+ demo = gr.Interface(
109
+ fn=create_brochure,
110
+ inputs=[
111
+ "text", # Company name input
112
+ "text", # URL input
113
+ gr.Dropdown(choices=list(llm_options.keys()), label="Select LLM"), # LLM selection
114
+ gr.Textbox(type="password", label="Enter Groq API Key") # API Key input
115
+ ],
116
+ outputs="markdown", # Output format
117
+ title="Brochure Generator",
118
+ description="Generate brochures by selecting a company name, URL, and LLM. Provide your Groq API Key."
119
+ )
120
+
121
+ demo.launch()