qdqd commited on
Commit
e72495e
1 Parent(s): dc4dd17

Create sraper.py

Browse files
Files changed (1) hide show
  1. sraper.py +440 -0
sraper.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import time
4
+ import re
5
+ import json
6
+ from datetime import datetime
7
+ from typing import List, Dict, Type
8
+
9
+ import pandas as pd
10
+ from bs4 import BeautifulSoup
11
+ from pydantic import BaseModel, Field, create_model
12
+ import html2text
13
+ import tiktoken
14
+
15
+ from dotenv import load_dotenv
16
+ from selenium import webdriver
17
+ from selenium.webdriver.chrome.service import Service
18
+ from selenium.webdriver.chrome.options import Options
19
+ from selenium.webdriver.common.by import By
20
+ from selenium.webdriver.common.action_chains import ActionChains
21
+ from selenium.webdriver.support.ui import WebDriverWait
22
+ from selenium.webdriver.support import expected_conditions as EC
23
+
24
+
25
+ from openai import OpenAI
26
+ import google.generativeai as genai
27
+ from groq import Groq
28
+
29
+
30
+ from assets import USER_AGENTS,PRICING,HEADLESS_OPTIONS,SYSTEM_MESSAGE,USER_MESSAGE,LLAMA_MODEL_FULLNAME,GROQ_LLAMA_MODEL_FULLNAME
31
+ load_dotenv()
32
+
33
+ # Set up the Chrome WebDriver options
34
+
35
+ def setup_selenium():
36
+ options = Options()
37
+
38
+ # Randomly select a user agent from the imported list
39
+ user_agent = random.choice(USER_AGENTS)
40
+ options.add_argument(f"user-agent={user_agent}")
41
+
42
+ # Add other options
43
+ for option in HEADLESS_OPTIONS:
44
+ options.add_argument(option)
45
+
46
+ # Specify the path to the ChromeDriver
47
+ service = Service(r"./chromedriver-win64/chromedriver.exe")
48
+
49
+ # Initialize the WebDriver
50
+ driver = webdriver.Chrome(service=service, options=options)
51
+ return driver
52
+
53
+ def click_accept_cookies(driver):
54
+ """
55
+ Tries to find and click on a cookie consent button. It looks for several common patterns.
56
+ """
57
+ try:
58
+ # Wait for cookie popup to load
59
+ WebDriverWait(driver, 10).until(
60
+ EC.presence_of_element_located((By.XPATH, "//button | //a | //div"))
61
+ )
62
+
63
+ # Common text variations for cookie buttons
64
+ accept_text_variations = [
65
+ "accept", "agree", "allow", "consent", "continue", "ok", "I agree", "got it"
66
+ ]
67
+
68
+ # Iterate through different element types and common text variations
69
+ for tag in ["button", "a", "div"]:
70
+ for text in accept_text_variations:
71
+ try:
72
+ # Create an XPath to find the button by text
73
+ element = driver.find_element(By.XPATH, f"//{tag}[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text}')]")
74
+ if element:
75
+ element.click()
76
+ print(f"Clicked the '{text}' button.")
77
+ return
78
+ except:
79
+ continue
80
+
81
+ print("No 'Accept Cookies' button found.")
82
+
83
+ except Exception as e:
84
+ print(f"Error finding 'Accept Cookies' button: {e}")
85
+
86
+ def fetch_html_selenium(url):
87
+ driver = setup_selenium()
88
+ try:
89
+ driver.get(url)
90
+
91
+ # Add random delays to mimic human behavior
92
+ time.sleep(1) # Adjust this to simulate time for user to read or interact
93
+ driver.maximize_window()
94
+
95
+
96
+ # Try to find and click the 'Accept Cookies' button
97
+ # click_accept_cookies(driver)
98
+
99
+ # Add more realistic actions like scrolling
100
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
101
+ time.sleep(2) # Simulate time taken to scroll and read
102
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
103
+ time.sleep(1)
104
+ html = driver.page_source
105
+ return html
106
+ finally:
107
+ driver.quit()
108
+
109
+ def clean_html(html_content):
110
+ soup = BeautifulSoup(html_content, 'html.parser')
111
+
112
+ # Remove headers and footers based on common HTML tags or classes
113
+ for element in soup.find_all(['header', 'footer']):
114
+ element.decompose() # Remove these tags and their content
115
+
116
+ return str(soup)
117
+
118
+
119
+ def html_to_markdown_with_readability(html_content):
120
+
121
+
122
+ cleaned_html = clean_html(html_content)
123
+
124
+ # Convert to markdown
125
+ markdown_converter = html2text.HTML2Text()
126
+ markdown_converter.ignore_links = False
127
+ markdown_content = markdown_converter.handle(cleaned_html)
128
+
129
+ return markdown_content
130
+
131
+
132
+
133
+ def save_raw_data(raw_data, timestamp, output_folder='output'):
134
+ # Ensure the output folder exists
135
+ os.makedirs(output_folder, exist_ok=True)
136
+
137
+ # Save the raw markdown data with timestamp in filename
138
+ raw_output_path = os.path.join(output_folder, f'rawData_{timestamp}.md')
139
+ with open(raw_output_path, 'w', encoding='utf-8') as f:
140
+ f.write(raw_data)
141
+ print(f"Raw data saved to {raw_output_path}")
142
+ return raw_output_path
143
+
144
+
145
+ def remove_urls_from_file(file_path):
146
+ # Regex pattern to find URLs
147
+ url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
148
+
149
+ # Construct the new file name
150
+ base, ext = os.path.splitext(file_path)
151
+ new_file_path = f"{base}_cleaned{ext}"
152
+
153
+ # Read the original markdown content
154
+ with open(file_path, 'r', encoding='utf-8') as file:
155
+ markdown_content = file.read()
156
+
157
+ # Replace all found URLs with an empty string
158
+ cleaned_content = re.sub(url_pattern, '', markdown_content)
159
+
160
+ # Write the cleaned content to a new file
161
+ with open(new_file_path, 'w', encoding='utf-8') as file:
162
+ file.write(cleaned_content)
163
+ print(f"Cleaned file saved as: {new_file_path}")
164
+ return cleaned_content
165
+
166
+
167
+ def create_dynamic_listing_model(field_names: List[str]) -> Type[BaseModel]:
168
+ """
169
+ Dynamically creates a Pydantic model based on provided fields.
170
+ field_name is a list of names of the fields to extract from the markdown.
171
+ """
172
+ # Create field definitions using aliases for Field parameters
173
+ field_definitions = {field: (str, ...) for field in field_names}
174
+ # Dynamically create the model with all field
175
+ return create_model('DynamicListingModel', **field_definitions)
176
+
177
+
178
+ def create_listings_container_model(listing_model: Type[BaseModel]) -> Type[BaseModel]:
179
+ """
180
+ Create a container model that holds a list of the given listing model.
181
+ """
182
+ return create_model('DynamicListingsContainer', listings=(List[listing_model], ...))
183
+
184
+
185
+
186
+
187
+ def trim_to_token_limit(text, model, max_tokens=120000):
188
+ encoder = tiktoken.encoding_for_model(model)
189
+ tokens = encoder.encode(text)
190
+ if len(tokens) > max_tokens:
191
+ trimmed_text = encoder.decode(tokens[:max_tokens])
192
+ return trimmed_text
193
+ return text
194
+
195
+ def generate_system_message(listing_model: BaseModel) -> str:
196
+ """
197
+ Dynamically generate a system message based on the fields in the provided listing model.
198
+ """
199
+ # Use the model_json_schema() method to introspect the Pydantic model
200
+ schema_info = listing_model.model_json_schema()
201
+
202
+ # Extract field descriptions from the schema
203
+ field_descriptions = []
204
+ for field_name, field_info in schema_info["properties"].items():
205
+ # Get the field type from the schema info
206
+ field_type = field_info["type"]
207
+ field_descriptions.append(f'"{field_name}": "{field_type}"')
208
+
209
+ # Create the JSON schema structure for the listings
210
+ schema_structure = ",\n".join(field_descriptions)
211
+
212
+ # Generate the system message dynamically
213
+ system_message = f"""
214
+ You are an intelligent text extraction and conversion assistant. Your task is to extract structured information
215
+ from the given text and convert it into a pure JSON format. The JSON should contain only the structured data extracted from the text,
216
+ with no additional commentary, explanations, or extraneous information.
217
+ You could encounter cases where you can't find the data of the fields you have to extract or the data will be in a foreign language.
218
+ Please process the following text and provide the output in pure JSON format with no words before or after the JSON:
219
+ Please ensure the output strictly follows this schema:
220
+
221
+ {{
222
+ "listings": [
223
+ {{
224
+ {schema_structure}
225
+ }}
226
+ ]
227
+ }} """
228
+
229
+ return system_message
230
+
231
+
232
+
233
+ def format_data(data, DynamicListingsContainer, DynamicListingModel, selected_model):
234
+ token_counts = {}
235
+
236
+ if selected_model in ["gpt-4o-mini", "gpt-4o-2024-08-06"]:
237
+ # Use OpenAI API
238
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
239
+ completion = client.beta.chat.completions.parse(
240
+ model=selected_model,
241
+ messages=[
242
+ {"role": "system", "content": SYSTEM_MESSAGE},
243
+ {"role": "user", "content": USER_MESSAGE + data},
244
+ ],
245
+ response_format=DynamicListingsContainer
246
+ )
247
+ # Calculate tokens using tiktoken
248
+ encoder = tiktoken.encoding_for_model(selected_model)
249
+ input_token_count = len(encoder.encode(USER_MESSAGE + data))
250
+ output_token_count = len(encoder.encode(json.dumps(completion.choices[0].message.parsed.dict())))
251
+ token_counts = {
252
+ "input_tokens": input_token_count,
253
+ "output_tokens": output_token_count
254
+ }
255
+ return completion.choices[0].message.parsed, token_counts
256
+
257
+ elif selected_model == "gemini-1.5-flash":
258
+ # Use Google Gemini API
259
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
260
+ model = genai.GenerativeModel('gemini-1.5-flash',
261
+ generation_config={
262
+ "response_mime_type": "application/json",
263
+ "response_schema": DynamicListingsContainer
264
+ })
265
+ prompt = SYSTEM_MESSAGE + "\n" + USER_MESSAGE + data
266
+ # Count input tokens using Gemini's method
267
+ input_tokens = model.count_tokens(prompt)
268
+ completion = model.generate_content(prompt)
269
+ # Extract token counts from usage_metadata
270
+ usage_metadata = completion.usage_metadata
271
+ token_counts = {
272
+ "input_tokens": usage_metadata.prompt_token_count,
273
+ "output_tokens": usage_metadata.candidates_token_count
274
+ }
275
+ return completion.text, token_counts
276
+
277
+ elif selected_model == "Llama3.1 8B":
278
+
279
+ # Dynamically generate the system message based on the schema
280
+ sys_message = generate_system_message(DynamicListingModel)
281
+ # print(SYSTEM_MESSAGE)
282
+ # Point to the local server
283
+ client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
284
+
285
+ completion = client.chat.completions.create(
286
+ model=LLAMA_MODEL_FULLNAME, #change this if needed (use a better model)
287
+ messages=[
288
+ {"role": "system", "content": sys_message},
289
+ {"role": "user", "content": USER_MESSAGE + data}
290
+ ],
291
+ temperature=0.7,
292
+
293
+ )
294
+
295
+ # Extract the content from the response
296
+ response_content = completion.choices[0].message.content
297
+ print(response_content)
298
+ # Convert the content from JSON string to a Python dictionary
299
+ parsed_response = json.loads(response_content)
300
+
301
+ # Extract token usage
302
+ token_counts = {
303
+ "input_tokens": completion.usage.prompt_tokens,
304
+ "output_tokens": completion.usage.completion_tokens
305
+ }
306
+
307
+ return parsed_response, token_counts
308
+ elif selected_model== "Groq Llama3.1 70b":
309
+
310
+ # Dynamically generate the system message based on the schema
311
+ sys_message = generate_system_message(DynamicListingModel)
312
+ # print(SYSTEM_MESSAGE)
313
+ # Point to the local server
314
+ client = Groq(api_key=os.environ.get("GROQ_API_KEY"),)
315
+
316
+ completion = client.chat.completions.create(
317
+ messages=[
318
+ {"role": "system","content": sys_message},
319
+ {"role": "user","content": USER_MESSAGE + data}
320
+ ],
321
+ model=GROQ_LLAMA_MODEL_FULLNAME,
322
+ )
323
+
324
+ # Extract the content from the response
325
+ response_content = completion.choices[0].message.content
326
+
327
+ # Convert the content from JSON string to a Python dictionary
328
+ parsed_response = json.loads(response_content)
329
+
330
+ # completion.usage
331
+ token_counts = {
332
+ "input_tokens": completion.usage.prompt_tokens,
333
+ "output_tokens": completion.usage.completion_tokens
334
+ }
335
+
336
+ return parsed_response, token_counts
337
+ else:
338
+ raise ValueError(f"Unsupported model: {selected_model}")
339
+
340
+
341
+
342
+ def save_formatted_data(formatted_data, timestamp, output_folder='output'):
343
+ # Ensure the output folder exists
344
+ os.makedirs(output_folder, exist_ok=True)
345
+
346
+ # Parse the formatted data if it's a JSON string (from Gemini API)
347
+ if isinstance(formatted_data, str):
348
+ try:
349
+ formatted_data_dict = json.loads(formatted_data)
350
+ except json.JSONDecodeError:
351
+ raise ValueError("The provided formatted data is a string but not valid JSON.")
352
+ else:
353
+ # Handle data from OpenAI or other sources
354
+ formatted_data_dict = formatted_data.dict() if hasattr(formatted_data, 'dict') else formatted_data
355
+
356
+ # Save the formatted data as JSON with timestamp in filename
357
+ json_output_path = os.path.join(output_folder, f'sorted_data_{timestamp}.json')
358
+ with open(json_output_path, 'w', encoding='utf-8') as f:
359
+ json.dump(formatted_data_dict, f, indent=4)
360
+ print(f"Formatted data saved to JSON at {json_output_path}")
361
+
362
+ # Prepare data for DataFrame
363
+ if isinstance(formatted_data_dict, dict):
364
+ # If the data is a dictionary containing lists, assume these lists are records
365
+ data_for_df = next(iter(formatted_data_dict.values())) if len(formatted_data_dict) == 1 else formatted_data_dict
366
+ elif isinstance(formatted_data_dict, list):
367
+ data_for_df = formatted_data_dict
368
+ else:
369
+ raise ValueError("Formatted data is neither a dictionary nor a list, cannot convert to DataFrame")
370
+
371
+ # Create DataFrame
372
+ try:
373
+ df = pd.DataFrame(data_for_df)
374
+ print("DataFrame created successfully.")
375
+
376
+ # Save the DataFrame to an Excel file
377
+ excel_output_path = os.path.join(output_folder, f'sorted_data_{timestamp}.xlsx')
378
+ df.to_excel(excel_output_path, index=False)
379
+ print(f"Formatted data saved to Excel at {excel_output_path}")
380
+
381
+ return df
382
+ except Exception as e:
383
+ print(f"Error creating DataFrame or saving Excel: {str(e)}")
384
+ return None
385
+
386
+ def calculate_price(token_counts, model):
387
+ input_token_count = token_counts.get("input_tokens", 0)
388
+ output_token_count = token_counts.get("output_tokens", 0)
389
+
390
+ # Calculate the costs
391
+ input_cost = input_token_count * PRICING[model]["input"]
392
+ output_cost = output_token_count * PRICING[model]["output"]
393
+ total_cost = input_cost + output_cost
394
+
395
+ return input_token_count, output_token_count, total_cost
396
+
397
+
398
+
399
+
400
+
401
+ if __name__ == "__main__":
402
+ url = 'https://webscraper.io/test-sites/e-commerce/static'
403
+ fields=['Name of item', 'Price']
404
+
405
+ try:
406
+ # # Generate timestamp
407
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
408
+
409
+ # Scrape data
410
+ raw_html = fetch_html_selenium(url)
411
+
412
+ markdown = html_to_markdown_with_readability(raw_html)
413
+
414
+ # Save raw data
415
+ save_raw_data(markdown, timestamp)
416
+
417
+ # Create the dynamic listing model
418
+ DynamicListingModel = create_dynamic_listing_model(fields)
419
+
420
+ # Create the container model that holds a list of the dynamic listing models
421
+ DynamicListingsContainer = create_listings_container_model(DynamicListingModel)
422
+
423
+ # Format data
424
+ formatted_data, token_counts = format_data(markdown, DynamicListingsContainer,DynamicListingModel,"Groq Llama3.1 70b") # Use markdown, not raw_html
425
+ print(formatted_data)
426
+ # Save formatted data
427
+ save_formatted_data(formatted_data, timestamp)
428
+
429
+ # Convert formatted_data back to text for token counting
430
+ formatted_data_text = json.dumps(formatted_data.dict() if hasattr(formatted_data, 'dict') else formatted_data)
431
+
432
+
433
+ # Automatically calculate the token usage and cost for all input and output
434
+ input_tokens, output_tokens, total_cost = calculate_price(token_counts, "Groq Llama3.1 70b")
435
+ print(f"Input token count: {input_tokens}")
436
+ print(f"Output token count: {output_tokens}")
437
+ print(f"Estimated total cost: ${total_cost:.4f}")
438
+
439
+ except Exception as e:
440
+ print(f"An error occurred: {e}")