lyca-mobile-chatbot / scrape.py
poemsforaphrodite's picture
Upload folder using huggingface_hub
c9e6ba4 verified
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
# Create a lock for thread-safe operations
visited_lock = threading.Lock()
# Thread-safe set for visited URLs
visited = set()
# Function to scrape links with depth control
def get_all_links(url, max_depth, current_depth=0):
if current_depth > max_depth:
return []
try:
# Print the current URL being scraped
print(f"Scraping: {url} at depth {current_depth}")
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
# Set up the Chrome driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
# Navigate to the URL
driver.get(url)
# Wait for the page to load (adjust the sleep time if needed)
time.sleep(5)
# Get the page source and parse it with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find all 'a' tags and extract the 'href' attribute
links = set()
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
full_url = urljoin(url, href)
# Only include links from the same domain and not already visited
with visited_lock:
if urlparse(full_url).netloc == urlparse(url).netloc and full_url not in visited:
visited.add(full_url)
links.add(full_url)
# Close the browser
driver.quit()
return list(links)
except Exception as e:
print(f"Error fetching the URL: {e}")
return []
def scrape_recursive(urls, max_depth, current_depth, executor):
if current_depth > max_depth:
return []
# Submit tasks for the URLs to the ThreadPoolExecutor
futures = [executor.submit(get_all_links, url, max_depth, current_depth) for url in urls]
all_links = set()
for future in as_completed(futures):
try:
links = future.result()
all_links.update(links)
except Exception as e:
print(f"Error in thread: {e}")
# Recursively scrape the new set of links
if current_depth + 1 <= max_depth:
new_links = scrape_recursive(all_links, max_depth, current_depth + 1, executor)
all_links.update(new_links)
return all_links
def main():
# Get input URL and depth from the user
input_url = input("Enter the URL to scrape: ")
max_depth = int(input("Enter the maximum depth: "))
# ThreadPoolExecutor for multithreading
with ThreadPoolExecutor(max_workers=10) as executor:
# Start scraping
all_links = scrape_recursive([input_url], max_depth, 0, executor)
# Save the results to links.txt
with open("links.txt", "w") as file:
for link in all_links:
file.write(f"{link}\n")
print(f"\nFound {len(all_links)} links on the page. Saved to links.txt.")
if __name__ == "__main__":
main()