lyca-mobile-chatbot /
poemsforaphrodite's picture
Upload folder using huggingface_hub
c9e6ba4 verified
from selenium import webdriver
from import Service
from import Options
from import ChromeDriverManager
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
# Create a lock for thread-safe operations
visited_lock = threading.Lock()
# Thread-safe set for visited URLs
visited = set()
# Function to scrape links with depth control
def get_all_links(url, max_depth, current_depth=0):
if current_depth > max_depth:
return []
# Print the current URL being scraped
print(f"Scraping: {url} at depth {current_depth}")
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
# Set up the Chrome driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
# Navigate to the URL
# Wait for the page to load (adjust the sleep time if needed)
# Get the page source and parse it with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find all 'a' tags and extract the 'href' attribute
links = set()
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
full_url = urljoin(url, href)
# Only include links from the same domain and not already visited
with visited_lock:
if urlparse(full_url).netloc == urlparse(url).netloc and full_url not in visited:
# Close the browser
return list(links)
except Exception as e:
print(f"Error fetching the URL: {e}")
return []
def scrape_recursive(urls, max_depth, current_depth, executor):
if current_depth > max_depth:
return []
# Submit tasks for the URLs to the ThreadPoolExecutor
futures = [executor.submit(get_all_links, url, max_depth, current_depth) for url in urls]
all_links = set()
for future in as_completed(futures):
links = future.result()
except Exception as e:
print(f"Error in thread: {e}")
# Recursively scrape the new set of links
if current_depth + 1 <= max_depth:
new_links = scrape_recursive(all_links, max_depth, current_depth + 1, executor)
return all_links
def main():
# Get input URL and depth from the user
input_url = input("Enter the URL to scrape: ")
max_depth = int(input("Enter the maximum depth: "))
# ThreadPoolExecutor for multithreading
with ThreadPoolExecutor(max_workers=10) as executor:
# Start scraping
all_links = scrape_recursive([input_url], max_depth, 0, executor)
# Save the results to links.txt
with open("links.txt", "w") as file:
for link in all_links:
print(f"\nFound {len(all_links)} links on the page. Saved to links.txt.")
if __name__ == "__main__":