poemsforaphrodite
/

lyca-mobile-chatbot

Model card Files Files and versions Community

lyca-mobile-chatbot / scrape.py

poemsforaphrodite's picture

poemsforaphrodite

Upload folder using huggingface_hub

c9e6ba4 verified 3 months ago

history blame contribute delete

3.37 kB

	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from webdriver_manager.chrome import ChromeDriverManager
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import time
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import threading

	# Create a lock for thread-safe operations
	visited_lock = threading.Lock()

	# Thread-safe set for visited URLs
	visited = set()

	# Function to scrape links with depth control
	def get_all_links(url, max_depth, current_depth=0):
	if current_depth > max_depth:
	return []

	try:
	# Print the current URL being scraped
	print(f"Scraping: {url} at depth {current_depth}")

	# Set up Chrome options
	chrome_options = Options()
	chrome_options.add_argument("--headless") # Run in headless mode

	# Set up the Chrome driver
	service = Service(ChromeDriverManager().install())
	driver = webdriver.Chrome(service=service, options=chrome_options)

	# Navigate to the URL
	driver.get(url)

	# Wait for the page to load (adjust the sleep time if needed)
	time.sleep(5)

	# Get the page source and parse it with BeautifulSoup
	soup = BeautifulSoup(driver.page_source, 'html.parser')

	# Find all 'a' tags and extract the 'href' attribute
	links = set()
	for a_tag in soup.find_all('a', href=True):
	href = a_tag['href']
	full_url = urljoin(url, href)

	# Only include links from the same domain and not already visited
	with visited_lock:
	if urlparse(full_url).netloc == urlparse(url).netloc and full_url not in visited:
	visited.add(full_url)
	links.add(full_url)

	# Close the browser
	driver.quit()

	return list(links)

	except Exception as e:
	print(f"Error fetching the URL: {e}")
	return []

	def scrape_recursive(urls, max_depth, current_depth, executor):
	if current_depth > max_depth:
	return []

	# Submit tasks for the URLs to the ThreadPoolExecutor
	futures = [executor.submit(get_all_links, url, max_depth, current_depth) for url in urls]
	all_links = set()

	for future in as_completed(futures):
	try:
	links = future.result()
	all_links.update(links)
	except Exception as e:
	print(f"Error in thread: {e}")

	# Recursively scrape the new set of links
	if current_depth + 1 <= max_depth:
	new_links = scrape_recursive(all_links, max_depth, current_depth + 1, executor)
	all_links.update(new_links)

	return all_links

	def main():
	# Get input URL and depth from the user
	input_url = input("Enter the URL to scrape: ")
	max_depth = int(input("Enter the maximum depth: "))

	# ThreadPoolExecutor for multithreading
	with ThreadPoolExecutor(max_workers=10) as executor:
	# Start scraping
	all_links = scrape_recursive([input_url], max_depth, 0, executor)

	# Save the results to links.txt
	with open("links.txt", "w") as file:
	for link in all_links:
	file.write(f"{link}\n")

	print(f"\nFound {len(all_links)} links on the page. Saved to links.txt.")

	if __name__ == "__main__":
	main()