|
from selenium import webdriver |
|
from selenium.webdriver.chrome.service import Service |
|
from selenium.webdriver.chrome.options import Options |
|
from webdriver_manager.chrome import ChromeDriverManager |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urljoin, urlparse |
|
import time |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
import threading |
|
|
|
|
|
visited_lock = threading.Lock() |
|
|
|
|
|
visited = set() |
|
|
|
|
|
def get_all_links(url, max_depth, current_depth=0): |
|
if current_depth > max_depth: |
|
return [] |
|
|
|
try: |
|
|
|
print(f"Scraping: {url} at depth {current_depth}") |
|
|
|
|
|
chrome_options = Options() |
|
chrome_options.add_argument("--headless") |
|
|
|
|
|
service = Service(ChromeDriverManager().install()) |
|
driver = webdriver.Chrome(service=service, options=chrome_options) |
|
|
|
|
|
driver.get(url) |
|
|
|
|
|
time.sleep(5) |
|
|
|
|
|
soup = BeautifulSoup(driver.page_source, 'html.parser') |
|
|
|
|
|
links = set() |
|
for a_tag in soup.find_all('a', href=True): |
|
href = a_tag['href'] |
|
full_url = urljoin(url, href) |
|
|
|
|
|
with visited_lock: |
|
if urlparse(full_url).netloc == urlparse(url).netloc and full_url not in visited: |
|
visited.add(full_url) |
|
links.add(full_url) |
|
|
|
|
|
driver.quit() |
|
|
|
return list(links) |
|
|
|
except Exception as e: |
|
print(f"Error fetching the URL: {e}") |
|
return [] |
|
|
|
def scrape_recursive(urls, max_depth, current_depth, executor): |
|
if current_depth > max_depth: |
|
return [] |
|
|
|
|
|
futures = [executor.submit(get_all_links, url, max_depth, current_depth) for url in urls] |
|
all_links = set() |
|
|
|
for future in as_completed(futures): |
|
try: |
|
links = future.result() |
|
all_links.update(links) |
|
except Exception as e: |
|
print(f"Error in thread: {e}") |
|
|
|
|
|
if current_depth + 1 <= max_depth: |
|
new_links = scrape_recursive(all_links, max_depth, current_depth + 1, executor) |
|
all_links.update(new_links) |
|
|
|
return all_links |
|
|
|
def main(): |
|
|
|
input_url = input("Enter the URL to scrape: ") |
|
max_depth = int(input("Enter the maximum depth: ")) |
|
|
|
|
|
with ThreadPoolExecutor(max_workers=10) as executor: |
|
|
|
all_links = scrape_recursive([input_url], max_depth, 0, executor) |
|
|
|
|
|
with open("links.txt", "w") as file: |
|
for link in all_links: |
|
file.write(f"{link}\n") |
|
|
|
print(f"\nFound {len(all_links)} links on the page. Saved to links.txt.") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|