Solar-Eyes-Dockerized / download_pdfs.py
3martini's picture
Upload folder using huggingface_hub
786d4da verified
raw
history blame contribute delete
958 Bytes
import requests
from bs4 import BeautifulSoup
import os
# URL setup
base_url = "http://74.208.61.158:8888"
page_url = base_url + "/list-pdfs" # if the PDF links are on the home page
# Directory for storing PDFs
pdf_dir = "pdf_downloads"
os.makedirs(pdf_dir, exist_ok=True)
# Fetch the webpage
response = requests.get(page_url)
response.raise_for_status() # will raise an exception for HTTP error codes
# Parse the webpage
soup = BeautifulSoup(response.content, 'html.parser')
# Find all PDF links
for link in soup.find_all('a', href=True):
href = link['href']
if href.endswith('.pdf'):
pdf_url = base_url + href
pdf_response = requests.get(pdf_url)
pdf_response.raise_for_status()
# Write the PDF to a file
pdf_filename = os.path.join(pdf_dir, href.split('/')[-1])
with open(pdf_filename, 'wb') as file:
file.write(pdf_response.content)
print(f"Downloaded: {pdf_filename}")