Spaces:
Sleeping
Sleeping
File size: 1,993 Bytes
71e47a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import concurrent.futures
import requests
import re
from bs4 import BeautifulSoup
import extensions.superboogav2.parameters as parameters
from .data_processor import process_and_add_to_collector
from .utils import create_metadata_source
def _download_single(url):
response = requests.get(url, timeout=5)
if response.status_code == 200:
return response.content
else:
raise Exception("Failed to download URL")
def _download_urls(urls, threads=1):
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
futures = []
for url in urls:
future = executor.submit(_download_single, url)
futures.append(future)
results = []
i = 0
for future in concurrent.futures.as_completed(futures):
try:
result = future.result()
results.append(result)
i += 1
yield f"{i}/{len(urls)}", results
except Exception:
pass
yield "Done", results
def feed_url_into_collector(urls, collector):
all_text = ''
cumulative = ''
urls = urls.strip().split('\n')
cumulative += f'Loading {len(urls)} URLs with {parameters.get_num_threads()} threads...\n\n'
yield cumulative
for update, contents in _download_urls(urls, threads=parameters.get_num_threads()):
yield cumulative + update
cumulative += 'Processing the HTML sources...'
yield cumulative
for content in contents:
soup = BeautifulSoup(content, features="lxml")
for script in soup(["script", "style"]):
script.extract()
strings = soup.stripped_strings
if parameters.get_is_strong_cleanup():
strings = [s for s in strings if re.search("[A-Za-z] ", s)]
text = '\n'.join([s.strip() for s in strings])
all_text += text
process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download')) |