Spaces:
Runtime error
Runtime error
from bs4 import BeautifulSoup as bs | |
import requests | |
from typing import Dict, List, Optional | |
from fake_http_header import FakeHttpHeader | |
class Scraper: | |
def __init__(self): | |
... | |
def sanity_url(self, url : str) -> bool: | |
if url.find('amazon')==-1: | |
return False | |
return True | |
def get_product(self, url : str) -> Dict: | |
if not self.sanity_url(url): | |
return 'Invalid URL' | |
webpage = requests.get(url, headers=FakeHttpHeader().as_header_dict()) | |
f = open('webpage_out.html','w') | |
f.write(webpage.content.decode()) | |
f.close() | |
if webpage.status_code != 200: | |
return 'Error Loading Link' | |
try: | |
webpage = bs(webpage.content) | |
title = webpage.findAll("span", attrs={"id": 'productTitle'})[0].text.strip() | |
categories = [x.strip().lower() for x in webpage.findAll("div", attrs={"id": 'wayfinding-breadcrumbs_feature_div'})[0].text.strip().split('\n') if x.strip()!='' and len(x.strip()) >=3] | |
desc = webpage.findAll("div", attrs={"id": 'featurebullets_feature_div'})[0].text.replace('About this item','').strip() | |
except IndexError as e: | |
if webpage.content.find('captcha')!=-1: | |
return {'description' : 'Detected as a Bot. Please Try Again Later. Till then, you can continue to type in your description, or manually copy from Amazon.'} | |
return {'description' : f'{title}\n{desc}', 'labels' : categories} |