Spaces:
Runtime error
Runtime error
import re | |
import unicodedata | |
import requests | |
from bs4 import BeautifulSoup | |
def retrieve_parsed_doc(patent_information, summaries_generated): | |
try: | |
language_config = "en" | |
if "https" in patent_information: | |
patent_code = patent_information.split("/")[4] | |
else: | |
patent_code = patent_information | |
URL = f"https://patents.google.com/patent/{patent_code}/{language_config}" | |
page = requests.get(URL) | |
soup = BeautifulSoup(page.content, 'lxml') | |
if "Abstract" in summaries_generated: | |
abstract = clean_text(soup.find({"div":{"class":"abstract"}}).prettify()) | |
else: | |
abstract = None | |
if "Background" in summaries_generated: | |
background = clean_text(soup.find_all(itemprop="description", | |
itemscope="")[-1:][0].prettify()) | |
else: | |
background = None | |
if "Claims" in summaries_generated: | |
claims = soup.find(itemprop="claims") | |
main_claim = claims.find_all({"div":{"class":"claim"}}) | |
main_claims = main_claim[0].select("div[class=claim]") | |
formatted_claims = set() | |
for i in main_claims: | |
formatted_claims.add(clean_text(i.prettify())) | |
try: | |
formatted_claims.remove('') | |
except: | |
pass | |
claim_list = sorted(list(formatted_claims), key=len, reverse=True) | |
else: | |
claim_list = None | |
return [abstract, background, claim_list] | |
except: | |
print("here") | |
return None | |
def get_word_index(s, limit): | |
try: | |
words = re.findall(r'\s*\S+\s*', s) | |
return sum(map(len, words[:limit])) + len(words[limit]) - len(words[limit].lstrip()) | |
except: | |
return len(s) | |
def post_process(s): | |
# Basic post-processing | |
if s[0] == " ": s = s[1:] | |
s = s.replace("- ", "-").replace(" .", ".") | |
return ".".join(s.split(".")[:-1])+"." | |
def clean_text(text): | |
# TODO: optimize text cleaning | |
reg = re.compile(r'<.*?>') | |
cleaned = reg.sub('', text) | |
cleaned = re.sub(r'\([^)]*\)', '', cleaned) | |
cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned) | |
cleaned = cleaned.strip() | |
cleaned = cleaned.lstrip() | |
cleaned = "".join(ch for ch in cleaned if unicodedata.category(ch)[0]!="C") | |
cleaned = re.sub(' +', ' ', cleaned) | |
cleaned = cleaned.replace(";", ", and") | |
cleaned = cleaned.replace(":", "") | |
cleaned = cleaned.replace(" .", ".") | |
cleaned = cleaned.replace(" ,", ",") | |
cleaned = cleaned.replace("\xa0", " ") | |
cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start | |
cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words | |
return cleaned |