Spaces:

amielle
/

patent-summarizer

Runtime error

App Files Files Community

patent-summarizer / util /textproc.py

amielle

feat: Add text processing functions

82331bb over 2 years ago

raw

history blame

2.81 kB

	import re
	import unicodedata
	import requests
	from bs4 import BeautifulSoup

	def retrieve_parsed_doc(patent_information, summaries_generated):
	try:
	language_config = "en"
	if "https" in patent_information:
	patent_code = patent_information.split("/")[4]
	else:
	patent_code = patent_information
	URL = f"https://patents.google.com/patent/{patent_code}/{language_config}"
	page = requests.get(URL)

	soup = BeautifulSoup(page.content, 'lxml')

	if "Abstract" in summaries_generated:
	abstract = clean_text(soup.find({"div":{"class":"abstract"}}).prettify())
	else:
	abstract = None

	if "Background" in summaries_generated:
	background = clean_text(soup.find_all(itemprop="description",
	itemscope="")[-1:][0].prettify())
	else:
	background = None

	if "Claims" in summaries_generated:
	claims = soup.find(itemprop="claims")
	main_claim = claims.find_all({"div":{"class":"claim"}})
	main_claims = main_claim[0].select("div[class=claim]")
	formatted_claims = set()
	for i in main_claims:
	formatted_claims.add(clean_text(i.prettify()))
	try:
	formatted_claims.remove('')
	except:
	pass
	claim_list = sorted(list(formatted_claims), key=len, reverse=True)
	else:
	claim_list = None

	return [abstract, background, claim_list]
	except:
	print("here")
	return None


	def get_word_index(s, limit):
	try:
	words = re.findall(r'\s\S+\s', s)
	return sum(map(len, words[:limit])) + len(words[limit]) - len(words[limit].lstrip())
	except:
	return len(s)


	def post_process(s):
	# Basic post-processing

	if s[0] == " ": s = s[1:]
	s = s.replace("- ", "-").replace(" .", ".")
	return ".".join(s.split(".")[:-1])+"."


	def clean_text(text):
	# TODO: optimize text cleaning
	reg = re.compile(r'<.*?>')
	cleaned = reg.sub('', text)
	cleaned = re.sub(r'\([^)]*\)', '', cleaned)
	cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned)
	cleaned = cleaned.strip()
	cleaned = cleaned.lstrip()
	cleaned = "".join(ch for ch in cleaned if unicodedata.category(ch)[0]!="C")
	cleaned = re.sub(' +', ' ', cleaned)
	cleaned = cleaned.replace(";", ", and")
	cleaned = cleaned.replace(":", "")
	cleaned = cleaned.replace(" .", ".")
	cleaned = cleaned.replace(" ,", ",")
	cleaned = cleaned.replace("\xa0", " ")
	cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start
	cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words

	return cleaned