Spaces:
Runtime error
Runtime error
feat: Add text processing functions
Browse files- util/textproc.py +65 -7
util/textproc.py
CHANGED
@@ -1,14 +1,73 @@
|
|
1 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
def clean_text(text):
|
4 |
-
#
|
5 |
reg = re.compile(r'<.*?>')
|
6 |
cleaned = reg.sub('', text)
|
7 |
-
# cleaned = re.sub(r'\s([?.!"](?:\s|$))', r'\1', cleaned)
|
8 |
cleaned = re.sub(r'\([^)]*\)', '', cleaned)
|
9 |
-
# reg = re.compile(r'[\n\r\t]')
|
10 |
-
# cleaned = reg.sub(" ", cleaned)
|
11 |
-
# cleaned = re.sub('\.(?!$)', '', cleaned) # remove periods in between sentence
|
12 |
cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned)
|
13 |
cleaned = cleaned.strip()
|
14 |
cleaned = cleaned.lstrip()
|
@@ -21,6 +80,5 @@ def clean_text(text):
|
|
21 |
cleaned = cleaned.replace("\xa0", " ")
|
22 |
cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start
|
23 |
cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words
|
24 |
-
|
25 |
-
# cleaned = cleaned.strip()
|
26 |
return cleaned
|
|
|
1 |
import re
|
2 |
+
import unicodedata
|
3 |
+
import requests
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
|
6 |
+
def retrieve_parsed_doc(patent_information, summaries_generated):
|
7 |
+
try:
|
8 |
+
language_config = "en"
|
9 |
+
if "https" in patent_information:
|
10 |
+
patent_code = patent_information.split("/")[4]
|
11 |
+
else:
|
12 |
+
patent_code = patent_information
|
13 |
+
URL = f"https://patents.google.com/patent/{patent_code}/{language_config}"
|
14 |
+
page = requests.get(URL)
|
15 |
+
|
16 |
+
soup = BeautifulSoup(page.content, 'lxml')
|
17 |
+
|
18 |
+
if "Abstract" in summaries_generated:
|
19 |
+
abstract = clean_text(soup.find({"div":{"class":"abstract"}}).prettify())
|
20 |
+
else:
|
21 |
+
abstract = None
|
22 |
+
|
23 |
+
if "Background" in summaries_generated:
|
24 |
+
background = clean_text(soup.find_all(itemprop="description",
|
25 |
+
itemscope="")[-1:][0].prettify())
|
26 |
+
else:
|
27 |
+
background = None
|
28 |
+
|
29 |
+
if "Claims" in summaries_generated:
|
30 |
+
claims = soup.find(itemprop="claims")
|
31 |
+
main_claim = claims.find_all({"div":{"class":"claim"}})
|
32 |
+
main_claims = main_claim[0].select("div[class=claim]")
|
33 |
+
formatted_claims = set()
|
34 |
+
for i in main_claims:
|
35 |
+
formatted_claims.add(clean_text(i.prettify()))
|
36 |
+
try:
|
37 |
+
formatted_claims.remove('')
|
38 |
+
except:
|
39 |
+
pass
|
40 |
+
claim_list = sorted(list(formatted_claims), key=len, reverse=True)
|
41 |
+
else:
|
42 |
+
claim_list = None
|
43 |
+
|
44 |
+
return [abstract, background, claim_list]
|
45 |
+
except:
|
46 |
+
print("here")
|
47 |
+
return None
|
48 |
+
|
49 |
+
|
50 |
+
def get_word_index(s, limit):
|
51 |
+
try:
|
52 |
+
words = re.findall(r'\s*\S+\s*', s)
|
53 |
+
return sum(map(len, words[:limit])) + len(words[limit]) - len(words[limit].lstrip())
|
54 |
+
except:
|
55 |
+
return len(s)
|
56 |
+
|
57 |
+
|
58 |
+
def post_process(s):
|
59 |
+
# Basic post-processing
|
60 |
+
|
61 |
+
if s[0] == " ": s = s[1:]
|
62 |
+
s = s.replace("- ", "-").replace(" .", ".")
|
63 |
+
return ".".join(s.split(".")[:-1])+"."
|
64 |
+
|
65 |
|
66 |
def clean_text(text):
|
67 |
+
# TODO: optimize text cleaning
|
68 |
reg = re.compile(r'<.*?>')
|
69 |
cleaned = reg.sub('', text)
|
|
|
70 |
cleaned = re.sub(r'\([^)]*\)', '', cleaned)
|
|
|
|
|
|
|
71 |
cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned)
|
72 |
cleaned = cleaned.strip()
|
73 |
cleaned = cleaned.lstrip()
|
|
|
80 |
cleaned = cleaned.replace("\xa0", " ")
|
81 |
cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start
|
82 |
cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words
|
83 |
+
|
|
|
84 |
return cleaned
|