amielle commited on
Commit
82331bb
1 Parent(s): cb09dc9

feat: Add text processing functions

Browse files
Files changed (1) hide show
  1. util/textproc.py +65 -7
util/textproc.py CHANGED
@@ -1,14 +1,73 @@
1
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  def clean_text(text):
4
- # html pre-proc
5
  reg = re.compile(r'<.*?>')
6
  cleaned = reg.sub('', text)
7
- # cleaned = re.sub(r'\s([?.!"](?:\s|$))', r'\1', cleaned)
8
  cleaned = re.sub(r'\([^)]*\)', '', cleaned)
9
- # reg = re.compile(r'[\n\r\t]')
10
- # cleaned = reg.sub(" ", cleaned)
11
- # cleaned = re.sub('\.(?!$)', '', cleaned) # remove periods in between sentence
12
  cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned)
13
  cleaned = cleaned.strip()
14
  cleaned = cleaned.lstrip()
@@ -21,6 +80,5 @@ def clean_text(text):
21
  cleaned = cleaned.replace("\xa0", " ")
22
  cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start
23
  cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words
24
-
25
- # cleaned = cleaned.strip()
26
  return cleaned
 
1
  import re
2
+ import unicodedata
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+
6
+ def retrieve_parsed_doc(patent_information, summaries_generated):
7
+ try:
8
+ language_config = "en"
9
+ if "https" in patent_information:
10
+ patent_code = patent_information.split("/")[4]
11
+ else:
12
+ patent_code = patent_information
13
+ URL = f"https://patents.google.com/patent/{patent_code}/{language_config}"
14
+ page = requests.get(URL)
15
+
16
+ soup = BeautifulSoup(page.content, 'lxml')
17
+
18
+ if "Abstract" in summaries_generated:
19
+ abstract = clean_text(soup.find({"div":{"class":"abstract"}}).prettify())
20
+ else:
21
+ abstract = None
22
+
23
+ if "Background" in summaries_generated:
24
+ background = clean_text(soup.find_all(itemprop="description",
25
+ itemscope="")[-1:][0].prettify())
26
+ else:
27
+ background = None
28
+
29
+ if "Claims" in summaries_generated:
30
+ claims = soup.find(itemprop="claims")
31
+ main_claim = claims.find_all({"div":{"class":"claim"}})
32
+ main_claims = main_claim[0].select("div[class=claim]")
33
+ formatted_claims = set()
34
+ for i in main_claims:
35
+ formatted_claims.add(clean_text(i.prettify()))
36
+ try:
37
+ formatted_claims.remove('')
38
+ except:
39
+ pass
40
+ claim_list = sorted(list(formatted_claims), key=len, reverse=True)
41
+ else:
42
+ claim_list = None
43
+
44
+ return [abstract, background, claim_list]
45
+ except:
46
+ print("here")
47
+ return None
48
+
49
+
50
+ def get_word_index(s, limit):
51
+ try:
52
+ words = re.findall(r'\s*\S+\s*', s)
53
+ return sum(map(len, words[:limit])) + len(words[limit]) - len(words[limit].lstrip())
54
+ except:
55
+ return len(s)
56
+
57
+
58
+ def post_process(s):
59
+ # Basic post-processing
60
+
61
+ if s[0] == " ": s = s[1:]
62
+ s = s.replace("- ", "-").replace(" .", ".")
63
+ return ".".join(s.split(".")[:-1])+"."
64
+
65
 
66
  def clean_text(text):
67
+ # TODO: optimize text cleaning
68
  reg = re.compile(r'<.*?>')
69
  cleaned = reg.sub('', text)
 
70
  cleaned = re.sub(r'\([^)]*\)', '', cleaned)
 
 
 
71
  cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned)
72
  cleaned = cleaned.strip()
73
  cleaned = cleaned.lstrip()
 
80
  cleaned = cleaned.replace("\xa0", " ")
81
  cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start
82
  cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words
83
+
 
84
  return cleaned