Liyan06 commited on
Commit
3fbb656
·
1 Parent(s): 113a57e

update retrieval and doc display ranking

Browse files
Files changed (2) hide show
  1. handler.py +25 -4
  2. web_retrieval.py +3 -2
handler.py CHANGED
@@ -51,6 +51,7 @@ class EndpointHandler():
51
  def __init__(self, path="./"):
52
  self.scorer = MiniCheck(path=path)
53
  self.rouge = evaluate.load('rouge')
 
54
 
55
 
56
  def __call__(self, data):
@@ -82,7 +83,7 @@ class EndpointHandler():
82
  else:
83
  assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
84
 
85
- ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim)
86
 
87
  span_to_highlight = []
88
  for doc_chunk, score in zip(ranked_docs, scores):
@@ -104,7 +105,12 @@ class EndpointHandler():
104
  return outputs
105
 
106
 
107
- def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
 
 
 
 
 
108
 
109
  search_results = search_google(claim, timeout=timeout)
110
 
@@ -133,9 +139,24 @@ class EndpointHandler():
133
  num_chunks = len([item for items in used_chunk for item in items])
134
  print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
135
 
136
- ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
- return ranked_docs, scores, ranked_urls
139
 
140
 
141
  def chunk_and_highest_rouge_score(self, doc, claim):
 
51
  def __init__(self, path="./"):
52
  self.scorer = MiniCheck(path=path)
53
  self.rouge = evaluate.load('rouge')
54
+ self.tfidf_order = True
55
 
56
 
57
  def __call__(self, data):
 
83
  else:
84
  assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
85
 
86
+ ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim, tfidf_order=self.tfidf_order)
87
 
88
  span_to_highlight = []
89
  for doc_chunk, score in zip(ranked_docs, scores):
 
105
  return outputs
106
 
107
 
108
+ def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False, tfidf_order=False):
109
+
110
+ """
111
+ if tfidf_order == True, then display the docs in the order of TF-IDF similarity with the claim, regardless of the entailment score
112
+ otherwise, display the docs in the order of the entailment score
113
+ """
114
 
115
  search_results = search_google(claim, timeout=timeout)
116
 
 
139
  num_chunks = len([item for items in used_chunk for item in items])
140
  print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
141
 
142
+ if tfidf_order:
143
+ tfidf_docs, scores = [], []
144
+ for used_c, support_prob_per_c in zip(used_chunk, support_prob_per_chunk):
145
+ # If the doc can support the claim, find the chunk with the
146
+ # highest entailment score; otherwise, use the first chunk
147
+ if max(support_prob_per_c) > 0.5:
148
+ tfidf_docs.append(used_c[np.argmax(support_prob_per_c)])
149
+ scores.append(max(support_prob_per_c))
150
+ else:
151
+ tfidf_docs.append(used_c[0])
152
+ scores.append(support_prob_per_c[0])
153
+
154
+ return tfidf_docs, scores, urls
155
+
156
+ else:
157
+ ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
158
 
159
+ return ranked_docs, scores, ranked_urls
160
 
161
 
162
  def chunk_and_highest_rouge_score(self, doc, claim):
web_retrieval.py CHANGED
@@ -82,7 +82,7 @@ def scrape_url(url: str, timeout=10) -> Tuple[str, str]:
82
  return web_text, url
83
 
84
 
85
- def search_google(query:str, num_web_pages:int=20, timeout:int=6, save_url:str='') -> List[str]:
86
  """Searches the query using Google.
87
  Args:
88
  query: Search query.
@@ -108,7 +108,8 @@ def search_google(query:str, num_web_pages:int=20, timeout:int=6, save_url:str='
108
  for page in range(0, num_web_pages, 10):
109
  # here page is google search's bottom page meaning, click 2 -> start=10
110
  # url = "https://www.google.com/search?q={}&start={}".format(query, page)
111
- url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
 
112
  r = requests.get(url, headers=headers, timeout=timeout)
113
  # collect all urls by regular expression
114
  # how to do if I just want to have the returned top-k pages?
 
82
  return web_text, url
83
 
84
 
85
+ def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='') -> List[str]:
86
  """Searches the query using Google.
87
  Args:
88
  query: Search query.
 
108
  for page in range(0, num_web_pages, 10):
109
  # here page is google search's bottom page meaning, click 2 -> start=10
110
  # url = "https://www.google.com/search?q={}&start={}".format(query, page)
111
+ # url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
112
+ url = f"https://www.google.com/search?q={search_query}&start={page}"
113
  r = requests.get(url, headers=headers, timeout=timeout)
114
  # collect all urls by regular expression
115
  # how to do if I just want to have the returned top-k pages?