jianghuyihei commited on
Commit
789383a
1 Parent(s): 863d8a3
Files changed (1) hide show
  1. searcher/sementic_search.py +13 -3
searcher/sementic_search.py CHANGED
@@ -132,7 +132,11 @@ class SementicSearcher:
132
  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
133
 
134
  def read_arxiv_from_path(self, pdf_path):
135
- article_dict = scipdf.parse_pdf_to_dict(pdf_path)
 
 
 
 
136
  return article_dict
137
 
138
  async def get_paper_embbeding_and_score_async(self,query_embedding, paper,llm):
@@ -281,7 +285,10 @@ Abstract: {paper['abstract']}
281
  abstract = result['abstract']
282
  citationCount = result['citationCount']
283
  year = result['year']
284
- article = scipdf.parse_pdf_to_dict(content)
 
 
 
285
  if not article:
286
  continue
287
  final_results.append(Result(title,abstract,article,citationCount,year))
@@ -350,7 +357,10 @@ Abstract: {paper['abstract']}
350
  url = paper[2]
351
  content = await self.download_pdf_async(url)
352
  if content:
353
- article = scipdf.parse_pdf_to_dict(content)
 
 
 
354
  if not article:
355
  continue
356
  result = Result(paper[0],paper[1],article,paper[3],paper[4])
 
132
  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
133
 
134
  def read_arxiv_from_path(self, pdf_path):
135
+ try:
136
+ article_dict = scipdf.parse_pdf_to_dict(pdf_path)
137
+ except Exception as e:
138
+ print(f"Failed to parse the PDF file: {pdf_path}")
139
+ return None
140
  return article_dict
141
 
142
  async def get_paper_embbeding_and_score_async(self,query_embedding, paper,llm):
 
285
  abstract = result['abstract']
286
  citationCount = result['citationCount']
287
  year = result['year']
288
+ try:
289
+ article = scipdf.parse_pdf_to_dict(content)
290
+ except Exception as e:
291
+ article = None
292
  if not article:
293
  continue
294
  final_results.append(Result(title,abstract,article,citationCount,year))
 
357
  url = paper[2]
358
  content = await self.download_pdf_async(url)
359
  if content:
360
+ try:
361
+ article = scipdf.parse_pdf_to_dict(content)
362
+ except Exception as e:
363
+ article = None
364
  if not article:
365
  continue
366
  result = Result(paper[0],paper[1],article,paper[3],paper[4])