Spaces:
Running
Running
File size: 5,529 Bytes
43cd37c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# Arxiv.py
# Description: This file contains the functions for searching and ingesting arXiv papers.
import time
import arxiv
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from requests.adapters import HTTPAdapter
from urllib3 import Retry
#
# Local Imports
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
#
#####################################################################################################
#
# Functions:
# Number of results per page
ARXIV_PAGE_SIZE = 10
def fetch_arxiv_pdf_url(paper_id):
base_url = f"http://export.arxiv.org/api/query?id_list={paper_id}"
# Configure retry strategy
retry_strategy = Retry(
total=3, # Maximum number of retries
status_forcelist=[429, 500, 502, 503, 504], # Retry on these status codes
backoff_factor=1 # Exponential backoff factor
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)
try:
response = http.get(base_url)
response.raise_for_status()
# Delay between requests to avoid rate limiting
time.sleep(2)
soup = BeautifulSoup(response.text, 'xml')
pdf_link = soup.find('link', title='pdf')['href']
return pdf_link
except requests.exceptions.RequestException as e:
print(f"**Error:** {e}")
return None
def search_arxiv(query):
client = arxiv.Client()
search = arxiv.Search(
query=query,
max_results=10,
sort_by=arxiv.SortCriterion.Relevance
)
results = []
for result in client.results(search):
results.append([
result.title,
result.entry_id.split('/')[-1], # Extract the ID from the entry_id
', '.join(author.name for author in result.authors),
result.summary
])
return results
def fetch_arxiv_xml(paper_id):
base_url = "http://export.arxiv.org/api/query?id_list="
response = requests.get(base_url + paper_id)
response.raise_for_status()
return response.text
def parse_arxiv_feed(xml_content):
soup = BeautifulSoup(xml_content, 'xml')
entries = []
for entry in soup.find_all('entry'):
title = entry.title.text.strip()
paper_id = entry.id.text.strip().split('/abs/')[-1]
authors = ', '.join(author.find('name').text.strip() for author in entry.find_all('author'))
published = entry.published.text.strip().split('T')[0]
abstract = entry.summary.text.strip()
entries.append({
'id': paper_id,
'title': title,
'authors': authors,
'published': published,
'abstract': abstract
})
return entries
def build_query_url(query, author, year, start):
# HTTP? FIXME....
base_url = "http://export.arxiv.org/api/query?"
search_params = []
# Build search query
if query:
search_params.append(f"all:{query}")
if author:
search_params.append(f'au:"{author}"')
if year:
search_params.append(f'submittedDate:[{year}01010000 TO {year}12312359]')
search_query = "+AND+".join(search_params) if search_params else "all:*"
url = f"{base_url}search_query={search_query}&start={start}&max_results={ARXIV_PAGE_SIZE}"
return url
def convert_xml_to_markdown(xml_content):
soup = BeautifulSoup(xml_content, 'xml')
# Extract title, authors, abstract, and other relevant information from the specific paper entry
entry = soup.find('entry')
title = entry.find('title').text.strip()
authors = [author.find('name').text.strip() for author in entry.find_all('author')]
abstract = entry.find('summary').text.strip()
published = entry.find('published').text.strip()
categories = [category['term'] for category in entry.find_all('category')]
# Constructing a markdown representation for the paper
markdown = f"# {title}\n\n"
markdown += f"**Authors:** {', '.join(authors)}\n\n"
markdown += f"**Published Date:** {published}\n\n"
markdown += f"**Abstract:**\n\n{abstract}\n\n"
markdown += f"**Categories:** {', '.join(categories)}\n\n"
return markdown, title, authors, categories
def process_and_ingest_arxiv_paper(paper_id, additional_keywords):
try:
xml_content = fetch_arxiv_xml(paper_id)
markdown, title, authors, categories = convert_xml_to_markdown(xml_content)
keywords = f"arxiv,{','.join(categories)}"
if additional_keywords:
keywords += f",{additional_keywords}"
add_media_with_keywords(
url=f"https://arxiv.org/abs/{paper_id}",
title=title,
media_type='document',
content=markdown,
keywords=keywords,
prompt='No prompt for arXiv papers',
summary='arXiv paper ingested from XML',
transcription_model='None',
author=', '.join(authors),
ingestion_date=datetime.now().strftime('%Y-%m-%d')
)
return f"arXiv paper '{title}' ingested successfully."
except Exception as e:
return f"Error processing arXiv paper: {str(e)}"
#
# End of Arxiv.py
####################################################################################################
|