File size: 5,529 Bytes
43cd37c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# Arxiv.py
# Description: This file contains the functions for searching and ingesting arXiv papers.
import time

import arxiv
import requests
from bs4 import BeautifulSoup
from datetime import datetime

from requests.adapters import HTTPAdapter
from urllib3 import Retry

#
# Local Imports
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
#
#####################################################################################################
#
# Functions:

# Number of results per page
ARXIV_PAGE_SIZE = 10


def fetch_arxiv_pdf_url(paper_id):
    base_url = f"http://export.arxiv.org/api/query?id_list={paper_id}"

    # Configure retry strategy
    retry_strategy = Retry(
        total=3,  # Maximum number of retries
        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these status codes
        backoff_factor=1  # Exponential backoff factor
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    http.mount("http://", adapter)

    try:
        response = http.get(base_url)
        response.raise_for_status()
        # Delay between requests to avoid rate limiting
        time.sleep(2)
        soup = BeautifulSoup(response.text, 'xml')
        pdf_link = soup.find('link', title='pdf')['href']
        return pdf_link
    except requests.exceptions.RequestException as e:
        print(f"**Error:** {e}")
        return None


def search_arxiv(query):
    client = arxiv.Client()
    search = arxiv.Search(
        query=query,
        max_results=10,
        sort_by=arxiv.SortCriterion.Relevance
    )

    results = []
    for result in client.results(search):
        results.append([
            result.title,
            result.entry_id.split('/')[-1],  # Extract the ID from the entry_id
            ', '.join(author.name for author in result.authors),
            result.summary
        ])

    return results


def fetch_arxiv_xml(paper_id):
    base_url = "http://export.arxiv.org/api/query?id_list="
    response = requests.get(base_url + paper_id)
    response.raise_for_status()
    return response.text


def parse_arxiv_feed(xml_content):
    soup = BeautifulSoup(xml_content, 'xml')
    entries = []
    for entry in soup.find_all('entry'):
        title = entry.title.text.strip()
        paper_id = entry.id.text.strip().split('/abs/')[-1]
        authors = ', '.join(author.find('name').text.strip() for author in entry.find_all('author'))
        published = entry.published.text.strip().split('T')[0]
        abstract = entry.summary.text.strip()
        entries.append({
            'id': paper_id,
            'title': title,
            'authors': authors,
            'published': published,
            'abstract': abstract
        })
    return entries


def build_query_url(query, author, year, start):
    # HTTP? FIXME....
    base_url = "http://export.arxiv.org/api/query?"
    search_params = []

    # Build search query
    if query:
        search_params.append(f"all:{query}")
    if author:
        search_params.append(f'au:"{author}"')
    if year:
        search_params.append(f'submittedDate:[{year}01010000 TO {year}12312359]')

    search_query = "+AND+".join(search_params) if search_params else "all:*"

    url = f"{base_url}search_query={search_query}&start={start}&max_results={ARXIV_PAGE_SIZE}"
    return url

def convert_xml_to_markdown(xml_content):
    soup = BeautifulSoup(xml_content, 'xml')

    # Extract title, authors, abstract, and other relevant information from the specific paper entry
    entry = soup.find('entry')
    title = entry.find('title').text.strip()
    authors = [author.find('name').text.strip() for author in entry.find_all('author')]
    abstract = entry.find('summary').text.strip()
    published = entry.find('published').text.strip()

    categories = [category['term'] for category in entry.find_all('category')]

    # Constructing a markdown representation for the paper
    markdown = f"# {title}\n\n"
    markdown += f"**Authors:** {', '.join(authors)}\n\n"
    markdown += f"**Published Date:** {published}\n\n"
    markdown += f"**Abstract:**\n\n{abstract}\n\n"
    markdown += f"**Categories:** {', '.join(categories)}\n\n"

    return markdown, title, authors, categories


def process_and_ingest_arxiv_paper(paper_id, additional_keywords):
    try:
        xml_content = fetch_arxiv_xml(paper_id)
        markdown, title, authors, categories = convert_xml_to_markdown(xml_content)

        keywords = f"arxiv,{','.join(categories)}"
        if additional_keywords:
            keywords += f",{additional_keywords}"

        add_media_with_keywords(
            url=f"https://arxiv.org/abs/{paper_id}",
            title=title,
            media_type='document',
            content=markdown,
            keywords=keywords,
            prompt='No prompt for arXiv papers',
            summary='arXiv paper ingested from XML',
            transcription_model='None',
            author=', '.join(authors),
            ingestion_date=datetime.now().strftime('%Y-%m-%d')
        )

        return f"arXiv paper '{title}' ingested successfully."
    except Exception as e:
        return f"Error processing arXiv paper: {str(e)}"

#
# End of Arxiv.py
####################################################################################################