ArxivDigest / download_new_papers.py
Richard Fan
add app
6f966a5
raw
history blame
2.63 kB
# encoding: utf-8
import os
import tqdm
from bs4 import BeautifulSoup as bs
import urllib.request
import json
import datetime
import pytz
def _download_new_papers(field_abbr):
NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new' # https://arxiv.org/list/cs/new
page = urllib.request.urlopen(NEW_SUB_URL)
soup = bs(page)
content = soup.body.find("div", {'id': 'content'})
# find the first h3 element in content
h3 = content.find("h3").text # e.g: New submissions for Wed, 10 May 23
date = h3.replace("New submissions for", "").strip()
dt_list = content.dl.find_all("dt")
dd_list = content.dl.find_all("dd")
arxiv_base = "https://arxiv.org/abs/"
assert len(dt_list) == len(dd_list)
new_paper_list = []
for i in tqdm.tqdm(range(len(dt_list))):
paper = {}
paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1]
paper['main_page'] = arxiv_base + paper_number
paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number
paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip()
paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \
.replace("Authors:\n", "").replace("\n", "").strip()
paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip()
paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip()
new_paper_list.append(paper)
# check if ./data exist, if not, create it
if not os.path.exists("./data"):
os.makedirs("./data")
# save new_paper_list to a jsonl file, with each line as the element of a dictionary
date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp())
date = date.strftime("%a, %d %b %y")
with open(f"./data/{field_abbr}_{date}.jsonl", "w") as f:
for paper in new_paper_list:
f.write(json.dumps(paper) + "\n")
def get_papers(field_abbr, limit=None):
date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp())
date = date.strftime("%a, %d %b %y")
if not os.path.exists(f"./data/{field_abbr}_{date}.jsonl"):
_download_new_papers(field_abbr)
results = []
with open(f"./data/{field_abbr}_{date}.jsonl", "r") as f:
for i, line in enumerate(f.readlines()):
if limit and i == limit:
return results
results.append(json.loads(line))
return results