akhaliq HF staff commited on
Commit
35b4c0e
1 Parent(s): 8679092
Files changed (1) hide show
  1. app.py +279 -384
app.py CHANGED
@@ -1,396 +1,291 @@
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- import requests
3
- from datetime import datetime, timezone, timedelta
4
-
5
- API_URL = "https://huggingface.co/api/daily_papers"
6
-
7
- class PaperManager:
8
- def __init__(self, papers_per_page=30):
9
- self.papers_per_page = papers_per_page
10
- self.current_page = 1
11
- self.papers = []
12
- self.total_pages = 1
13
- self.time_filter = 'All Time' # Default filter
14
-
15
- def calculate_score(self, paper):
16
- """
17
- Calculate the score of a paper based on upvotes and age.
18
- This mimics the "hotness" algorithm used by platforms like Hacker News.
19
- """
20
- upvotes = paper.get('paper', {}).get('upvotes', 0)
21
- published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
22
- try:
23
- published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
24
- except ValueError:
25
- # If parsing fails, use current time to minimize the impact on sorting
26
- published_time = datetime.now(timezone.utc)
27
-
28
- time_diff = datetime.now(timezone.utc) - published_time
29
- time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
30
-
31
- # Avoid division by zero and apply the hotness formula
32
- score = upvotes / ((time_diff_hours + 2) ** 1.5)
33
- return score
34
-
35
- def fetch_papers(self, time_filter='All Time'):
36
- """
37
- Fetch papers from the API and apply time filtering.
38
- """
39
- try:
40
- response = requests.get(f"{API_URL}?limit=100")
41
- response.raise_for_status()
42
- data = response.json()
43
-
44
- # Apply time filter
45
- filtered_data = self.apply_time_filter(data, time_filter)
46
-
47
- # Sort papers by calculated score descending
48
- self.papers = sorted(
49
- filtered_data,
50
- key=lambda x: self.calculate_score(x),
51
- reverse=True
52
- )
53
-
54
- self.total_pages = max((len(self.papers) + self.papers_per_page - 1) // self.papers_per_page, 1)
55
- self.current_page = 1
56
- self.time_filter = time_filter
57
- return True
58
- except requests.RequestException as e:
59
- print(f"Error fetching papers: {e}")
60
- return False
61
- except Exception as e:
62
- print(f"Unexpected error: {e}")
63
- return False
64
-
65
- def apply_time_filter(self, data, time_filter):
66
- """
67
- Filter papers based on the selected timeframe.
68
- """
69
- if time_filter == 'All Time':
70
- return data
71
-
72
- now = datetime.now(timezone.utc)
73
- if time_filter == 'Last Week':
74
- threshold = now - timedelta(weeks=1)
75
- elif time_filter == 'Last Month':
76
- threshold = now - timedelta(days=30)
77
- elif time_filter == 'Last Year':
78
- threshold = now - timedelta(days=365)
79
- else:
80
- # If an unknown filter is provided, default to all time
81
- return data
82
-
83
- filtered = []
84
- for paper in data:
85
- published_at_str = paper.get('publishedAt', '')
86
- try:
87
- published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
88
- if published_time >= threshold:
89
- filtered.append(paper)
90
- except ValueError:
91
- # Skip papers with invalid date formats
92
- continue
93
- return filtered
94
-
95
- def format_paper(self, paper, rank):
96
- title = paper.get('title', 'No title')
97
- paper_id = paper.get('paper', {}).get('id', '')
98
- url = f"https://huggingface.co/papers/{paper_id}"
99
- authors = ', '.join([author.get('name', '') for author in paper.get('paper', {}).get('authors', [])]) or 'Unknown'
100
- upvotes = paper.get('paper', {}).get('upvotes', 0)
101
- comments = paper.get('numComments', 0)
102
- published_time = datetime.fromisoformat(
103
- paper.get('publishedAt', datetime.now(timezone.utc).isoformat()).replace('Z', '+00:00')
104
- )
105
- time_diff = datetime.now(timezone.utc) - published_time
106
- time_ago_days = time_diff.days
107
- time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today"
108
-
109
- return f"""
110
- <tr class="athing">
111
- <td align="right" valign="top" class="title"><span class="rank">{rank}.</span></td>
112
- <td valign="top" class="title">
113
- <a href="{url}" class="storylink" target="_blank">{title}</a>
114
- </td>
115
- </tr>
116
- <tr>
117
- <td colspan="1"></td>
118
- <td class="subtext">
119
- <span class="score">{upvotes} upvotes</span><br>
120
- authors: {authors} | {time_ago} | <a href="#">{comments} comments</a>
121
- </td>
122
- </tr>
123
- <tr style="height:5px"></tr>
124
- """
125
-
126
- def render_papers(self):
127
- start = (self.current_page - 1) * self.papers_per_page
128
- end = start + self.papers_per_page
129
- current_papers = self.papers[start:end]
130
-
131
- if not current_papers:
132
- return "<div class='no-papers'>No papers available for this page.</div>"
133
-
134
- papers_html = "".join([self.format_paper(paper, idx + start + 1) for idx, paper in enumerate(current_papers)])
135
- return f"""
136
- <table border="0" cellpadding="0" cellspacing="0" class="itemlist">
137
- {papers_html}
138
- </table>
139
- """
140
-
141
- def next_page(self):
142
- if self.current_page < self.total_pages:
143
- self.current_page += 1
144
- return self.render_papers()
145
-
146
- def prev_page(self):
147
- if self.current_page > 1:
148
- self.current_page -= 1
149
- return self.render_papers()
150
-
151
- def set_time_filter(self, time_filter):
152
- """
153
- Set the time filter and fetch papers accordingly.
154
- """
155
- if self.fetch_papers(time_filter):
156
- return self.render_papers()
157
- else:
158
- return "<div class='no-papers'>Failed to fetch papers. Please try again later.</div>"
159
-
160
- paper_manager = PaperManager()
161
-
162
- def initialize_app():
163
- if paper_manager.fetch_papers():
164
- return paper_manager.render_papers()
165
- else:
166
- return "<div class='no-papers'>Failed to fetch papers. Please try again later.</div>"
167
-
168
- def refresh_papers():
169
- if paper_manager.fetch_papers(paper_manager.time_filter):
170
- return paper_manager.render_papers()
171
- else:
172
- return "<div class='no-papers'>Failed to refresh papers. Please try again later.</div>"
173
-
174
- css = """
175
- body {
176
- background-color: white;
177
- font-family: Verdana, Geneva, sans-serif;
178
- margin: 0;
179
- padding: 0;
180
- }
181
-
182
- a {
183
- color: #0000ff;
184
- text-decoration: none;
185
- }
186
-
187
- a:visited {
188
- color: #551A8B;
189
- }
190
-
191
- .container {
192
- width: 85%;
193
- margin: auto;
194
- }
195
-
196
- table {
197
- width: 100%;
198
- }
199
-
200
- .header-table {
201
- width: 100%;
202
- background-color: #ff6600;
203
- padding: 2px 10px;
204
- }
205
-
206
- .header-table a {
207
- color: black;
208
- font-weight: bold;
209
- font-size: 14pt;
210
- text-decoration: none;
211
- }
212
-
213
- .itemlist .athing {
214
- background-color: #f6f6ef;
215
- }
216
-
217
- .rank {
218
- font-size: 14pt;
219
- color: #828282;
220
- padding-right: 5px;
221
- }
222
-
223
- .storylink {
224
- font-size: 10pt;
225
- }
226
-
227
- .subtext {
228
- font-size: 8pt;
229
- color: #828282;
230
- padding-left: 40px;
231
- }
232
-
233
- .subtext a {
234
- color: #828282;
235
- text-decoration: none;
236
- }
237
-
238
- #refresh-button {
239
- background: none;
240
- border: none;
241
- color: black;
242
- font-weight: bold;
243
- font-size: 14pt;
244
- cursor: pointer;
245
- }
246
-
247
- .no-papers {
248
- text-align: center;
249
- color: #828282;
250
- padding: 1rem;
251
- font-size: 14pt;
252
- }
253
-
254
- @media (max-width: 640px) {
255
- .header-table a {
256
- font-size: 12pt;
257
- }
258
-
259
- .storylink {
260
- font-size: 9pt;
261
- }
262
-
263
- .subtext {
264
- font-size: 7pt;
265
- }
266
- }
267
-
268
- /* Dark mode */
269
- @media (prefers-color-scheme: dark) {
270
- body {
271
- background-color: #121212;
272
- color: #e0e0e0;
273
- }
274
-
275
- a {
276
- color: #add8e6;
277
- }
278
-
279
- a:visited {
280
- color: #9370db;
281
- }
282
-
283
- .header-table {
284
- background-color: #ff6600;
285
- }
286
-
287
- .header-table a {
288
- color: black;
289
- }
290
-
291
- .itemlist .athing {
292
- background-color: #1e1e1e;
293
- }
294
-
295
- .rank {
296
- color: #b0b0b0;
297
- }
298
-
299
- .subtext {
300
- color: #b0b0b0;
301
- }
302
-
303
- .subtext a {
304
- color: #b0b0b0;
305
- }
306
-
307
- #refresh-button {
308
- color: #e0e0e0;
309
- }
310
-
311
- .no-papers {
312
- color: #b0b0b0;
313
- }
314
- }
315
  """
316
 
317
- demo = gr.Blocks(css=css)
318
 
319
- with demo:
320
- with gr.Column(elem_classes=["container"]):
321
- # Accordion for Submission Instructions
322
- with gr.Accordion("How to Submit a Paper", open=False):
323
- gr.Markdown("""
324
- ### Steps to Submit Your Paper
325
 
326
- **Step 1:** Search for your paper and index on Hugging Face:
327
- [https://huggingface.co/papers?search=true](https://huggingface.co/papers?search=true)
328
 
329
- **Step 2:** Submit the paper to Daily Papers:
330
- [https://huggingface.co/papers](https://huggingface.co/papers)
 
 
 
 
 
 
331
 
332
- Once your paper is submitted, it will automatically appear in this demo.
333
- """)
334
-
335
- # Header with Refresh Button and Time Filter
 
336
  with gr.Row():
337
- gr.HTML("""
338
- <table border="0" cellpadding="0" cellspacing="0" class="header-table">
339
- <tr>
340
- <td>
341
- <span class="pagetop">
342
- <b class="hnname"><a href="#">Daily Papers</a></b>
343
- </span>
344
- </td>
345
- <td align="right">
346
- <button id="refresh-button">Refresh</button>
347
- </td>
348
- </tr>
349
- </table>
350
- """)
351
-
352
- # Time Filter Dropdown
353
- with gr.Row(elem_classes=["time-filter-row"], elem_id="time-filter-row"):
354
- gr.HTML("<label for='time-filter'>Filter by Timeframe: </label>")
355
- time_filter_dropdown = gr.Dropdown(
356
- choices=["All Time", "Last Week", "Last Month", "Last Year"],
357
- value="All Time",
358
- label="Timeframe",
359
- interactive=True,
360
- elem_id="time-filter-dropdown"
361
- )
362
-
363
- # Paper list
364
- paper_list = gr.HTML()
365
-
366
- # Navigation Buttons
367
  with gr.Row():
368
- prev_button = gr.Button("Prev")
369
- next_button = gr.Button("Next")
370
-
371
- # Load papers on app start
372
- demo.load(initialize_app, outputs=[paper_list])
373
-
374
- # Button clicks
375
- prev_button.click(paper_manager.prev_page, outputs=[paper_list])
376
- next_button.click(paper_manager.next_page, outputs=[paper_list])
377
- refresh_button = gr.Button("Refresh", visible=False, elem_id="refresh-hidden")
378
- refresh_button.click(refresh_papers, outputs=[paper_list])
379
-
380
- # Time Filter change
381
- time_filter_dropdown.change(
382
- paper_manager.set_time_filter,
383
- inputs=[time_filter_dropdown],
384
- outputs=[paper_list]
385
  )
386
 
387
- # Bind the visible Refresh button to the hidden one using JavaScript
388
- gr.HTML("""
389
- <script>
390
- document.getElementById('refresh-button').addEventListener('click', function() {
391
- document.getElementById('refresh-hidden').click();
392
- });
393
- </script>
394
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
 
396
- demo.launch()
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import datetime
4
+ import operator
5
+ import pandas as pd
6
+ import tqdm.auto
7
+ from apscheduler.schedulers.background import BackgroundScheduler
8
+ from huggingface_hub import HfApi
9
+ from ragatouille import RAGPretrainedModel
10
+
11
  import gradio as gr
12
+ from gradio_calendar import Calendar
13
+ import datasets
14
+
15
+ # --- Data Loading and Processing ---
16
+
17
+ api = HfApi()
18
+
19
+ INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
20
+ INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
21
+ api.snapshot_download(
22
+ repo_id=INDEX_REPO_ID,
23
+ repo_type="dataset",
24
+ local_dir=INDEX_DIR_PATH,
25
+ )
26
+ abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
27
+ # Run once to initialize the retriever
28
+ abstract_retriever.search("LLM")
29
+
30
+
31
+ def update_abstract_index() -> None:
32
+ global abstract_retriever
33
+
34
+ api.snapshot_download(
35
+ repo_id=INDEX_REPO_ID,
36
+ repo_type="dataset",
37
+ local_dir=INDEX_DIR_PATH,
38
+ )
39
+ abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
40
+ abstract_retriever.search("LLM")
41
+
42
+
43
+ scheduler_abstract = BackgroundScheduler()
44
+ scheduler_abstract.add_job(
45
+ func=update_abstract_index,
46
+ trigger="cron",
47
+ minute=0, # Every hour at minute 0
48
+ timezone="UTC",
49
+ misfire_grace_time=3 * 60,
50
+ )
51
+ scheduler_abstract.start()
52
+
53
+
54
+ def get_df() -> pd.DataFrame:
55
+ df = pd.merge(
56
+ left=datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
57
+ right=datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
58
+ on="arxiv_id",
59
+ )
60
+ df = df[::-1].reset_index(drop=True)
61
+ df["date"] = df["date"].dt.strftime("%Y-%m-%d")
62
+
63
+ paper_info = []
64
+ for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
65
+ info = row.copy()
66
+ del info["abstract"]
67
+ info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
68
+ paper_info.append(info)
69
+ return pd.DataFrame(paper_info)
70
+
71
+
72
+ class Prettifier:
73
+ @staticmethod
74
+ def get_github_link(link: str) -> str:
75
+ if not link:
76
+ return ""
77
+ return Prettifier.create_link("github", link)
78
+
79
+ @staticmethod
80
+ def create_link(text: str, url: str) -> str:
81
+ return f'<a href="{url}" target="_blank">{text}</a>'
82
+
83
+ @staticmethod
84
+ def to_div(text: str | None, category_name: str) -> str:
85
+ if text is None:
86
+ text = ""
87
+ class_name = f"{category_name}-{text.lower()}"
88
+ return f'<div class="{class_name}">{text}</div>'
89
+
90
+ def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
91
+ new_rows = []
92
+ for _, row in df.iterrows():
93
+ new_row = {
94
+ "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
95
+ "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
96
+ "title": row["title"],
97
+ "github": self.get_github_link(row.github),
98
+ "👍": row["upvotes"],
99
+ "💬": row["num_comments"],
100
+ }
101
+ new_rows.append(new_row)
102
+ return pd.DataFrame(new_rows)
103
+
104
+
105
+ class PaperList:
106
+ COLUMN_INFO = [
107
+ ["date", "markdown"],
108
+ ["paper_page", "markdown"],
109
+ ["title", "str"],
110
+ ["github", "markdown"],
111
+ ["👍", "number"],
112
+ ["💬", "number"],
113
+ ]
114
+
115
+ def __init__(self, df: pd.DataFrame):
116
+ self.df_raw = df
117
+ self._prettifier = Prettifier()
118
+ self.df_prettified = self._prettifier(df).loc[:, self.column_names]
119
+
120
+ @property
121
+ def column_names(self):
122
+ return list(map(operator.itemgetter(0), self.COLUMN_INFO))
123
+
124
+ @property
125
+ def column_datatype(self):
126
+ return list(map(operator.itemgetter(1), self.COLUMN_INFO))
127
+
128
+ def search(
129
+ self,
130
+ start_date: datetime.datetime,
131
+ end_date: datetime.datetime,
132
+ title_search_query: str,
133
+ abstract_search_query: str,
134
+ max_num_to_retrieve: int,
135
+ ) -> pd.DataFrame:
136
+ df = self.df_raw.copy()
137
+ df["date"] = pd.to_datetime(df["date"])
138
+
139
+ # Filter by date
140
+ df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
141
+ df["date"] = df["date"].dt.strftime("%Y-%m-%d")
142
+
143
+ # Filter by title
144
+ if title_search_query:
145
+ df = df[df["title"].str.contains(title_search_query, case=False)]
146
+
147
+ # Filter by abstract
148
+ if abstract_search_query:
149
+ results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
150
+ remaining_ids = set(df["arxiv_id"])
151
+ found_id_set = set()
152
+ found_ids = []
153
+ for x in results:
154
+ arxiv_id = x["document_id"]
155
+ if arxiv_id not in remaining_ids:
156
+ continue
157
+ if arxiv_id in found_id_set:
158
+ continue
159
+ found_id_set.add(arxiv_id)
160
+ found_ids.append(arxiv_id)
161
+ df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
162
+
163
+ df_prettified = self._prettifier(df).loc[:, self.column_names]
164
+ return df_prettified
165
+
166
+
167
+ paper_list = PaperList(get_df())
168
+
169
+
170
+ def update_paper_list() -> None:
171
+ global paper_list
172
+ paper_list = PaperList(get_df())
173
+
174
+
175
+ scheduler_data = BackgroundScheduler()
176
+ scheduler_data.add_job(
177
+ func=update_paper_list,
178
+ trigger="cron",
179
+ minute=0, # Every hour at minute 0
180
+ timezone="UTC",
181
+ misfire_grace_time=60,
182
+ )
183
+ scheduler_data.start()
184
+
185
+ # --- Gradio App ---
186
+
187
+ DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
188
+
189
+ FOOT_NOTE = """\
190
+ Related useful Spaces:
191
+ - [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
192
+ - [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
193
+ - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  """
195
 
 
196
 
197
+ def update_df() -> pd.DataFrame:
198
+ return paper_list.df_prettified
199
+
200
+
201
+ def update_num_papers(df: pd.DataFrame) -> str:
202
+ return f"{len(df)} / {len(paper_list.df_raw)}"
203
 
 
 
204
 
205
+ def search(
206
+ start_date: datetime.datetime,
207
+ end_date: datetime.datetime,
208
+ search_title: str,
209
+ search_abstract: str,
210
+ max_num_to_retrieve: int,
211
+ ) -> pd.DataFrame:
212
+ return paper_list.search(start_date, end_date, search_title, search_abstract, max_num_to_retrieve)
213
 
214
+
215
+ with gr.Blocks(css="style.css") as demo:
216
+ gr.Markdown(DESCRIPTION)
217
+ with gr.Group():
218
+ search_title = gr.Textbox(label="Search title")
219
  with gr.Row():
220
+ with gr.Column(scale=4):
221
+ search_abstract = gr.Textbox(
222
+ label="Search abstract",
223
+ info="The result may not be accurate as the abstract does not contain all the information.",
224
+ )
225
+ with gr.Column(scale=1):
226
+ max_num_to_retrieve = gr.Slider(
227
+ label="Max number to retrieve",
228
+ info="This is used only for search on abstracts.",
229
+ minimum=1,
230
+ maximum=len(paper_list.df_raw),
231
+ step=1,
232
+ value=100,
233
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  with gr.Row():
235
+ start_date = Calendar(label="Start date", type="date", value="2023-05-05")
236
+ end_date = Calendar(label="End date", type="date", value=datetime.datetime.utcnow().strftime("%Y-%m-%d"))
237
+
238
+ num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(paper_list.df_raw), interactive=False)
239
+ df = gr.Dataframe(
240
+ value=paper_list.df_prettified,
241
+ datatype=paper_list.column_datatype,
242
+ type="pandas",
243
+ interactive=False,
244
+ height=1000,
245
+ elem_id="table",
246
+ column_widths=["10%", "10%", "60%", "10%", "5%", "5%"],
247
+ wrap=True,
 
 
 
 
248
  )
249
 
250
+ gr.Markdown(FOOT_NOTE)
251
+
252
+ # Define the triggers and corresponding functions
253
+ search_event = gr.Button("Search")
254
+ search_event.click(
255
+ fn=search,
256
+ inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve],
257
+ outputs=df,
258
+ ).then(
259
+ fn=update_num_papers,
260
+ inputs=df,
261
+ outputs=num_papers,
262
+ queue=False,
263
+ )
264
+
265
+ # Automatically trigger search when inputs change
266
+ for trigger in [start_date, end_date, search_title, search_abstract, max_num_to_retrieve]:
267
+ trigger.change(
268
+ fn=search,
269
+ inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve],
270
+ outputs=df,
271
+ ).then(
272
+ fn=update_num_papers,
273
+ inputs=df,
274
+ outputs=num_papers,
275
+ queue=False,
276
+ )
277
+
278
+ # Load the initial dataframe and number of papers
279
+ demo.load(
280
+ fn=update_df,
281
+ outputs=df,
282
+ queue=False,
283
+ ).then(
284
+ fn=update_num_papers,
285
+ inputs=df,
286
+ outputs=num_papers,
287
+ queue=False,
288
+ )
289
 
290
+ if __name__ == "__main__":
291
+ demo.queue(api_open=False).launch(show_api=False)