dailypapershackernews-dev

Sleeping

App Files Files Community

akhaliq HF staff commited on Sep 20

Commit

35b4c0e

•

1 Parent(s): 8679092

updates

Browse files

Files changed (1) hide show

app.py +279 -384

app.py CHANGED Viewed

@@ -1,396 +1,291 @@
 import gradio as gr
-import requests
-from datetime import datetime, timezone, timedelta
-API_URL = "https://huggingface.co/api/daily_papers"
-class PaperManager:
-    def __init__(self, papers_per_page=30):
-        self.papers_per_page = papers_per_page
-        self.current_page = 1
-        self.papers = []
-        self.total_pages = 1
-        self.time_filter = 'All Time'  # Default filter
-    def calculate_score(self, paper):
-        """
-        Calculate the score of a paper based on upvotes and age.
-        This mimics the "hotness" algorithm used by platforms like Hacker News.
-        """
-        upvotes = paper.get('paper', {}).get('upvotes', 0)
-        published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
-        try:
-            published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
-        except ValueError:
-            # If parsing fails, use current time to minimize the impact on sorting
-            published_time = datetime.now(timezone.utc)
-        time_diff = datetime.now(timezone.utc) - published_time
-        time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours
-        # Avoid division by zero and apply the hotness formula
-        score = upvotes / ((time_diff_hours + 2) ** 1.5)
-        return score
-    def fetch_papers(self, time_filter='All Time'):
-        """
-        Fetch papers from the API and apply time filtering.
-        """
-        try:
-            response = requests.get(f"{API_URL}?limit=100")
-            response.raise_for_status()
-            data = response.json()
-            # Apply time filter
-            filtered_data = self.apply_time_filter(data, time_filter)
-            # Sort papers by calculated score descending
-            self.papers = sorted(
-                filtered_data,
-                key=lambda x: self.calculate_score(x),
-                reverse=True
-            )
-            self.total_pages = max((len(self.papers) + self.papers_per_page - 1) // self.papers_per_page, 1)
-            self.current_page = 1
-            self.time_filter = time_filter
-            return True
-        except requests.RequestException as e:
-            print(f"Error fetching papers: {e}")
-            return False
-        except Exception as e:
-            print(f"Unexpected error: {e}")
-            return False
-    def apply_time_filter(self, data, time_filter):
-        """
-        Filter papers based on the selected timeframe.
-        """
-        if time_filter == 'All Time':
-            return data
-        now = datetime.now(timezone.utc)
-        if time_filter == 'Last Week':
-            threshold = now - timedelta(weeks=1)
-        elif time_filter == 'Last Month':
-            threshold = now - timedelta(days=30)
-        elif time_filter == 'Last Year':
-            threshold = now - timedelta(days=365)
-        else:
-            # If an unknown filter is provided, default to all time
-            return data
-        filtered = []
-        for paper in data:
-            published_at_str = paper.get('publishedAt', '')
-            try:
-                published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
-                if published_time >= threshold:
-                    filtered.append(paper)
-            except ValueError:
-                # Skip papers with invalid date formats
-                continue
-        return filtered
-    def format_paper(self, paper, rank):
-        title = paper.get('title', 'No title')
-        paper_id = paper.get('paper', {}).get('id', '')
-        url = f"https://huggingface.co/papers/{paper_id}"
-        authors = ', '.join([author.get('name', '') for author in paper.get('paper', {}).get('authors', [])]) or 'Unknown'
-        upvotes = paper.get('paper', {}).get('upvotes', 0)
-        comments = paper.get('numComments', 0)
-        published_time = datetime.fromisoformat(
-            paper.get('publishedAt', datetime.now(timezone.utc).isoformat()).replace('Z', '+00:00')
-        )
-        time_diff = datetime.now(timezone.utc) - published_time
-        time_ago_days = time_diff.days
-        time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today"
-        return f"""
-        <tr class="athing">
-            <td align="right" valign="top" class="title"><span class="rank">{rank}.</span></td>
-            <td valign="top" class="title">
-                <a href="{url}" class="storylink" target="_blank">{title}</a>
-            </td>
-        </tr>
-        <tr>
-            <td colspan="1"></td>
-            <td class="subtext">
-                <span class="score">{upvotes} upvotes</span><br>
-                authors: {authors} | {time_ago} | <a href="#">{comments} comments</a>
-            </td>
-        </tr>
-        <tr style="height:5px"></tr>
-        """
-    def render_papers(self):
-        start = (self.current_page - 1) * self.papers_per_page
-        end = start + self.papers_per_page
-        current_papers = self.papers[start:end]
-        if not current_papers:
-            return "<div class='no-papers'>No papers available for this page.</div>"
-        papers_html = "".join([self.format_paper(paper, idx + start + 1) for idx, paper in enumerate(current_papers)])
-        return f"""
-        <table border="0" cellpadding="0" cellspacing="0" class="itemlist">
-            {papers_html}
-        </table>
-        """
-    def next_page(self):
-        if self.current_page < self.total_pages:
-            self.current_page += 1
-        return self.render_papers()
-    def prev_page(self):
-        if self.current_page > 1:
-            self.current_page -= 1
-        return self.render_papers()
-    def set_time_filter(self, time_filter):
-        """
-        Set the time filter and fetch papers accordingly.
-        """
-        if self.fetch_papers(time_filter):
-            return self.render_papers()
-        else:
-            return "<div class='no-papers'>Failed to fetch papers. Please try again later.</div>"
-paper_manager = PaperManager()
-def initialize_app():
-    if paper_manager.fetch_papers():
-        return paper_manager.render_papers()
-    else:
-        return "<div class='no-papers'>Failed to fetch papers. Please try again later.</div>"
-def refresh_papers():
-    if paper_manager.fetch_papers(paper_manager.time_filter):
-        return paper_manager.render_papers()
-    else:
-        return "<div class='no-papers'>Failed to refresh papers. Please try again later.</div>"
-css = """
-body {
-    background-color: white;
-    font-family: Verdana, Geneva, sans-serif;
-    margin: 0;
-    padding: 0;
-}
-a {
-    color: #0000ff;
-    text-decoration: none;
-}
-a:visited {
-    color: #551A8B;
-}
-.container {
-    width: 85%;
-    margin: auto;
-}
-table {
-    width: 100%;
-}
-.header-table {
-    width: 100%;
-    background-color: #ff6600;
-    padding: 2px 10px;
-}
-.header-table a {
-    color: black;
-    font-weight: bold;
-    font-size: 14pt;
-    text-decoration: none;
-}
-.itemlist .athing {
-    background-color: #f6f6ef;
-}
-.rank {
-    font-size: 14pt;
-    color: #828282;
-    padding-right: 5px;
-}
-.storylink {
-    font-size: 10pt;
-}
-.subtext {
-    font-size: 8pt;
-    color: #828282;
-    padding-left: 40px;
-}
-.subtext a {
-    color: #828282;
-    text-decoration: none;
-}
-#refresh-button {
-    background: none;
-    border: none;
-    color: black;
-    font-weight: bold;
-    font-size: 14pt;
-    cursor: pointer;
-}
-.no-papers {
-    text-align: center;
-    color: #828282;
-    padding: 1rem;
-    font-size: 14pt;
-}
-@media (max-width: 640px) {
-    .header-table a {
-        font-size: 12pt;
-    }
-    .storylink {
-        font-size: 9pt;
-    }
-    .subtext {
-        font-size: 7pt;
-    }
-}
-/* Dark mode */
-@media (prefers-color-scheme: dark) {
-    body {
-        background-color: #121212;
-        color: #e0e0e0;
-    }
-    a {
-        color: #add8e6;
-    }
-    a:visited {
-        color: #9370db;
-    }
-    .header-table {
-        background-color: #ff6600;
-    }
-    .header-table a {
-        color: black;
-    }
-    .itemlist .athing {
-        background-color: #1e1e1e;
-    }
-    .rank {
-        color: #b0b0b0;
-    }
-    .subtext {
-        color: #b0b0b0;
-    }
-    .subtext a {
-        color: #b0b0b0;
-    }
-    #refresh-button {
-        color: #e0e0e0;
-    }
-    .no-papers {
-        color: #b0b0b0;
-    }
-}
 """
-demo = gr.Blocks(css=css)
-with demo:
-    with gr.Column(elem_classes=["container"]):
-        # Accordion for Submission Instructions
-        with gr.Accordion("How to Submit a Paper", open=False):
-            gr.Markdown("""
-            ### Steps to Submit Your Paper
-            **Step 1:** Search for your paper and index on Hugging Face:
-            [https://huggingface.co/papers?search=true](https://huggingface.co/papers?search=true)
-            **Step 2:** Submit the paper to Daily Papers:
-            [https://huggingface.co/papers](https://huggingface.co/papers)
-            Once your paper is submitted, it will automatically appear in this demo.
-            """)
-        # Header with Refresh Button and Time Filter
         with gr.Row():
-            gr.HTML("""
-            <table border="0" cellpadding="0" cellspacing="0" class="header-table">
-                <tr>
-                    <td>
-                        <span class="pagetop">
-                            <b class="hnname"><a href="#">Daily Papers</a></b>
-                        </span>
-                    </td>
-                    <td align="right">
-                        <button id="refresh-button">Refresh</button>
-                    </td>
-                </tr>
-            </table>
-            """)
-        # Time Filter Dropdown
-        with gr.Row(elem_classes=["time-filter-row"], elem_id="time-filter-row"):
-            gr.HTML("<label for='time-filter'>Filter by Timeframe: </label>")
-            time_filter_dropdown = gr.Dropdown(
-                choices=["All Time", "Last Week", "Last Month", "Last Year"],
-                value="All Time",
-                label="Timeframe",
-                interactive=True,
-                elem_id="time-filter-dropdown"
-            )
-        # Paper list
-        paper_list = gr.HTML()
-        # Navigation Buttons
         with gr.Row():
-            prev_button = gr.Button("Prev")
-            next_button = gr.Button("Next")
-    # Load papers on app start
-    demo.load(initialize_app, outputs=[paper_list])
-    # Button clicks
-    prev_button.click(paper_manager.prev_page, outputs=[paper_list])
-    next_button.click(paper_manager.next_page, outputs=[paper_list])
-    refresh_button = gr.Button("Refresh", visible=False, elem_id="refresh-hidden")
-    refresh_button.click(refresh_papers, outputs=[paper_list])
-    # Time Filter change
-    time_filter_dropdown.change(
-        paper_manager.set_time_filter,
-        inputs=[time_filter_dropdown],
-        outputs=[paper_list]
     )
-    # Bind the visible Refresh button to the hidden one using JavaScript
-    gr.HTML("""
-    <script>
-    document.getElementById('refresh-button').addEventListener('click', function() {
-        document.getElementById('refresh-hidden').click();
-    });
-    </script>
-    """)
-demo.launch()

+#!/usr/bin/env python
+import datetime
+import operator
+import pandas as pd
+import tqdm.auto
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import HfApi
+from ragatouille import RAGPretrainedModel
 import gradio as gr
+from gradio_calendar import Calendar
+import datasets
+# --- Data Loading and Processing ---
+api = HfApi()
+INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
+INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
+api.snapshot_download(
+    repo_id=INDEX_REPO_ID,
+    repo_type="dataset",
+    local_dir=INDEX_DIR_PATH,
+)
+abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
+# Run once to initialize the retriever
+abstract_retriever.search("LLM")
+def update_abstract_index() -> None:
+    global abstract_retriever
+    api.snapshot_download(
+        repo_id=INDEX_REPO_ID,
+        repo_type="dataset",
+        local_dir=INDEX_DIR_PATH,
+    )
+    abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
+    abstract_retriever.search("LLM")
+scheduler_abstract = BackgroundScheduler()
+scheduler_abstract.add_job(
+    func=update_abstract_index,
+    trigger="cron",
+    minute=0,  # Every hour at minute 0
+    timezone="UTC",
+    misfire_grace_time=3 * 60,
+)
+scheduler_abstract.start()
+def get_df() -> pd.DataFrame:
+    df = pd.merge(
+        left=datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
+        right=datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
+        on="arxiv_id",
+    )
+    df = df[::-1].reset_index(drop=True)
+    df["date"] = df["date"].dt.strftime("%Y-%m-%d")
+    paper_info = []
+    for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
+        info = row.copy()
+        del info["abstract"]
+        info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
+        paper_info.append(info)
+    return pd.DataFrame(paper_info)
+class Prettifier:
+    @staticmethod
+    def get_github_link(link: str) -> str:
+        if not link:
+            return ""
+        return Prettifier.create_link("github", link)
+    @staticmethod
+    def create_link(text: str, url: str) -> str:
+        return f'<a href="{url}" target="_blank">{text}</a>'
+    @staticmethod
+    def to_div(text: str | None, category_name: str) -> str:
+        if text is None:
+            text = ""
+        class_name = f"{category_name}-{text.lower()}"
+        return f'<div class="{class_name}">{text}</div>'
+    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
+        new_rows = []
+        for _, row in df.iterrows():
+            new_row = {
+                "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
+                "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
+                "title": row["title"],
+                "github": self.get_github_link(row.github),
+                "👍": row["upvotes"],
+                "💬": row["num_comments"],
+            }
+            new_rows.append(new_row)
+        return pd.DataFrame(new_rows)
+class PaperList:
+    COLUMN_INFO = [
+        ["date", "markdown"],
+        ["paper_page", "markdown"],
+        ["title", "str"],
+        ["github", "markdown"],
+        ["👍", "number"],
+        ["💬", "number"],
+    ]
+    def __init__(self, df: pd.DataFrame):
+        self.df_raw = df
+        self._prettifier = Prettifier()
+        self.df_prettified = self._prettifier(df).loc[:, self.column_names]
+    @property
+    def column_names(self):
+        return list(map(operator.itemgetter(0), self.COLUMN_INFO))
+    @property
+    def column_datatype(self):
+        return list(map(operator.itemgetter(1), self.COLUMN_INFO))
+    def search(
+        self,
+        start_date: datetime.datetime,
+        end_date: datetime.datetime,
+        title_search_query: str,
+        abstract_search_query: str,
+        max_num_to_retrieve: int,
+    ) -> pd.DataFrame:
+        df = self.df_raw.copy()
+        df["date"] = pd.to_datetime(df["date"])
+        # Filter by date
+        df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
+        df["date"] = df["date"].dt.strftime("%Y-%m-%d")
+        # Filter by title
+        if title_search_query:
+            df = df[df["title"].str.contains(title_search_query, case=False)]
+        # Filter by abstract
+        if abstract_search_query:
+            results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
+            remaining_ids = set(df["arxiv_id"])
+            found_id_set = set()
+            found_ids = []
+            for x in results:
+                arxiv_id = x["document_id"]
+                if arxiv_id not in remaining_ids:
+                    continue
+                if arxiv_id in found_id_set:
+                    continue
+                found_id_set.add(arxiv_id)
+                found_ids.append(arxiv_id)
+            df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
+        df_prettified = self._prettifier(df).loc[:, self.column_names]
+        return df_prettified
+paper_list = PaperList(get_df())
+def update_paper_list() -> None:
+    global paper_list
+    paper_list = PaperList(get_df())
+scheduler_data = BackgroundScheduler()
+scheduler_data.add_job(
+    func=update_paper_list,
+    trigger="cron",
+    minute=0,  # Every hour at minute 0
+    timezone="UTC",
+    misfire_grace_time=60,
+)
+scheduler_data.start()
+# --- Gradio App ---
+DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
+FOOT_NOTE = """\
+Related useful Spaces:
+- [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
+- [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
+- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
 """
+def update_df() -> pd.DataFrame:
+    return paper_list.df_prettified
+def update_num_papers(df: pd.DataFrame) -> str:
+    return f"{len(df)} / {len(paper_list.df_raw)}"
+def search(
+    start_date: datetime.datetime,
+    end_date: datetime.datetime,
+    search_title: str,
+    search_abstract: str,
+    max_num_to_retrieve: int,
+) -> pd.DataFrame:
+    return paper_list.search(start_date, end_date, search_title, search_abstract, max_num_to_retrieve)
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Group():
+        search_title = gr.Textbox(label="Search title")
         with gr.Row():
+            with gr.Column(scale=4):
+                search_abstract = gr.Textbox(
+                    label="Search abstract",
+                    info="The result may not be accurate as the abstract does not contain all the information.",
+                )
+            with gr.Column(scale=1):
+                max_num_to_retrieve = gr.Slider(
+                    label="Max number to retrieve",
+                    info="This is used only for search on abstracts.",
+                    minimum=1,
+                    maximum=len(paper_list.df_raw),
+                    step=1,
+                    value=100,
+                )
         with gr.Row():
+            start_date = Calendar(label="Start date", type="date", value="2023-05-05")
+            end_date = Calendar(label="End date", type="date", value=datetime.datetime.utcnow().strftime("%Y-%m-%d"))
+    num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(paper_list.df_raw), interactive=False)
+    df = gr.Dataframe(
+        value=paper_list.df_prettified,
+        datatype=paper_list.column_datatype,
+        type="pandas",
+        interactive=False,
+        height=1000,
+        elem_id="table",
+        column_widths=["10%", "10%", "60%", "10%", "5%", "5%"],
+        wrap=True,
     )
+    gr.Markdown(FOOT_NOTE)
+    # Define the triggers and corresponding functions
+    search_event = gr.Button("Search")
+    search_event.click(
+        fn=search,
+        inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve],
+        outputs=df,
+    ).then(
+        fn=update_num_papers,
+        inputs=df,
+        outputs=num_papers,
+        queue=False,
+    )
+    # Automatically trigger search when inputs change
+    for trigger in [start_date, end_date, search_title, search_abstract, max_num_to_retrieve]:
+        trigger.change(
+            fn=search,
+            inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve],
+            outputs=df,
+        ).then(
+            fn=update_num_papers,
+            inputs=df,
+            outputs=num_papers,
+            queue=False,
+        )
+    # Load the initial dataframe and number of papers
+    demo.load(
+        fn=update_df,
+        outputs=df,
+        queue=False,
+    ).then(
+        fn=update_num_papers,
+        inputs=df,
+        outputs=num_papers,
+        queue=False,
+    )
+if __name__ == "__main__":
+    demo.queue(api_open=False).launch(show_api=False)