hackernews-summaries / utils /hackernews_fetcher.py
Tuana's picture
first working app
d61b6cb
raw
history blame
904 Bytes
from typing import List
from haystack import component, Document
from newspaper import Article
import requests
@component
class HackernewsFetcher():
@component.output_types(articles=List[Document])
def run(self, top_k: int):
newest_list = requests.get(url='https://hacker-news.firebaseio.com/v0/topstories.json?print=pretty')
articles = []
for id in newest_list.json()[0:top_k]:
article = requests.get(url=f"https://hacker-news.firebaseio.com/v0/item/{id}.json?print=pretty")
if 'url' in article.json():
articles.append(article.json()['url'])
docs = []
for url in articles:
try:
article = Article(url)
article.download()
article.parse()
docs.append(Document(content=article.text, meta={'title': article.title, 'url': url}))
except:
print(f"Couldn't download {url}, skipped")
return {'articles': docs}