Spaces:
Running
Running
gunship999
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -4,74 +4,94 @@ import streamlit as st
|
|
4 |
import time
|
5 |
import random
|
6 |
|
7 |
-
#
|
8 |
url = "https://m.news.naver.com/rankingList"
|
9 |
|
10 |
-
#
|
11 |
headers = {
|
12 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0",
|
13 |
"Referer": "https://m.news.naver.com/"
|
14 |
}
|
15 |
|
16 |
-
# λλ€ λλ μ΄ μ€μ ν¨μ
|
17 |
def random_delay():
|
18 |
-
|
19 |
-
time.sleep(delay)
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
26 |
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
})
|
50 |
-
return news_list
|
51 |
|
52 |
-
#
|
53 |
st.title("Daily News Scrap in Korea")
|
54 |
|
55 |
-
|
56 |
-
if st.button("start"):
|
57 |
-
# λνΉ λ΄μ€ λ°μ΄ν°λ₯Ό μ€ν¬λν
|
58 |
news_data = scrape_ranking_news()
|
59 |
-
|
60 |
-
# 5x5 ννλ‘ κ°μ μΈλ‘ μ¬μ κΈ°μ¬λ₯Ό ν μ€μ λ°°μΉ
|
61 |
-
num_columns = 5
|
62 |
-
for news in news_data:
|
63 |
-
col_count = 0
|
64 |
-
cols = st.columns(num_columns)
|
65 |
-
|
66 |
-
for index, news in enumerate(news_data):
|
67 |
-
with cols[col_count]:
|
68 |
-
st.image(news['image'])
|
69 |
-
st.write(f"**{news['rank']}μ - {news['office']}**")
|
70 |
-
st.write(f"[{news['title']}]({news['link']})")
|
71 |
-
st.write(f"π {news['time']}")
|
72 |
-
col_count += 1
|
73 |
-
|
74 |
-
# 5κ° μΆλ ₯ ν μλ‘μ΄ νμΌλ‘
|
75 |
-
if col_count == num_columns:
|
76 |
-
col_count = 0
|
77 |
-
cols = st.columns(num_columns)
|
|
|
4 |
import time
|
5 |
import random
|
6 |
|
7 |
+
# Target URL
|
8 |
url = "https://m.news.naver.com/rankingList"
|
9 |
|
10 |
+
# Headers
|
11 |
headers = {
|
12 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0",
|
13 |
"Referer": "https://m.news.naver.com/"
|
14 |
}
|
15 |
|
|
|
16 |
def random_delay():
|
17 |
+
time.sleep(random.uniform(1, 3))
|
|
|
18 |
|
19 |
+
def safe_find(element, selector, class_name, attribute=None):
|
20 |
+
"""Safely find elements and their attributes"""
|
21 |
+
found = element.find(selector, class_=class_name)
|
22 |
+
if found and attribute:
|
23 |
+
return found.get(attribute)
|
24 |
+
return found.text if found else None
|
25 |
|
26 |
+
def scrape_ranking_news():
|
27 |
+
try:
|
28 |
+
random_delay()
|
29 |
+
response = requests.get(url, headers=headers)
|
30 |
+
response.raise_for_status() # Check for HTTP errors
|
31 |
+
|
32 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
33 |
+
ranking_news_sections = soup.find_all("div", class_="rankingnews_box")
|
34 |
+
|
35 |
+
news_list = []
|
36 |
+
for section in ranking_news_sections:
|
37 |
+
office_name = safe_find(section, "strong", "rankingnews_name")
|
38 |
+
if not office_name:
|
39 |
+
continue
|
40 |
+
|
41 |
+
articles = section.find_all("li")
|
42 |
+
for article in articles:
|
43 |
+
# Safely extract all attributes
|
44 |
+
rank = safe_find(article, "em", "list_ranking_num")
|
45 |
+
title = safe_find(article, "strong", "list_title")
|
46 |
+
time_posted = safe_find(article, "span", "list_time")
|
47 |
+
link = safe_find(article, "a", None, "href")
|
48 |
+
|
49 |
+
# Handle image separately as it needs specific null checking
|
50 |
+
img_tag = article.find("img")
|
51 |
+
image = img_tag.get('src') if img_tag else None
|
52 |
+
|
53 |
+
if all([rank, title, time_posted, link]): # Ensure all required fields exist
|
54 |
+
news_list.append({
|
55 |
+
"rank": rank,
|
56 |
+
"title": title,
|
57 |
+
"time": time_posted,
|
58 |
+
"link": link,
|
59 |
+
"image": image,
|
60 |
+
"office": office_name
|
61 |
+
})
|
62 |
+
|
63 |
+
return news_list
|
64 |
+
except Exception as e:
|
65 |
+
st.error(f"Error scraping news: {str(e)}")
|
66 |
+
return []
|
67 |
|
68 |
+
def display_news(news_data, num_columns=5):
|
69 |
+
if not news_data:
|
70 |
+
st.warning("No news articles found.")
|
71 |
+
return
|
72 |
|
73 |
+
col_count = 0
|
74 |
+
cols = st.columns(num_columns)
|
75 |
+
|
76 |
+
for news in news_data:
|
77 |
+
with cols[col_count]:
|
78 |
+
if news['image']:
|
79 |
+
try:
|
80 |
+
st.image(news['image'])
|
81 |
+
except Exception:
|
82 |
+
st.warning("Image unavailable")
|
83 |
|
84 |
+
st.write(f"**{news['rank']}μ - {news['office']}**")
|
85 |
+
st.write(f"[{news['title']}]({news['link']})")
|
86 |
+
st.write(f"π {news['time']}")
|
87 |
+
|
88 |
+
col_count = (col_count + 1) % num_columns
|
89 |
+
if col_count == 0:
|
90 |
+
cols = st.columns(num_columns)
|
|
|
|
|
91 |
|
92 |
+
# Main app
|
93 |
st.title("Daily News Scrap in Korea")
|
94 |
|
95 |
+
if st.button("Start"):
|
|
|
|
|
96 |
news_data = scrape_ranking_news()
|
97 |
+
display_news(news_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|