gunship999 commited on
Commit
8298dd3
Β·
verified Β·
1 Parent(s): e666451

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -54
app.py CHANGED
@@ -4,74 +4,94 @@ import streamlit as st
4
  import time
5
  import random
6
 
7
- # νƒ€κ²Ÿ URL
8
  url = "https://m.news.naver.com/rankingList"
9
 
10
- # 헀더 μ„€μ • (User-Agent 및 Referer μ„€μ •)
11
  headers = {
12
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0",
13
  "Referer": "https://m.news.naver.com/"
14
  }
15
 
16
- # 랜덀 λ”œλ ˆμ΄ μ„€μ • ν•¨μˆ˜
17
  def random_delay():
18
- delay = random.uniform(1, 3) # 1μ΄ˆμ—μ„œ 3초 μ‚¬μ΄μ˜ 랜덀 λ”œλ ˆμ΄
19
- time.sleep(delay)
20
 
21
- # μŠ€ν¬λž˜ν•‘ν•  데이터가 ν¬ν•¨λœ HTML μ˜μ—­ 선택
22
- def scrape_ranking_news():
23
- random_delay() # 랜덀 λ”œλ ˆμ΄ 적용
24
- response = requests.get(url, headers=headers)
25
- soup = BeautifulSoup(response.text, "html.parser")
 
26
 
27
- # μŠ€ν¬λž˜ν•‘ν•  데이터가 ν¬ν•¨λœ HTML μ˜μ—­ 선택
28
- ranking_news_sections = soup.find_all("div", class_="rankingnews_box")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- news_list = []
31
- for section in ranking_news_sections:
32
- office_name = section.find("strong", class_="rankingnews_name").text # 언둠사λͺ… μΆ”μΆœ
33
- articles = section.find_all("li")
34
 
35
- for article in articles:
36
- rank = article.find("em", class_="list_ranking_num").text
37
- title = article.find("strong", class_="list_title").text
38
- time_posted = article.find("span", class_="list_time").text
39
- link = article.find("a")['href']
40
- image = article.find("img")['src']
 
 
 
 
41
 
42
- news_list.append({
43
- "rank": rank,
44
- "title": title,
45
- "time": time_posted,
46
- "link": link,
47
- "image": image,
48
- "office": office_name
49
- })
50
- return news_list
51
 
52
- # λŒ€μ œλͺ© μΆ”κ°€
53
  st.title("Daily News Scrap in Korea")
54
 
55
- # μ‹€ν–‰ λ²„νŠΌ
56
- if st.button("start"):
57
- # λž­ν‚Ή λ‰΄μŠ€ 데이터λ₯Ό μŠ€ν¬λž˜ν•‘
58
  news_data = scrape_ranking_news()
59
-
60
- # 5x5 ν˜•νƒœλ‘œ 같은 μ–Έλ‘ μ‚¬μ˜ 기사λ₯Ό ν•œ 쀄에 배치
61
- num_columns = 5
62
- for news in news_data:
63
- col_count = 0
64
- cols = st.columns(num_columns)
65
-
66
- for index, news in enumerate(news_data):
67
- with cols[col_count]:
68
- st.image(news['image'])
69
- st.write(f"**{news['rank']}μœ„ - {news['office']}**")
70
- st.write(f"[{news['title']}]({news['link']})")
71
- st.write(f"πŸ•’ {news['time']}")
72
- col_count += 1
73
-
74
- # 5개 좜λ ₯ ν›„ μƒˆλ‘œμš΄ ν–‰μœΌλ‘œ
75
- if col_count == num_columns:
76
- col_count = 0
77
- cols = st.columns(num_columns)
 
4
  import time
5
  import random
6
 
7
+ # Target URL
8
  url = "https://m.news.naver.com/rankingList"
9
 
10
+ # Headers
11
  headers = {
12
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0",
13
  "Referer": "https://m.news.naver.com/"
14
  }
15
 
 
16
  def random_delay():
17
+ time.sleep(random.uniform(1, 3))
 
18
 
19
+ def safe_find(element, selector, class_name, attribute=None):
20
+ """Safely find elements and their attributes"""
21
+ found = element.find(selector, class_=class_name)
22
+ if found and attribute:
23
+ return found.get(attribute)
24
+ return found.text if found else None
25
 
26
+ def scrape_ranking_news():
27
+ try:
28
+ random_delay()
29
+ response = requests.get(url, headers=headers)
30
+ response.raise_for_status() # Check for HTTP errors
31
+
32
+ soup = BeautifulSoup(response.text, "html.parser")
33
+ ranking_news_sections = soup.find_all("div", class_="rankingnews_box")
34
+
35
+ news_list = []
36
+ for section in ranking_news_sections:
37
+ office_name = safe_find(section, "strong", "rankingnews_name")
38
+ if not office_name:
39
+ continue
40
+
41
+ articles = section.find_all("li")
42
+ for article in articles:
43
+ # Safely extract all attributes
44
+ rank = safe_find(article, "em", "list_ranking_num")
45
+ title = safe_find(article, "strong", "list_title")
46
+ time_posted = safe_find(article, "span", "list_time")
47
+ link = safe_find(article, "a", None, "href")
48
+
49
+ # Handle image separately as it needs specific null checking
50
+ img_tag = article.find("img")
51
+ image = img_tag.get('src') if img_tag else None
52
+
53
+ if all([rank, title, time_posted, link]): # Ensure all required fields exist
54
+ news_list.append({
55
+ "rank": rank,
56
+ "title": title,
57
+ "time": time_posted,
58
+ "link": link,
59
+ "image": image,
60
+ "office": office_name
61
+ })
62
+
63
+ return news_list
64
+ except Exception as e:
65
+ st.error(f"Error scraping news: {str(e)}")
66
+ return []
67
 
68
+ def display_news(news_data, num_columns=5):
69
+ if not news_data:
70
+ st.warning("No news articles found.")
71
+ return
72
 
73
+ col_count = 0
74
+ cols = st.columns(num_columns)
75
+
76
+ for news in news_data:
77
+ with cols[col_count]:
78
+ if news['image']:
79
+ try:
80
+ st.image(news['image'])
81
+ except Exception:
82
+ st.warning("Image unavailable")
83
 
84
+ st.write(f"**{news['rank']}μœ„ - {news['office']}**")
85
+ st.write(f"[{news['title']}]({news['link']})")
86
+ st.write(f"πŸ•’ {news['time']}")
87
+
88
+ col_count = (col_count + 1) % num_columns
89
+ if col_count == 0:
90
+ cols = st.columns(num_columns)
 
 
91
 
92
+ # Main app
93
  st.title("Daily News Scrap in Korea")
94
 
95
+ if st.button("Start"):
 
 
96
  news_data = scrape_ranking_news()
97
+ display_news(news_data)