File size: 1,903 Bytes
25e1c5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import requests
from bs4 import BeautifulSoup
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re
import os

# URL of the your website 
url = 'https://XXX.com'

# Set Chrome options to enable headless mode
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Set the path to chromedriver executable
chromedriver_path = '/usr/local/bin/chromedriver'

# Create a new Chrome instance
driver = webdriver.Chrome(options=chrome_options)

# Load the website
driver.get(url)

# Wait for the page to fully render
time.sleep(5)

# Extract the rendered HTML
html = driver.page_source

# Close the Chrome instance
driver.quit()

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

data = {}

# h1〜h4のタグを取得
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5'])
(h1_text,h2_text,h3_text,h4_text,h5_text)=("","","","","")
for heading in headings:
    if heading.name == 'h1':
        h1_text = heading.text
        key = h1_text
    elif heading.name == 'h2':
        h2_text = heading.text
        key = f"{h1_text}-{h2_text}"
    elif heading.name == 'h3':
        h3_text = heading.text
        key = f"{h1_text}-{h2_text}-{h3_text}"
    elif heading.name == 'h4':
        h4_text = heading.text
        key = f"{h1_text}-{h2_text}-{h3_text}-{h4_text}"
    elif heading.name == 'h5':
        h5_text = heading.text
        key = f"{h1_text}-{h2_text}-{h3_text}-{h5_text}"

    # 次の要素のテキストを取得
    sibling = heading.find_next_sibling()
    value = ''
    while sibling and not sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5']:
        value += sibling.text
        sibling = sibling.find_next_sibling()

    data[key] = value.strip()

print(len(data),(data.keys()))