|
from selenium import webdriver |
|
from selenium.webdriver.common.keys import Keys |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.support.wait import WebDriverWait |
|
from selenium.webdriver.support import expected_conditions as EC |
|
|
|
import urllib |
|
|
|
import time |
|
import os |
|
import json |
|
import random |
|
|
|
driver = webdriver.Chrome() |
|
|
|
existing = set([int(name.replace('.json', '')) for name in os.listdir('./anime')]) |
|
|
|
|
|
difference = range(3210, 10000) |
|
|
|
for i in difference: |
|
random_num = random.choice([4]) |
|
|
|
driver.get(f"https://myanimelist.net/anime/{i}/") |
|
|
|
try: |
|
try: |
|
wait = WebDriverWait(driver, 5) |
|
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='leftside']//a/img"))) |
|
image = driver.find_element(By.XPATH, "//div[@class='leftside']//a/img").get_attribute('src') |
|
urllib.request.urlretrieve(image, f"./images/{i}.jpg") |
|
except: |
|
pass |
|
try: |
|
body = driver.find_element(By.TAG_NAME, 'body') |
|
except: |
|
body = '' |
|
try: |
|
description = driver.find_element(By.XPATH, "//p[@itemprop='description']").text |
|
except: |
|
description = '' |
|
try: |
|
synonyms = driver.find_element(By.XPATH, "//span[text() = 'Synonyms:']/..").text.replace('Synonyms:', '').strip() |
|
except: |
|
synonyms = '' |
|
try: |
|
japanese = driver.find_element(By.XPATH, "//span[text() = 'Japanese:']/..").text.replace('Japanese:', '').strip() |
|
except: |
|
japanese = '' |
|
try: |
|
driver.find_element(By.CLASS_NAME, 'js-anime-toggle-alternative-title-button').click() |
|
english = driver.find_element(By.XPATH, "//span[text() = 'English:']/..").text.replace('English:', '').strip() |
|
except: |
|
english = '' |
|
try: |
|
type = driver.find_element(By.XPATH, "//span[text() = 'Type:']/..").text.replace('Type:', '').strip() |
|
except: |
|
type = '' |
|
|
|
try: |
|
episodes = driver.find_element(By.XPATH, "//span[text() = 'Episodes:']/..").text.replace('Episodes:', '').strip() |
|
except: |
|
episodes = '' |
|
try: |
|
premiered = driver.find_element(By.XPATH, "//span[text() = 'Premiered:']/..").text.replace('Premiered:', '').strip() |
|
except: |
|
premiered = '' |
|
try: |
|
broadcast = driver.find_element(By.XPATH, "//span[text() = 'Broadcast:']/..").text.replace('Broadcast:', '').strip() |
|
except: |
|
broadcast = '' |
|
try: |
|
producers = driver.find_element(By.XPATH, "//span[text() = 'Producers:']/..").text.replace('Producers:', '').strip() |
|
except: |
|
producers = '' |
|
try: |
|
licensors = driver.find_element(By.XPATH, "//span[text() = 'Licensors:']/..").text.replace('Licensors:', '').strip() |
|
except: |
|
licensors = '' |
|
try: |
|
studios = driver.find_element(By.XPATH, "//span[text() = 'Studios:']/..").text.replace('Studios:', '').strip() |
|
except: |
|
studios = '' |
|
try: |
|
source = driver.find_element(By.XPATH, "//span[text() = 'Source:']/..").text.replace('Source:', '').strip() |
|
except: |
|
source = '' |
|
|
|
try: |
|
genres = driver.find_element(By.XPATH, "//span[text() = 'Genres:']/..").text.replace('Genres:', '').strip() |
|
except: |
|
genres = '' |
|
try: |
|
themes = driver.find_element(By.XPATH, "//span[text() = 'Themes:']/..").text.replace('Themes:', '').strip() |
|
except: |
|
try: |
|
themes = driver.find_element(By.XPATH, "//span[text() = 'Theme:']/..").text.replace('Theme:', '').strip() |
|
except: |
|
themes = '' |
|
|
|
try: |
|
demographic = driver.find_element(By.XPATH, "//span[text() = 'Demographic:']/..").text.replace('Demographic:', '').strip() |
|
except: |
|
demographic = '' |
|
try: |
|
duration = driver.find_element(By.XPATH, "//span[text() = 'Duration:']/..").text.replace('Duration:', '').strip() |
|
except: |
|
duration = '' |
|
try: |
|
rating = driver.find_element(By.XPATH, "//span[text() = 'Rating:']/..").text.replace('Rating:', '').strip() |
|
except: |
|
rating = '' |
|
|
|
time.sleep(2) |
|
|
|
try: |
|
wait = WebDriverWait(driver, 10) |
|
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']"))) |
|
driver.find_element(By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']").click() |
|
except Exception as e: |
|
print(e) |
|
try: |
|
driver.find_element(By.CLASS_NAME, 'error404') |
|
with open(f"anime/{i}.json", "w") as outfile: |
|
json.dump({}, outfile) |
|
continue |
|
except Exception as e: |
|
print(e) |
|
driver.close() |
|
time.sleep(150) |
|
driver = webdriver.Chrome() |
|
continue |
|
driver.execute_script("window.scrollTo(0, 0)") |
|
|
|
data = [] |
|
reviews = driver.find_elements(By.CLASS_NAME, 'js-review-element') |
|
for review in reviews: |
|
visible = review.find_element(By.CLASS_NAME, 'text') |
|
sentiment = review.find_element(By.CLASS_NAME, 'tag') |
|
wait = WebDriverWait(driver, 10) |
|
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'js-hidden'))) |
|
hidden = review.find_element(By.CLASS_NAME, 'js-hidden') |
|
text = visible.text + hidden.get_attribute('textContent') |
|
text = text.strip().replace('\n', ' ') |
|
data.append({ |
|
'sentiment': sentiment.text, |
|
'text': text |
|
}) |
|
|
|
reviews = data |
|
|
|
if not reviews: |
|
continue |
|
|
|
information = { |
|
'synonyms': synonyms, |
|
'japanese': japanese, |
|
'english': english, |
|
'type': type, |
|
'episodes': episodes, |
|
'premiered': premiered, |
|
'broadcast': broadcast, |
|
'producers': [x.strip() for x in producers.split(',')], |
|
'licensors': [x.strip() for x in licensors.split(',')], |
|
'studios': [x.strip() for x in studios.split(',')], |
|
'source': [x.strip() for x in source.split(',')], |
|
'genres': [x.strip() for x in genres.split(',')], |
|
'themes': [x.strip() for x in themes.split(',')], |
|
'demographic': demographic.split(','), |
|
'duration': duration, |
|
'rating': rating, |
|
'description': description, |
|
'reviews': reviews |
|
} |
|
|
|
with open(f"anime/{i}.json", "w") as outfile: |
|
json.dump(information, outfile) |
|
time.sleep(random_num) |
|
except Exception as e: |
|
print(e) |
|
time.sleep(random_num) |
|
|
|
|
|
driver.close() |