miscjose's picture
Added additional files and source data
699b928
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import urllib
import time
import os
import json
import random
driver = webdriver.Chrome()
existing = set([int(name.replace('.json', '')) for name in os.listdir('./anime')])
# total = set(range(100000))
# difference = total.difference(existing)
difference = range(3210, 10000)
for i in difference:
random_num = random.choice([4])
driver.get(f"https://myanimelist.net/anime/{i}/")
try:
try:
wait = WebDriverWait(driver, 5)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='leftside']//a/img")))
image = driver.find_element(By.XPATH, "//div[@class='leftside']//a/img").get_attribute('src')
urllib.request.urlretrieve(image, f"./images/{i}.jpg")
except:
pass
try:
body = driver.find_element(By.TAG_NAME, 'body')
except:
body = ''
try:
description = driver.find_element(By.XPATH, "//p[@itemprop='description']").text
except:
description = ''
try:
synonyms = driver.find_element(By.XPATH, "//span[text() = 'Synonyms:']/..").text.replace('Synonyms:', '').strip()
except:
synonyms = ''
try:
japanese = driver.find_element(By.XPATH, "//span[text() = 'Japanese:']/..").text.replace('Japanese:', '').strip()
except:
japanese = ''
try:
driver.find_element(By.CLASS_NAME, 'js-anime-toggle-alternative-title-button').click()
english = driver.find_element(By.XPATH, "//span[text() = 'English:']/..").text.replace('English:', '').strip()
except:
english = ''
try:
type = driver.find_element(By.XPATH, "//span[text() = 'Type:']/..").text.replace('Type:', '').strip()
except:
type = ''
try:
episodes = driver.find_element(By.XPATH, "//span[text() = 'Episodes:']/..").text.replace('Episodes:', '').strip()
except:
episodes = ''
try:
premiered = driver.find_element(By.XPATH, "//span[text() = 'Premiered:']/..").text.replace('Premiered:', '').strip()
except:
premiered = ''
try:
broadcast = driver.find_element(By.XPATH, "//span[text() = 'Broadcast:']/..").text.replace('Broadcast:', '').strip()
except:
broadcast = ''
try:
producers = driver.find_element(By.XPATH, "//span[text() = 'Producers:']/..").text.replace('Producers:', '').strip()
except:
producers = ''
try:
licensors = driver.find_element(By.XPATH, "//span[text() = 'Licensors:']/..").text.replace('Licensors:', '').strip()
except:
licensors = ''
try:
studios = driver.find_element(By.XPATH, "//span[text() = 'Studios:']/..").text.replace('Studios:', '').strip()
except:
studios = ''
try:
source = driver.find_element(By.XPATH, "//span[text() = 'Source:']/..").text.replace('Source:', '').strip()
except:
source = ''
try:
genres = driver.find_element(By.XPATH, "//span[text() = 'Genres:']/..").text.replace('Genres:', '').strip()
except:
genres = ''
try:
themes = driver.find_element(By.XPATH, "//span[text() = 'Themes:']/..").text.replace('Themes:', '').strip()
except:
try:
themes = driver.find_element(By.XPATH, "//span[text() = 'Theme:']/..").text.replace('Theme:', '').strip()
except:
themes = ''
try:
demographic = driver.find_element(By.XPATH, "//span[text() = 'Demographic:']/..").text.replace('Demographic:', '').strip()
except:
demographic = ''
try:
duration = driver.find_element(By.XPATH, "//span[text() = 'Duration:']/..").text.replace('Duration:', '').strip()
except:
duration = ''
try:
rating = driver.find_element(By.XPATH, "//span[text() = 'Rating:']/..").text.replace('Rating:', '').strip()
except:
rating = ''
time.sleep(2)
try:
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']")))
driver.find_element(By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']").click()
except Exception as e:
print(e)
try:
driver.find_element(By.CLASS_NAME, 'error404')
with open(f"anime/{i}.json", "w") as outfile:
json.dump({}, outfile)
continue
except Exception as e:
print(e)
driver.close()
time.sleep(150)
driver = webdriver.Chrome()
continue
driver.execute_script("window.scrollTo(0, 0)")
data = []
reviews = driver.find_elements(By.CLASS_NAME, 'js-review-element')
for review in reviews:
visible = review.find_element(By.CLASS_NAME, 'text')
sentiment = review.find_element(By.CLASS_NAME, 'tag')
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'js-hidden')))
hidden = review.find_element(By.CLASS_NAME, 'js-hidden')
text = visible.text + hidden.get_attribute('textContent')
text = text.strip().replace('\n', ' ')
data.append({
'sentiment': sentiment.text,
'text': text
})
reviews = data
if not reviews:
continue
information = {
'synonyms': synonyms,
'japanese': japanese,
'english': english,
'type': type,
'episodes': episodes,
'premiered': premiered,
'broadcast': broadcast,
'producers': [x.strip() for x in producers.split(',')],
'licensors': [x.strip() for x in licensors.split(',')],
'studios': [x.strip() for x in studios.split(',')],
'source': [x.strip() for x in source.split(',')],
'genres': [x.strip() for x in genres.split(',')],
'themes': [x.strip() for x in themes.split(',')],
'demographic': demographic.split(','),
'duration': duration,
'rating': rating,
'description': description,
'reviews': reviews
}
with open(f"anime/{i}.json", "w") as outfile:
json.dump(information, outfile)
time.sleep(random_num)
except Exception as e:
print(e)
time.sleep(random_num)
driver.close()