from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import urllib import time import os import json import random driver = webdriver.Chrome() existing = set([int(name.replace('.json', '')) for name in os.listdir('./anime')]) # total = set(range(100000)) # difference = total.difference(existing) difference = range(3210, 10000) for i in difference: random_num = random.choice([4]) driver.get(f"https://myanimelist.net/anime/{i}/") try: try: wait = WebDriverWait(driver, 5) wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='leftside']//a/img"))) image = driver.find_element(By.XPATH, "//div[@class='leftside']//a/img").get_attribute('src') urllib.request.urlretrieve(image, f"./images/{i}.jpg") except: pass try: body = driver.find_element(By.TAG_NAME, 'body') except: body = '' try: description = driver.find_element(By.XPATH, "//p[@itemprop='description']").text except: description = '' try: synonyms = driver.find_element(By.XPATH, "//span[text() = 'Synonyms:']/..").text.replace('Synonyms:', '').strip() except: synonyms = '' try: japanese = driver.find_element(By.XPATH, "//span[text() = 'Japanese:']/..").text.replace('Japanese:', '').strip() except: japanese = '' try: driver.find_element(By.CLASS_NAME, 'js-anime-toggle-alternative-title-button').click() english = driver.find_element(By.XPATH, "//span[text() = 'English:']/..").text.replace('English:', '').strip() except: english = '' try: type = driver.find_element(By.XPATH, "//span[text() = 'Type:']/..").text.replace('Type:', '').strip() except: type = '' try: episodes = driver.find_element(By.XPATH, "//span[text() = 'Episodes:']/..").text.replace('Episodes:', '').strip() except: episodes = '' try: premiered = driver.find_element(By.XPATH, "//span[text() = 'Premiered:']/..").text.replace('Premiered:', '').strip() except: premiered = '' try: broadcast = driver.find_element(By.XPATH, "//span[text() = 'Broadcast:']/..").text.replace('Broadcast:', '').strip() except: broadcast = '' try: producers = driver.find_element(By.XPATH, "//span[text() = 'Producers:']/..").text.replace('Producers:', '').strip() except: producers = '' try: licensors = driver.find_element(By.XPATH, "//span[text() = 'Licensors:']/..").text.replace('Licensors:', '').strip() except: licensors = '' try: studios = driver.find_element(By.XPATH, "//span[text() = 'Studios:']/..").text.replace('Studios:', '').strip() except: studios = '' try: source = driver.find_element(By.XPATH, "//span[text() = 'Source:']/..").text.replace('Source:', '').strip() except: source = '' try: genres = driver.find_element(By.XPATH, "//span[text() = 'Genres:']/..").text.replace('Genres:', '').strip() except: genres = '' try: themes = driver.find_element(By.XPATH, "//span[text() = 'Themes:']/..").text.replace('Themes:', '').strip() except: try: themes = driver.find_element(By.XPATH, "//span[text() = 'Theme:']/..").text.replace('Theme:', '').strip() except: themes = '' try: demographic = driver.find_element(By.XPATH, "//span[text() = 'Demographic:']/..").text.replace('Demographic:', '').strip() except: demographic = '' try: duration = driver.find_element(By.XPATH, "//span[text() = 'Duration:']/..").text.replace('Duration:', '').strip() except: duration = '' try: rating = driver.find_element(By.XPATH, "//span[text() = 'Rating:']/..").text.replace('Rating:', '').strip() except: rating = '' time.sleep(2) try: wait = WebDriverWait(driver, 10) wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']"))) driver.find_element(By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']").click() except Exception as e: print(e) try: driver.find_element(By.CLASS_NAME, 'error404') with open(f"anime/{i}.json", "w") as outfile: json.dump({}, outfile) continue except Exception as e: print(e) driver.close() time.sleep(150) driver = webdriver.Chrome() continue driver.execute_script("window.scrollTo(0, 0)") data = [] reviews = driver.find_elements(By.CLASS_NAME, 'js-review-element') for review in reviews: visible = review.find_element(By.CLASS_NAME, 'text') sentiment = review.find_element(By.CLASS_NAME, 'tag') wait = WebDriverWait(driver, 10) wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'js-hidden'))) hidden = review.find_element(By.CLASS_NAME, 'js-hidden') text = visible.text + hidden.get_attribute('textContent') text = text.strip().replace('\n', ' ') data.append({ 'sentiment': sentiment.text, 'text': text }) reviews = data if not reviews: continue information = { 'synonyms': synonyms, 'japanese': japanese, 'english': english, 'type': type, 'episodes': episodes, 'premiered': premiered, 'broadcast': broadcast, 'producers': [x.strip() for x in producers.split(',')], 'licensors': [x.strip() for x in licensors.split(',')], 'studios': [x.strip() for x in studios.split(',')], 'source': [x.strip() for x in source.split(',')], 'genres': [x.strip() for x in genres.split(',')], 'themes': [x.strip() for x in themes.split(',')], 'demographic': demographic.split(','), 'duration': duration, 'rating': rating, 'description': description, 'reviews': reviews } with open(f"anime/{i}.json", "w") as outfile: json.dump(information, outfile) time.sleep(random_num) except Exception as e: print(e) time.sleep(random_num) driver.close()