miscjose commited on
Commit
699b928
·
1 Parent(s): 125041f

Added additional files and source data

Browse files
Files changed (5) hide show
  1. .gitignore +0 -3
  2. app.py +0 -1
  3. app_test.ipynb +253 -0
  4. embed_test.ipynb +142 -0
  5. scrape.py +184 -0
.gitignore CHANGED
@@ -1,4 +1 @@
1
  .venv
2
- scrape.py
3
- app_test.ipynb
4
- embed_test.ipynb
 
1
  .venv
 
 
 
app.py CHANGED
@@ -73,7 +73,6 @@ def get_recommendation(query, number_of_recommendations, genres, themes, rating,
73
  name = data['japanese']
74
  else:
75
  name = data['english']
76
- english = data['english']
77
  description = data['description']
78
  review = data['reviews'][review_index]['text']
79
  image = data['image']
 
73
  name = data['japanese']
74
  else:
75
  name = data['english']
 
76
  description = data['description']
77
  review = data['reviews'][review_index]['text']
78
  image = data['image']
app_test.ipynb ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 7,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
13
+ " warnings.warn(\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import os\n",
19
+ "import json\n",
20
+ "\n",
21
+ "from sentence_transformers import SentenceTransformer\n",
22
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
23
+ "\n",
24
+ "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
25
+ "\n",
26
+ "import gradio as gr"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 8,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "def get_n_weighted_scores(embeddings, query, n, objective_weight, subjective_weight):\n",
36
+ " query = [model.encode(query)]\n",
37
+ "\n",
38
+ " weighted_scores = []\n",
39
+ "\n",
40
+ " for key, value in embeddings.items():\n",
41
+ " objective_embedding = value['objective_embedding']\n",
42
+ " subjective_embeddings = value['subjective_embeddings']\n",
43
+ " \n",
44
+ " objective_score = cosine_similarity(query, objective_embedding).item()\n",
45
+ " subjective_scores = cosine_similarity(query, subjective_embeddings)\n",
46
+ "\n",
47
+ " max_score = 0\n",
48
+ " max_review_index = 0\n",
49
+ " for idx, score in enumerate(subjective_scores[0].tolist()):\n",
50
+ " weighted_score = ((objective_score * objective_weight)+(score * subjective_weight))\n",
51
+ " if weighted_score > max_score:\n",
52
+ " max_score = weighted_score\n",
53
+ " max_review_index = idx\n",
54
+ " \n",
55
+ " weighted_scores.append((key, max_score, max_review_index))\n",
56
+ " \n",
57
+ " return sorted(weighted_scores, key=lambda x: x[1], reverse=True)[:n]"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": 9,
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "def filter_anime(embeddings, genres, themes, rating):\n",
67
+ " genres = set(genres)\n",
68
+ " themes = set(themes)\n",
69
+ " rating = set(rating)\n",
70
+ "\n",
71
+ " filtered_anime = embeddings.copy()\n",
72
+ " for key, anime in embeddings.items():\n",
73
+ "\n",
74
+ " anime_genres = set(anime['genres'])\n",
75
+ " anime_themes = set(anime['themes'])\n",
76
+ " anime_rating = set([anime['rating']])\n",
77
+ "\n",
78
+ " if genres.intersection(anime_genres) or 'ALL' in genres:\n",
79
+ " pass\n",
80
+ " else:\n",
81
+ " filtered_anime.pop(key)\n",
82
+ " continue\n",
83
+ " if themes.intersection(anime_themes) or 'ALL' in themes:\n",
84
+ " pass\n",
85
+ " else:\n",
86
+ " filtered_anime.pop(key)\n",
87
+ " continue\n",
88
+ " if rating.intersection(anime_rating) or 'ALL' in rating:\n",
89
+ " pass\n",
90
+ " else:\n",
91
+ " filtered_anime.pop(key)\n",
92
+ " continue\n",
93
+ " \n",
94
+ " return filtered_anime"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 10,
100
+ "metadata": {},
101
+ "outputs": [],
102
+ "source": [
103
+ "with open('./embeddings/data.json') as f:\n",
104
+ " data = json.load(f)\n",
105
+ " embeddings = data['embeddings']\n",
106
+ " filters = data['filters']"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 15,
112
+ "metadata": {},
113
+ "outputs": [],
114
+ "source": [
115
+ "def get_recommendation(query, number_of_recommendations, genres, themes, rating, objective_weight, subjective_weight):\n",
116
+ " filtered_anime = filter_anime(embeddings, genres, themes, rating)\n",
117
+ " results = []\n",
118
+ " weighted_scores = get_n_weighted_scores(filtered_anime, query, number_of_recommendations, float(objective_weight), float(subjective_weight))\n",
119
+ " for idx, (key, score, review_index) in enumerate(weighted_scores, start=1):\n",
120
+ " data = embeddings[key]\n",
121
+ " if not data['english']:\n",
122
+ " name = data['japanese']\n",
123
+ " else:\n",
124
+ " name = data['english']\n",
125
+ " description = data['description']\n",
126
+ " review = data['reviews'][review_index]['text']\n",
127
+ " image = data['image']\n",
128
+ "\n",
129
+ " results.append(gr.Image(label=f\"Recommendation {idx}: {name}\",value=image, height=435, width=500, visible=True))\n",
130
+ " results.append(gr.Textbox(label=f\"Synopsis\", value=description, max_lines=7, visible=True))\n",
131
+ " results.append(gr.Textbox(label=f\"Most Relevant User Review\",value=review, max_lines=7, visible=True))\n",
132
+ "\n",
133
+ " for _ in range(10-number_of_recommendations):\n",
134
+ " results.append(gr.Image(visible=False))\n",
135
+ " results.append(gr.Textbox(visible=False))\n",
136
+ " results.append(gr.Textbox(visible=False))\n",
137
+ " \n",
138
+ " return results"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": 16,
144
+ "metadata": {},
145
+ "outputs": [
146
+ {
147
+ "name": "stdout",
148
+ "output_type": "stream",
149
+ "text": [
150
+ "Running on local URL: http://127.0.0.1:7863\n",
151
+ "\n",
152
+ "To create a public link, set `share=True` in `launch()`.\n"
153
+ ]
154
+ },
155
+ {
156
+ "data": {
157
+ "text/html": [
158
+ "<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
159
+ ],
160
+ "text/plain": [
161
+ "<IPython.core.display.HTML object>"
162
+ ]
163
+ },
164
+ "metadata": {},
165
+ "output_type": "display_data"
166
+ },
167
+ {
168
+ "name": "stderr",
169
+ "output_type": "stream",
170
+ "text": [
171
+ "c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\gradio\\analytics.py:106: UserWarning: IMPORTANT: You are using gradio version 4.44.1, however version 5.0.1 is available, please upgrade. \n",
172
+ "--------\n",
173
+ " warnings.warn(\n"
174
+ ]
175
+ }
176
+ ],
177
+ "source": [
178
+ "with gr.Blocks(theme=gr.themes.Soft(primary_hue='red')) as demo:\n",
179
+ " with gr.Row():\n",
180
+ " with gr.Column():\n",
181
+ " gr.Markdown(\n",
182
+ " '''\n",
183
+ " # Welcome to the Nuanced Recommendation System!\n",
184
+ " ### This system **combines** both objective (synopsis, episode count, themes) and subjective (user reviews) data, in order to recommend the most approprate anime. Feel free to refine using the **optional** filters below! \n",
185
+ " '''\n",
186
+ " )\n",
187
+ " with gr.Column():\n",
188
+ " pass\n",
189
+ " \n",
190
+ "\n",
191
+ " with gr.Row():\n",
192
+ " with gr.Column() as input_col:\n",
193
+ " query = gr.Textbox(label=\"What are you looking for?\")\n",
194
+ " number_of_recommendations = gr.Slider(label= \"# of Recommendations\", minimum=1, maximum=10, value=3, step=1)\n",
195
+ " genres = gr.Dropdown(label='Genres',multiselect=True,choices=filters['genres'], value=['ALL'])\n",
196
+ " themes = gr.Dropdown(label='Themes',multiselect=True,choices=filters['themes'], value=['ALL'])\n",
197
+ " rating = gr.Dropdown(label='Rating',multiselect=True,choices=filters['rating'], value=['ALL'])\n",
198
+ " objective_weight = gr.Slider(label= \"Objective Weight\", minimum=0, maximum=1, value=.5, step=.1)\n",
199
+ " subjective_weight = gr.Slider(label= \"Subjective Weight\", minimum=0, maximum=1, value=.5, step=.1)\n",
200
+ " submit_btn = gr.Button(\"Submit\")\n",
201
+ "\n",
202
+ " examples = gr.Examples(\n",
203
+ " examples=[\n",
204
+ " ['A sci-fi anime set in a future where AI and robots have become self-aware', 3, ['Action', 'Sci-Fi', 'Fantasy'], ['ALL'], ['PG-13 - Teens 13 or older'], .8, .2],\n",
205
+ " ['An anime where a group of students form a band, and the story focuses on their personal growth and struggles with adulthood', 5, ['ALL'], ['Music'], ['PG-13 - Teens 13 or older', 'R - 17+ (violence & profanity)'], .3, .7],\n",
206
+ " ['An anime where the main character starts as a villain but slowly redeems themselves', 3, ['Suspense', 'Action'], ['ALL'], ['PG-13 - Teens 13 or older', 'R - 17+ (violence & profanity)'], .2, .8],\n",
207
+ " ],\n",
208
+ " inputs=[query, number_of_recommendations, genres, themes, rating, objective_weight, subjective_weight],\n",
209
+ " )\n",
210
+ "\n",
211
+ " outputs = []\n",
212
+ " with gr.Column():\n",
213
+ " for i in range(10):\n",
214
+ " with gr.Row():\n",
215
+ " with gr.Column():\n",
216
+ " outputs.append(gr.Image(height=435, width=500, visible=False))\n",
217
+ " with gr.Column():\n",
218
+ " outputs.append(gr.Textbox(max_lines=7, visible=False))\n",
219
+ " outputs.append(gr.Textbox(max_lines=7, visible=False))\n",
220
+ " \n",
221
+ "\n",
222
+ " submit_btn.click(\n",
223
+ " get_recommendation,\n",
224
+ " [query, number_of_recommendations, genres, themes, rating, objective_weight, subjective_weight],\n",
225
+ " outputs\n",
226
+ " )\n",
227
+ "\n",
228
+ " demo.launch()"
229
+ ]
230
+ }
231
+ ],
232
+ "metadata": {
233
+ "kernelspec": {
234
+ "display_name": ".venv",
235
+ "language": "python",
236
+ "name": "python3"
237
+ },
238
+ "language_info": {
239
+ "codemirror_mode": {
240
+ "name": "ipython",
241
+ "version": 3
242
+ },
243
+ "file_extension": ".py",
244
+ "mimetype": "text/x-python",
245
+ "name": "python",
246
+ "nbconvert_exporter": "python",
247
+ "pygments_lexer": "ipython3",
248
+ "version": "3.12.7"
249
+ }
250
+ },
251
+ "nbformat": 4,
252
+ "nbformat_minor": 2
253
+ }
embed_test.ipynb ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from tqdm.autonotebook import tqdm, trange\n",
14
+ "c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
15
+ " warnings.warn(\n"
16
+ ]
17
+ }
18
+ ],
19
+ "source": [
20
+ "import os\n",
21
+ "import json\n",
22
+ "\n",
23
+ "from sentence_transformers import SentenceTransformer\n",
24
+ "\n",
25
+ "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 3,
31
+ "metadata": {},
32
+ "outputs": [],
33
+ "source": [
34
+ "def update_filters(filters, data):\n",
35
+ " for key, value in filters.items():\n",
36
+ " dont_add = ['Boys Love', 'Erotica', 'Girls Love', 'Hentai', 'Ecchi', 'Gore', 'Crossdressing', 'Magical Sex Shift', 'Rx - Hentai', 'R+ - Mild Nudity']\n",
37
+ " if data[key] and key == 'rating':\n",
38
+ " if data[key] not in dont_add:\n",
39
+ " value.add(data[key])\n",
40
+ " else:\n",
41
+ " for val in data[key]:\n",
42
+ " if val and val not in dont_add:\n",
43
+ " value.add(val)\n",
44
+ " return filters"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "def clean_filters(filters):\n",
54
+ " for key, val in filters.items():\n",
55
+ " val.add('ALL')\n",
56
+ " filters[key] = list(val)\n",
57
+ " return filters"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": null,
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "print('Embedding Started')\n",
67
+ "\n",
68
+ "filters = {\n",
69
+ "'genres': set(),\n",
70
+ "'themes': set(),\n",
71
+ "'rating': set()\n",
72
+ "}\n",
73
+ "\n",
74
+ "embeddings = {}\n",
75
+ "for name in os.listdir('./anime'):\n",
76
+ " with open(f\"./anime/{name}\", 'r') as file:\n",
77
+ " data = json.load(file)\n",
78
+ "\n",
79
+ " if not data:\n",
80
+ " continue\n",
81
+ "\n",
82
+ " filters = update_filters(filters, data)\n",
83
+ "\n",
84
+ " name = name.replace('.json', '')\n",
85
+ " \n",
86
+ " data['image'] = f\"./images/{name}.jpg\"\n",
87
+ "\n",
88
+ " text = f'''Episodes: {data['episodes']} \n",
89
+ " Premiered: {data['premiered']} \n",
90
+ " Broadcast: {data['broadcast']} \n",
91
+ " Producers: {' '.join(data['producers'])} \n",
92
+ " Licensors: {' '.join(data['licensors'])} \n",
93
+ " Studios: {' '.join(data['studios'])} \n",
94
+ " Source: {' '.join(data['source'])} \n",
95
+ " Genres: {' '.join(data['genres'])} \n",
96
+ " Themes: {' '.join(data['themes'])} \n",
97
+ " Demographic: {data['demographic']} \n",
98
+ " Duration: {data['duration']} \n",
99
+ " Rating: {data['rating']} \n",
100
+ " Description: {data['description']}'''\n",
101
+ " \n",
102
+ " embeddings[name] = data.copy()\n",
103
+ " \n",
104
+ " embeddings[name]['objective_embedding'] = [model.encode(text).tolist()]\n",
105
+ " subjective_embeddings = []\n",
106
+ " for review in embeddings[name]['reviews']:\n",
107
+ " text = review['text']\n",
108
+ " subjective_embeddings.append(model.encode(text).tolist())\n",
109
+ " data['review'] = text\n",
110
+ " embeddings[name]['subjective_embeddings'] = subjective_embeddings\n",
111
+ "\n",
112
+ "filters = clean_filters(filters)\n",
113
+ "\n",
114
+ "with open('./embeddings/data.json', 'w') as f:\n",
115
+ " json.dump({'embeddings':embeddings, 'filters': filters}, f)\n",
116
+ "\n",
117
+ "print('Embedding Complete')"
118
+ ]
119
+ }
120
+ ],
121
+ "metadata": {
122
+ "kernelspec": {
123
+ "display_name": ".venv",
124
+ "language": "python",
125
+ "name": "python3"
126
+ },
127
+ "language_info": {
128
+ "codemirror_mode": {
129
+ "name": "ipython",
130
+ "version": 3
131
+ },
132
+ "file_extension": ".py",
133
+ "mimetype": "text/x-python",
134
+ "name": "python",
135
+ "nbconvert_exporter": "python",
136
+ "pygments_lexer": "ipython3",
137
+ "version": "3.12.7"
138
+ }
139
+ },
140
+ "nbformat": 4,
141
+ "nbformat_minor": 2
142
+ }
scrape.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.common.keys import Keys
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.support.wait import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+
7
+ import urllib
8
+
9
+ import time
10
+ import os
11
+ import json
12
+ import random
13
+
14
+ driver = webdriver.Chrome()
15
+
16
+ existing = set([int(name.replace('.json', '')) for name in os.listdir('./anime')])
17
+ # total = set(range(100000))
18
+ # difference = total.difference(existing)
19
+ difference = range(3210, 10000)
20
+
21
+ for i in difference:
22
+ random_num = random.choice([4])
23
+
24
+ driver.get(f"https://myanimelist.net/anime/{i}/")
25
+
26
+ try:
27
+ try:
28
+ wait = WebDriverWait(driver, 5)
29
+ wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='leftside']//a/img")))
30
+ image = driver.find_element(By.XPATH, "//div[@class='leftside']//a/img").get_attribute('src')
31
+ urllib.request.urlretrieve(image, f"./images/{i}.jpg")
32
+ except:
33
+ pass
34
+ try:
35
+ body = driver.find_element(By.TAG_NAME, 'body')
36
+ except:
37
+ body = ''
38
+ try:
39
+ description = driver.find_element(By.XPATH, "//p[@itemprop='description']").text
40
+ except:
41
+ description = ''
42
+ try:
43
+ synonyms = driver.find_element(By.XPATH, "//span[text() = 'Synonyms:']/..").text.replace('Synonyms:', '').strip()
44
+ except:
45
+ synonyms = ''
46
+ try:
47
+ japanese = driver.find_element(By.XPATH, "//span[text() = 'Japanese:']/..").text.replace('Japanese:', '').strip()
48
+ except:
49
+ japanese = ''
50
+ try:
51
+ driver.find_element(By.CLASS_NAME, 'js-anime-toggle-alternative-title-button').click()
52
+ english = driver.find_element(By.XPATH, "//span[text() = 'English:']/..").text.replace('English:', '').strip()
53
+ except:
54
+ english = ''
55
+ try:
56
+ type = driver.find_element(By.XPATH, "//span[text() = 'Type:']/..").text.replace('Type:', '').strip()
57
+ except:
58
+ type = ''
59
+
60
+ try:
61
+ episodes = driver.find_element(By.XPATH, "//span[text() = 'Episodes:']/..").text.replace('Episodes:', '').strip()
62
+ except:
63
+ episodes = ''
64
+ try:
65
+ premiered = driver.find_element(By.XPATH, "//span[text() = 'Premiered:']/..").text.replace('Premiered:', '').strip()
66
+ except:
67
+ premiered = ''
68
+ try:
69
+ broadcast = driver.find_element(By.XPATH, "//span[text() = 'Broadcast:']/..").text.replace('Broadcast:', '').strip()
70
+ except:
71
+ broadcast = ''
72
+ try:
73
+ producers = driver.find_element(By.XPATH, "//span[text() = 'Producers:']/..").text.replace('Producers:', '').strip()
74
+ except:
75
+ producers = ''
76
+ try:
77
+ licensors = driver.find_element(By.XPATH, "//span[text() = 'Licensors:']/..").text.replace('Licensors:', '').strip()
78
+ except:
79
+ licensors = ''
80
+ try:
81
+ studios = driver.find_element(By.XPATH, "//span[text() = 'Studios:']/..").text.replace('Studios:', '').strip()
82
+ except:
83
+ studios = ''
84
+ try:
85
+ source = driver.find_element(By.XPATH, "//span[text() = 'Source:']/..").text.replace('Source:', '').strip()
86
+ except:
87
+ source = ''
88
+
89
+ try:
90
+ genres = driver.find_element(By.XPATH, "//span[text() = 'Genres:']/..").text.replace('Genres:', '').strip()
91
+ except:
92
+ genres = ''
93
+ try:
94
+ themes = driver.find_element(By.XPATH, "//span[text() = 'Themes:']/..").text.replace('Themes:', '').strip()
95
+ except:
96
+ try:
97
+ themes = driver.find_element(By.XPATH, "//span[text() = 'Theme:']/..").text.replace('Theme:', '').strip()
98
+ except:
99
+ themes = ''
100
+
101
+ try:
102
+ demographic = driver.find_element(By.XPATH, "//span[text() = 'Demographic:']/..").text.replace('Demographic:', '').strip()
103
+ except:
104
+ demographic = ''
105
+ try:
106
+ duration = driver.find_element(By.XPATH, "//span[text() = 'Duration:']/..").text.replace('Duration:', '').strip()
107
+ except:
108
+ duration = ''
109
+ try:
110
+ rating = driver.find_element(By.XPATH, "//span[text() = 'Rating:']/..").text.replace('Rating:', '').strip()
111
+ except:
112
+ rating = ''
113
+
114
+ time.sleep(2)
115
+
116
+ try:
117
+ wait = WebDriverWait(driver, 10)
118
+ wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']")))
119
+ driver.find_element(By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']").click()
120
+ except Exception as e:
121
+ print(e)
122
+ try:
123
+ driver.find_element(By.CLASS_NAME, 'error404')
124
+ with open(f"anime/{i}.json", "w") as outfile:
125
+ json.dump({}, outfile)
126
+ continue
127
+ except Exception as e:
128
+ print(e)
129
+ driver.close()
130
+ time.sleep(150)
131
+ driver = webdriver.Chrome()
132
+ continue
133
+ driver.execute_script("window.scrollTo(0, 0)")
134
+
135
+ data = []
136
+ reviews = driver.find_elements(By.CLASS_NAME, 'js-review-element')
137
+ for review in reviews:
138
+ visible = review.find_element(By.CLASS_NAME, 'text')
139
+ sentiment = review.find_element(By.CLASS_NAME, 'tag')
140
+ wait = WebDriverWait(driver, 10)
141
+ wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'js-hidden')))
142
+ hidden = review.find_element(By.CLASS_NAME, 'js-hidden')
143
+ text = visible.text + hidden.get_attribute('textContent')
144
+ text = text.strip().replace('\n', ' ')
145
+ data.append({
146
+ 'sentiment': sentiment.text,
147
+ 'text': text
148
+ })
149
+
150
+ reviews = data
151
+
152
+ if not reviews:
153
+ continue
154
+
155
+ information = {
156
+ 'synonyms': synonyms,
157
+ 'japanese': japanese,
158
+ 'english': english,
159
+ 'type': type,
160
+ 'episodes': episodes,
161
+ 'premiered': premiered,
162
+ 'broadcast': broadcast,
163
+ 'producers': [x.strip() for x in producers.split(',')],
164
+ 'licensors': [x.strip() for x in licensors.split(',')],
165
+ 'studios': [x.strip() for x in studios.split(',')],
166
+ 'source': [x.strip() for x in source.split(',')],
167
+ 'genres': [x.strip() for x in genres.split(',')],
168
+ 'themes': [x.strip() for x in themes.split(',')],
169
+ 'demographic': demographic.split(','),
170
+ 'duration': duration,
171
+ 'rating': rating,
172
+ 'description': description,
173
+ 'reviews': reviews
174
+ }
175
+
176
+ with open(f"anime/{i}.json", "w") as outfile:
177
+ json.dump(information, outfile)
178
+ time.sleep(random_num)
179
+ except Exception as e:
180
+ print(e)
181
+ time.sleep(random_num)
182
+
183
+
184
+ driver.close()