Added additional files and source data
Browse files- .gitignore +0 -3
- app.py +0 -1
- app_test.ipynb +253 -0
- embed_test.ipynb +142 -0
- scrape.py +184 -0
.gitignore
CHANGED
@@ -1,4 +1 @@
|
|
1 |
.venv
|
2 |
-
scrape.py
|
3 |
-
app_test.ipynb
|
4 |
-
embed_test.ipynb
|
|
|
1 |
.venv
|
|
|
|
|
|
app.py
CHANGED
@@ -73,7 +73,6 @@ def get_recommendation(query, number_of_recommendations, genres, themes, rating,
|
|
73 |
name = data['japanese']
|
74 |
else:
|
75 |
name = data['english']
|
76 |
-
english = data['english']
|
77 |
description = data['description']
|
78 |
review = data['reviews'][review_index]['text']
|
79 |
image = data['image']
|
|
|
73 |
name = data['japanese']
|
74 |
else:
|
75 |
name = data['english']
|
|
|
76 |
description = data['description']
|
77 |
review = data['reviews'][review_index]['text']
|
78 |
image = data['image']
|
app_test.ipynb
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 7,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
|
13 |
+
" warnings.warn(\n"
|
14 |
+
]
|
15 |
+
}
|
16 |
+
],
|
17 |
+
"source": [
|
18 |
+
"import os\n",
|
19 |
+
"import json\n",
|
20 |
+
"\n",
|
21 |
+
"from sentence_transformers import SentenceTransformer\n",
|
22 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
23 |
+
"\n",
|
24 |
+
"model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
|
25 |
+
"\n",
|
26 |
+
"import gradio as gr"
|
27 |
+
]
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"cell_type": "code",
|
31 |
+
"execution_count": 8,
|
32 |
+
"metadata": {},
|
33 |
+
"outputs": [],
|
34 |
+
"source": [
|
35 |
+
"def get_n_weighted_scores(embeddings, query, n, objective_weight, subjective_weight):\n",
|
36 |
+
" query = [model.encode(query)]\n",
|
37 |
+
"\n",
|
38 |
+
" weighted_scores = []\n",
|
39 |
+
"\n",
|
40 |
+
" for key, value in embeddings.items():\n",
|
41 |
+
" objective_embedding = value['objective_embedding']\n",
|
42 |
+
" subjective_embeddings = value['subjective_embeddings']\n",
|
43 |
+
" \n",
|
44 |
+
" objective_score = cosine_similarity(query, objective_embedding).item()\n",
|
45 |
+
" subjective_scores = cosine_similarity(query, subjective_embeddings)\n",
|
46 |
+
"\n",
|
47 |
+
" max_score = 0\n",
|
48 |
+
" max_review_index = 0\n",
|
49 |
+
" for idx, score in enumerate(subjective_scores[0].tolist()):\n",
|
50 |
+
" weighted_score = ((objective_score * objective_weight)+(score * subjective_weight))\n",
|
51 |
+
" if weighted_score > max_score:\n",
|
52 |
+
" max_score = weighted_score\n",
|
53 |
+
" max_review_index = idx\n",
|
54 |
+
" \n",
|
55 |
+
" weighted_scores.append((key, max_score, max_review_index))\n",
|
56 |
+
" \n",
|
57 |
+
" return sorted(weighted_scores, key=lambda x: x[1], reverse=True)[:n]"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"cell_type": "code",
|
62 |
+
"execution_count": 9,
|
63 |
+
"metadata": {},
|
64 |
+
"outputs": [],
|
65 |
+
"source": [
|
66 |
+
"def filter_anime(embeddings, genres, themes, rating):\n",
|
67 |
+
" genres = set(genres)\n",
|
68 |
+
" themes = set(themes)\n",
|
69 |
+
" rating = set(rating)\n",
|
70 |
+
"\n",
|
71 |
+
" filtered_anime = embeddings.copy()\n",
|
72 |
+
" for key, anime in embeddings.items():\n",
|
73 |
+
"\n",
|
74 |
+
" anime_genres = set(anime['genres'])\n",
|
75 |
+
" anime_themes = set(anime['themes'])\n",
|
76 |
+
" anime_rating = set([anime['rating']])\n",
|
77 |
+
"\n",
|
78 |
+
" if genres.intersection(anime_genres) or 'ALL' in genres:\n",
|
79 |
+
" pass\n",
|
80 |
+
" else:\n",
|
81 |
+
" filtered_anime.pop(key)\n",
|
82 |
+
" continue\n",
|
83 |
+
" if themes.intersection(anime_themes) or 'ALL' in themes:\n",
|
84 |
+
" pass\n",
|
85 |
+
" else:\n",
|
86 |
+
" filtered_anime.pop(key)\n",
|
87 |
+
" continue\n",
|
88 |
+
" if rating.intersection(anime_rating) or 'ALL' in rating:\n",
|
89 |
+
" pass\n",
|
90 |
+
" else:\n",
|
91 |
+
" filtered_anime.pop(key)\n",
|
92 |
+
" continue\n",
|
93 |
+
" \n",
|
94 |
+
" return filtered_anime"
|
95 |
+
]
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"cell_type": "code",
|
99 |
+
"execution_count": 10,
|
100 |
+
"metadata": {},
|
101 |
+
"outputs": [],
|
102 |
+
"source": [
|
103 |
+
"with open('./embeddings/data.json') as f:\n",
|
104 |
+
" data = json.load(f)\n",
|
105 |
+
" embeddings = data['embeddings']\n",
|
106 |
+
" filters = data['filters']"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": 15,
|
112 |
+
"metadata": {},
|
113 |
+
"outputs": [],
|
114 |
+
"source": [
|
115 |
+
"def get_recommendation(query, number_of_recommendations, genres, themes, rating, objective_weight, subjective_weight):\n",
|
116 |
+
" filtered_anime = filter_anime(embeddings, genres, themes, rating)\n",
|
117 |
+
" results = []\n",
|
118 |
+
" weighted_scores = get_n_weighted_scores(filtered_anime, query, number_of_recommendations, float(objective_weight), float(subjective_weight))\n",
|
119 |
+
" for idx, (key, score, review_index) in enumerate(weighted_scores, start=1):\n",
|
120 |
+
" data = embeddings[key]\n",
|
121 |
+
" if not data['english']:\n",
|
122 |
+
" name = data['japanese']\n",
|
123 |
+
" else:\n",
|
124 |
+
" name = data['english']\n",
|
125 |
+
" description = data['description']\n",
|
126 |
+
" review = data['reviews'][review_index]['text']\n",
|
127 |
+
" image = data['image']\n",
|
128 |
+
"\n",
|
129 |
+
" results.append(gr.Image(label=f\"Recommendation {idx}: {name}\",value=image, height=435, width=500, visible=True))\n",
|
130 |
+
" results.append(gr.Textbox(label=f\"Synopsis\", value=description, max_lines=7, visible=True))\n",
|
131 |
+
" results.append(gr.Textbox(label=f\"Most Relevant User Review\",value=review, max_lines=7, visible=True))\n",
|
132 |
+
"\n",
|
133 |
+
" for _ in range(10-number_of_recommendations):\n",
|
134 |
+
" results.append(gr.Image(visible=False))\n",
|
135 |
+
" results.append(gr.Textbox(visible=False))\n",
|
136 |
+
" results.append(gr.Textbox(visible=False))\n",
|
137 |
+
" \n",
|
138 |
+
" return results"
|
139 |
+
]
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"cell_type": "code",
|
143 |
+
"execution_count": 16,
|
144 |
+
"metadata": {},
|
145 |
+
"outputs": [
|
146 |
+
{
|
147 |
+
"name": "stdout",
|
148 |
+
"output_type": "stream",
|
149 |
+
"text": [
|
150 |
+
"Running on local URL: http://127.0.0.1:7863\n",
|
151 |
+
"\n",
|
152 |
+
"To create a public link, set `share=True` in `launch()`.\n"
|
153 |
+
]
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"data": {
|
157 |
+
"text/html": [
|
158 |
+
"<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
159 |
+
],
|
160 |
+
"text/plain": [
|
161 |
+
"<IPython.core.display.HTML object>"
|
162 |
+
]
|
163 |
+
},
|
164 |
+
"metadata": {},
|
165 |
+
"output_type": "display_data"
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"name": "stderr",
|
169 |
+
"output_type": "stream",
|
170 |
+
"text": [
|
171 |
+
"c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\gradio\\analytics.py:106: UserWarning: IMPORTANT: You are using gradio version 4.44.1, however version 5.0.1 is available, please upgrade. \n",
|
172 |
+
"--------\n",
|
173 |
+
" warnings.warn(\n"
|
174 |
+
]
|
175 |
+
}
|
176 |
+
],
|
177 |
+
"source": [
|
178 |
+
"with gr.Blocks(theme=gr.themes.Soft(primary_hue='red')) as demo:\n",
|
179 |
+
" with gr.Row():\n",
|
180 |
+
" with gr.Column():\n",
|
181 |
+
" gr.Markdown(\n",
|
182 |
+
" '''\n",
|
183 |
+
" # Welcome to the Nuanced Recommendation System!\n",
|
184 |
+
" ### This system **combines** both objective (synopsis, episode count, themes) and subjective (user reviews) data, in order to recommend the most approprate anime. Feel free to refine using the **optional** filters below! \n",
|
185 |
+
" '''\n",
|
186 |
+
" )\n",
|
187 |
+
" with gr.Column():\n",
|
188 |
+
" pass\n",
|
189 |
+
" \n",
|
190 |
+
"\n",
|
191 |
+
" with gr.Row():\n",
|
192 |
+
" with gr.Column() as input_col:\n",
|
193 |
+
" query = gr.Textbox(label=\"What are you looking for?\")\n",
|
194 |
+
" number_of_recommendations = gr.Slider(label= \"# of Recommendations\", minimum=1, maximum=10, value=3, step=1)\n",
|
195 |
+
" genres = gr.Dropdown(label='Genres',multiselect=True,choices=filters['genres'], value=['ALL'])\n",
|
196 |
+
" themes = gr.Dropdown(label='Themes',multiselect=True,choices=filters['themes'], value=['ALL'])\n",
|
197 |
+
" rating = gr.Dropdown(label='Rating',multiselect=True,choices=filters['rating'], value=['ALL'])\n",
|
198 |
+
" objective_weight = gr.Slider(label= \"Objective Weight\", minimum=0, maximum=1, value=.5, step=.1)\n",
|
199 |
+
" subjective_weight = gr.Slider(label= \"Subjective Weight\", minimum=0, maximum=1, value=.5, step=.1)\n",
|
200 |
+
" submit_btn = gr.Button(\"Submit\")\n",
|
201 |
+
"\n",
|
202 |
+
" examples = gr.Examples(\n",
|
203 |
+
" examples=[\n",
|
204 |
+
" ['A sci-fi anime set in a future where AI and robots have become self-aware', 3, ['Action', 'Sci-Fi', 'Fantasy'], ['ALL'], ['PG-13 - Teens 13 or older'], .8, .2],\n",
|
205 |
+
" ['An anime where a group of students form a band, and the story focuses on their personal growth and struggles with adulthood', 5, ['ALL'], ['Music'], ['PG-13 - Teens 13 or older', 'R - 17+ (violence & profanity)'], .3, .7],\n",
|
206 |
+
" ['An anime where the main character starts as a villain but slowly redeems themselves', 3, ['Suspense', 'Action'], ['ALL'], ['PG-13 - Teens 13 or older', 'R - 17+ (violence & profanity)'], .2, .8],\n",
|
207 |
+
" ],\n",
|
208 |
+
" inputs=[query, number_of_recommendations, genres, themes, rating, objective_weight, subjective_weight],\n",
|
209 |
+
" )\n",
|
210 |
+
"\n",
|
211 |
+
" outputs = []\n",
|
212 |
+
" with gr.Column():\n",
|
213 |
+
" for i in range(10):\n",
|
214 |
+
" with gr.Row():\n",
|
215 |
+
" with gr.Column():\n",
|
216 |
+
" outputs.append(gr.Image(height=435, width=500, visible=False))\n",
|
217 |
+
" with gr.Column():\n",
|
218 |
+
" outputs.append(gr.Textbox(max_lines=7, visible=False))\n",
|
219 |
+
" outputs.append(gr.Textbox(max_lines=7, visible=False))\n",
|
220 |
+
" \n",
|
221 |
+
"\n",
|
222 |
+
" submit_btn.click(\n",
|
223 |
+
" get_recommendation,\n",
|
224 |
+
" [query, number_of_recommendations, genres, themes, rating, objective_weight, subjective_weight],\n",
|
225 |
+
" outputs\n",
|
226 |
+
" )\n",
|
227 |
+
"\n",
|
228 |
+
" demo.launch()"
|
229 |
+
]
|
230 |
+
}
|
231 |
+
],
|
232 |
+
"metadata": {
|
233 |
+
"kernelspec": {
|
234 |
+
"display_name": ".venv",
|
235 |
+
"language": "python",
|
236 |
+
"name": "python3"
|
237 |
+
},
|
238 |
+
"language_info": {
|
239 |
+
"codemirror_mode": {
|
240 |
+
"name": "ipython",
|
241 |
+
"version": 3
|
242 |
+
},
|
243 |
+
"file_extension": ".py",
|
244 |
+
"mimetype": "text/x-python",
|
245 |
+
"name": "python",
|
246 |
+
"nbconvert_exporter": "python",
|
247 |
+
"pygments_lexer": "ipython3",
|
248 |
+
"version": "3.12.7"
|
249 |
+
}
|
250 |
+
},
|
251 |
+
"nbformat": 4,
|
252 |
+
"nbformat_minor": 2
|
253 |
+
}
|
embed_test.ipynb
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
13 |
+
" from tqdm.autonotebook import tqdm, trange\n",
|
14 |
+
"c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
|
15 |
+
" warnings.warn(\n"
|
16 |
+
]
|
17 |
+
}
|
18 |
+
],
|
19 |
+
"source": [
|
20 |
+
"import os\n",
|
21 |
+
"import json\n",
|
22 |
+
"\n",
|
23 |
+
"from sentence_transformers import SentenceTransformer\n",
|
24 |
+
"\n",
|
25 |
+
"model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
|
26 |
+
]
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"cell_type": "code",
|
30 |
+
"execution_count": 3,
|
31 |
+
"metadata": {},
|
32 |
+
"outputs": [],
|
33 |
+
"source": [
|
34 |
+
"def update_filters(filters, data):\n",
|
35 |
+
" for key, value in filters.items():\n",
|
36 |
+
" dont_add = ['Boys Love', 'Erotica', 'Girls Love', 'Hentai', 'Ecchi', 'Gore', 'Crossdressing', 'Magical Sex Shift', 'Rx - Hentai', 'R+ - Mild Nudity']\n",
|
37 |
+
" if data[key] and key == 'rating':\n",
|
38 |
+
" if data[key] not in dont_add:\n",
|
39 |
+
" value.add(data[key])\n",
|
40 |
+
" else:\n",
|
41 |
+
" for val in data[key]:\n",
|
42 |
+
" if val and val not in dont_add:\n",
|
43 |
+
" value.add(val)\n",
|
44 |
+
" return filters"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"cell_type": "code",
|
49 |
+
"execution_count": null,
|
50 |
+
"metadata": {},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"def clean_filters(filters):\n",
|
54 |
+
" for key, val in filters.items():\n",
|
55 |
+
" val.add('ALL')\n",
|
56 |
+
" filters[key] = list(val)\n",
|
57 |
+
" return filters"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"cell_type": "code",
|
62 |
+
"execution_count": null,
|
63 |
+
"metadata": {},
|
64 |
+
"outputs": [],
|
65 |
+
"source": [
|
66 |
+
"print('Embedding Started')\n",
|
67 |
+
"\n",
|
68 |
+
"filters = {\n",
|
69 |
+
"'genres': set(),\n",
|
70 |
+
"'themes': set(),\n",
|
71 |
+
"'rating': set()\n",
|
72 |
+
"}\n",
|
73 |
+
"\n",
|
74 |
+
"embeddings = {}\n",
|
75 |
+
"for name in os.listdir('./anime'):\n",
|
76 |
+
" with open(f\"./anime/{name}\", 'r') as file:\n",
|
77 |
+
" data = json.load(file)\n",
|
78 |
+
"\n",
|
79 |
+
" if not data:\n",
|
80 |
+
" continue\n",
|
81 |
+
"\n",
|
82 |
+
" filters = update_filters(filters, data)\n",
|
83 |
+
"\n",
|
84 |
+
" name = name.replace('.json', '')\n",
|
85 |
+
" \n",
|
86 |
+
" data['image'] = f\"./images/{name}.jpg\"\n",
|
87 |
+
"\n",
|
88 |
+
" text = f'''Episodes: {data['episodes']} \n",
|
89 |
+
" Premiered: {data['premiered']} \n",
|
90 |
+
" Broadcast: {data['broadcast']} \n",
|
91 |
+
" Producers: {' '.join(data['producers'])} \n",
|
92 |
+
" Licensors: {' '.join(data['licensors'])} \n",
|
93 |
+
" Studios: {' '.join(data['studios'])} \n",
|
94 |
+
" Source: {' '.join(data['source'])} \n",
|
95 |
+
" Genres: {' '.join(data['genres'])} \n",
|
96 |
+
" Themes: {' '.join(data['themes'])} \n",
|
97 |
+
" Demographic: {data['demographic']} \n",
|
98 |
+
" Duration: {data['duration']} \n",
|
99 |
+
" Rating: {data['rating']} \n",
|
100 |
+
" Description: {data['description']}'''\n",
|
101 |
+
" \n",
|
102 |
+
" embeddings[name] = data.copy()\n",
|
103 |
+
" \n",
|
104 |
+
" embeddings[name]['objective_embedding'] = [model.encode(text).tolist()]\n",
|
105 |
+
" subjective_embeddings = []\n",
|
106 |
+
" for review in embeddings[name]['reviews']:\n",
|
107 |
+
" text = review['text']\n",
|
108 |
+
" subjective_embeddings.append(model.encode(text).tolist())\n",
|
109 |
+
" data['review'] = text\n",
|
110 |
+
" embeddings[name]['subjective_embeddings'] = subjective_embeddings\n",
|
111 |
+
"\n",
|
112 |
+
"filters = clean_filters(filters)\n",
|
113 |
+
"\n",
|
114 |
+
"with open('./embeddings/data.json', 'w') as f:\n",
|
115 |
+
" json.dump({'embeddings':embeddings, 'filters': filters}, f)\n",
|
116 |
+
"\n",
|
117 |
+
"print('Embedding Complete')"
|
118 |
+
]
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"metadata": {
|
122 |
+
"kernelspec": {
|
123 |
+
"display_name": ".venv",
|
124 |
+
"language": "python",
|
125 |
+
"name": "python3"
|
126 |
+
},
|
127 |
+
"language_info": {
|
128 |
+
"codemirror_mode": {
|
129 |
+
"name": "ipython",
|
130 |
+
"version": 3
|
131 |
+
},
|
132 |
+
"file_extension": ".py",
|
133 |
+
"mimetype": "text/x-python",
|
134 |
+
"name": "python",
|
135 |
+
"nbconvert_exporter": "python",
|
136 |
+
"pygments_lexer": "ipython3",
|
137 |
+
"version": "3.12.7"
|
138 |
+
}
|
139 |
+
},
|
140 |
+
"nbformat": 4,
|
141 |
+
"nbformat_minor": 2
|
142 |
+
}
|
scrape.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.common.keys import Keys
|
3 |
+
from selenium.webdriver.common.by import By
|
4 |
+
from selenium.webdriver.support.wait import WebDriverWait
|
5 |
+
from selenium.webdriver.support import expected_conditions as EC
|
6 |
+
|
7 |
+
import urllib
|
8 |
+
|
9 |
+
import time
|
10 |
+
import os
|
11 |
+
import json
|
12 |
+
import random
|
13 |
+
|
14 |
+
driver = webdriver.Chrome()
|
15 |
+
|
16 |
+
existing = set([int(name.replace('.json', '')) for name in os.listdir('./anime')])
|
17 |
+
# total = set(range(100000))
|
18 |
+
# difference = total.difference(existing)
|
19 |
+
difference = range(3210, 10000)
|
20 |
+
|
21 |
+
for i in difference:
|
22 |
+
random_num = random.choice([4])
|
23 |
+
|
24 |
+
driver.get(f"https://myanimelist.net/anime/{i}/")
|
25 |
+
|
26 |
+
try:
|
27 |
+
try:
|
28 |
+
wait = WebDriverWait(driver, 5)
|
29 |
+
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='leftside']//a/img")))
|
30 |
+
image = driver.find_element(By.XPATH, "//div[@class='leftside']//a/img").get_attribute('src')
|
31 |
+
urllib.request.urlretrieve(image, f"./images/{i}.jpg")
|
32 |
+
except:
|
33 |
+
pass
|
34 |
+
try:
|
35 |
+
body = driver.find_element(By.TAG_NAME, 'body')
|
36 |
+
except:
|
37 |
+
body = ''
|
38 |
+
try:
|
39 |
+
description = driver.find_element(By.XPATH, "//p[@itemprop='description']").text
|
40 |
+
except:
|
41 |
+
description = ''
|
42 |
+
try:
|
43 |
+
synonyms = driver.find_element(By.XPATH, "//span[text() = 'Synonyms:']/..").text.replace('Synonyms:', '').strip()
|
44 |
+
except:
|
45 |
+
synonyms = ''
|
46 |
+
try:
|
47 |
+
japanese = driver.find_element(By.XPATH, "//span[text() = 'Japanese:']/..").text.replace('Japanese:', '').strip()
|
48 |
+
except:
|
49 |
+
japanese = ''
|
50 |
+
try:
|
51 |
+
driver.find_element(By.CLASS_NAME, 'js-anime-toggle-alternative-title-button').click()
|
52 |
+
english = driver.find_element(By.XPATH, "//span[text() = 'English:']/..").text.replace('English:', '').strip()
|
53 |
+
except:
|
54 |
+
english = ''
|
55 |
+
try:
|
56 |
+
type = driver.find_element(By.XPATH, "//span[text() = 'Type:']/..").text.replace('Type:', '').strip()
|
57 |
+
except:
|
58 |
+
type = ''
|
59 |
+
|
60 |
+
try:
|
61 |
+
episodes = driver.find_element(By.XPATH, "//span[text() = 'Episodes:']/..").text.replace('Episodes:', '').strip()
|
62 |
+
except:
|
63 |
+
episodes = ''
|
64 |
+
try:
|
65 |
+
premiered = driver.find_element(By.XPATH, "//span[text() = 'Premiered:']/..").text.replace('Premiered:', '').strip()
|
66 |
+
except:
|
67 |
+
premiered = ''
|
68 |
+
try:
|
69 |
+
broadcast = driver.find_element(By.XPATH, "//span[text() = 'Broadcast:']/..").text.replace('Broadcast:', '').strip()
|
70 |
+
except:
|
71 |
+
broadcast = ''
|
72 |
+
try:
|
73 |
+
producers = driver.find_element(By.XPATH, "//span[text() = 'Producers:']/..").text.replace('Producers:', '').strip()
|
74 |
+
except:
|
75 |
+
producers = ''
|
76 |
+
try:
|
77 |
+
licensors = driver.find_element(By.XPATH, "//span[text() = 'Licensors:']/..").text.replace('Licensors:', '').strip()
|
78 |
+
except:
|
79 |
+
licensors = ''
|
80 |
+
try:
|
81 |
+
studios = driver.find_element(By.XPATH, "//span[text() = 'Studios:']/..").text.replace('Studios:', '').strip()
|
82 |
+
except:
|
83 |
+
studios = ''
|
84 |
+
try:
|
85 |
+
source = driver.find_element(By.XPATH, "//span[text() = 'Source:']/..").text.replace('Source:', '').strip()
|
86 |
+
except:
|
87 |
+
source = ''
|
88 |
+
|
89 |
+
try:
|
90 |
+
genres = driver.find_element(By.XPATH, "//span[text() = 'Genres:']/..").text.replace('Genres:', '').strip()
|
91 |
+
except:
|
92 |
+
genres = ''
|
93 |
+
try:
|
94 |
+
themes = driver.find_element(By.XPATH, "//span[text() = 'Themes:']/..").text.replace('Themes:', '').strip()
|
95 |
+
except:
|
96 |
+
try:
|
97 |
+
themes = driver.find_element(By.XPATH, "//span[text() = 'Theme:']/..").text.replace('Theme:', '').strip()
|
98 |
+
except:
|
99 |
+
themes = ''
|
100 |
+
|
101 |
+
try:
|
102 |
+
demographic = driver.find_element(By.XPATH, "//span[text() = 'Demographic:']/..").text.replace('Demographic:', '').strip()
|
103 |
+
except:
|
104 |
+
demographic = ''
|
105 |
+
try:
|
106 |
+
duration = driver.find_element(By.XPATH, "//span[text() = 'Duration:']/..").text.replace('Duration:', '').strip()
|
107 |
+
except:
|
108 |
+
duration = ''
|
109 |
+
try:
|
110 |
+
rating = driver.find_element(By.XPATH, "//span[text() = 'Rating:']/..").text.replace('Rating:', '').strip()
|
111 |
+
except:
|
112 |
+
rating = ''
|
113 |
+
|
114 |
+
time.sleep(2)
|
115 |
+
|
116 |
+
try:
|
117 |
+
wait = WebDriverWait(driver, 10)
|
118 |
+
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']")))
|
119 |
+
driver.find_element(By.XPATH, "//div[@id='horiznav_nav']//a[text() = 'Reviews']").click()
|
120 |
+
except Exception as e:
|
121 |
+
print(e)
|
122 |
+
try:
|
123 |
+
driver.find_element(By.CLASS_NAME, 'error404')
|
124 |
+
with open(f"anime/{i}.json", "w") as outfile:
|
125 |
+
json.dump({}, outfile)
|
126 |
+
continue
|
127 |
+
except Exception as e:
|
128 |
+
print(e)
|
129 |
+
driver.close()
|
130 |
+
time.sleep(150)
|
131 |
+
driver = webdriver.Chrome()
|
132 |
+
continue
|
133 |
+
driver.execute_script("window.scrollTo(0, 0)")
|
134 |
+
|
135 |
+
data = []
|
136 |
+
reviews = driver.find_elements(By.CLASS_NAME, 'js-review-element')
|
137 |
+
for review in reviews:
|
138 |
+
visible = review.find_element(By.CLASS_NAME, 'text')
|
139 |
+
sentiment = review.find_element(By.CLASS_NAME, 'tag')
|
140 |
+
wait = WebDriverWait(driver, 10)
|
141 |
+
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'js-hidden')))
|
142 |
+
hidden = review.find_element(By.CLASS_NAME, 'js-hidden')
|
143 |
+
text = visible.text + hidden.get_attribute('textContent')
|
144 |
+
text = text.strip().replace('\n', ' ')
|
145 |
+
data.append({
|
146 |
+
'sentiment': sentiment.text,
|
147 |
+
'text': text
|
148 |
+
})
|
149 |
+
|
150 |
+
reviews = data
|
151 |
+
|
152 |
+
if not reviews:
|
153 |
+
continue
|
154 |
+
|
155 |
+
information = {
|
156 |
+
'synonyms': synonyms,
|
157 |
+
'japanese': japanese,
|
158 |
+
'english': english,
|
159 |
+
'type': type,
|
160 |
+
'episodes': episodes,
|
161 |
+
'premiered': premiered,
|
162 |
+
'broadcast': broadcast,
|
163 |
+
'producers': [x.strip() for x in producers.split(',')],
|
164 |
+
'licensors': [x.strip() for x in licensors.split(',')],
|
165 |
+
'studios': [x.strip() for x in studios.split(',')],
|
166 |
+
'source': [x.strip() for x in source.split(',')],
|
167 |
+
'genres': [x.strip() for x in genres.split(',')],
|
168 |
+
'themes': [x.strip() for x in themes.split(',')],
|
169 |
+
'demographic': demographic.split(','),
|
170 |
+
'duration': duration,
|
171 |
+
'rating': rating,
|
172 |
+
'description': description,
|
173 |
+
'reviews': reviews
|
174 |
+
}
|
175 |
+
|
176 |
+
with open(f"anime/{i}.json", "w") as outfile:
|
177 |
+
json.dump(information, outfile)
|
178 |
+
time.sleep(random_num)
|
179 |
+
except Exception as e:
|
180 |
+
print(e)
|
181 |
+
time.sleep(random_num)
|
182 |
+
|
183 |
+
|
184 |
+
driver.close()
|