{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from tqdm.autonotebook import tqdm, trange\n", "c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", " warnings.warn(\n" ] } ], "source": [ "import os\n", "import json\n", "\n", "from sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def update_filters(filters, data):\n", " for key, value in filters.items():\n", " dont_add = ['Boys Love', 'Erotica', 'Girls Love', 'Hentai', 'Ecchi', 'Gore', 'Crossdressing', 'Magical Sex Shift', 'Rx - Hentai', 'R+ - Mild Nudity']\n", " if data[key] and key == 'rating':\n", " if data[key] not in dont_add:\n", " value.add(data[key])\n", " else:\n", " for val in data[key]:\n", " if val and val not in dont_add:\n", " value.add(val)\n", " return filters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def clean_filters(filters):\n", " for key, val in filters.items():\n", " val.add('ALL')\n", " filters[key] = list(val)\n", " return filters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print('Embedding Started')\n", "\n", "filters = {\n", "'genres': set(),\n", "'themes': set(),\n", "'rating': set()\n", "}\n", "\n", "embeddings = {}\n", "for name in os.listdir('./anime'):\n", " with open(f\"./anime/{name}\", 'r') as file:\n", " data = json.load(file)\n", "\n", " if not data:\n", " continue\n", "\n", " filters = update_filters(filters, data)\n", "\n", " name = name.replace('.json', '')\n", " \n", " data['image'] = f\"./images/{name}.jpg\"\n", "\n", " text = f'''Episodes: {data['episodes']} \n", " Premiered: {data['premiered']} \n", " Broadcast: {data['broadcast']} \n", " Producers: {' '.join(data['producers'])} \n", " Licensors: {' '.join(data['licensors'])} \n", " Studios: {' '.join(data['studios'])} \n", " Source: {' '.join(data['source'])} \n", " Genres: {' '.join(data['genres'])} \n", " Themes: {' '.join(data['themes'])} \n", " Demographic: {data['demographic']} \n", " Duration: {data['duration']} \n", " Rating: {data['rating']} \n", " Description: {data['description']}'''\n", " \n", " embeddings[name] = data.copy()\n", " \n", " embeddings[name]['objective_embedding'] = [model.encode(text).tolist()]\n", " subjective_embeddings = []\n", " for review in embeddings[name]['reviews']:\n", " text = review['text']\n", " subjective_embeddings.append(model.encode(text).tolist())\n", " data['review'] = text\n", " embeddings[name]['subjective_embeddings'] = subjective_embeddings\n", "\n", "filters = clean_filters(filters)\n", "\n", "with open('./embeddings/data.json', 'w') as f:\n", " json.dump({'embeddings':embeddings, 'filters': filters}, f)\n", "\n", "print('Embedding Complete')" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }