File size: 4,822 Bytes
699b928 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\sentence_transformers\\cross_encoder\\CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from tqdm.autonotebook import tqdm, trange\n",
"c:\\Users\\Jose\\Desktop\\Nuanced_Recommendation_System\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
" warnings.warn(\n"
]
}
],
"source": [
"import os\n",
"import json\n",
"\n",
"from sentence_transformers import SentenceTransformer\n",
"\n",
"model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def update_filters(filters, data):\n",
" for key, value in filters.items():\n",
" dont_add = ['Boys Love', 'Erotica', 'Girls Love', 'Hentai', 'Ecchi', 'Gore', 'Crossdressing', 'Magical Sex Shift', 'Rx - Hentai', 'R+ - Mild Nudity']\n",
" if data[key] and key == 'rating':\n",
" if data[key] not in dont_add:\n",
" value.add(data[key])\n",
" else:\n",
" for val in data[key]:\n",
" if val and val not in dont_add:\n",
" value.add(val)\n",
" return filters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def clean_filters(filters):\n",
" for key, val in filters.items():\n",
" val.add('ALL')\n",
" filters[key] = list(val)\n",
" return filters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('Embedding Started')\n",
"\n",
"filters = {\n",
"'genres': set(),\n",
"'themes': set(),\n",
"'rating': set()\n",
"}\n",
"\n",
"embeddings = {}\n",
"for name in os.listdir('./anime'):\n",
" with open(f\"./anime/{name}\", 'r') as file:\n",
" data = json.load(file)\n",
"\n",
" if not data:\n",
" continue\n",
"\n",
" filters = update_filters(filters, data)\n",
"\n",
" name = name.replace('.json', '')\n",
" \n",
" data['image'] = f\"./images/{name}.jpg\"\n",
"\n",
" text = f'''Episodes: {data['episodes']} \n",
" Premiered: {data['premiered']} \n",
" Broadcast: {data['broadcast']} \n",
" Producers: {' '.join(data['producers'])} \n",
" Licensors: {' '.join(data['licensors'])} \n",
" Studios: {' '.join(data['studios'])} \n",
" Source: {' '.join(data['source'])} \n",
" Genres: {' '.join(data['genres'])} \n",
" Themes: {' '.join(data['themes'])} \n",
" Demographic: {data['demographic']} \n",
" Duration: {data['duration']} \n",
" Rating: {data['rating']} \n",
" Description: {data['description']}'''\n",
" \n",
" embeddings[name] = data.copy()\n",
" \n",
" embeddings[name]['objective_embedding'] = [model.encode(text).tolist()]\n",
" subjective_embeddings = []\n",
" for review in embeddings[name]['reviews']:\n",
" text = review['text']\n",
" subjective_embeddings.append(model.encode(text).tolist())\n",
" data['review'] = text\n",
" embeddings[name]['subjective_embeddings'] = subjective_embeddings\n",
"\n",
"filters = clean_filters(filters)\n",
"\n",
"with open('./embeddings/data.json', 'w') as f:\n",
" json.dump({'embeddings':embeddings, 'filters': filters}, f)\n",
"\n",
"print('Embedding Complete')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|