{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-08-15T14:44:00.362360Z","iopub.status.busy":"2024-08-15T14:44:00.361946Z","iopub.status.idle":"2024-08-15T14:44:01.523645Z","shell.execute_reply":"2024-08-15T14:44:01.522395Z","shell.execute_reply.started":"2024-08-15T14:44:00.362321Z"},"trusted":true},"outputs":[],"source":["# This Python 3 environment comes with many helpful analytics libraries installed\n","# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n","# For example, here's several helpful packages to load\n","\n","import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","\n","# Input data files are available in the read-only \"../input/\" directory\n","# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n","\n","import os\n","for dirname, _, filenames in os.walk('/kaggle/input'):\n"," for filename in filenames:\n"," print(os.path.join(dirname, filename))\n","\n","# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n","# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-08-15T14:44:01.526545Z","iopub.status.busy":"2024-08-15T14:44:01.525934Z","iopub.status.idle":"2024-08-15T14:44:01.846387Z","shell.execute_reply":"2024-08-15T14:44:01.845152Z","shell.execute_reply.started":"2024-08-15T14:44:01.526503Z"},"trusted":true},"outputs":[],"source":["movie_details = pd.read_json('/kaggle/input/movie-details/IMDB_movie_details.json', lines=True)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-08-15T14:44:01.848814Z","iopub.status.busy":"2024-08-15T14:44:01.848415Z","iopub.status.idle":"2024-08-15T14:44:17.628604Z","shell.execute_reply":"2024-08-15T14:44:17.627484Z","shell.execute_reply.started":"2024-08-15T14:44:01.848783Z"},"trusted":true},"outputs":[],"source":["reviews = pd.read_json('/kaggle/input/bad-words-flag/better_reviews.json')"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-08-15T14:44:17.631413Z","iopub.status.busy":"2024-08-15T14:44:17.631039Z","iopub.status.idle":"2024-08-15T14:44:17.652198Z","shell.execute_reply":"2024-08-15T14:44:17.650772Z","shell.execute_reply.started":"2024-08-15T14:44:17.631381Z"},"trusted":true},"outputs":[],"source":["print(movie_details.head())\n","print(reviews.head())"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-08-15T14:44:17.663776Z","iopub.status.busy":"2024-08-15T14:44:17.663306Z","iopub.status.idle":"2024-08-15T14:44:23.371720Z","shell.execute_reply":"2024-08-15T14:44:23.370184Z","shell.execute_reply.started":"2024-08-15T14:44:17.663734Z"},"trusted":true},"outputs":[],"source":["from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.metrics.pairwise import cosine_similarity\n","from transformers import BertTokenizer\n","import torch\n","from torch.utils.data import Dataset, DataLoader\n","from sklearn.model_selection import train_test_split\n","\n","# Preprocess and merge data\n","movie_details.dropna(subset=['plot_synopsis', 'plot_summary'], inplace=True)\n","reviews.dropna(subset=['review_text'], inplace=True)\n","data = pd.merge(reviews, movie_details, on='movie_id')\n","\n","# data = data.head(10000)\n","data.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-08-15T14:44:23.373876Z","iopub.status.busy":"2024-08-15T14:44:23.373308Z","iopub.status.idle":"2024-08-15T14:44:23.382136Z","shell.execute_reply":"2024-08-15T14:44:23.380852Z","shell.execute_reply.started":"2024-08-15T14:44:23.373843Z"},"trusted":true},"outputs":[],"source":["# Function to split the synopsis into three parts\n","def split_synopsis(text):\n"," parts = len(text.split()) // 3\n"," return text.split()[:parts], text.split()[parts:2*parts], text.split()[2*parts:]\n","\n","# Calculate the proximity of review text to the end of the plot synopsis\n","def calculate_proximity(review, synopsis):\n"," _, _, end = split_synopsis(synopsis)\n"," vectorizer = TfidfVectorizer()\n"," vectors = vectorizer.fit_transform([review, ' '.join(end)])\n"," return cosine_similarity(vectors)[0, 1]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-08-15T14:44:23.383872Z","iopub.status.busy":"2024-08-15T14:44:23.383457Z","iopub.status.idle":"2024-08-15T14:44:23.399055Z","shell.execute_reply":"2024-08-15T14:44:23.397734Z","shell.execute_reply.started":"2024-08-15T14:44:23.383839Z"},"trusted":true},"outputs":[],"source":["from nltk.tokenize import sent_tokenize\n","import nltk\n","from sklearn.preprocessing import MinMaxScaler\n","\n","nltk.download('punkt')\n","\n","counter = 0\n","\n","def calculate_proximity_weighted(review, synopsis):\n"," global counter\n"," counter += 1\n"," if counter % 5000 == 0:\n"," print(counter, \"Records Ended!\")\n"," review_sentences = sent_tokenize(review)\n"," synopsis_sentences = sent_tokenize(synopsis)\n"," vectorizer = TfidfVectorizer()\n","\n"," # Create weights for synopsis sentences based on their position\n"," synopsis_weights = np.linspace(0.5, 1.0, num=len(synopsis_sentences))\n"," \n"," if len(synopsis_sentences) == 0:\n"," return 0\n"," # Vectorize synopsis sentences\n"," synopsis_vectors = vectorizer.fit_transform(synopsis_sentences)\n","\n"," significant_proximity_scores = []\n","\n"," for sentence in review_sentences:\n"," sentence_vector = vectorizer.transform([sentence])\n"," similarities = cosine_similarity(sentence_vector, synopsis_vectors)[0]\n","\n"," # Apply a threshold of 50% to consider the similarity significant\n"," significant_similarities = [sim * weight for sim, weight in zip(similarities, synopsis_weights) if sim > 0.7]\n","\n"," if significant_similarities:\n"," # Sum the significant similarities weighted by sentence position in the synopsis\n"," significant_proximity_scores.extend(significant_similarities)\n","\n"," # Return the sum of significant proximity scores\n"," return sum(significant_proximity_scores)\n","\n","# Applying the function to the dataset\n","data['end_proximity'] = data.apply(lambda x: calculate_proximity_weighted(x['review_text'], x['plot_synopsis']), axis=1)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.status.busy":"2024-08-15T14:35:59.289260Z","iopub.status.idle":"2024-08-15T14:35:59.289684Z","shell.execute_reply":"2024-08-15T14:35:59.289489Z","shell.execute_reply.started":"2024-08-15T14:35:59.289473Z"},"trusted":true},"outputs":[],"source":["scaler = MinMaxScaler()\n","data['end_proximity'] = scaler.fit_transform(data[['end_proximity']])"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["data.to_json('/kaggle/input/final_dataset2.json', orient='records', lines=True)"]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":5547860,"sourceId":9179201,"sourceType":"datasetVersion"},{"datasetId":5547886,"sourceId":9179232,"sourceType":"datasetVersion"}],"dockerImageVersionId":30746,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"}},"nbformat":4,"nbformat_minor":4}