hsKNN / matrixRec.py
hscrown's picture
Upload matrixRec.py
617cd6a verified
raw
history blame
3.88 kB
# -*- coding: utf-8 -*-
!pip install datasets
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from datasets import load_dataset
"""## ๋ฐ์ดํ„ฐ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
"""
# df:๋„์„œ๊ด€,๋ฐ•๋ฌผ๊ด€,๊ณต์›์ด place_id,place_name,gu_name,type ์œผ๋กœ๋œ ๋ฐ์ดํ„ฐ: csv ํŒŒ์ผ๋กœ ์ฝ์–ด์˜ค๊ธฐ
df= read_csv('places.csv', index=False, encoding='utf-8')
"""# ์‚ฌ์šฉ์ž ํ‰์ ๋ฐ์ดํ„ฐ """
user_rating= read_csv('user_rating_1000.csv', index=False, encoding='utf-8')
"""์•„์ดํ…œ-ํŠน์„ฑ ๋ฐ์ดํ„ฐ ๋งŒ๋“ค๊ธฐ"""
# place_id, type, place_name๋งŒ ์ถ”์ถœ
item_feature = df[['place_id', 'type', 'place_name']]
item_feature.head()
"""์ถ”์ฒœ์‹œ์Šคํ…œ๊ตฌํ˜„
"""
# ์‚ฌ์šฉ์ž-์žฅ์†Œ-ํ‰์  ํ”ผ๋ด‡๋งŒ๋“ค๊ธฐ
df_user_place_ratings = user_place_data.pivot_table(index='user_id', columns='place_id', values='rating')
df_user_place_ratings.head()
"""
์ดํ›„ ํ• ์ผ
1)pivot table์„ matrix๋กœ ๋ณ€ํ™˜
2)np.mean(axis = 1)์„ ํ†ตํ•ด ์žฅ์†Œ๋ณ„ ๊ฐ ์‚ฌ์šฉ์ž๋“ค์ด ๋งค๊ธฐ๋Š” ํ‰์  ํ‰๊ท ์„ ๊ตฌํ•จ
1์—์„œ ๊ตฌํ•œ ๊ฐ’๊ณผ 2์—์„œ ๊ตฌํ•œ ๊ฐ’์„ ๋นผ์„œ ์‚ฌ์šฉ์ž-ํ‰๊ท  ๋ฐ์ดํ„ฐ ๊ฐ’์„ ๋ณ€๊ฒฝ
"""
# floatํƒ€์ž…์„ str๋กœ ๋ณ€ํ™˜
df_user_place_ratings.columns = df_user_place_ratings.columns.astype(str)
# df_user_place_ratings: pivot_table ๊ฐ’์„ numpy matrix๋กœ ๋งŒ๋“  ๊ฒƒ
df_user_place_ratings.columns = df_user_place_ratings.columns.str.strip() # ํ™”์ดํŠธ์ŠคํŽ˜์ด์Šค ์ง€์šฐ๊ธฐ
matrix = df_user_place_ratings.values #as_matrix function์€ depricated.
# user_ratings_mean: ์‚ฌ์šฉ์ž์˜ ํ‰๊ท  ํ‰์ 
user_ratings_mean = np.mean(matrix, axis = 1)
# # matrix_user_mean : ์‚ฌ์šฉ์ž-์˜ํ™”์— ๋Œ€ํ•ด ์‚ฌ์šฉ์ž ํ‰๊ท  ํ‰์ ์„ ๋บ€ ๊ฒƒ.
matrix_user_mean = matrix - user_ratings_mean.reshape(-1, 1)
pd.DataFrame(matrix_user_mean, columns = df_user_place_ratings.columns).head()
# scipy์—์„œ ์ œ๊ณตํ•ด์ฃผ๋Š” svd.
# U ํ–‰๋ ฌ, sigma ํ–‰๋ ฌ, V ์ „์น˜ ํ–‰๋ ฌ์„ ๋ฐ˜ํ™˜.
U, sigma, Vt = svds(matrix_user_mean, k = 12)
# ํ˜„์žฌ ์ด Sigma ํ–‰๋ ฌ์€ 0์ด ์•„๋‹Œ ๊ฐ’๋งŒ 1์ฐจ์› ํ–‰๋ ฌ๋กœ ํ‘œํ˜„๋œ ์ƒํƒœ์ž…๋‹ˆ๋‹ค.
# ์ฆ‰, 0์ด ํฌํ•จ๋œ ๋Œ€์นญํ–‰๋ ฌ๋กœ ๋ณ€ํ™˜ํ•  ๋•Œ๋Š” numpy์˜ diag๋ฅผ ์ด์šฉํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
sigma = np.diag(sigma)
sigma.shape
# U, Sigma, Vt์˜ ๋‚ด์ ์„ ์ˆ˜ํ–‰ํ•˜๋ฉด, ๋‹ค์‹œ ์›๋ณธ ํ–‰๋ ฌ๋กœ ๋ณต์›์ด ๋œ๋‹ค.
# ๊ฑฐ๊ธฐ์— + ์‚ฌ์šฉ์ž ํ‰๊ท  rating์„ ์ ์šฉํ•œ๋‹ค.
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = df_user_place_ratings.columns)
df_svd_preds.head()
df_svd_preds.shape
# ์˜ˆ์ธก ๊ฒฐ๊ณผ ์ •๋ ฌ ๋ฐ ๋ฐ˜ํ™˜์ฝ”๋“œ
# ์‚ฌ์šฉ์ž์˜ ์˜ˆ์ธก ํ‰์ ์ด ๋†’์€ ์ˆœ์œผ๋กœ ์ •๋ ฌ๋œ ๋ฐ์ดํ„ฐ
# user_id๊ฐ€ 0๋ถ€ํ„ฐ ์‹œ์ž‘ํ•˜๋ฏ€๋กœ user_row_number๋กœ ์“ด๋‹ค. 1๋ถ€ํ„ฐ์‹œ์ž‘ํ•˜๋ฉด user_id-1ํ•˜๋ฉด๋จ.
user_id = 0 # 0๋ฒˆํšŒ์›์˜ ํ‰์  ์˜ˆ์ธก
user_row_number = user_id
sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
sorted_user_predictions = pd.DataFrame(sorted_user_predictions.reset_index())
sorted_user_predictions.columns = ['place_id', 'predict_rating']
sorted_user_predictions['place_id'] = sorted_user_predictions['place_id'].astype('int64')
# ์›๋ณธ ํ‰์  ๋ฐ์ดํ„ฐ์—์„œ user id์— ํ•ด๋‹นํ•˜๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ์ถ”์ถœ
user_data = user_rating[user_rating['user_id'] == user_id]
# user_data์—์„œ ํ‰์ ์ด 0์ธ ๋ฐ์ดํ„ฐ๋Š” ์•„์ง ์•ˆ ๊ฐ€๋ณธ ๊ฒƒ์ด๋ฏ€๋กœ ์‚ญ์ œ
user_data = user_data[user_data['rating'] != 0.0]
# (๊ฐ€๋ณธ ์žฅ์†Œ) ์ถ”์ถœ๋œ ๋ฐ์ดํ„ฐ์™€ ์›๋ณธ ์žฅ์†Œ ๋ฐ์ดํ„ฐ๋ฅผ ํ•ฉ์นจ
user_history = user_data.merge(item_feature, on='place_id').sort_values(['rating'], ascending=False)
# ์‚ฌ์šฉ์ž๊ฐ€ ํ‰๊ฐ€ํ•˜์ง€ ์•Š์€ ์žฅ์†Œ๋ฅผ ์ถ”์ฒœ ๋Œ€์ƒ์œผ๋กœ ์„ค์ •
recommendations = item_feature[~item_feature['place_id'].isin(user_history['place_id'])]