Spaces:
Sleeping
Sleeping
Initial commit: Add Streamlit app for movie recommendation
Browse files- app.py +88 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gdown
|
3 |
+
import pandas as pd
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
import numpy as np
|
7 |
+
import ipywidgets as widgets
|
8 |
+
from IPython.display import display
|
9 |
+
|
10 |
+
# Google Drive file IDs
|
11 |
+
movies_file_id = "1HWlVK-nXM5JG4GfSDHyR-x8T1AlfQQYw"
|
12 |
+
ratings_file_id = "1V2s1rpu4Gfjbt8z2a1Xml9IJr5KSozK1"
|
13 |
+
|
14 |
+
# Download the files if they don't exist
|
15 |
+
def download_file_from_google_drive(file_id, output):
|
16 |
+
url = f"https://drive.google.com/uc?id={file_id}"
|
17 |
+
gdown.download(url, output, quiet=False)
|
18 |
+
|
19 |
+
if not os.path.exists("movies.csv"):
|
20 |
+
download_file_from_google_drive(movies_file_id, "movies.csv")
|
21 |
+
|
22 |
+
if not os.path.exists("ratings.csv"):
|
23 |
+
download_file_from_google_drive(ratings_file_id, "ratings.csv")
|
24 |
+
|
25 |
+
# Load the data
|
26 |
+
movies = pd.read_csv("movies.csv")
|
27 |
+
ratings = pd.read_csv("ratings.csv")
|
28 |
+
|
29 |
+
# Clean movie titles
|
30 |
+
import re
|
31 |
+
def clean_title(title):
|
32 |
+
title = re.sub("[^a-zA-Z0-9 ]", "", title)
|
33 |
+
return title
|
34 |
+
|
35 |
+
movies["clean_title"] = movies["title"].apply(clean_title)
|
36 |
+
|
37 |
+
# Vectorize the titles
|
38 |
+
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
|
39 |
+
tfidf = vectorizer.fit_transform(movies["clean_title"])
|
40 |
+
|
41 |
+
# Function to search for movies
|
42 |
+
def search(title):
|
43 |
+
title = clean_title(title)
|
44 |
+
query_vec = vectorizer.transform([title])
|
45 |
+
similarity = cosine_similarity(query_vec, tfidf).flatten()
|
46 |
+
indices = np.argpartition(similarity, -5)[-5:]
|
47 |
+
results = movies.iloc[indices].iloc[::-1]
|
48 |
+
|
49 |
+
return results
|
50 |
+
|
51 |
+
# Function to find similar movies
|
52 |
+
def find_similar_movies(movie_id):
|
53 |
+
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
|
54 |
+
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
|
55 |
+
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
|
56 |
+
|
57 |
+
similar_user_recs = similar_user_recs[similar_user_recs > .10]
|
58 |
+
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
|
59 |
+
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
|
60 |
+
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
|
61 |
+
rec_percentages.columns = ["similar", "all"]
|
62 |
+
|
63 |
+
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
|
64 |
+
rec_percentages = rec_percentages.sort_values("score", ascending=False)
|
65 |
+
return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
|
66 |
+
|
67 |
+
|
68 |
+
# Widgets for the UI
|
69 |
+
movie_name_input = widgets.Text(
|
70 |
+
value='Toy Story',
|
71 |
+
description='Movie Title:',
|
72 |
+
disabled=False
|
73 |
+
)
|
74 |
+
recommendation_list = widgets.Output()
|
75 |
+
|
76 |
+
def on_type(data):
|
77 |
+
with recommendation_list:
|
78 |
+
recommendation_list.clear_output()
|
79 |
+
title = data["new"]
|
80 |
+
if len(title) > 5:
|
81 |
+
results = search(title)
|
82 |
+
movie_id = results.iloc[0]["movieId"]
|
83 |
+
display(find_similar_movies(movie_id))
|
84 |
+
|
85 |
+
movie_name_input.observe(on_type, names='value')
|
86 |
+
|
87 |
+
# Display the widgets
|
88 |
+
display(movie_name_input, recommendation_list)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pandas
|
3 |
+
scikit-learn
|
4 |
+
numpy
|
5 |
+
gdown
|