Spaces:
Sleeping
Sleeping
Changed Files for Adding Streamlit app for movie recommendation
Browse files- app.py +21 -23
- requirements.txt +2 -3
app.py
CHANGED
@@ -4,18 +4,20 @@ import pandas as pd
|
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
import numpy as np
|
7 |
-
import
|
8 |
-
|
9 |
|
10 |
# Google Drive file IDs
|
11 |
movies_file_id = "1HWlVK-nXM5JG4GfSDHyR-x8T1AlfQQYw"
|
12 |
ratings_file_id = "1V2s1rpu4Gfjbt8z2a1Xml9IJr5KSozK1"
|
13 |
|
|
|
14 |
# Download the files if they don't exist
|
15 |
def download_file_from_google_drive(file_id, output):
|
16 |
url = f"https://drive.google.com/uc?id={file_id}"
|
17 |
gdown.download(url, output, quiet=False)
|
18 |
|
|
|
19 |
if not os.path.exists("movies.csv"):
|
20 |
download_file_from_google_drive(movies_file_id, "movies.csv")
|
21 |
|
@@ -26,18 +28,20 @@ if not os.path.exists("ratings.csv"):
|
|
26 |
movies = pd.read_csv("movies.csv")
|
27 |
ratings = pd.read_csv("ratings.csv")
|
28 |
|
|
|
29 |
# Clean movie titles
|
30 |
-
import re
|
31 |
def clean_title(title):
|
32 |
title = re.sub("[^a-zA-Z0-9 ]", "", title)
|
33 |
return title
|
34 |
|
|
|
35 |
movies["clean_title"] = movies["title"].apply(clean_title)
|
36 |
|
37 |
# Vectorize the titles
|
38 |
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
|
39 |
tfidf = vectorizer.fit_transform(movies["clean_title"])
|
40 |
|
|
|
41 |
# Function to search for movies
|
42 |
def search(title):
|
43 |
title = clean_title(title)
|
@@ -45,9 +49,9 @@ def search(title):
|
|
45 |
similarity = cosine_similarity(query_vec, tfidf).flatten()
|
46 |
indices = np.argpartition(similarity, -5)[-5:]
|
47 |
results = movies.iloc[indices].iloc[::-1]
|
48 |
-
|
49 |
return results
|
50 |
|
|
|
51 |
# Function to find similar movies
|
52 |
def find_similar_movies(movie_id):
|
53 |
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
|
@@ -65,24 +69,18 @@ def find_similar_movies(movie_id):
|
|
65 |
return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
|
66 |
|
67 |
|
68 |
-
#
|
69 |
-
|
70 |
-
value='Toy Story',
|
71 |
-
description='Movie Title:',
|
72 |
-
disabled=False
|
73 |
-
)
|
74 |
-
recommendation_list = widgets.Output()
|
75 |
-
|
76 |
-
def on_type(data):
|
77 |
-
with recommendation_list:
|
78 |
-
recommendation_list.clear_output()
|
79 |
-
title = data["new"]
|
80 |
-
if len(title) > 5:
|
81 |
-
results = search(title)
|
82 |
-
movie_id = results.iloc[0]["movieId"]
|
83 |
-
display(find_similar_movies(movie_id))
|
84 |
|
85 |
-
|
86 |
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
import numpy as np
|
7 |
+
import re
|
8 |
+
import streamlit as st
|
9 |
|
10 |
# Google Drive file IDs
|
11 |
movies_file_id = "1HWlVK-nXM5JG4GfSDHyR-x8T1AlfQQYw"
|
12 |
ratings_file_id = "1V2s1rpu4Gfjbt8z2a1Xml9IJr5KSozK1"
|
13 |
|
14 |
+
|
15 |
# Download the files if they don't exist
|
16 |
def download_file_from_google_drive(file_id, output):
|
17 |
url = f"https://drive.google.com/uc?id={file_id}"
|
18 |
gdown.download(url, output, quiet=False)
|
19 |
|
20 |
+
|
21 |
if not os.path.exists("movies.csv"):
|
22 |
download_file_from_google_drive(movies_file_id, "movies.csv")
|
23 |
|
|
|
28 |
movies = pd.read_csv("movies.csv")
|
29 |
ratings = pd.read_csv("ratings.csv")
|
30 |
|
31 |
+
|
32 |
# Clean movie titles
|
|
|
33 |
def clean_title(title):
|
34 |
title = re.sub("[^a-zA-Z0-9 ]", "", title)
|
35 |
return title
|
36 |
|
37 |
+
|
38 |
movies["clean_title"] = movies["title"].apply(clean_title)
|
39 |
|
40 |
# Vectorize the titles
|
41 |
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
|
42 |
tfidf = vectorizer.fit_transform(movies["clean_title"])
|
43 |
|
44 |
+
|
45 |
# Function to search for movies
|
46 |
def search(title):
|
47 |
title = clean_title(title)
|
|
|
49 |
similarity = cosine_similarity(query_vec, tfidf).flatten()
|
50 |
indices = np.argpartition(similarity, -5)[-5:]
|
51 |
results = movies.iloc[indices].iloc[::-1]
|
|
|
52 |
return results
|
53 |
|
54 |
+
|
55 |
# Function to find similar movies
|
56 |
def find_similar_movies(movie_id):
|
57 |
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
|
|
|
69 |
return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
|
70 |
|
71 |
|
72 |
+
# Streamlit UI
|
73 |
+
st.title("Movie Recommendation System")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
+
movie_name = st.text_input("Enter a movie title", "Toy Story")
|
76 |
|
77 |
+
if len(movie_name) > 5:
|
78 |
+
results = search(movie_name)
|
79 |
+
if not results.empty:
|
80 |
+
movie_id = results.iloc[0]["movieId"]
|
81 |
+
st.write(f"Top recommendations based on '{results.iloc[0]['title']}':")
|
82 |
+
recommendations = find_similar_movies(movie_id)
|
83 |
+
for index, row in recommendations.iterrows():
|
84 |
+
st.write(f"{row['title']} ({row['genres']}) - Score: {row['score']:.2f}")
|
85 |
+
else:
|
86 |
+
st.write("No movies found. Please try a different title.")
|
requirements.txt
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
|
2 |
pandas
|
3 |
scikit-learn
|
4 |
-
|
5 |
-
gdown
|
|
|
1 |
+
gdown
|
2 |
pandas
|
3 |
scikit-learn
|
4 |
+
streamlit
|
|