sairamn commited on
Commit
8af8121
1 Parent(s): d754603

Initial commit: Add Streamlit app for movie recommendation

Browse files
Files changed (2) hide show
  1. app.py +88 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gdown
3
+ import pandas as pd
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import numpy as np
7
+ import ipywidgets as widgets
8
+ from IPython.display import display
9
+
10
+ # Google Drive file IDs
11
+ movies_file_id = "1HWlVK-nXM5JG4GfSDHyR-x8T1AlfQQYw"
12
+ ratings_file_id = "1V2s1rpu4Gfjbt8z2a1Xml9IJr5KSozK1"
13
+
14
+ # Download the files if they don't exist
15
+ def download_file_from_google_drive(file_id, output):
16
+ url = f"https://drive.google.com/uc?id={file_id}"
17
+ gdown.download(url, output, quiet=False)
18
+
19
+ if not os.path.exists("movies.csv"):
20
+ download_file_from_google_drive(movies_file_id, "movies.csv")
21
+
22
+ if not os.path.exists("ratings.csv"):
23
+ download_file_from_google_drive(ratings_file_id, "ratings.csv")
24
+
25
+ # Load the data
26
+ movies = pd.read_csv("movies.csv")
27
+ ratings = pd.read_csv("ratings.csv")
28
+
29
+ # Clean movie titles
30
+ import re
31
+ def clean_title(title):
32
+ title = re.sub("[^a-zA-Z0-9 ]", "", title)
33
+ return title
34
+
35
+ movies["clean_title"] = movies["title"].apply(clean_title)
36
+
37
+ # Vectorize the titles
38
+ vectorizer = TfidfVectorizer(ngram_range=(1, 2))
39
+ tfidf = vectorizer.fit_transform(movies["clean_title"])
40
+
41
+ # Function to search for movies
42
+ def search(title):
43
+ title = clean_title(title)
44
+ query_vec = vectorizer.transform([title])
45
+ similarity = cosine_similarity(query_vec, tfidf).flatten()
46
+ indices = np.argpartition(similarity, -5)[-5:]
47
+ results = movies.iloc[indices].iloc[::-1]
48
+
49
+ return results
50
+
51
+ # Function to find similar movies
52
+ def find_similar_movies(movie_id):
53
+ similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
54
+ similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
55
+ similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
56
+
57
+ similar_user_recs = similar_user_recs[similar_user_recs > .10]
58
+ all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
59
+ all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
60
+ rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
61
+ rec_percentages.columns = ["similar", "all"]
62
+
63
+ rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
64
+ rec_percentages = rec_percentages.sort_values("score", ascending=False)
65
+ return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
66
+
67
+
68
+ # Widgets for the UI
69
+ movie_name_input = widgets.Text(
70
+ value='Toy Story',
71
+ description='Movie Title:',
72
+ disabled=False
73
+ )
74
+ recommendation_list = widgets.Output()
75
+
76
+ def on_type(data):
77
+ with recommendation_list:
78
+ recommendation_list.clear_output()
79
+ title = data["new"]
80
+ if len(title) > 5:
81
+ results = search(title)
82
+ movie_id = results.iloc[0]["movieId"]
83
+ display(find_similar_movies(movie_id))
84
+
85
+ movie_name_input.observe(on_type, names='value')
86
+
87
+ # Display the widgets
88
+ display(movie_name_input, recommendation_list)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ scikit-learn
4
+ numpy
5
+ gdown