File size: 3,082 Bytes
8af8121
 
 
 
 
 
36dc14c
 
8af8121
 
 
 
 
36dc14c
8af8121
 
 
 
 
36dc14c
8af8121
 
 
 
 
 
 
 
 
 
36dc14c
8af8121
 
 
 
 
36dc14c
8af8121
 
 
 
 
 
36dc14c
8af8121
 
 
 
 
 
 
 
 
36dc14c
8af8121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36dc14c
 
8af8121
36dc14c
8af8121
36dc14c
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import gdown
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import streamlit as st

# Google Drive file IDs
movies_file_id = "1HWlVK-nXM5JG4GfSDHyR-x8T1AlfQQYw"
ratings_file_id = "1V2s1rpu4Gfjbt8z2a1Xml9IJr5KSozK1"


# Download the files if they don't exist
def download_file_from_google_drive(file_id, output):
    url = f"https://drive.google.com/uc?id={file_id}"
    gdown.download(url, output, quiet=False)


if not os.path.exists("movies.csv"):
    download_file_from_google_drive(movies_file_id, "movies.csv")

if not os.path.exists("ratings.csv"):
    download_file_from_google_drive(ratings_file_id, "ratings.csv")

# Load the data
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")


# Clean movie titles
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title


movies["clean_title"] = movies["title"].apply(clean_title)

# Vectorize the titles
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["clean_title"])


# Function to search for movies
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    return results


# Function to find similar movies
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


# Streamlit UI
st.title("Movie Recommendation System")

movie_name = st.text_input("Enter a movie title", "Toy Story")

if len(movie_name) > 5:
    results = search(movie_name)
    if not results.empty:
        movie_id = results.iloc[0]["movieId"]
        st.write(f"Top recommendations based on '{results.iloc[0]['title']}':")
        recommendations = find_similar_movies(movie_id)
        for index, row in recommendations.iterrows():
            st.write(f"{row['title']} ({row['genres']}) - Score: {row['score']:.2f}")
    else:
        st.write("No movies found. Please try a different title.")