jrno commited on
Commit
dddf97f
β€’
1 Parent(s): 52bc18b
.gitignore CHANGED
@@ -1,4 +1 @@
1
- data/*
2
- !data/music_info.csv
3
- !data/model_track_interactions.csv
4
  recommendation-api/__pycache__
 
 
 
 
1
  recommendation-api/__pycache__
README.md CHANGED
@@ -9,3 +9,34 @@ license: mit
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
+
13
+ # ai-academy-2024-group8
14
+
15
+ A lightweight backend API for song recommender.
16
+
17
+ Dataset used in this project is public and available from [online](https://www.kaggle.com/datasets/undefinenull/million-song-dataset-spotify-lastfm)
18
+
19
+ ## What's in here
20
+
21
+ - `data/`: Contains the trained `model.pkl` and related `model.csv` that has the training set in csv format
22
+ - `notebooks/`: Contains any jupyter notebooks used in the project
23
+ - `recommendation-api/`: A FastAPI app to serve user recommendations
24
+
25
+ ## Running service locally
26
+
27
+ 1. (Optional) Create an activate python venv
28
+ 2. Install the requirements `pip install -r requirements.txt`
29
+ 3. Start the service `python recommendation-api/server.py`
30
+
31
+ Then
32
+
33
+ - `curl http://localhost:7860/users` to fetch list of supported users
34
+ - `curl http://localhost:7860/users/<id>` to fetch track history for individual user
35
+ - `curl http://localhost:7860/recommend/<id>` to recommend tracks for the specific user
36
+
37
+ ## Running in Huggingface
38
+
39
+ Application is built and started on push to master.
40
+
41
+ Application is available from [here](https://schibsted-ai-academy-2024-gr8-recommender-api.hf.space/docs)
42
+
data/{model_track_interactions.csv β†’ model.csv} RENAMED
File without changes
{recommendation-api β†’ data}/model.pkl RENAMED
File without changes
recommendation-api/recommender.py CHANGED
@@ -1,28 +1,25 @@
1
- from fastai.learner import Learner
2
- import pandas as pd
3
-
4
- from tracks import get_unlistened_tracks_for_user, predictions_to_tracks
5
-
6
- def get_recommendations_for_user(learn: Learner, user_id: str, limit: int = 5):
7
- not_listened_tracks = get_unlistened_tracks_for_user(user_id)
8
- print(len(not_listened_tracks))
9
-
10
- # Get predictions for the tracks user hasn't listened yet
11
- input_dataframe = pd.DataFrame({'user_id': [user_id] * len(not_listened_tracks), 'entry': not_listened_tracks})
12
- test_dl = learn.dls.test_dl(input_dataframe)
13
- predictions = learn.get_preds(dl=test_dl)
14
-
15
- # Associate them with prediction score and sort
16
- tracks_with_predictions = list(zip(not_listened_tracks, predictions[0].numpy()))
17
- tracks_with_predictions.sort(key=lambda x: x[1], reverse=True)
18
-
19
- print(tracks_with_predictions[:limit])
20
-
21
- # Pick n and return as full tracks
22
- recommendations = predictions_to_tracks(tracks_with_predictions[:limit])
23
-
24
- return {
25
- "user_id": user_id,
26
- "limit": limit,
27
- "recommendations": recommendations
28
  }
 
1
+ from fastai.learner import Learner
2
+ import pandas as pd
3
+
4
+ from tracks import get_unlistened_tracks_for_user, predictions_to_tracks
5
+
6
+ def get_recommendations_for_user(learn: Learner, user_id: str, limit: int = 5):
7
+ not_listened_tracks = get_unlistened_tracks_for_user(user_id)
8
+
9
+ # Get predictions for the tracks user hasn't listened yet
10
+ input_dataframe = pd.DataFrame({'user_id': [user_id] * len(not_listened_tracks), 'entry': not_listened_tracks})
11
+ test_dl = learn.dls.test_dl(input_dataframe)
12
+ predictions = learn.get_preds(dl=test_dl)
13
+
14
+ # Associate them with prediction score and sort
15
+ tracks_with_predictions = list(zip(not_listened_tracks, predictions[0].numpy()))
16
+ tracks_with_predictions.sort(key=lambda x: x[1], reverse=True)
17
+
18
+ # Pick n and return as full tracks
19
+ recommendations = predictions_to_tracks(tracks_with_predictions[:limit])
20
+
21
+ return {
22
+ "user_id": user_id,
23
+ "limit": limit,
24
+ "recommendations": recommendations
 
 
 
25
  }
recommendation-api/server.py CHANGED
@@ -5,10 +5,12 @@ import os
5
 
6
  from tracks import get_top_tracks_for_user, get_users_with_track_interactions
7
  from recommender import get_recommendations_for_user
8
- from learner import setup_learner, custom_accuracy # Note that DotProductBias must be imported to global namespace
 
 
9
 
10
  app = FastAPI()
11
- model_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'model.pkl')
12
  learn = None
13
 
14
  @app.on_event("startup")
 
5
 
6
  from tracks import get_top_tracks_for_user, get_users_with_track_interactions
7
  from recommender import get_recommendations_for_user
8
+
9
+ # custom_accuracy needs to be imported to the global namespace for Learner to load
10
+ from learner import setup_learner, custom_accuracy
11
 
12
  app = FastAPI()
13
+ model_filename = 'data/model.pkl'
14
  learn = None
15
 
16
  @app.on_event("startup")
recommendation-api/tracks.py CHANGED
@@ -1,44 +1,44 @@
1
- import pandas as pd
2
-
3
- # Read the CSV files
4
- tracks_df = pd.read_csv('data/music_info.csv')
5
- tracks_df.fillna('', inplace=True)
6
- tracks_df["entry"] = tracks_df["name"] + ", " + tracks_df["artist"] + ", " + tracks_df["year"].astype(str)
7
- track_interactions_df = pd.read_csv('data/model_track_interactions.csv')[['user_id', 'track_id']]
8
-
9
- # Merge data on those two csvs
10
- dataframe = pd.merge(tracks_df, track_interactions_df, on='track_id', how='left')
11
- # Convert all columns to string type
12
- dataframe = dataframe.astype(str)
13
- # Create a history lookup dictionary by 'user_id'
14
- user_to_track_history_dict = {user_id: group.drop('user_id', axis=1).to_dict('records')
15
- for user_id, group in dataframe.groupby('user_id')}
16
-
17
- def get_users_with_track_interactions(ascending=False, limit=10):
18
- playcount_summary = track_interactions_df.groupby('user_id').size().reset_index(name='track_interactions')
19
- playcount_summary.sort_values(by='track_interactions', ascending=ascending, inplace=True)
20
- if limit is not None:
21
- playcount_summary = playcount_summary.head(limit)
22
- return playcount_summary.to_dict(orient='records')
23
-
24
- def get_top_tracks_for_user(user_id: str, limit=10):
25
- track_list = user_to_track_history_dict.get(user_id, [])
26
- sorted_tracks = sorted(track_list, key=lambda x: int(x['playcount']) if 'playcount' in x and x['playcount'].isdigit() else 0, reverse=True)
27
- if limit is not None:
28
- sorted_tracks = sorted_tracks[:limit]
29
- return sorted_tracks
30
-
31
- def get_unlistened_tracks_for_user(user_id:str):
32
- all_tracks = tracks_df['entry'].tolist()
33
- listened_tracks = [track['entry'] for track in user_to_track_history_dict.get(user_id, [])]
34
- return list(set(all_tracks) - set(listened_tracks))
35
-
36
- def predictions_to_tracks(entries_and_predictions):
37
- tracks = []
38
- for entry, score in entries_and_predictions:
39
- track_info = tracks_df[tracks_df['entry'] == entry]
40
- if not track_info.empty:
41
- track_dict = track_info.to_dict('records')[0]
42
- track_dict['score'] = score.astype(str)
43
- tracks.append(track_dict)
44
  return tracks
 
1
+ import pandas as pd
2
+
3
+ # Read the CSV files
4
+ tracks_df = pd.read_csv('data/music_info.csv')
5
+ tracks_df.fillna('', inplace=True)
6
+ tracks_df["entry"] = tracks_df["name"] + ", " + tracks_df["artist"] + ", " + tracks_df["year"].astype(str)
7
+ track_interactions_df = pd.read_csv('data/model.csv')[['user_id', 'track_id']]
8
+
9
+ # Merge data on those two csvs
10
+ dataframe = pd.merge(tracks_df, track_interactions_df, on='track_id', how='left')
11
+ # Convert all columns to string type
12
+ dataframe = dataframe.astype(str)
13
+ # Create a history lookup dictionary by 'user_id'
14
+ user_to_track_history_dict = {user_id: group.drop('user_id', axis=1).to_dict('records')
15
+ for user_id, group in dataframe.groupby('user_id')}
16
+
17
+ def get_users_with_track_interactions(ascending=False, limit=10):
18
+ playcount_summary = track_interactions_df.groupby('user_id').size().reset_index(name='track_interactions')
19
+ playcount_summary.sort_values(by='track_interactions', ascending=ascending, inplace=True)
20
+ if limit is not None:
21
+ playcount_summary = playcount_summary.head(limit)
22
+ return playcount_summary.to_dict(orient='records')
23
+
24
+ def get_top_tracks_for_user(user_id: str, limit=10):
25
+ track_list = user_to_track_history_dict.get(user_id, [])
26
+ sorted_tracks = sorted(track_list, key=lambda x: int(x['playcount']) if 'playcount' in x and x['playcount'].isdigit() else 0, reverse=True)
27
+ if limit is not None:
28
+ sorted_tracks = sorted_tracks[:limit]
29
+ return sorted_tracks
30
+
31
+ def get_unlistened_tracks_for_user(user_id:str):
32
+ all_tracks = tracks_df['entry'].tolist()
33
+ listened_tracks = [track['entry'] for track in user_to_track_history_dict.get(user_id, [])]
34
+ return list(set(all_tracks) - set(listened_tracks))
35
+
36
+ def predictions_to_tracks(entries_and_predictions):
37
+ tracks = []
38
+ for entry, score in entries_and_predictions:
39
+ track_info = tracks_df[tracks_df['entry'] == entry]
40
+ if not track_info.empty:
41
+ track_dict = track_info.to_dict('records')[0]
42
+ track_dict['score'] = score.astype(str)
43
+ tracks.append(track_dict)
44
  return tracks