Collaborative Filtering¶
Quickstart¶
Load the Data¶
import pandas as pd
import numpy as np
from resype.collab_filtering import CollabFilteringModel
%load_ext autoreload
%autoreload 2
# load transaction list
transaction_list = pd.read_csv("sample_data/ratings.csv")[['userId', 'movieId', 'rating']]
transaction_list = transaction_list.sample(20)
transaction_list.columns = ["user_id", 'item_id', 'rating']
Preprocess¶
re = CollabFilteringModel(transaction_list)
re.transaction_list.head(3)
| user_id | item_id | rating | |
|---|---|---|---|
| 58565 | 381 | 78266 | 3.0 |
| 18628 | 119 | 54272 | 4.5 |
| 35027 | 234 | 2123 | 3.0 |
# construct utlity matrix
re.construct_utility_matrix()
re.utility_matrix.head(3)
| item_id | 196 | 208 | 266 | 355 | 434 | 586 | 588 | 1701 | 1856 | 2123 | 2455 | 2502 | 2867 | 4025 | 6188 | 54272 | 78266 | 80463 | 106100 | 157108 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | ||||||||||||||||||||
| 18 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.5 |
| 107 | NaN | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 111 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Train on Unclustered Matrix¶
Iterative Approach¶
# import sklearn Model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
# fit and predict
re.fit(model, method='iterative')
re.utility_matrix_preds.head(3)
| item_id | 196 | 208 | 266 | 355 | 434 | 586 | 588 | 1701 | 1856 | 2123 | 2455 | 2502 | 2867 | 4025 | 6188 | 54272 | 78266 | 80463 | 106100 | 157108 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | ||||||||||||||||||||
| 18 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 107 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 111 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
# recommend
user_list = [0, 1, 2] # indices
top_n = 10
re.get_rec(user_list, top_n)
re.df_rec
| user_id | rank_1 | rank_2 | rank_3 | rank_4 | rank_5 | rank_6 | rank_7 | rank_8 | rank_9 | rank_10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18 | 2123.0 | 106100.0 | 208.0 | 266.0 | 355.0 | 434.0 | 586.0 | 588.0 | 1701.0 | 1856.0 |
| 1 | 107 | 157108.0 | 106100.0 | 266.0 | 355.0 | 434.0 | 586.0 | 588.0 | 1701.0 | 1856.0 | 2123.0 |
| 2 | 111 | 157108.0 | 106100.0 | 208.0 | 266.0 | 355.0 | 434.0 | 586.0 | 588.0 | 1701.0 | 2123.0 |
Train on Clustered matrix¶
from sklearn.cluster import KMeans
km_users = KMeans(n_clusters=10)
km_items = KMeans(n_clusters=10)
user_model, user_cluster_map, util_matrix_w_users = re.cluster_users(km_users)
item_model, item_cluster_map, util_matrix_w_items = re.cluster_items(km_items)
re.cluster_assignment()
re.utility_matrix_agg(u_agg="mean", i_agg="mean")
re.utility_matrix.head(3)
| i_cluster | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
|---|---|---|---|---|---|---|---|---|---|---|
| u_cluster | ||||||||||
| 0 | 0.000000 | 0.000000 | 0.0 | 5.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
| 1 | 0.363636 | 0.272727 | 0.0 | 0.0 | 0.18595 | 0.0 | 0.0 | 0.0 | 0.0 | 0.363636 |
| 2 | 0.000000 | 0.000000 | 5.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 |
from sklearn.ensemble import RandomForestRegressor
model1 = RandomForestRegressor()
re.fit(model_object=model1, method='iterative', n_synth_data=5, p=0.1)
re.utility_matrix_preds.head(3)
| i_cluster | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
|---|---|---|---|---|---|---|---|---|---|---|
| u_cluster | ||||||||||
| 0 | -0.430173 | -0.417114 | -0.458333 | 3.538939 | -0.458333 | -0.458333 | -0.458333 | 0.209448 | -0.230966 | -0.417366 |
| 1 | 0.225689 | 0.134780 | -0.080534 | -0.088732 | 0.067354 | -0.099382 | -0.114118 | -0.137948 | -0.116099 | 0.225689 |
| 2 | -0.486881 | -0.522222 | 4.477778 | -0.522222 | -0.522222 | 0.130154 | -0.522222 | -0.522222 | -0.522222 | -0.522222 |
# predict top item clusters per user index
user_list = [0, 1, 2] # index
top_n = 5 # top n clusters
re.get_rec(user_list, top_n, re.user_assignment)
| user_id | rank_1 | rank_2 | rank_3 | rank_4 | rank_5 | |
|---|---|---|---|---|---|---|
| 0 | 0 | 2.0 | 9.0 | 7.0 | 6.0 | 5.0 |
| 1 | 1 | 2.0 | 3.0 | 5.0 | 6.0 | 8.0 |
| 2 | 2 | 5.0 | 9.0 | 0.0 | 1.0 | 3.0 |
# predict top items per user_id
top_n = 5 # top n clusters
re.get_rec_item(top_n)
| user_id | rank_1 | rank_2 | rank_3 | rank_4 | rank_5 | |
|---|---|---|---|---|---|---|
| 0 | 18 | 2502.0 | 588.0 | 54272.0 | 106100.0 | 2455.0 |
| 1 | 107 | 2502.0 | 2867.0 | 2455.0 | 106100.0 | 157108.0 |
| 2 | 111 | 2455.0 | 588.0 | 1701.0 | 78266.0 | 2867.0 |