Content-based Filtering

Quickstart

# transaction list
transaction_list.head()
userId movieId rating timestamp
0 1 1 4.0 964982703
1 1 3 4.0 964981247
2 1 6 4.0 964982224
3 1 47 5.0 964983815
4 1 50 5.0 964982931
# item ids and features
item_df.head()
movieId i_1 i_2 i_3 i_4 i_5 i_6 i_7 i_8 i_9 ... i_291 i_292 i_293 i_294 i_295 i_296 i_297 i_298 i_299 i_300
0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.513025 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 47 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 50 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 301 columns

# user ids and features
user_df.head()
userId u_1 u_2 u_3 u_4 u_5 u_6 u_7 u_8 u_9 ... u_11 u_12 u_13 u_14 u_15 u_16 u_17 u_18 u_19 u_20
0 1 85 29 42 83 47 26 90 45 55 ... 17 68 22 7 40 22 1 0 0 0
1 2 3 0 0 7 0 1 11 10 10 ... 1 17 1 1 4 0 0 4 3 0
2 3 11 4 5 9 4 5 14 2 7 ... 8 16 5 0 15 1 0 0 0 0
3 4 29 6 10 104 19 58 25 27 38 ... 4 120 7 10 12 16 4 1 2 0
4 5 8 6 9 15 7 11 9 12 9 ... 1 25 3 2 2 5 0 3 0 0

5 rows × 21 columns

Load ContentBasedModel

from resype.content_based import ContentBasedModel
cb = ContentBasedModel(user_df,
                        item_df,
                        transaction_list,
                        item_id_name='movieId',
                        user_id_name='userId',
                        target_name='rating',
                        timestamp_name='timestamp')

Train-test Split

#Do your train and test split
cb.split_train_test(train_ratio = 0.7) #To split the data
#Do your train and test split
cb.split_train_test_chronological(train_ratio = 0.7) #To split the data
cb.df_test
userId movieId rating timestamp u_1 u_2 u_3 u_4 u_5 u_6 ... i_291 i_292 i_293 i_294 i_295 i_296 i_297 i_298 i_299 i_300
5 1 70 3.0 964982400 85 29 42 83 47 26 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8 1 151 5.0 964984041 85 29 42 83 47 26 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
9 1 157 5.0 964984100 85 29 42 83 47 26 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
16 1 296 3.0 964982967 85 29 42 83 47 26 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
18 1 333 5.0 964981179 85 29 42 83 47 26 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
980 7 8665 3.5 1108602755 54 14 15 49 23 30 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
981 7 8666 1.0 1106779625 54 14 15 49 23 30 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
983 7 8798 4.5 1106636602 54 14 15 49 23 30 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
984 7 8808 1.5 1109746594 54 14 15 49 23 30 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
989 7 8949 4.0 1110757890 54 14 15 49 23 30 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

301 rows × 324 columns

cb.df_train
userId movieId rating timestamp u_1 u_2 u_3 u_4 u_5 u_6 ... i_291 i_292 i_293 i_294 i_295 i_296 i_297 i_298 i_299 i_300
206 1 3243 3.0 964981093 85 29 42 83 47 26 ... 0.0 0.00000 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.0
207 1 3247 3.0 964983108 85 29 42 83 47 26 ... 0.0 0.60429 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.0
225 1 3729 5.0 964982363 85 29 42 83 47 26 ... 0.0 0.00000 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.0
201 1 3053 5.0 964984086 85 29 42 83 47 26 ... 0.0 0.00000 0.0 0.000000 0.0 0.202585 0.0 0.000000 0.0 0.0
82 1 1256 5.0 964981442 85 29 42 83 47 26 ... 0.0 0.00000 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
972 7 7155 3.0 1173051472 54 14 15 49 23 30 ... 0.0 0.00000 0.0 0.590782 0.0 0.000000 0.0 0.000000 0.0 0.0
921 7 2683 2.0 1106635420 54 14 15 49 23 30 ... 0.0 0.00000 0.0 0.000000 0.0 0.171763 0.0 0.000000 0.0 0.0
944 7 4700 1.5 1109746591 54 14 15 49 23 30 ... 0.0 0.00000 0.0 0.000000 0.0 0.000000 0.0 0.478203 0.0 0.0
897 7 1196 4.0 1106635996 54 14 15 49 23 30 ... 0.0 0.00000 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.0
968 7 6863 4.5 1106712827 54 14 15 49 23 30 ... 0.0 0.00000 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.0 0.0

699 rows × 324 columns

Fit model

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=202109, n_jobs=-1)
cb.fit_ml_cb(model)
RandomForestRegressor(n_jobs=-1, random_state=202109)
# show trained model
cb.model
RandomForestRegressor(n_jobs=-1, random_state=202109)

Predict on Test Set

#Predict on the test data
preds_array = cb.reco_ml_cb_tt() #To make predictions as an array
print(len(preds_array))
preds_array
301
array([4.07 , 4.18 , 4.26 , 4.15 , 4.24 , 4.45 , 4.26 , 4.53 , 4.69 ,
       3.72 , 4.68 , 4.3  , 4.97 , 4.14 , 3.45 , 4.83 , 4.56 , 4.86 ,
       4.18 , 4.82 , 4.73 , 3.6  , 4.96 , 4.9  , 4.89 , 4.48 , 4.7  ,
       4.59 , 4.74 , 4.2  , 4.76 , 4.9  , 4.17 , 4.39 , 4.61 , 4.27 ,
       4.05 , 4.46 , 4.26 , 4.88 , 4.6  , 4.45 , 4.39 , 4.49 , 3.23 ,
       4.61 , 4.98 , 4.85 , 4.82 , 4.5  , 4.14 , 4.57 , 4.2  , 4.45 ,
       4.52 , 4.32 , 4.7  , 3.74 , 4.45 , 4.87 , 4.6  , 4.29 , 4.51 ,
       4.43 , 4.71 , 4.75 , 4.55 , 4.41 , 4.64 , 4.74 , 3.495, 2.17 ,
       3.86 , 3.25 , 4.225, 3.5  , 4.41 , 3.175, 3.76 , 4.13 , 3.275,
       3.575, 3.585, 3.36 , 3.84 , 3.625, 4.05 , 3.265, 3.5  , 4.175,
       3.795, 3.62 , 3.19 , 3.9  , 4.23 , 3.99 , 3.315, 3.975, 3.66 ,
       3.44 , 4.09 , 2.875, 3.35 , 3.015, 3.88 , 3.92 , 3.23 , 3.955,
       4.1  , 3.5  , 2.895, 3.51 , 3.285, 3.4  , 3.285, 3.555, 4.02 ,
       3.335, 4.17 , 3.435, 4.16 , 3.9  , 4.2  , 3.54 , 4.075, 3.36 ,
       3.65 , 3.965, 3.615, 3.86 , 3.17 , 3.11 , 4.175, 3.02 , 4.08 ,
       3.975, 3.24 , 4.105, 3.745, 3.67 , 4.18 , 4.14 , 4.2  , 3.87 ,
       3.75 , 3.625, 3.63 , 4.08 , 3.835, 3.92 , 3.645, 4.275, 3.69 ,
       3.285, 4.045, 3.015, 3.675, 3.63 , 3.705, 3.06 , 3.335, 3.57 ,
       3.99 , 2.195, 3.14 , 3.285, 3.265, 3.95 , 3.07 , 3.49 , 3.48 ,
       3.5  , 3.45 , 3.285, 3.53 , 3.14 , 3.22 , 3.38 , 3.49 , 3.17 ,
       3.54 , 3.4  , 3.565, 4.   , 3.345, 3.78 , 2.795, 3.46 , 3.71 ,
       3.505, 3.675, 3.9  , 2.99 , 3.2  , 3.345, 3.325, 3.64 , 3.28 ,
       3.615, 3.935, 3.06 , 3.445, 4.085, 3.87 , 3.86 , 2.91 , 3.32 ,
       3.82 , 3.79 , 3.225, 3.285, 3.56 , 2.895, 3.5  , 2.935, 3.31 ,
       2.965, 2.55 , 3.735, 3.91 , 3.08 , 3.63 , 3.525, 3.175, 3.8  ,
       3.64 , 3.645, 3.405, 2.49 , 3.37 , 3.24 , 3.175, 3.45 , 3.615,
       3.585, 3.335, 3.81 , 3.875, 3.69 , 4.145, 3.21 , 3.825, 2.285,
       3.19 , 2.655, 3.645, 3.715, 3.62 , 3.585, 3.5  , 3.355, 4.025,
       3.   , 3.535, 3.645, 3.36 , 3.44 , 3.375, 3.63 , 2.815, 4.195,
       3.56 , 3.51 , 4.15 , 4.305, 4.19 , 3.645, 3.96 , 3.605, 3.475,
       3.04 , 3.96 , 3.975, 3.29 , 2.715, 3.695, 3.36 , 4.28 , 3.95 ,
       3.105, 3.14 , 3.725, 3.725, 3.19 , 3.16 , 3.885, 3.88 , 3.405,
       3.925, 3.73 , 3.77 , 3.715, 1.995, 2.65 , 3.77 , 3.87 , 3.045,
       3.785, 3.775, 3.62 , 2.835])

Get Recommendations

cb.get_rec(user_list=[0, 1, 3], top_n=3)
Predicting utility matrix: 7 out of 7
user_id rank_1 rank_2 rank_3
0 0 163981.0 52042.0 117368.0
1 1 122882.0 596.0 7381.0
2 3 2858.0 125.0 4967.0

Evaluate on test set

mse, mae = cb.evaluate_test_set() #MSE and MAE
mse, mae
proceed
(1.4762376245847177, 0.9313455149501663)

Cross Validation

mse, mae = cb.cross_val(model, k=3)
0
Starting splitting
Finished splitting
Starting training
Finished training
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
1
Starting splitting
Finished splitting
Starting training
Finished training
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
2
Starting splitting
Finished splitting
Starting training
Finished training
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
mse, mae
([1.4898759966777408, 1.221593899192318, 1.5457277466777408],
 [0.9358637873754153, 0.8729953725676317, 0.9878654485049833])