Demo: ML-based Collaborative Filtering on Utility Matrix with Reduced Dimensions¶
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../resype')
%load_ext autoreload
%autoreload 2
Prepare data¶
np.random.seed(202109)
rating_vals = np.hstack([np.arange(1,6), [np.nan]])
userids = np.arange(10)
itemids = np.arange(100)
random_ratings = np.random.choice(rating_vals, size=len(userids)*len(itemids))
transactions = pd.DataFrame(
{'user_id': userids.repeat(len(itemids)),
'item_id': itemids.reshape((-1, 1)).repeat(len(userids), axis=1).T.flatten(),
'rating': random_ratings}).drop_duplicates()
transactions
| user_id | item_id | rating | |
|---|---|---|---|
| 0 | 0 | 0 | 2.0 |
| 1 | 0 | 1 | NaN |
| 2 | 0 | 2 | NaN |
| 3 | 0 | 3 | 5.0 |
| 4 | 0 | 4 | 4.0 |
| ... | ... | ... | ... |
| 995 | 9 | 95 | NaN |
| 996 | 9 | 96 | 2.0 |
| 997 | 9 | 97 | NaN |
| 998 | 9 | 98 | 5.0 |
| 999 | 9 | 99 | 2.0 |
1000 rows × 3 columns
Load resype¶
from collab_filtering import CollabFilteringModel
re = CollabFilteringModel(transactions)
utility_matrix = re.construct_utility_matrix()
utility_matrix
| item_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | |||||||||||||||||||||
| 0 | 2.0 | NaN | NaN | 5.0 | 4.0 | 4.0 | 3.0 | 4.0 | 4.0 | 3.0 | ... | 4.0 | NaN | 1.0 | 1.0 | NaN | 4.0 | 5.0 | 5.0 | 1.0 | 5.0 |
| 1 | 1.0 | 3.0 | 5.0 | 2.0 | NaN | 5.0 | 3.0 | 3.0 | 1.0 | NaN | ... | 4.0 | 4.0 | 3.0 | NaN | 2.0 | 1.0 | 4.0 | 1.0 | 3.0 | 4.0 |
| 2 | 2.0 | 1.0 | 2.0 | 4.0 | 5.0 | 2.0 | 2.0 | 1.0 | 1.0 | 5.0 | ... | 5.0 | 2.0 | 5.0 | 4.0 | 2.0 | 4.0 | 3.0 | NaN | 2.0 | 2.0 |
| 3 | 5.0 | 5.0 | 4.0 | 5.0 | 2.0 | 4.0 | 1.0 | 3.0 | 1.0 | 4.0 | ... | 4.0 | 2.0 | 3.0 | 2.0 | 4.0 | 3.0 | 4.0 | 4.0 | 1.0 | 5.0 |
| 4 | 4.0 | 1.0 | 2.0 | 3.0 | 2.0 | 2.0 | NaN | 5.0 | NaN | 4.0 | ... | 1.0 | NaN | NaN | 4.0 | NaN | 3.0 | 3.0 | 2.0 | NaN | NaN |
| 5 | 1.0 | 5.0 | NaN | 4.0 | 2.0 | 4.0 | 3.0 | NaN | 2.0 | NaN | ... | 2.0 | 4.0 | 1.0 | 1.0 | 1.0 | 5.0 | 4.0 | 1.0 | 3.0 | 3.0 |
| 6 | 4.0 | 2.0 | 2.0 | 4.0 | 1.0 | 5.0 | 1.0 | 3.0 | 3.0 | NaN | ... | 1.0 | 5.0 | 5.0 | 3.0 | 2.0 | 2.0 | 4.0 | 3.0 | 2.0 | NaN |
| 7 | NaN | 3.0 | 2.0 | 2.0 | NaN | 5.0 | 5.0 | 5.0 | 4.0 | 5.0 | ... | 1.0 | 2.0 | 4.0 | 3.0 | 2.0 | 3.0 | 1.0 | 1.0 | 2.0 | NaN |
| 8 | 1.0 | NaN | 2.0 | 2.0 | 1.0 | 2.0 | 2.0 | 2.0 | NaN | 3.0 | ... | 5.0 | 2.0 | 3.0 | 2.0 | 2.0 | 4.0 | 3.0 | 1.0 | 1.0 | NaN |
| 9 | 2.0 | 1.0 | 3.0 | 3.0 | 4.0 | 2.0 | NaN | NaN | 2.0 | 2.0 | ... | 5.0 | 1.0 | 4.0 | 2.0 | 3.0 | NaN | 2.0 | NaN | 5.0 | 2.0 |
10 rows × 100 columns
Train model using train_model_svd¶
Create model object (load from sklearn)¶
from sklearn.ensemble import RandomForestRegressor
rs_model1 = RandomForestRegressor(random_state=202109)
Train model¶
np.random.seed(202109)
utility_matrix_imputed = re.train_model_svd(
re.utility_matrix, rs_model1, d=10, return_models=False)
Done training 100 out of 100
Prediction¶
utility_matrix_imputed
| item_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | |||||||||||||||||||||
| 0 | 2.00 | 2.99 | 3.01 | 5.0 | 4.00 | 4.0 | 3.00 | 4.00 | 4.00 | 3.00 | ... | 4.0 | 2.50 | 1.00 | 1.00 | 2.81 | 4.00 | 5.0 | 5.0 | 1.00 | 5.00 |
| 1 | 1.00 | 3.00 | 5.00 | 2.0 | 2.31 | 5.0 | 3.00 | 3.00 | 1.00 | 3.82 | ... | 4.0 | 4.00 | 3.00 | 2.07 | 2.00 | 1.00 | 4.0 | 1.0 | 3.00 | 4.00 |
| 2 | 2.00 | 1.00 | 2.00 | 4.0 | 5.00 | 2.0 | 2.00 | 1.00 | 1.00 | 5.00 | ... | 5.0 | 2.00 | 5.00 | 4.00 | 2.00 | 4.00 | 3.0 | 1.9 | 2.00 | 2.00 |
| 3 | 5.00 | 5.00 | 4.00 | 5.0 | 2.00 | 4.0 | 1.00 | 3.00 | 1.00 | 4.00 | ... | 4.0 | 2.00 | 3.00 | 2.00 | 4.00 | 3.00 | 4.0 | 4.0 | 1.00 | 5.00 |
| 4 | 4.00 | 1.00 | 2.00 | 3.0 | 2.00 | 2.0 | 2.33 | 5.00 | 2.44 | 4.00 | ... | 1.0 | 2.76 | 2.63 | 4.00 | 2.18 | 3.00 | 3.0 | 2.0 | 1.75 | 3.61 |
| 5 | 1.00 | 5.00 | 3.09 | 4.0 | 2.00 | 4.0 | 3.00 | 2.86 | 2.00 | 3.55 | ... | 2.0 | 4.00 | 1.00 | 1.00 | 1.00 | 5.00 | 4.0 | 1.0 | 3.00 | 3.00 |
| 6 | 4.00 | 2.00 | 2.00 | 4.0 | 1.00 | 5.0 | 1.00 | 3.00 | 3.00 | 3.21 | ... | 1.0 | 5.00 | 5.00 | 3.00 | 2.00 | 2.00 | 4.0 | 3.0 | 2.00 | 3.61 |
| 7 | 2.71 | 3.00 | 2.00 | 2.0 | 2.91 | 5.0 | 5.00 | 5.00 | 4.00 | 5.00 | ... | 1.0 | 2.00 | 4.00 | 3.00 | 2.00 | 3.00 | 1.0 | 1.0 | 2.00 | 3.66 |
| 8 | 1.00 | 2.91 | 2.00 | 2.0 | 1.00 | 2.0 | 2.00 | 2.00 | 2.23 | 3.00 | ... | 5.0 | 2.00 | 3.00 | 2.00 | 2.00 | 4.00 | 3.0 | 1.0 | 1.00 | 3.05 |
| 9 | 2.00 | 1.00 | 3.00 | 3.0 | 4.00 | 2.0 | 2.56 | 3.32 | 2.00 | 2.00 | ... | 5.0 | 1.00 | 4.00 | 2.00 | 3.00 | 3.53 | 2.0 | 2.3 | 5.00 | 2.00 |
10 rows × 100 columns
Train iterative model using fit¶
Create model object (load from sklearn)¶
Train model¶
rs_model2 = RandomForestRegressor(random_state=202109)
re.fit(rs_model2, method='svd', d=10)
Done training 100 out of 100
Prediction¶
re.utility_matrix_preds
| item_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | |||||||||||||||||||||
| 0 | 2.00 | 3.05 | 3.00 | 5.0 | 4.00 | 4.0 | 3.00 | 4.00 | 4.00 | 3.00 | ... | 4.0 | 2.54 | 1.00 | 1.00 | 2.80 | 4.00 | 5.0 | 5.00 | 1.00 | 5.00 |
| 1 | 1.00 | 3.00 | 5.00 | 2.0 | 2.41 | 5.0 | 3.00 | 3.00 | 1.00 | 3.83 | ... | 4.0 | 4.00 | 3.00 | 2.06 | 2.00 | 1.00 | 4.0 | 1.00 | 3.00 | 4.00 |
| 2 | 2.00 | 1.00 | 2.00 | 4.0 | 5.00 | 2.0 | 2.00 | 1.00 | 1.00 | 5.00 | ... | 5.0 | 2.00 | 5.00 | 4.00 | 2.00 | 4.00 | 3.0 | 1.93 | 2.00 | 2.00 |
| 3 | 5.00 | 5.00 | 4.00 | 5.0 | 2.00 | 4.0 | 1.00 | 3.00 | 1.00 | 4.00 | ... | 4.0 | 2.00 | 3.00 | 2.00 | 4.00 | 3.00 | 4.0 | 4.00 | 1.00 | 5.00 |
| 4 | 4.00 | 1.00 | 2.00 | 3.0 | 2.00 | 2.0 | 2.35 | 5.00 | 2.40 | 4.00 | ... | 1.0 | 2.73 | 2.63 | 4.00 | 2.18 | 3.00 | 3.0 | 2.00 | 1.75 | 3.58 |
| 5 | 1.00 | 5.00 | 3.09 | 4.0 | 2.00 | 4.0 | 3.00 | 2.84 | 2.00 | 3.54 | ... | 2.0 | 4.00 | 1.00 | 1.00 | 1.00 | 5.00 | 4.0 | 1.00 | 3.00 | 3.00 |
| 6 | 4.00 | 2.00 | 2.00 | 4.0 | 1.00 | 5.0 | 1.00 | 3.00 | 3.00 | 3.19 | ... | 1.0 | 5.00 | 5.00 | 3.00 | 2.00 | 2.00 | 4.0 | 3.00 | 2.00 | 3.62 |
| 7 | 2.63 | 3.00 | 2.00 | 2.0 | 2.94 | 5.0 | 5.00 | 5.00 | 4.00 | 5.00 | ... | 1.0 | 2.00 | 4.00 | 3.00 | 2.00 | 3.00 | 1.0 | 1.00 | 2.00 | 3.67 |
| 8 | 1.00 | 2.88 | 2.00 | 2.0 | 1.00 | 2.0 | 2.00 | 2.00 | 2.23 | 3.00 | ... | 5.0 | 2.00 | 3.00 | 2.00 | 2.00 | 4.00 | 3.0 | 1.00 | 1.00 | 3.12 |
| 9 | 2.00 | 1.00 | 3.00 | 3.0 | 4.00 | 2.0 | 2.56 | 3.33 | 2.00 | 2.00 | ... | 5.0 | 1.00 | 4.00 | 2.00 | 3.00 | 3.51 | 2.0 | 2.35 | 5.00 | 2.00 |
10 rows × 100 columns
Unit test¶
import unittest
import pandas as pd
from pandas._testing import assert_index_equal
from pandas._testing import assert_frame_equal
class TestTrainSVDModel(unittest.TestCase):
def test_train_model_svd(self):
from collab_filtering import CollabFilteringModel
movies_df = pd.DataFrame({'M1': [2, None, 3, None, 1, 5],
'M2': [None, 1, None, 5, 1, None],
'M3': [5, 4, 1, 1, 4, None],
'M4': [5, 4, None, None, None, 1]},
index=[f'U{i+1}' for i in range(6)])
movie_transactions = pd.melt(
movies_df.reset_index(), id_vars=['index'],
value_vars=movies_df.columns).dropna().reset_index(drop=True)
movie_transactions.columns = ["user_id", 'item_id', 'rating']
re_test = CollabFilteringModel(movie_transactions)
um = re_test.construct_utility_matrix()
rf = RandomForestRegressor(random_state=202109)
self.assertEqual(len(re_test.train_model_svd(um, rf)), 2)
self.assertEqual(len(re_test.train_model_svd(
um, rf, return_models=True)), 2)
self.assertEqual(len(re_test.train_model_svd(
um, rf, return_models=False)), len(um))
def test_fit(self):
from collab_filtering import CollabFilteringModel
movies_df = pd.DataFrame({'M1': [2, None, 3, None, 1, 5],
'M2': [None, 1, None, 5, 1, None],
'M3': [5, 4, 1, 1, 4, None],
'M4': [5, 4, None, None, None, 1]},
index=[f'U{i+1}' for i in range(6)])
movie_transactions = pd.melt(
movies_df.reset_index(), id_vars=['index'],
value_vars=movies_df.columns).dropna().reset_index(drop=True)
movie_transactions.columns = ["user_id", 'item_id', 'rating']
re_test = CollabFilteringModel(movie_transactions)
um = re_test.construct_utility_matrix()
rf = RandomForestRegressor(random_state=202109)
re_test.fit(rf, method='svd', d=2)
um_preds = re_test.utility_matrix_preds
self.assertFalse(um_preds.isnull().any().any())
rf = RandomForestRegressor(random_state=202109)
re_test.fit(rf, method='svd', d=2, return_models=True)
um_preds = re_test.utility_matrix_preds
self.assertFalse(um_preds.isnull().any().any())
self.assertEqual(len(re_test.trained_models), um.shape[1])
unittest.main(argv=[''], verbosity=2, exit=False)
test_fit (__main__.TestTrainSVDModel) ...
Done training 4 out of 4
ok
test_train_model_svd (__main__.TestTrainSVDModel) ...
Done training 4 out of 4
Done training 4 out of 4
Done training 4 out of 4
Done training 4 out of 4
ok
----------------------------------------------------------------------
Ran 2 tests in 2.111s
OK
<unittest.main.TestProgram at 0x7f06b4368c40>