Demo: ML-based Collaborative Filtering on Utility Matrix with Reduced Dimensions

import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../resype')
%load_ext autoreload
%autoreload 2 

Prepare data

np.random.seed(202109)
rating_vals = np.hstack([np.arange(1,6), [np.nan]])
userids = np.arange(10)
itemids = np.arange(100)
random_ratings = np.random.choice(rating_vals, size=len(userids)*len(itemids))
transactions = pd.DataFrame(
    {'user_id': userids.repeat(len(itemids)),
     'item_id': itemids.reshape((-1, 1)).repeat(len(userids), axis=1).T.flatten(),
     'rating': random_ratings}).drop_duplicates()
transactions
user_id item_id rating
0 0 0 2.0
1 0 1 NaN
2 0 2 NaN
3 0 3 5.0
4 0 4 4.0
... ... ... ...
995 9 95 NaN
996 9 96 2.0
997 9 97 NaN
998 9 98 5.0
999 9 99 2.0

1000 rows × 3 columns

Load resype

from collab_filtering import CollabFilteringModel
re = CollabFilteringModel(transactions)
utility_matrix = re.construct_utility_matrix()
utility_matrix
item_id 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
user_id
0 2.0 NaN NaN 5.0 4.0 4.0 3.0 4.0 4.0 3.0 ... 4.0 NaN 1.0 1.0 NaN 4.0 5.0 5.0 1.0 5.0
1 1.0 3.0 5.0 2.0 NaN 5.0 3.0 3.0 1.0 NaN ... 4.0 4.0 3.0 NaN 2.0 1.0 4.0 1.0 3.0 4.0
2 2.0 1.0 2.0 4.0 5.0 2.0 2.0 1.0 1.0 5.0 ... 5.0 2.0 5.0 4.0 2.0 4.0 3.0 NaN 2.0 2.0
3 5.0 5.0 4.0 5.0 2.0 4.0 1.0 3.0 1.0 4.0 ... 4.0 2.0 3.0 2.0 4.0 3.0 4.0 4.0 1.0 5.0
4 4.0 1.0 2.0 3.0 2.0 2.0 NaN 5.0 NaN 4.0 ... 1.0 NaN NaN 4.0 NaN 3.0 3.0 2.0 NaN NaN
5 1.0 5.0 NaN 4.0 2.0 4.0 3.0 NaN 2.0 NaN ... 2.0 4.0 1.0 1.0 1.0 5.0 4.0 1.0 3.0 3.0
6 4.0 2.0 2.0 4.0 1.0 5.0 1.0 3.0 3.0 NaN ... 1.0 5.0 5.0 3.0 2.0 2.0 4.0 3.0 2.0 NaN
7 NaN 3.0 2.0 2.0 NaN 5.0 5.0 5.0 4.0 5.0 ... 1.0 2.0 4.0 3.0 2.0 3.0 1.0 1.0 2.0 NaN
8 1.0 NaN 2.0 2.0 1.0 2.0 2.0 2.0 NaN 3.0 ... 5.0 2.0 3.0 2.0 2.0 4.0 3.0 1.0 1.0 NaN
9 2.0 1.0 3.0 3.0 4.0 2.0 NaN NaN 2.0 2.0 ... 5.0 1.0 4.0 2.0 3.0 NaN 2.0 NaN 5.0 2.0

10 rows × 100 columns

Train model using train_model_svd

Create model object (load from sklearn)

from sklearn.ensemble import RandomForestRegressor
rs_model1 = RandomForestRegressor(random_state=202109)

Train model

np.random.seed(202109)
utility_matrix_imputed = re.train_model_svd(
    re.utility_matrix, rs_model1, d=10, return_models=False)
Done training 100 out of 100

Prediction

utility_matrix_imputed
item_id 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
user_id
0 2.00 2.99 3.01 5.0 4.00 4.0 3.00 4.00 4.00 3.00 ... 4.0 2.50 1.00 1.00 2.81 4.00 5.0 5.0 1.00 5.00
1 1.00 3.00 5.00 2.0 2.31 5.0 3.00 3.00 1.00 3.82 ... 4.0 4.00 3.00 2.07 2.00 1.00 4.0 1.0 3.00 4.00
2 2.00 1.00 2.00 4.0 5.00 2.0 2.00 1.00 1.00 5.00 ... 5.0 2.00 5.00 4.00 2.00 4.00 3.0 1.9 2.00 2.00
3 5.00 5.00 4.00 5.0 2.00 4.0 1.00 3.00 1.00 4.00 ... 4.0 2.00 3.00 2.00 4.00 3.00 4.0 4.0 1.00 5.00
4 4.00 1.00 2.00 3.0 2.00 2.0 2.33 5.00 2.44 4.00 ... 1.0 2.76 2.63 4.00 2.18 3.00 3.0 2.0 1.75 3.61
5 1.00 5.00 3.09 4.0 2.00 4.0 3.00 2.86 2.00 3.55 ... 2.0 4.00 1.00 1.00 1.00 5.00 4.0 1.0 3.00 3.00
6 4.00 2.00 2.00 4.0 1.00 5.0 1.00 3.00 3.00 3.21 ... 1.0 5.00 5.00 3.00 2.00 2.00 4.0 3.0 2.00 3.61
7 2.71 3.00 2.00 2.0 2.91 5.0 5.00 5.00 4.00 5.00 ... 1.0 2.00 4.00 3.00 2.00 3.00 1.0 1.0 2.00 3.66
8 1.00 2.91 2.00 2.0 1.00 2.0 2.00 2.00 2.23 3.00 ... 5.0 2.00 3.00 2.00 2.00 4.00 3.0 1.0 1.00 3.05
9 2.00 1.00 3.00 3.0 4.00 2.0 2.56 3.32 2.00 2.00 ... 5.0 1.00 4.00 2.00 3.00 3.53 2.0 2.3 5.00 2.00

10 rows × 100 columns

Train iterative model using fit

Create model object (load from sklearn)

Train model

rs_model2 = RandomForestRegressor(random_state=202109)
re.fit(rs_model2, method='svd', d=10)
Done training 100 out of 100

Prediction

re.utility_matrix_preds
item_id 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
user_id
0 2.00 3.05 3.00 5.0 4.00 4.0 3.00 4.00 4.00 3.00 ... 4.0 2.54 1.00 1.00 2.80 4.00 5.0 5.00 1.00 5.00
1 1.00 3.00 5.00 2.0 2.41 5.0 3.00 3.00 1.00 3.83 ... 4.0 4.00 3.00 2.06 2.00 1.00 4.0 1.00 3.00 4.00
2 2.00 1.00 2.00 4.0 5.00 2.0 2.00 1.00 1.00 5.00 ... 5.0 2.00 5.00 4.00 2.00 4.00 3.0 1.93 2.00 2.00
3 5.00 5.00 4.00 5.0 2.00 4.0 1.00 3.00 1.00 4.00 ... 4.0 2.00 3.00 2.00 4.00 3.00 4.0 4.00 1.00 5.00
4 4.00 1.00 2.00 3.0 2.00 2.0 2.35 5.00 2.40 4.00 ... 1.0 2.73 2.63 4.00 2.18 3.00 3.0 2.00 1.75 3.58
5 1.00 5.00 3.09 4.0 2.00 4.0 3.00 2.84 2.00 3.54 ... 2.0 4.00 1.00 1.00 1.00 5.00 4.0 1.00 3.00 3.00
6 4.00 2.00 2.00 4.0 1.00 5.0 1.00 3.00 3.00 3.19 ... 1.0 5.00 5.00 3.00 2.00 2.00 4.0 3.00 2.00 3.62
7 2.63 3.00 2.00 2.0 2.94 5.0 5.00 5.00 4.00 5.00 ... 1.0 2.00 4.00 3.00 2.00 3.00 1.0 1.00 2.00 3.67
8 1.00 2.88 2.00 2.0 1.00 2.0 2.00 2.00 2.23 3.00 ... 5.0 2.00 3.00 2.00 2.00 4.00 3.0 1.00 1.00 3.12
9 2.00 1.00 3.00 3.0 4.00 2.0 2.56 3.33 2.00 2.00 ... 5.0 1.00 4.00 2.00 3.00 3.51 2.0 2.35 5.00 2.00

10 rows × 100 columns

Unit test

import unittest
import pandas as pd
from pandas._testing import assert_index_equal
from pandas._testing import assert_frame_equal


class TestTrainSVDModel(unittest.TestCase):

    def test_train_model_svd(self):
        from collab_filtering import CollabFilteringModel
        movies_df = pd.DataFrame({'M1': [2, None, 3, None, 1, 5],
                                  'M2': [None, 1, None, 5, 1, None],
                                  'M3': [5, 4, 1, 1, 4, None],
                                  'M4': [5, 4, None, None, None, 1]},
                                 index=[f'U{i+1}' for i in range(6)])
        movie_transactions = pd.melt(
            movies_df.reset_index(), id_vars=['index'],
            value_vars=movies_df.columns).dropna().reset_index(drop=True)
        movie_transactions.columns = ["user_id", 'item_id', 'rating']

        re_test = CollabFilteringModel(movie_transactions)
        um = re_test.construct_utility_matrix()
        rf = RandomForestRegressor(random_state=202109)

        self.assertEqual(len(re_test.train_model_svd(um, rf)), 2)
        self.assertEqual(len(re_test.train_model_svd(
            um, rf, return_models=True)), 2)
        self.assertEqual(len(re_test.train_model_svd(
            um, rf, return_models=False)), len(um))

    def test_fit(self):
        from collab_filtering import CollabFilteringModel
        movies_df = pd.DataFrame({'M1': [2, None, 3, None, 1, 5],
                                  'M2': [None, 1, None, 5, 1, None],
                                  'M3': [5, 4, 1, 1, 4, None],
                                  'M4': [5, 4, None, None, None, 1]},
                                 index=[f'U{i+1}' for i in range(6)])
        movie_transactions = pd.melt(
            movies_df.reset_index(), id_vars=['index'],
            value_vars=movies_df.columns).dropna().reset_index(drop=True)
        movie_transactions.columns = ["user_id", 'item_id', 'rating']
        
        re_test = CollabFilteringModel(movie_transactions)
        um = re_test.construct_utility_matrix()
        rf = RandomForestRegressor(random_state=202109)
        re_test.fit(rf, method='svd', d=2)
        um_preds = re_test.utility_matrix_preds
        self.assertFalse(um_preds.isnull().any().any())

        rf = RandomForestRegressor(random_state=202109)
        re_test.fit(rf, method='svd', d=2, return_models=True)
        um_preds = re_test.utility_matrix_preds
        self.assertFalse(um_preds.isnull().any().any())
        self.assertEqual(len(re_test.trained_models), um.shape[1])


unittest.main(argv=[''], verbosity=2, exit=False)
test_fit (__main__.TestTrainSVDModel) ... 
Done training 4 out of 4
ok
test_train_model_svd (__main__.TestTrainSVDModel) ... 
Done training 4 out of 4
Done training 4 out of 4
Done training 4 out of 4
Done training 4 out of 4
ok

----------------------------------------------------------------------
Ran 2 tests in 2.111s

OK
<unittest.main.TestProgram at 0x7f06b4368c40>