Demo: Iterative Approach to ML-based Item-wise Collaborative Filtering

import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../resype')
%load_ext autoreload
%autoreload 2 

Prepare data

movies_df = pd.DataFrame({'M1': [2, None, 3, None, 1, 5],
                          'M2': [None, 1, None, 5, 1, None],
                          'M3': [5, 4, 1, 1, 4, None],
                          'M4': [5, 4, None, None, None, 1]},
                         index=[f'U{i+1}' for i in range(6)])
movies_df
M1 M2 M3 M4
U1 2.0 NaN 5.0 5.0
U2 NaN 1.0 4.0 4.0
U3 3.0 NaN 1.0 NaN
U4 NaN 5.0 1.0 NaN
U5 1.0 1.0 4.0 NaN
U6 5.0 NaN NaN 1.0
movie_transactions = pd.melt(
    movies_df.reset_index(), id_vars=['index'], 
    value_vars=movies_df.columns).dropna().reset_index(drop=True)
movie_transactions.columns = ["user_id", 'item_id', 'rating']
movie_transactions
user_id item_id rating
0 U1 M1 2.0
1 U3 M1 3.0
2 U5 M1 1.0
3 U6 M1 5.0
4 U2 M2 1.0
5 U4 M2 5.0
6 U5 M2 1.0
7 U1 M3 5.0
8 U2 M3 4.0
9 U3 M3 1.0
10 U4 M3 1.0
11 U5 M3 4.0
12 U1 M4 5.0
13 U2 M4 4.0
14 U6 M4 1.0

Load resype

from collab_filtering import CollabFilteringModel
re = CollabFilteringModel(movie_transactions)
utility_matrix = re.construct_utility_matrix()
utility_matrix
item_id M1 M2 M3 M4
user_id
U1 2.0 NaN 5.0 5.0
U2 NaN 1.0 4.0 4.0
U3 3.0 NaN 1.0 NaN
U4 NaN 5.0 1.0 NaN
U5 1.0 1.0 4.0 NaN
U6 5.0 NaN NaN 1.0

Train iterative model using train_model_iterative

Create model object (load from sklearn)

from sklearn.ensemble import RandomForestRegressor
rs_model1 = RandomForestRegressor(random_state=202109)

Train model

re.utility_matrix
item_id M1 M2 M3 M4
user_id
U1 2.0 NaN 5.0 5.0
U2 NaN 1.0 4.0 4.0
U3 3.0 NaN 1.0 NaN
U4 NaN 5.0 1.0 NaN
U5 1.0 1.0 4.0 NaN
U6 5.0 NaN NaN 1.0
utility_matrix_imputed, metrics, trained_model = re.train_model_iterative(
    re.utility_matrix, rs_model1, return_models=True)

Prediction

utility_matrix_imputed
item_id M1 M2 M3 M4
user_id
U1 -2.00 -1.45 1.00 1.00
U2 -1.45 -2.00 1.00 1.00
U3 1.00 1.14 -1.00 -1.16
U4 1.22 2.00 -2.00 -1.16
U5 -1.00 -1.00 2.00 0.91
U6 2.00 1.14 -1.15 -2.00

Convert to scale of original ratings

utility_matrix_imputed.add(re.utility_matrix.mean(axis=1), axis=0)
item_id M1 M2 M3 M4
user_id
U1 2.00 2.55 5.00 5.00
U2 1.55 1.00 4.00 4.00
U3 3.00 3.14 1.00 0.84
U4 4.22 5.00 1.00 1.84
U5 1.00 1.00 4.00 2.91
U6 5.00 4.14 1.85 1.00

Train iterative model using fit

Create model object (load from sklearn)

Train model

re.fit(rs_model1, method='iterative', return_models=True)

Prediction

re.utility_matrix_preds
item_id M1 M2 M3 M4
user_id
U1 -2.00 -1.45 1.00 1.00
U2 -1.45 -2.00 1.00 1.00
U3 1.00 1.14 -1.00 -1.16
U4 1.22 2.00 -2.00 -1.16
U5 -1.00 -1.00 2.00 0.91
U6 2.00 1.14 -1.15 -2.00

Trained models

re.trained_models
{'M1': RandomForestRegressor(random_state=202109),
 'M2': RandomForestRegressor(random_state=202109),
 'M3': RandomForestRegressor(random_state=202109),
 'M4': RandomForestRegressor(random_state=202109)}

Unit test

import unittest
import pandas as pd
from pandas._testing import assert_index_equal
from pandas._testing import assert_frame_equal


class TestTrainIterativeModel(unittest.TestCase):
    from collab_filtering import CollabFilteringModel
    movies_df = pd.DataFrame({'M1': [2, None, 3, None, 1, 5],
                              'M2': [None, 1, None, 5, 1, None],
                              'M3': [5, 4, 1, 1, 4, None],
                              'M4': [5, 4, None, None, None, 1]},
                             index=[f'U{i+1}' for i in range(6)])
    movie_transactions = pd.melt(
        movies_df.reset_index(), id_vars=['index'],
        value_vars=movies_df.columns).dropna().reset_index(drop=True)
    movie_transactions.columns = ["user_id", 'item_id', 'rating']

    def test_initialize_models_itemwise(self):
        re_test = CollabFilteringModel(movie_transactions)
        um = re_test.construct_utility_matrix()
        rf = RandomForestRegressor(random_state=202109)
        self.assertEqual(len(
            re_test.initialize_models_itemwise(um, rf)), um.shape[1])
        self.assertListEqual(sorted(um.columns),
                             sorted(re_test.initialize_models_itemwise(
                                 um, rf, suffix="").keys()))
        self.assertEqual(sorted(um.columns)[0]+'model',
                         sorted(re_test.initialize_models_itemwise(
                             um, rf).keys())[0])

    def test_initialize_models_userwise(self):
        re_test = CollabFilteringModel(movie_transactions)
        um = re_test.construct_utility_matrix()
        rf = RandomForestRegressor(random_state=202109)
        self.assertEqual(len(
            re_test.initialize_models_userwise(um, rf)), um.shape[0])
        self.assertListEqual(sorted(um.index),
                             sorted(re_test.initialize_models_userwise(
                                 um, rf, suffix="").keys()))
        self.assertEqual(sorted(um.index)[0]+'model',
                         sorted(re_test.initialize_models_userwise(
                             um, rf).keys())[0])

    def test_eval_convergence_criterion(self):
        from sklearn.metrics import mean_squared_error
        re_test = CollabFilteringModel(movie_transactions)
        um = re_test.construct_utility_matrix()
        pred_curr = [0, 0, 1]
        pred_prev = [1, 0, 1]
        pred_curr2 = [0, 0, 0.5]

        self.assertAlmostEqual(mean_squared_error(pred_curr, pred_prev),
                               re.eval_convergence_criterion(
                                   pred_curr, pred_prev, stopping_criterion='mse')[0])
        self.assertFalse(re.eval_convergence_criterion(
            pred_curr, pred_prev, stopping_criterion='mse')[1])
        self.assertFalse(re.eval_convergence_criterion(
            pred_curr, pred_prev, stopping_criterion='mse',
            mse_threshold=0.1)[1])
        self.assertTrue(re.eval_convergence_criterion(
            pred_curr, pred_prev, stopping_criterion='mse',
            mse_threshold=0.4)[1])
        self.assertTrue(re.eval_convergence_criterion(
            pred_curr, pred_curr2, stopping_criterion='mse',
            mse_threshold=0.1)[1])
        self.assertTrue(re.eval_convergence_criterion(
            pred_curr, pred_curr2, stopping_criterion='mse')[1])
        self.assertFalse(re.eval_convergence_criterion(
            pred_curr, pred_prev,  scaled=True, rating_max=1)[1])
        self.assertTrue(re.eval_convergence_criterion(
            pred_curr, pred_prev,  scaled=True, rating_max=5)[1])
        self.assertEqual(0, re.eval_convergence_criterion(
            pred_curr, pred_curr, stopping_criterion='stdev_abs',
            stdev_threshold=0.5)[0])
        self.assertTrue(re.eval_convergence_criterion(
            pred_curr, pred_prev, stopping_criterion='stdev_abs',
            stdev_threshold=0.5)[1])
        self.assertFalse(re.eval_convergence_criterion(
            pred_curr, pred_prev, stopping_criterion='stdev_abs',
            stdev_threshold=0.4)[1])
        self.assertFalse(re.eval_convergence_criterion(
            pred_curr, pred_prev, stopping_criterion='stdev_abs',
            stdev_threshold=0.1, scaled=True,
            scaling_method='max', rating_max=1)[1])

    def test_train_model_iterative(self):
        re_test = CollabFilteringModel(movie_transactions)
        um = re_test.construct_utility_matrix()
        rf = RandomForestRegressor(random_state=202109)

        self.assertEqual(len(re_test.train_model_iterative(um, rf)), 3)
        self.assertEqual(
            len(re_test.train_model_iterative(um, rf, return_models=True)), 3)
        self.assertEqual(
            len(re_test.train_model_iterative(um, rf, return_models=True)[2]),
            um.shape[1])
        self.assertEqual(
            len(re_test.train_model_iterative(um, rf, return_models=False)), 2)
        self.assertEqual(um.shape,
                         re_test.train_model_iterative(
                             um, rf, return_models=False)[0].shape)
        self.assertEqual(um.shape,
                         re_test.train_model_iterative(
                             um, rf, return_models=True)[0].shape)

    def test_fit(self):
        re_test = CollabFilteringModel(movie_transactions)
        um = re_test.construct_utility_matrix()
        rf = RandomForestRegressor(random_state=202109)
        re_test.fit(rf, method='iterative')
        um_preds = re_test.utility_matrix_preds
        self.assertFalse(um_preds.isnull().any().any())
        
        rf = RandomForestRegressor(random_state=202109)
        re_test.fit(rf, method='iterative', return_models=True)
        um_preds = re_test.utility_matrix_preds
        self.assertFalse(um_preds.isnull().any().any())
        self.assertEqual(len(re_test.trained_models), um.shape[1])

unittest.main(argv=[''], verbosity=2, exit=False)
test_eval_convergence_criterion (__main__.TestTrainIterativeModel) ... ok
test_fit (__main__.TestTrainIterativeModel) ... ok
test_initialize_models_itemwise (__main__.TestTrainIterativeModel) ... ok
test_initialize_models_userwise (__main__.TestTrainIterativeModel) ... ok
test_train_model_iterative (__main__.TestTrainIterativeModel) ... ok

----------------------------------------------------------------------
Ran 5 tests in 7.376s

OK
<unittest.main.TestProgram at 0x7f26f01024f0>