Demo: Iterative Approach to ML-based Item-wise Collaborative Filtering¶
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../resype')
%load_ext autoreload
%autoreload 2
Prepare data¶
movies_df = pd.DataFrame({'M1': [2, None, 3, None, 1, 5],
'M2': [None, 1, None, 5, 1, None],
'M3': [5, 4, 1, 1, 4, None],
'M4': [5, 4, None, None, None, 1]},
index=[f'U{i+1}' for i in range(6)])
movies_df
| M1 | M2 | M3 | M4 | |
|---|---|---|---|---|
| U1 | 2.0 | NaN | 5.0 | 5.0 |
| U2 | NaN | 1.0 | 4.0 | 4.0 |
| U3 | 3.0 | NaN | 1.0 | NaN |
| U4 | NaN | 5.0 | 1.0 | NaN |
| U5 | 1.0 | 1.0 | 4.0 | NaN |
| U6 | 5.0 | NaN | NaN | 1.0 |
movie_transactions = pd.melt(
movies_df.reset_index(), id_vars=['index'],
value_vars=movies_df.columns).dropna().reset_index(drop=True)
movie_transactions.columns = ["user_id", 'item_id', 'rating']
movie_transactions
| user_id | item_id | rating | |
|---|---|---|---|
| 0 | U1 | M1 | 2.0 |
| 1 | U3 | M1 | 3.0 |
| 2 | U5 | M1 | 1.0 |
| 3 | U6 | M1 | 5.0 |
| 4 | U2 | M2 | 1.0 |
| 5 | U4 | M2 | 5.0 |
| 6 | U5 | M2 | 1.0 |
| 7 | U1 | M3 | 5.0 |
| 8 | U2 | M3 | 4.0 |
| 9 | U3 | M3 | 1.0 |
| 10 | U4 | M3 | 1.0 |
| 11 | U5 | M3 | 4.0 |
| 12 | U1 | M4 | 5.0 |
| 13 | U2 | M4 | 4.0 |
| 14 | U6 | M4 | 1.0 |
Load resype¶
from collab_filtering import CollabFilteringModel
re = CollabFilteringModel(movie_transactions)
utility_matrix = re.construct_utility_matrix()
utility_matrix
| item_id | M1 | M2 | M3 | M4 |
|---|---|---|---|---|
| user_id | ||||
| U1 | 2.0 | NaN | 5.0 | 5.0 |
| U2 | NaN | 1.0 | 4.0 | 4.0 |
| U3 | 3.0 | NaN | 1.0 | NaN |
| U4 | NaN | 5.0 | 1.0 | NaN |
| U5 | 1.0 | 1.0 | 4.0 | NaN |
| U6 | 5.0 | NaN | NaN | 1.0 |
Train iterative model using train_model_iterative¶
Create model object (load from sklearn)¶
from sklearn.ensemble import RandomForestRegressor
rs_model1 = RandomForestRegressor(random_state=202109)
Train model¶
re.utility_matrix
| item_id | M1 | M2 | M3 | M4 |
|---|---|---|---|---|
| user_id | ||||
| U1 | 2.0 | NaN | 5.0 | 5.0 |
| U2 | NaN | 1.0 | 4.0 | 4.0 |
| U3 | 3.0 | NaN | 1.0 | NaN |
| U4 | NaN | 5.0 | 1.0 | NaN |
| U5 | 1.0 | 1.0 | 4.0 | NaN |
| U6 | 5.0 | NaN | NaN | 1.0 |
utility_matrix_imputed, metrics, trained_model = re.train_model_iterative(
re.utility_matrix, rs_model1, return_models=True)
Prediction¶
utility_matrix_imputed
| item_id | M1 | M2 | M3 | M4 |
|---|---|---|---|---|
| user_id | ||||
| U1 | -2.00 | -1.45 | 1.00 | 1.00 |
| U2 | -1.45 | -2.00 | 1.00 | 1.00 |
| U3 | 1.00 | 1.14 | -1.00 | -1.16 |
| U4 | 1.22 | 2.00 | -2.00 | -1.16 |
| U5 | -1.00 | -1.00 | 2.00 | 0.91 |
| U6 | 2.00 | 1.14 | -1.15 | -2.00 |
Convert to scale of original ratings¶
utility_matrix_imputed.add(re.utility_matrix.mean(axis=1), axis=0)
| item_id | M1 | M2 | M3 | M4 |
|---|---|---|---|---|
| user_id | ||||
| U1 | 2.00 | 2.55 | 5.00 | 5.00 |
| U2 | 1.55 | 1.00 | 4.00 | 4.00 |
| U3 | 3.00 | 3.14 | 1.00 | 0.84 |
| U4 | 4.22 | 5.00 | 1.00 | 1.84 |
| U5 | 1.00 | 1.00 | 4.00 | 2.91 |
| U6 | 5.00 | 4.14 | 1.85 | 1.00 |
Train iterative model using fit¶
Create model object (load from sklearn)¶
Train model¶
re.fit(rs_model1, method='iterative', return_models=True)
Prediction¶
re.utility_matrix_preds
| item_id | M1 | M2 | M3 | M4 |
|---|---|---|---|---|
| user_id | ||||
| U1 | -2.00 | -1.45 | 1.00 | 1.00 |
| U2 | -1.45 | -2.00 | 1.00 | 1.00 |
| U3 | 1.00 | 1.14 | -1.00 | -1.16 |
| U4 | 1.22 | 2.00 | -2.00 | -1.16 |
| U5 | -1.00 | -1.00 | 2.00 | 0.91 |
| U6 | 2.00 | 1.14 | -1.15 | -2.00 |
Trained models¶
re.trained_models
{'M1': RandomForestRegressor(random_state=202109),
'M2': RandomForestRegressor(random_state=202109),
'M3': RandomForestRegressor(random_state=202109),
'M4': RandomForestRegressor(random_state=202109)}
Unit test¶
import unittest
import pandas as pd
from pandas._testing import assert_index_equal
from pandas._testing import assert_frame_equal
class TestTrainIterativeModel(unittest.TestCase):
from collab_filtering import CollabFilteringModel
movies_df = pd.DataFrame({'M1': [2, None, 3, None, 1, 5],
'M2': [None, 1, None, 5, 1, None],
'M3': [5, 4, 1, 1, 4, None],
'M4': [5, 4, None, None, None, 1]},
index=[f'U{i+1}' for i in range(6)])
movie_transactions = pd.melt(
movies_df.reset_index(), id_vars=['index'],
value_vars=movies_df.columns).dropna().reset_index(drop=True)
movie_transactions.columns = ["user_id", 'item_id', 'rating']
def test_initialize_models_itemwise(self):
re_test = CollabFilteringModel(movie_transactions)
um = re_test.construct_utility_matrix()
rf = RandomForestRegressor(random_state=202109)
self.assertEqual(len(
re_test.initialize_models_itemwise(um, rf)), um.shape[1])
self.assertListEqual(sorted(um.columns),
sorted(re_test.initialize_models_itemwise(
um, rf, suffix="").keys()))
self.assertEqual(sorted(um.columns)[0]+'model',
sorted(re_test.initialize_models_itemwise(
um, rf).keys())[0])
def test_initialize_models_userwise(self):
re_test = CollabFilteringModel(movie_transactions)
um = re_test.construct_utility_matrix()
rf = RandomForestRegressor(random_state=202109)
self.assertEqual(len(
re_test.initialize_models_userwise(um, rf)), um.shape[0])
self.assertListEqual(sorted(um.index),
sorted(re_test.initialize_models_userwise(
um, rf, suffix="").keys()))
self.assertEqual(sorted(um.index)[0]+'model',
sorted(re_test.initialize_models_userwise(
um, rf).keys())[0])
def test_eval_convergence_criterion(self):
from sklearn.metrics import mean_squared_error
re_test = CollabFilteringModel(movie_transactions)
um = re_test.construct_utility_matrix()
pred_curr = [0, 0, 1]
pred_prev = [1, 0, 1]
pred_curr2 = [0, 0, 0.5]
self.assertAlmostEqual(mean_squared_error(pred_curr, pred_prev),
re.eval_convergence_criterion(
pred_curr, pred_prev, stopping_criterion='mse')[0])
self.assertFalse(re.eval_convergence_criterion(
pred_curr, pred_prev, stopping_criterion='mse')[1])
self.assertFalse(re.eval_convergence_criterion(
pred_curr, pred_prev, stopping_criterion='mse',
mse_threshold=0.1)[1])
self.assertTrue(re.eval_convergence_criterion(
pred_curr, pred_prev, stopping_criterion='mse',
mse_threshold=0.4)[1])
self.assertTrue(re.eval_convergence_criterion(
pred_curr, pred_curr2, stopping_criterion='mse',
mse_threshold=0.1)[1])
self.assertTrue(re.eval_convergence_criterion(
pred_curr, pred_curr2, stopping_criterion='mse')[1])
self.assertFalse(re.eval_convergence_criterion(
pred_curr, pred_prev, scaled=True, rating_max=1)[1])
self.assertTrue(re.eval_convergence_criterion(
pred_curr, pred_prev, scaled=True, rating_max=5)[1])
self.assertEqual(0, re.eval_convergence_criterion(
pred_curr, pred_curr, stopping_criterion='stdev_abs',
stdev_threshold=0.5)[0])
self.assertTrue(re.eval_convergence_criterion(
pred_curr, pred_prev, stopping_criterion='stdev_abs',
stdev_threshold=0.5)[1])
self.assertFalse(re.eval_convergence_criterion(
pred_curr, pred_prev, stopping_criterion='stdev_abs',
stdev_threshold=0.4)[1])
self.assertFalse(re.eval_convergence_criterion(
pred_curr, pred_prev, stopping_criterion='stdev_abs',
stdev_threshold=0.1, scaled=True,
scaling_method='max', rating_max=1)[1])
def test_train_model_iterative(self):
re_test = CollabFilteringModel(movie_transactions)
um = re_test.construct_utility_matrix()
rf = RandomForestRegressor(random_state=202109)
self.assertEqual(len(re_test.train_model_iterative(um, rf)), 3)
self.assertEqual(
len(re_test.train_model_iterative(um, rf, return_models=True)), 3)
self.assertEqual(
len(re_test.train_model_iterative(um, rf, return_models=True)[2]),
um.shape[1])
self.assertEqual(
len(re_test.train_model_iterative(um, rf, return_models=False)), 2)
self.assertEqual(um.shape,
re_test.train_model_iterative(
um, rf, return_models=False)[0].shape)
self.assertEqual(um.shape,
re_test.train_model_iterative(
um, rf, return_models=True)[0].shape)
def test_fit(self):
re_test = CollabFilteringModel(movie_transactions)
um = re_test.construct_utility_matrix()
rf = RandomForestRegressor(random_state=202109)
re_test.fit(rf, method='iterative')
um_preds = re_test.utility_matrix_preds
self.assertFalse(um_preds.isnull().any().any())
rf = RandomForestRegressor(random_state=202109)
re_test.fit(rf, method='iterative', return_models=True)
um_preds = re_test.utility_matrix_preds
self.assertFalse(um_preds.isnull().any().any())
self.assertEqual(len(re_test.trained_models), um.shape[1])
unittest.main(argv=[''], verbosity=2, exit=False)
test_eval_convergence_criterion (__main__.TestTrainIterativeModel) ... ok
test_fit (__main__.TestTrainIterativeModel) ... ok
test_initialize_models_itemwise (__main__.TestTrainIterativeModel) ... ok
test_initialize_models_userwise (__main__.TestTrainIterativeModel) ... ok
test_train_model_iterative (__main__.TestTrainIterativeModel) ... ok
----------------------------------------------------------------------
Ran 5 tests in 7.376s
OK
<unittest.main.TestProgram at 0x7f26f01024f0>