Demo: Iterative Approach to ML-based Item-wise Collaborative Filtering Applied to Clustered Data

import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../resype')
%load_ext autoreload
%autoreload 2 

Prepare data

np.random.seed(202109)
rating_vals = np.hstack([np.arange(1,6), [np.nan]])
rating_vals
array([ 1.,  2.,  3.,  4.,  5., nan])
userids = np.arange(1000)
itemids = np.arange(1000)
random_ratings = np.random.choice(rating_vals, size=len(userids)*len(itemids))
transactions = pd.DataFrame(
    {'user_id': userids.repeat(len(itemids)),
     'item_id': itemids.reshape((-1, 1)).repeat(len(userids), axis=1).T.flatten(),
     'rating': random_ratings}).drop_duplicates()
transactions
user_id item_id rating
0 0 0 2.0
1 0 1 NaN
2 0 2 NaN
3 0 3 5.0
4 0 4 4.0
... ... ... ...
999995 999 995 1.0
999996 999 996 3.0
999997 999 997 NaN
999998 999 998 2.0
999999 999 999 1.0

1000000 rows × 3 columns

Load resype

from collab_filtering import CollabFilteringModel
re = CollabFilteringModel(transactions)
utility_matrix = re.construct_utility_matrix()
utility_matrix
item_id 0 1 2 3 4 5 6 7 8 9 ... 990 991 992 993 994 995 996 997 998 999
user_id
0 2.0 NaN NaN 5.0 4.0 4.0 3.0 4.0 4.0 3.0 ... 5.0 1.0 4.0 2.0 3.0 NaN 2.0 NaN 5.0 2.0
1 1.0 NaN 4.0 5.0 3.0 2.0 1.0 3.0 1.0 NaN ... 3.0 3.0 2.0 4.0 4.0 3.0 4.0 4.0 3.0 4.0
2 3.0 4.0 4.0 4.0 2.0 4.0 2.0 4.0 1.0 4.0 ... 5.0 4.0 3.0 1.0 NaN 5.0 2.0 2.0 NaN 5.0
3 5.0 2.0 1.0 NaN 2.0 4.0 3.0 3.0 NaN 1.0 ... 2.0 NaN 2.0 3.0 5.0 2.0 NaN 5.0 NaN 1.0
4 2.0 1.0 3.0 1.0 2.0 2.0 3.0 1.0 3.0 5.0 ... NaN 3.0 1.0 4.0 4.0 1.0 2.0 1.0 2.0 3.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 5.0 NaN NaN 2.0 1.0 5.0 1.0 2.0 2.0 3.0 ... NaN 1.0 5.0 1.0 5.0 2.0 2.0 NaN 3.0 5.0
996 1.0 5.0 5.0 2.0 5.0 4.0 1.0 NaN 1.0 5.0 ... NaN NaN 1.0 NaN NaN NaN 4.0 5.0 1.0 2.0
997 2.0 5.0 NaN 4.0 5.0 4.0 NaN 3.0 NaN 3.0 ... 1.0 4.0 1.0 5.0 4.0 3.0 NaN 5.0 2.0 2.0
998 3.0 2.0 2.0 1.0 1.0 4.0 2.0 1.0 5.0 1.0 ... 3.0 4.0 4.0 5.0 5.0 NaN 4.0 5.0 4.0 4.0
999 1.0 4.0 NaN NaN 4.0 3.0 1.0 5.0 4.0 5.0 ... 2.0 NaN 3.0 4.0 5.0 1.0 3.0 NaN 2.0 1.0

1000 rows × 1000 columns

Cluster data

from sklearn.cluster import (KMeans, SpectralClustering,
                             AgglomerativeClustering, DBSCAN, OPTICS,
                             cluster_optics_dbscan, Birch)

model1 = KMeans(n_clusters = 15)
model2 = KMeans(n_clusters = 20)
x_u,y_u, df_u  = re.cluster_users(model1)
x_i,y_i, df_i  = re.cluster_items(model2)

Generate new utility matrix based on clusters

# Running this overwrites the original utility matrix
Uc_df = re.utility_matrix_agg()
Uc_df
i_cluster 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
u_cluster
0 2.459586 2.481728 2.564732 2.470130 2.536458 2.578073 2.524068 2.572545 2.442857 2.510445 2.545788 2.487155 2.446429 2.475377 2.557738 2.484524 2.554825 2.430019 2.555314 2.518707
1 2.500000 2.507558 2.525000 2.506061 2.488542 2.527132 2.555072 2.523958 2.441333 2.530818 2.499145 2.557018 2.528169 2.490683 2.646111 2.494815 2.439474 2.545495 2.464634 2.428571
2 2.433293 2.496349 2.452762 2.513531 2.518169 2.505408 2.470172 2.569767 2.399070 2.502852 2.440370 2.495104 2.552571 2.503972 2.544574 2.552972 2.469400 2.496543 2.489790 2.526578
3 2.568279 2.499686 2.589105 2.468796 2.570101 2.454431 2.534665 2.518581 2.505405 2.542325 2.509356 2.512802 2.528169 2.517962 2.538739 2.550150 2.511380 2.466764 2.512690 2.629987
4 2.411483 2.465513 2.500355 2.459711 2.457623 2.450317 2.639328 2.383523 2.505455 2.520583 2.511364 2.522329 2.480154 2.499859 2.484848 2.448232 2.560805 2.512285 2.497506 2.438853
5 2.541963 2.543055 2.515203 2.497297 2.502252 2.468887 2.491187 2.687500 2.551351 2.518103 2.429660 2.491228 2.529882 2.491858 2.447748 2.513514 2.534851 2.523009 2.520435 2.384813
6 2.456672 2.542929 2.479167 2.492195 2.523464 2.436105 2.465086 2.517992 2.533737 2.504955 2.518778 2.503456 2.518993 2.497522 2.476263 2.497755 2.544568 2.507917 2.492547 2.458393
7 2.406534 2.475742 2.579203 2.514420 2.538793 2.508821 2.500750 2.508621 2.340690 2.531555 2.459770 2.457955 2.511413 2.479439 2.364368 2.542146 2.480036 2.439888 2.526493 2.537767
8 2.560272 2.483121 2.601815 2.490909 2.476478 2.405101 2.677419 2.375000 2.412903 2.469872 2.391232 2.460668 2.517038 2.544781 2.444086 2.445878 2.579513 2.521360 2.406373 2.503840
9 2.433114 2.475048 2.561849 2.493182 2.433594 2.453973 2.483696 2.516927 2.479167 2.475236 2.602030 2.474415 2.466549 2.566253 2.493750 2.451852 2.461988 2.490991 2.504065 2.648810
10 2.547635 2.533559 2.546282 2.425316 2.529272 2.563438 2.472207 2.303006 2.459747 2.571053 2.525479 2.484344 2.492423 2.460964 2.543460 2.528833 2.546525 2.528909 2.509262 2.420133
11 2.632554 2.507967 2.461227 2.489226 2.537037 2.490095 2.512882 2.606481 2.495556 2.504542 2.485755 2.523717 2.454356 2.555095 2.459259 2.520988 2.507147 2.405405 2.511066 2.485891
12 2.397946 2.564095 2.464939 2.454102 2.474593 2.461146 2.388123 2.481707 2.622439 2.431201 2.520325 2.454429 2.407077 2.492047 2.435772 2.478049 2.521181 2.549769 2.516359 2.526132
13 2.483175 2.485322 2.566086 2.457824 2.450478 2.565002 2.467569 2.592213 2.495082 2.500773 2.618327 2.551913 2.502424 2.550046 2.618033 2.516211 2.505321 2.522375 2.500800 2.620609
14 2.640653 2.481155 2.380388 2.559875 2.426724 2.416199 2.541229 2.303879 2.628966 2.491217 2.506631 2.552329 2.386595 2.514885 2.474713 2.482759 2.529341 2.547996 2.553406 2.509031

Train iterative model using train_model_iterative_cluster

Create model object (load from sklearn)

from sklearn.ensemble import RandomForestRegressor
rs_model1 = RandomForestRegressor(random_state=202109)

Train model

re.utility_matrix
i_cluster 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
u_cluster
0 2.459586 2.481728 2.564732 2.470130 2.536458 2.578073 2.524068 2.572545 2.442857 2.510445 2.545788 2.487155 2.446429 2.475377 2.557738 2.484524 2.554825 2.430019 2.555314 2.518707
1 2.500000 2.507558 2.525000 2.506061 2.488542 2.527132 2.555072 2.523958 2.441333 2.530818 2.499145 2.557018 2.528169 2.490683 2.646111 2.494815 2.439474 2.545495 2.464634 2.428571
2 2.433293 2.496349 2.452762 2.513531 2.518169 2.505408 2.470172 2.569767 2.399070 2.502852 2.440370 2.495104 2.552571 2.503972 2.544574 2.552972 2.469400 2.496543 2.489790 2.526578
3 2.568279 2.499686 2.589105 2.468796 2.570101 2.454431 2.534665 2.518581 2.505405 2.542325 2.509356 2.512802 2.528169 2.517962 2.538739 2.550150 2.511380 2.466764 2.512690 2.629987
4 2.411483 2.465513 2.500355 2.459711 2.457623 2.450317 2.639328 2.383523 2.505455 2.520583 2.511364 2.522329 2.480154 2.499859 2.484848 2.448232 2.560805 2.512285 2.497506 2.438853
5 2.541963 2.543055 2.515203 2.497297 2.502252 2.468887 2.491187 2.687500 2.551351 2.518103 2.429660 2.491228 2.529882 2.491858 2.447748 2.513514 2.534851 2.523009 2.520435 2.384813
6 2.456672 2.542929 2.479167 2.492195 2.523464 2.436105 2.465086 2.517992 2.533737 2.504955 2.518778 2.503456 2.518993 2.497522 2.476263 2.497755 2.544568 2.507917 2.492547 2.458393
7 2.406534 2.475742 2.579203 2.514420 2.538793 2.508821 2.500750 2.508621 2.340690 2.531555 2.459770 2.457955 2.511413 2.479439 2.364368 2.542146 2.480036 2.439888 2.526493 2.537767
8 2.560272 2.483121 2.601815 2.490909 2.476478 2.405101 2.677419 2.375000 2.412903 2.469872 2.391232 2.460668 2.517038 2.544781 2.444086 2.445878 2.579513 2.521360 2.406373 2.503840
9 2.433114 2.475048 2.561849 2.493182 2.433594 2.453973 2.483696 2.516927 2.479167 2.475236 2.602030 2.474415 2.466549 2.566253 2.493750 2.451852 2.461988 2.490991 2.504065 2.648810
10 2.547635 2.533559 2.546282 2.425316 2.529272 2.563438 2.472207 2.303006 2.459747 2.571053 2.525479 2.484344 2.492423 2.460964 2.543460 2.528833 2.546525 2.528909 2.509262 2.420133
11 2.632554 2.507967 2.461227 2.489226 2.537037 2.490095 2.512882 2.606481 2.495556 2.504542 2.485755 2.523717 2.454356 2.555095 2.459259 2.520988 2.507147 2.405405 2.511066 2.485891
12 2.397946 2.564095 2.464939 2.454102 2.474593 2.461146 2.388123 2.481707 2.622439 2.431201 2.520325 2.454429 2.407077 2.492047 2.435772 2.478049 2.521181 2.549769 2.516359 2.526132
13 2.483175 2.485322 2.566086 2.457824 2.450478 2.565002 2.467569 2.592213 2.495082 2.500773 2.618327 2.551913 2.502424 2.550046 2.618033 2.516211 2.505321 2.522375 2.500800 2.620609
14 2.640653 2.481155 2.380388 2.559875 2.426724 2.416199 2.541229 2.303879 2.628966 2.491217 2.506631 2.552329 2.386595 2.514885 2.474713 2.482759 2.529341 2.547996 2.553406 2.509031
%%time
utility_matrix_imputed = re.train_model_iterative_cluster(
    re.utility_matrix, rs_model1)
CPU times: user 3min 35s, sys: 1.55 s, total: 3min 36s
Wall time: 3min 36s

Prediction

utility_matrix_imputed
i_cluster 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
u_cluster
0 -0.039038 -0.022122 0.045300 -0.036716 0.019990 0.044360 0.011957 0.042639 -0.052608 0.004387 0.029200 -0.014744 -0.051624 -0.023198 0.040038 -0.016478 0.034931 -0.063235 0.039611 0.012240
1 -0.007077 -0.003058 0.016287 -0.007953 -0.015383 0.010219 0.033615 0.009101 -0.053635 0.017727 -0.010577 0.034276 0.012579 -0.012927 0.117389 -0.010746 -0.055454 0.025589 -0.032478 -0.067923
2 -0.048111 0.000198 -0.027908 0.011536 0.017701 0.004306 -0.019295 0.064164 -0.078891 0.006888 -0.042252 -0.001617 0.042408 0.006379 0.037838 0.042446 -0.019840 -0.003636 -0.006375 0.021758
3 0.026788 -0.022419 0.053440 -0.048951 0.034686 -0.054147 0.006116 -0.002173 -0.022897 0.015019 -0.009705 -0.011122 -0.000280 -0.005460 0.009860 0.016758 -0.010572 -0.051794 -0.008496 0.078387
4 -0.052601 -0.014234 0.012786 -0.024935 -0.024088 -0.035817 0.121849 -0.085713 0.012799 0.027142 0.021823 0.026909 -0.009737 0.010896 -0.003600 -0.033763 0.064394 0.020455 0.006621 -0.041351
5 0.024410 0.028842 0.006173 -0.012920 -0.004719 -0.036184 -0.015279 0.145937 0.031671 0.008260 -0.065516 -0.015894 0.013434 -0.012665 -0.047536 0.003986 0.022734 0.008966 0.006743 -0.100565
6 -0.031623 0.033249 -0.013314 -0.007262 0.019462 -0.054230 -0.024824 0.017850 0.027609 0.007455 0.011738 0.003573 0.010688 0.000969 -0.017861 0.000470 0.035631 0.007821 -0.002139 -0.033589
7 -0.057189 -0.008005 0.076034 0.019468 0.041875 0.012856 0.015624 0.025557 -0.117945 0.036005 -0.020845 -0.020936 0.016381 -0.001789 -0.082199 0.044610 -0.003032 -0.040903 0.032167 0.041692
8 0.060031 -0.002638 0.086681 0.000781 -0.011971 -0.071955 0.151291 -0.091279 -0.058280 -0.011058 -0.072192 -0.017735 0.022200 0.046403 -0.031919 -0.031753 0.070165 0.023076 -0.069526 0.012452
9 -0.053259 -0.017612 0.052396 -0.009449 -0.052178 -0.038003 -0.002272 0.014980 -0.017043 -0.017520 0.085133 -0.014214 -0.026713 0.052898 -0.003127 -0.038636 -0.025355 -0.006227 0.004296 0.121400
10 0.035220 0.026184 0.040166 -0.062073 0.026685 0.036351 -0.015575 -0.154792 -0.031215 0.059267 0.020816 -0.010240 -0.006108 -0.030463 0.036381 0.022484 0.037949 0.013978 0.007617 -0.059875
11 0.088317 0.000967 -0.029412 -0.016902 0.025702 -0.019834 0.004387 0.079266 -0.010484 -0.000263 -0.015513 0.012812 -0.040942 0.037483 -0.039904 0.012268 0.003010 -0.084412 0.004226 -0.015135
12 -0.064228 0.066135 -0.009591 -0.023483 -0.008685 -0.024945 -0.071904 0.000257 0.121190 -0.041886 0.029853 -0.023077 -0.062418 0.010695 -0.038319 -0.005437 0.033066 0.049959 0.027057 0.033350
13 -0.039317 -0.032254 0.033219 -0.061696 -0.055319 0.028940 -0.048530 0.056392 -0.032667 -0.021616 0.079176 0.016763 -0.024667 0.019457 0.073731 -0.011877 -0.018808 -0.009121 -0.020752 0.070925
14 0.112393 -0.011318 -0.075956 0.045967 -0.051294 -0.074172 0.036376 -0.158417 0.109504 -0.004869 0.007857 0.044951 -0.089166 0.016220 -0.020207 -0.014569 0.028038 0.041441 0.045100 0.006899

Train iterative model using fit

Train model

re.fit(rs_model1, method='iterative')

Prediction

re.utility_matrix_preds
i_cluster 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
u_cluster
0 -0.049105 -0.020916 0.056040 -0.038562 0.027767 0.069381 0.015377 0.063853 -0.056062 0.001142 0.033964 -0.015928 -0.062263 -0.021516 0.048859 -0.024168 0.022215 -0.078672 0.028942 0.010016
1 -0.008513 -0.003207 0.014235 -0.004705 -0.022224 0.016366 0.027618 0.013193 -0.069432 0.020052 -0.011072 0.035971 0.017404 -0.009562 0.135346 -0.015950 -0.071292 0.034730 -0.036040 -0.066478
2 -0.049866 0.004650 -0.045232 0.015537 0.020175 0.007415 -0.018152 0.069430 -0.079609 0.008455 -0.057624 -0.002889 0.054578 0.005979 0.046580 0.054978 -0.028593 -0.001450 -0.006199 0.021468
3 0.026739 -0.025368 0.054310 -0.056257 0.045048 -0.070622 0.009612 -0.006472 -0.019648 0.017272 -0.008279 -0.012251 -0.000887 -0.007091 0.002625 0.018397 -0.009745 -0.058289 -0.008653 0.104934
4 -0.026869 -0.022401 0.019945 -0.028203 -0.030291 -0.037597 0.151414 -0.104391 0.011025 0.018639 0.023140 0.028802 -0.008212 0.011735 -0.002689 -0.033195 0.072892 0.024371 0.009592 -0.049061
5 0.007887 0.031554 0.003702 -0.011881 -0.009248 -0.036155 -0.020314 0.175999 0.039851 0.006602 -0.081840 -0.020273 0.018381 -0.010763 -0.051397 0.002013 0.023350 0.011509 0.008934 -0.099627
6 -0.031917 0.034755 -0.019988 -0.006960 0.024309 -0.052428 -0.034069 0.018838 0.034583 0.005801 0.006499 0.004301 0.019838 0.001013 -0.019262 -0.001399 0.045414 0.008762 -0.006607 -0.040761
7 -0.083518 -0.012468 0.089151 0.024368 0.048741 0.018769 0.011622 0.016399 -0.123056 0.032265 -0.030282 -0.032097 0.021361 -0.010613 -0.018579 0.052094 -0.010016 -0.043031 0.036441 0.034407
8 0.069397 -0.007754 0.110940 -0.002517 -0.017619 -0.085774 0.161858 -0.091702 -0.054989 -0.021003 -0.099643 -0.022135 0.026163 0.053906 -0.033030 -0.036014 0.088638 0.018683 -0.046843 0.012965
9 -0.064271 -0.022337 0.064464 -0.015816 -0.063792 -0.041631 -0.013690 0.019542 -0.009724 -0.014233 0.104645 -0.022970 -0.030836 0.056294 -0.006586 -0.045533 -0.035397 -0.006394 0.006680 0.119648
10 0.015086 0.033412 0.036267 -0.072428 0.031528 0.065694 -0.009944 -0.194738 -0.037997 0.059090 0.027734 -0.013400 -0.005321 -0.022760 0.045716 0.025255 0.048780 0.031164 0.011518 -0.077612
11 0.093381 0.002512 -0.034227 -0.015722 0.031581 -0.015361 0.007427 0.066311 -0.009900 -0.000913 -0.019701 0.014123 -0.051100 0.039053 -0.038617 0.015532 0.006834 -0.100050 0.005610 -0.019565
12 -0.085716 0.080433 -0.010337 -0.023645 -0.008587 -0.022760 -0.071804 -0.001955 0.138777 -0.038864 0.035142 -0.029233 -0.061797 0.008385 -0.047890 -0.005613 0.037519 0.052100 0.022397 0.042470
13 -0.039563 -0.028122 0.035565 -0.071662 -0.079008 0.035516 -0.061917 0.062727 -0.027841 -0.028713 0.088841 0.022427 -0.027076 0.012156 0.088547 -0.014347 -0.024165 -0.007111 -0.020267 0.074330
14 0.110571 -0.003885 -0.082236 0.065300 -0.055131 -0.078376 0.045332 -0.190696 0.105328 -0.003358 0.013032 0.057754 -0.107980 0.020310 -0.016107 -0.012720 0.026770 0.053421 0.058831 0.003484