Get Item Recommendations from Clusters¶
import numpy as np
import pandas as pd
from random_gen import *
from get_rec import *
def get_rec_item(df_rec, top_k, ic_assignment):
"""Returns the top K item recommendations for each user in the user list.
Items are selected randomly from the top recommended item cluster, exhaustively. Left overs are taken from the next highest ranked item clusters in a cascading fashion.
Parameters:
df_rec (pandas.DataFrame): Table containing the top N item cluster recommendations for each user in the user list
ic_assignment (array-like): List containing the cluster assignment of each item
top_n (int): Number of items to recommend
Returns:
df_rec_item (pandas.DataFrame): Table containing the top K item recommendations for each user in the user list
"""
# Class stuff
#df_rec = self.df_rec # recommendations after running get_rec()
#ic_assignment = self.item_assignment # item-cluster assignment
# Create recommendation table
df_rec_item = pd.DataFrame()
df_rec_item['user_id'] = df_rec['user_id']
for i in range(top_k):
df_rec_item['rank_'+str(i+1)] = np.zeros(df_rec_item.shape[0])
# Get items
for j in range(df_rec_item.shape[0]):
item_rec = []
rank = 0
while len(item_rec) < top_k:
if rank+1 >= df_rec.shape[1]:
item_list = list(set(ic_assignment.index)-set(item_rec))
item_rec = item_rec + list(np.random.choice(item_list, size=top_k-len(item_rec), replace=False))
break
item_list = ic_assignment.index[np.where(ic_assignment == df_rec.iloc[j, rank+1])[0]]
if top_k-len(item_rec) > len(item_list):
item_rec = item_rec + list(item_list)
rank += 1
else:
item_rec = item_rec + list(np.random.choice(item_list, size=top_k-len(item_rec), replace=False))
df_rec_item.iloc[j, 1:] = item_rec
# look-up tables
#user_id_lookup = self.user_assignment.index
#for j in range(df_rec_item.shape[0]):
# df_rec_item.iloc[j, 0] = user_id_lookup[df_rec_item.iloc[j, 0].astype('int32')]
return df_rec_item
Example¶
n_user = 100
n_item = 50
sample_size = 10
n_user_cluster = 5
n_item_cluster = 5
top_n = 3
random_seed = 1
user_id_list = list(range(n_user))
user_list = random_user_list(n_user, sample_size, random_seed)
uc_assignment = random_user_cluster(n_user, n_user_cluster, random_seed)
utility_matrix_o, utility_matrix = random_utility_matrix(n_user_cluster, n_item_cluster, random_seed)
df_rec = get_rec(utility_matrix, utility_matrix_o, user_list, top_n, uc_assignment)
df_rec
| user_id | rank_1 | rank_2 | rank_3 | |
|---|---|---|---|---|
| 0 | 80 | 1.0 | 2.0 | 4.0 |
| 1 | 84 | 3.0 | 4.0 | 1.0 |
| 2 | 33 | 3.0 | 4.0 | 1.0 |
| 3 | 81 | 0.0 | 3.0 | 1.0 |
| 4 | 93 | 3.0 | 1.0 | 0.0 |
| 5 | 17 | 0.0 | 3.0 | 1.0 |
| 6 | 36 | 3.0 | 1.0 | 0.0 |
| 7 | 82 | 1.0 | 2.0 | 4.0 |
| 8 | 69 | 4.0 | 2.0 | 0.0 |
| 9 | 65 | 3.0 | 1.0 | 0.0 |
ic_assignment = random_user_cluster(n_item, n_item_cluster, random_seed=2)
ic_assignment
array([0, 0, 3, 2, 3, 0, 2, 1, 3, 2, 4, 4, 4, 3, 4, 2, 3, 3, 2, 1, 2, 4,
3, 0, 4, 3, 1, 2, 0, 4, 4, 2, 4, 2, 1, 0, 2, 2, 1, 0, 1, 0, 2, 1,
1, 1, 4, 2, 3, 0])
top_k = 10
df_rec_item = get_rec_item(df_rec, top_k, pd.DataFrame(ic_assignment))
df_rec_item
| user_id | rank_1 | rank_2 | rank_3 | rank_4 | rank_5 | rank_6 | rank_7 | rank_8 | rank_9 | rank_10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 80 | 7.0 | 19.0 | 26.0 | 34.0 | 38.0 | 40.0 | 43.0 | 44.0 | 45.0 | 15.0 |
| 1 | 84 | 2.0 | 4.0 | 8.0 | 13.0 | 16.0 | 17.0 | 22.0 | 25.0 | 48.0 | 32.0 |
| 2 | 33 | 2.0 | 4.0 | 8.0 | 13.0 | 16.0 | 17.0 | 22.0 | 25.0 | 48.0 | 21.0 |
| 3 | 81 | 0.0 | 1.0 | 5.0 | 23.0 | 28.0 | 35.0 | 39.0 | 41.0 | 49.0 | 48.0 |
| 4 | 93 | 2.0 | 4.0 | 8.0 | 13.0 | 16.0 | 17.0 | 22.0 | 25.0 | 48.0 | 38.0 |
| 5 | 17 | 0.0 | 1.0 | 5.0 | 23.0 | 28.0 | 35.0 | 39.0 | 41.0 | 49.0 | 2.0 |
| 6 | 36 | 2.0 | 4.0 | 8.0 | 13.0 | 16.0 | 17.0 | 22.0 | 25.0 | 48.0 | 43.0 |
| 7 | 82 | 7.0 | 19.0 | 26.0 | 34.0 | 38.0 | 40.0 | 43.0 | 44.0 | 45.0 | 47.0 |
| 8 | 69 | 21.0 | 46.0 | 29.0 | 24.0 | 32.0 | 10.0 | 14.0 | 30.0 | 11.0 | 12.0 |
| 9 | 65 | 2.0 | 4.0 | 8.0 | 13.0 | 16.0 | 17.0 | 22.0 | 25.0 | 48.0 | 43.0 |
Unit Test¶
import unittest
class TestGetRecItem(unittest.TestCase):
def test_1(self):
# Set-up
n_user = 100
n_item = 50
sample_size = 10
n_user_cluster = 5
n_item_cluster = 5
random_seed = 1
top_n = 3
top_k = 10
user_id_list = list(range(n_user))
user_list = random_user_list(n_user, sample_size, random_seed)
uc_assignment = random_user_cluster(n_user, n_user_cluster, random_seed)
utility_matrix_o, utility_matrix = random_utility_matrix(n_user_cluster, n_item_cluster, random_seed)
df_rec = get_rec(utility_matrix, utility_matrix_o, user_list, top_n, uc_assignment)
ic_assignment = random_user_cluster(n_item, n_item_cluster, random_seed=2)
df_rec_item = get_rec_item(df_rec, top_k, pd.DataFrame(ic_assignment))
test_case = np.array([
[80., 7., 19., 26., 34., 38., 40., 43., 44., 45., 15.],
[84., 2., 4., 8., 13., 16., 17., 22., 25., 48., 32.],
[33., 2., 4., 8., 13., 16., 17., 22., 25., 48., 21.],
[81., 0., 1., 5., 23., 28., 35., 39., 41., 49., 48.],
[93., 2., 4., 8., 13., 16., 17., 22., 25., 48., 38.],
[17., 0., 1., 5., 23., 28., 35., 39., 41., 49., 2.],
[36., 2., 4., 8., 13., 16., 17., 22., 25., 48., 43.],
[82., 7., 19., 26., 34., 38., 40., 43., 44., 45., 47.],
[69., 21., 46., 29., 24., 32., 10., 14., 30., 11., 12.],
[65., 2., 4., 8., 13., 16., 17., 22., 25., 48., 43.]
])
self.assertEqual(df_rec_item.to_numpy().tolist(), test_case.tolist())
def test_2(self):
# Set-up
n_user = 100
n_item = 50
sample_size = 10
n_user_cluster = 5
n_item_cluster = 5
random_seed = 2
top_n = 3
top_k = 10
user_id_list = list(range(n_user))
user_list = random_user_list(n_user, sample_size, random_seed)
uc_assignment = random_user_cluster(n_user, n_user_cluster, random_seed)
utility_matrix_o, utility_matrix = random_utility_matrix(n_user_cluster, n_item_cluster, random_seed)
df_rec = get_rec(utility_matrix, utility_matrix_o, user_list, top_n, uc_assignment)
ic_assignment = random_user_cluster(n_item, n_item_cluster, random_seed=3)
df_rec_item = get_rec_item(df_rec, top_k, pd.DataFrame(ic_assignment))
test_case = np.array([
[80., 7., 19., 26., 34., 38., 40., 43., 44., 45., 15.],
[84., 2., 4., 8., 13., 16., 17., 22., 25., 48., 32.],
[33., 2., 4., 8., 13., 16., 17., 22., 25., 48., 21.],
[81., 0., 1., 5., 23., 28., 35., 39., 41., 49., 48.],
[93., 2., 4., 8., 13., 16., 17., 22., 25., 48., 38.],
[17., 0., 1., 5., 23., 28., 35., 39., 41., 49., 2.],
[36., 2., 4., 8., 13., 16., 17., 22., 25., 48., 43.],
[82., 7., 19., 26., 34., 38., 40., 43., 44., 45., 47.],
[69., 21., 46., 29., 24., 32., 10., 14., 30., 11., 12.],
[65., 2., 4., 8., 13., 16., 17., 22., 25., 48., 43.]
])
self.assertEqual(df_rec_item.to_numpy().tolist(), test_case.tolist())
unittest.main(argv=[''], verbosity=2, exit=False)
test_1 (__main__.TestGetRecItem) ... ok
test_2 (__main__.TestGetRecItem) ... FAIL
======================================================================
FAIL: test_2 (__main__.TestGetRecItem)
----------------------------------------------------------------------
Traceback (most recent call last):
File "<ipython-input-8-280d80a78120>", line 77, in test_2
self.assertEqual(df_rec_item.to_numpy().tolist(), test_case.tolist())
AssertionError: Lists differ: [[83.0, 21.0, 8.0, 28.0, 0.0, 19.0, 17.0, 12[617 chars]2.0]] != [[80.0, 7.0, 19.0, 26.0, 34.0, 38.0, 40.0, 4[607 chars]3.0]]
First differing element 0:
[83.0, 21.0, 8.0, 28.0, 0.0, 19.0, 17.0, 12.0, 25.0, 43.0, 20.0]
[80.0, 7.0, 19.0, 26.0, 34.0, 38.0, 40.0, 43.0, 44.0, 45.0, 15.0]
Diff is 1513 characters long. Set self.maxDiff to None to see it.
----------------------------------------------------------------------
Ran 2 tests in 0.088s
FAILED (failures=1)
<unittest.main.TestProgram at 0x1c8aa1c0a90>