Item Clustering¶
fname = 'user_movie.csv'
state = 1337
drop_cols = ['userId']
from sklearn.cluster import (KMeans, SpectralClustering,
AgglomerativeClustering, DBSCAN, OPTICS,
cluster_optics_dbscan, Birch)
model2 = KMeans(n_clusters = 35)
def i_cluster(fname, model_fname, model, drop_cols=drop_cols):
"""
Perform item-wise clustering and assign each item to a cluster of similar
items based on the users that
Parameters
---------
fname : pandas DataFrame or string
The initial utility matrix with each row corresponding
to a user and the columns as their rating of each item
model_fname : string
The name of the model's pickle file when saved.
model : an sklearn model object
An object with a fit_predict method. Used to cluster the
users into groups with similar ratings of items.
drop_cols : list
Columns to be dropped in fname
Returns
-------
model : an sklearn model object
The fitted version of the model input used to predict the
clusters of items from fname
result : dict
A mapping of each item's cluster with the keys being the
item_id and the values their cluster membership
df_items : pandas DataFrame
Utility matrix derived from fname with the final column
corresponding to the cluster membership of that item
"""
import pandas as pd
import numpy as np
from sklearn.cluster import (KMeans, SpectralClustering,
AgglomerativeClustering, DBSCAN, OPTICS,
cluster_optics_dbscan, Birch)
import pickle
if isinstance(fname, str):
df = pd.read_csv(fname)
else:
df = fname
if drop_cols != None:
df = df.drop(columns=drop_cols)
df_items = df.T
i_clusterer = model
i_predict = i_clusterer.fit_predict(df_items)
df_items['i_cluster'] = i_predict
model = i_clusterer
result = dict(df_items['i_cluster'])
with open(model_fname,'wb') as f:
pickle.dump(model, f)
return model, result, df_items
x_i,y_i, df_items = i_cluster(fname,'i_cluster.pkl',model2)
Unit Test¶
import unittest
import pandas as pd
import os
class TestGetRec(unittest.TestCase):
from sklearn.cluster import KMeans
def test_i_c(self):
n_clusters = 35
model2 = KMeans(n_clusters=n_clusters)
x_u, y_u, df_items = i_cluster(fname, 'i_cluster.pkl', model2)
self.assertEqual(df_items['i_cluster'].nunique(), n_clusters)
self.assertEqual(len(df_items), len(pd.read_csv(fname)
.drop(columns=drop_cols)
.columns))
self.assertEqual(os.path.exists('i_cluster.pkl'), True)
unittest.main(argv=[''], verbosity=2, exit=False)
test_i_c (__main__.TestGetRec) ... ok
----------------------------------------------------------------------
Ran 1 test in 22.761s
OK
<unittest.main.TestProgram at 0x7f4b8fa714f0>