User Clustering¶
fname = 'user_movie.csv'
state = 1337
drop_cols = ['userId']
from sklearn.cluster import (KMeans, SpectralClustering,
AgglomerativeClustering, DBSCAN, OPTICS,
cluster_optics_dbscan, Birch)
model1 = KMeans(n_clusters = 20)
def u_cluster(fname, model_fname, model, drop_cols=drop_cols):
"""
Perform user-wise clustering and assign each user to a cluster.
Parameters
---------
fname : pandas DataFrame or string
The initial utility matrix with each row corresponding
to a user and the columns as their rating of each item
model_fname : string
The name of the model's pickle file when saved.
model : an sklearn model object
An object with a fit_predict method. Used to cluster the
users into groups with similar ratings of items.
drop_cols : list
Columns to be dropped in fname
Returns
-------
model : an sklearn model object
The fitted version of the model input used to predict the
clusters of users from fname
result : dict
A mapping of each user's cluster with the keys being the
user_id and the values their cluster membership
df : pandas DataFrame
Utility matrix derived from fname with the final column
corresponding to the cluster membership of that user
"""
import pandas as pd
import numpy as np
from sklearn.cluster import (KMeans, SpectralClustering,
AgglomerativeClustering, DBSCAN, OPTICS,
cluster_optics_dbscan, Birch)
import pickle
# Aggregation through tables
if isinstance(fname, str):
df = pd.read_csv(fname)
else:
df = fname
if drop_cols != None:
df = df.drop(columns=drop_cols)
u_clusterer = model
u_predict = u_clusterer.fit_predict(df)
df['u_cluster'] = u_predict
model = u_clusterer
result = dict(df['u_cluster'])
with open(model_fname,'wb') as f:
pickle.dump(model, f)
return model, result, df
x_u,y_u, df = u_cluster(fname,'u_cluster.pkl',model1)
Unit Test¶
import unittest
import pandas as pd
import os
class TestGetRec(unittest.TestCase):
from sklearn.cluster import KMeans
def test_u_c(self):
n_clusters = 20
model1 = KMeans(n_clusters=n_clusters)
x_u, y_u, df = u_cluster(fname, 'u_cluster.pkl', model1)
self.assertEqual(df['u_cluster'].nunique(), n_clusters)
self.assertEqual(len(df), len(pd.read_csv(fname)))
self.assertEqual(os.path.exists('u_cluster.pkl'), True)
unittest.main(argv=[''], verbosity=2, exit=False)
test_u_c (__main__.TestGetRec) ... ok
----------------------------------------------------------------------
Ran 1 test in 13.403s
OK
<unittest.main.TestProgram at 0x7f608f14a610>