User Clustering

fname = 'user_movie.csv'
state = 1337
drop_cols = ['userId']
from sklearn.cluster import (KMeans, SpectralClustering,
                             AgglomerativeClustering, DBSCAN, OPTICS,
                             cluster_optics_dbscan, Birch)

model1 = KMeans(n_clusters = 20)
def u_cluster(fname, model_fname, model, drop_cols=drop_cols):
    """
    Perform user-wise clustering and assign each user to a cluster.
    
    Parameters
    ---------
    fname        : pandas DataFrame or string
                   The initial utility matrix with each row corresponding 
                   to a user and the columns as their rating of each item
                   
    model_fname  : string
                   The name of the model's pickle file when saved.
                   
    model        : an sklearn model object
                   An object with a fit_predict method. Used to cluster the
                   users into groups with similar ratings of items.

    drop_cols    : list
                   Columns to be dropped in fname

    Returns
    -------
    model         : an sklearn model object
                    The fitted version of the model input used to predict the
                    clusters of users from fname
    
    result        : dict
                    A mapping of each user's cluster with the keys being the
                    user_id and the values their cluster membership
    
    df            : pandas DataFrame
                    Utility matrix derived from fname with the final column
                    corresponding to the cluster membership of that user
    """
    import pandas as pd
    import numpy as np
    from sklearn.cluster import (KMeans, SpectralClustering,
                                 AgglomerativeClustering, DBSCAN, OPTICS,
                                 cluster_optics_dbscan, Birch)
    import pickle
    # Aggregation through tables

    if isinstance(fname, str):
        df = pd.read_csv(fname)
    else:
        df = fname

    if drop_cols != None:
        df = df.drop(columns=drop_cols)
        
    u_clusterer = model

    u_predict = u_clusterer.fit_predict(df)
    df['u_cluster'] = u_predict

    model = u_clusterer
    result = dict(df['u_cluster'])
    with open(model_fname,'wb') as f:
        pickle.dump(model, f)
    return model, result, df
x_u,y_u, df = u_cluster(fname,'u_cluster.pkl',model1)

Unit Test

import unittest
import pandas as pd
import os


class TestGetRec(unittest.TestCase):
    from sklearn.cluster import KMeans
    def test_u_c(self):
        n_clusters = 20
        model1 = KMeans(n_clusters=n_clusters)
        x_u, y_u, df = u_cluster(fname, 'u_cluster.pkl', model1)
        self.assertEqual(df['u_cluster'].nunique(), n_clusters)
        self.assertEqual(len(df), len(pd.read_csv(fname)))
        self.assertEqual(os.path.exists('u_cluster.pkl'), True)
        
unittest.main(argv=[''], verbosity=2, exit=False)
test_u_c (__main__.TestGetRec) ... ok

----------------------------------------------------------------------
Ran 1 test in 13.403s

OK
<unittest.main.TestProgram at 0x7f608f14a610>