Item Clustering

fname = 'user_movie.csv'
state = 1337
drop_cols = ['userId']
from sklearn.cluster import (KMeans, SpectralClustering,
                             AgglomerativeClustering, DBSCAN, OPTICS,
                             cluster_optics_dbscan, Birch)

model2 = KMeans(n_clusters = 35)
def i_cluster(fname, model_fname, model, drop_cols=drop_cols):
    """
    Perform item-wise clustering and assign each item to a cluster of similar
    items based on the users that 
    
    Parameters
    ---------
    fname        : pandas DataFrame or string
                   The initial utility matrix with each row corresponding 
                   to a user and the columns as their rating of each item
                   
    model_fname  : string
                   The name of the model's pickle file when saved.
                   
    model        : an sklearn model object
                   An object with a fit_predict method. Used to cluster the
                   users into groups with similar ratings of items.

    drop_cols    : list
                   Columns to be dropped in fname

    Returns
    -------
    model         : an sklearn model object
                    The fitted version of the model input used to predict the
                    clusters of items from fname
    
    result        : dict
                    A mapping of each item's cluster with the keys being the
                    item_id and the values their cluster membership
    
    df_items      : pandas DataFrame
                    Utility matrix derived from fname with the final column
                    corresponding to the cluster membership of that item
    """
    import pandas as pd
    import numpy as np
    from sklearn.cluster import (KMeans, SpectralClustering, 
                                 AgglomerativeClustering, DBSCAN, OPTICS, 
                                 cluster_optics_dbscan, Birch)
    import pickle
    if isinstance(fname, str):
        df = pd.read_csv(fname)
    else:
        df = fname

    if drop_cols != None:
        df = df.drop(columns=drop_cols)

    df_items = df.T

    i_clusterer = model

    i_predict = i_clusterer.fit_predict(df_items)
    df_items['i_cluster'] = i_predict

    model = i_clusterer
    result = dict(df_items['i_cluster'])
    with open(model_fname,'wb') as f:
        pickle.dump(model, f)
    return model, result, df_items
x_i,y_i, df_items = i_cluster(fname,'i_cluster.pkl',model2)

Unit Test

import unittest
import pandas as pd
import os


class TestGetRec(unittest.TestCase):
    from sklearn.cluster import KMeans
    def test_i_c(self):
        n_clusters = 35
        model2 = KMeans(n_clusters=n_clusters)
        x_u, y_u, df_items = i_cluster(fname, 'i_cluster.pkl', model2)
        self.assertEqual(df_items['i_cluster'].nunique(), n_clusters)
        self.assertEqual(len(df_items), len(pd.read_csv(fname)
                                            .drop(columns=drop_cols)
                                            .columns))
        self.assertEqual(os.path.exists('i_cluster.pkl'), True)
        
unittest.main(argv=[''], verbosity=2, exit=False)
test_i_c (__main__.TestGetRec) ... ok

----------------------------------------------------------------------
Ran 1 test in 22.761s

OK
<unittest.main.TestProgram at 0x7f4b8fa714f0>