Content Based Model

This notebook uses code from the cross_val, sample_train_test, and evaluate pipeline.

import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor

def load_data(aug_tt, item_tt, user_tt):
    """
    Load the data from the transaction tables

    Paramters
    ---------
    aug_tt       : str
                   File name of the parquet file with each row corresponding
                   to a user's features, an item's features, and the user's
                   rating for that item

    item_tt      : str
                   File name of the parquet file with each row corresponding
                   to an item's features

    user_tt      : str
                   File name of the parquet file with each row corresponding
                   to a user's features

    Returns
    -------
    df            : pandas DataFrame
                    The augmented transaction table

    item_df       : pandas DataFrame
                    The item features as a transaction table

    user_df       : pandas DataFrame
                    The userfeatures as a transaction table

    item_ids      : list
                    All unique item ids

    user_ids      : list
                    All unique user ids
    """

    df = pd.read_parquet(aug_tt).dropna()
    item_df = pd.read_parquet(item_tt)
    item_ids = item_df['movieId'].unique()
    item_df = item_df.drop(columns=['movieId'])
    user_df = pd.read_parquet(user_tt).drop(columns=['userId'])
    user_ids = df['userId'].unique()
    return df, item_df, user_df, item_ids, user_ids


def fit_ml_cb(train_df, model, target_col='rating', drop_cols=['userId', 'movieId', 'timestamp']):
    """
    Perform item-wise clustering and assign each item to a cluster of similar
    items based on the users that 

    Paramters
    ---------
    train_df     : pandas DataFrame
                   The training set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model        : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float.

    target_col   : str
                   The column corresponding to the rating.

    drop_cols    : list
                   Columns to be dropped in train_df.

    Returns
    -------
    rs_model      : an sklearn model object
                    The fitted version of the model input used to predict the
                    rating of a user for an object given the user's features
                    and the item's features.
    """
    rs_model = clone(model)
    target = train_df[target_col].dropna().values.ravel()
    train_df = train_df.drop(columns=[target_col]+drop_cols)
    rs_model = model.fit(train_df, target)
    return rs_model


def reco_ml_cb(user_df, item_df, item_ids, model_fitted):
    """
    Completes the entire utility matrix based on the model passed

    Parameters
    ---------
    train_df     : pandas DataFrame
                   The training set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model        : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float.

    target_col   : str
                   The column corresponding to the rating.

    Returns
    -------
    full_matrix  : a pandas DataFrame
                   The completed utility matrix.
    """
    recos = {}
    c = 1
    for u, u_feats in user_df.iterrows():
        print(c, 'out of', len(user_df), end='\r')
        u_feats = pd.concat([pd.DataFrame(u_feats).T] *
                            len(item_ids)).reset_index(drop=True)
        a_feats = u_feats.join(item_df)
        reco = pd.Series(model_fitted.predict(a_feats), index=item_ids)
        recos[u] = reco
        c += 1
    full_matrix = pd.DataFrame.from_dict(recos, orient='index')
    return full_matrix


def reco_ml_cb_tt(df_test, model_fitted, target='rating', drop_cols=['userId', 'movieId', 'timestamp']):
    """
    Make predictions on the test set and outputs an array of the predicted
    values for them.

    Paramters
    ---------
    df_test      : pandas DataFrame
                   The test set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model_fitted : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float. Must be fitted already

    target_col   : str
                   The column corresponding to the rating.
                   
    drop_cols    : list
                   Columns to be dropped in df_test.

    Returns
    -------
    result        : numpy array
                   The results of the model using df_test's features
    """
    df_test = df_test.drop(columns=[target]+drop_cols)
    result = model_fitted.predict(df_test)
    return result
def split_train_test(data, train_ratio=0.7,uid='userId', iid='movieId', rid='rating'):
    """
    Splits the transaction data into train and test sets.
    
    Parameters
    ----------
    data         : pandas DataFrame for transaction table containing user, item, and ratings
    
    train_ratio  : the desired ratio of training set, while 1-train ratio is automatically set for the test set 
    
    
    Returns
    ---------
    df_train_fin : dataframe for the training set
    
    df_test_fin  : dataframe for the test set
    
    df_test_fin* : possible option is a pivoted df ready as the util matrix input of the recsys. In our case, the
                   index='userId', columns='movieId', values='rating'. To generalize a transaction table, 
                   index=column[0], columns=itemId, values=rating.
    """
    
    list_df_train = []
    list_df_test = []
    
    #group by user id
    d = dict(tuple(data.groupby(data.columns[0]))) #assuming column[0] is the userId
    
    #splitting randomly per user
    for i in (d):
        if len(d[i])<2:
            list_df_test.append(d[i])
            
        else:            
            df_train = d[i].sample(frac=train_ratio)  
            ind = df_train.index
            df_test = d[i].drop(ind)
            list_df_train.append(df_train) 
            list_df_test.append(df_test)

    # 2. merge selected train set per user to a single dataframe
    df_train_fin = pd.concat(list_df_train)
    df_test_fin = pd.concat(list_df_test)
    
    # 3. Option to pivot it to create the utility matrix ready as input for recsys
    df_test_um = df_test_fin.pivot(index=uid, columns=iid, values=rid)
    
    # 4. get indices of train and test sets
    indx_train = df_train_fin.index
    indx_test = df_test_fin.index

    return df_train_fin, df_test_fin, df_test_um, indx_train, indx_test #return indices
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def evaluate(df_test_result, df_test_data):
    """
    Calculates the mse and mae per user of the results of the recommender system for a given test set.
    
    Parameters
    ----------
    
    df_test_result   : utility matrix containing the result of the recommender systems
    
    df_test_data     : pivoted test data generated from splitting the transaction table and tested on the recommender systems
    
    Returns
    ---------
    
    mse_list         : list of mean squared error for each user
    
    mae_list         : list of mean absolute error for each user
    
    """
    
    
    mse_list = []
    mae_list = []
    
#     test indices first, all user ids should be represented in the test matrix 
    idx_orig_data = df_test_data.index
    idx_result = df_test_result.index
    a=idx_orig_data.difference(idx_result)
    
    if len(a)==0:
        print('proceed')
        for i in (df_test_result.index):
            y_pred = df_test_result[df_test_result.index==i].fillna(0)
            y = df_test_data[df_test_data.index==i].fillna(0)
            y_pred = y_pred[y.columns]

            mse = mean_squared_error(y, y_pred)
            mae = mean_absolute_error(y, y_pred)

            mse_list.append(mse)
            mae_list.append(mae)
    else:
        print('error')
    
    return mse_list, mae_list

import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def evaluate_arrays(model_result_arr, df_data, indx_test):
    """
    Calculates the mse and mae of the recommender system for a given result and test set.
    
    Parameters
    ----------
    
    model_result_arr   : ratings from the results of the recommender sys using test set
    
    df_test_truth      : the original dataframe for before splitting.
                         the original ratings or ground truth from the test set will be extracted from here using indices
                         
    indx_test          : result indices of test set from splitting
    
    Returns
    ---------
    
    mse                : mse value using sklearn 
    
    mae                : mse value using sklearn 
    
    """
    
    df_test_truth = df_data.loc[pd.Index(indx_test), df_data.columns[2]]
    test_arr = df_test_truth.values
         
#     test indices first, all user ids should be represented in the test matrix 

    result_len = len(model_result_arr) 
    test_len = len(test_arr)
      
    if result_len!=test_len:
        raise ValueError('the arrays are of different lengths %s in %s' % (result_len,test_len))
        
    else:
        print('proceed')
            
        mse = mean_squared_error(test_arr, model_result_arr)
        mae = mean_absolute_error(test_arr, model_result_arr)

            
    return mse, mae
def cross_val(df, k, model, split_method='random'):
    """
    Performs cross-validation for different train and test sets.

    Parameters
    -----------
    df                    : the data to be split in the form of vanilla/transaction++ table (uid, iid, rating, timestamp)

    k                     : the number of times splitting and learning with the model is desired
    
    model                 : an unfitted sklearn model

    split_method          : 'random' splitting or 'chronological' splitting of the data


    Returns
    --------
    mse and mae           : error metrics using sklearn


    """
    mse = []
    mae = []

    if split_method == 'random':

        for i in range(k):
            print(i)
            # 1. split
            print('Starting splitting')
            df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(
                df, 0.7)
            print('Finished splitting')
            # 2. train with model
            model_clone = clone(model)
            print('Starting training')
            model_clone_fit = fit_ml_cb(df_train.sample(100), model_clone)
            print('Finished training')
            print('Starting completing matrix')
            result = reco_ml_cb_tt(df_test, model_fit)
            print('Finished completing matrix')
            print('Starting computing MAE and MSE')
            # 3. evaluate results (result is in the form of utility matrix)
            mse_i, mae_i = evaluate_arrays(result, df, indx_test)
            print('Finished computing MAE and MSE')

            mse.append(mse_i)
            mae.append(mae_i)

    elif split_method == 'chronological':

        # 1. split
        df_train, df_test, df_test_um, indx_train, indx_test = split_train_test_chronological(
            df, 0.7)

        print('Starting splitting')
        print('Finished splitting')
        # 2. train with model
        model_clone = clone(model)
        print('Starting training')
        model_clone_fit = fit_ml_cb(df_train.sample(100), model_clone)
        print('Finished training')
        print('Starting completing matrix')
        result = reco_ml_cb_tt(df_test, model_fit)
        print('Finished completing matrix')
        print('Starting computing MAE and MSE')
        # 3. evaluate results (result is in the form of utility matrix)
        mse_i, mae_i = evaluate_arrays(result, df, indx_test)
        print('Finished computing MAE and MSE')

        mse.append(mse_i)
        mae.append(mae_i)

    return mse, mae

Model Pipeline

#Declare your model
rs_model1 = RandomForestRegressor(random_state=202109, n_jobs=-1)
#Load the data
df, item_df, user_df, item_ids, user_ids = load_data('augmented_transaction_table.parquet',
                                                     'item_feature.parquet',
                                                     'user_feature.parquet')
#Do your train and test split
df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(df, 0.7) #To split the data
# #Fit your model to the train data
model_fit = fit_ml_cb(df_train.sample(100), rs_model1) #To fit the model
#Predict on the test data
preds_array = reco_ml_cb_tt(df_test, model_fit) #To make predictions as an array
mse, mae = cross_val(df, 5, model_fit, split_method='random')
0
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
1
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
2
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
3
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
4
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
evaluate_arrays(preds_array, df, indx_test) #MSE and MAE
proceed
(0.8669795716126364, 0.7143198938393699)
import unittest


class TestGetRec(unittest.TestCase):
    import pandas as pd
    import numpy as np
    from sklearn.base import clone
    from sklearn.ensemble import RandomForestRegressor
    
    def test_matrix_shape(self):
        df, item_df, user_df, item_ids, user_ids = load_data('augmented_transaction_table.parquet',
                                                     'item_feature.parquet',
                                                     'user_feature.parquet')
        df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(df, 0.7) #To split the data
        model_fit = fit_ml_cb(df_train.sample(100), rs_model1) 
        matrix_result = reco_ml_cb(user_df, item_df, item_ids, model_fit)
        self.assertEqual(matrix_result.shape[0], len(user_ids))
        self.assertEqual(matrix_result.shape[1], len(item_ids))

    def test_array_pred(self):
        df, item_df, user_df, item_ids, user_ids = load_data('augmented_transaction_table.parquet',
                                                     'item_feature.parquet',
                                                     'user_feature.parquet')
        df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(df, 0.7) #To split the data
        model_fit = fit_ml_cb(df_train.sample(100), rs_model1) 
        array_result = reco_ml_cb_tt(df_test, model_fit)
        self.assertEqual(len(array_result), len(df_test))        
        
unittest.main(argv=[''], verbosity=2, exit=False)
test_array_pred (__main__.TestGetRec) ... 
userId movieId rating timestamp u_1 u_2 u_3 u_4 u_5 u_6 ... i_291 i_292 i_293 i_294 i_295 i_296 i_297 i_298 i_299 i_300
1 1 3 4.0 964981247 85 29 42 83 47 26 ... 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000
2 1 6 4.0 964982224 85 29 42 83 47 26 ... 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000
5 1 70 3.0 964982400 85 29 42 83 47 26 ... 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000
11 1 216 5.0 964981208 85 29 42 83 47 26 ... 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.448884 0.0 0.000000
18 1 333 5.0 964981179 85 29 42 83 47 26 ... 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
100824 610 161582 4.0 1493847759 267 66 56 411 151 119 ... 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000
100826 610 162350 3.5 1493849971 267 66 56 411 151 119 ... 0.0 0.0 0.000000 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000
100829 610 164179 5.0 1493845631 267 66 56 411 151 119 ... 0.0 0.0 0.000000 0.0 0.0 0.296870 0.0 0.000000 0.0 0.000000
100834 610 168252 5.0 1493846352 267 66 56 411 151 119 ... 0.0 0.0 0.000000 0.0 0.0 0.291784 0.0 0.000000 0.0 0.282198
100835 610 170875 3.0 1493846415 267 66 56 411 151 119 ... 0.0 0.0 0.365896 0.0 0.0 0.313337 0.0 0.000000 0.0 0.000000

30101 rows × 324 columns

ok
test_matrix_shape (__main__.TestGetRec) ... 
userId movieId rating timestamp u_1 u_2 u_3 u_4 u_5 u_6 ... i_291 i_292 i_293 i_294 i_295 i_296 i_297 i_298 i_299 i_300
0 1 1 4.0 964982703 85 29 42 83 47 26 ... 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0
4 1 50 5.0 964982931 85 29 42 83 47 26 ... 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0
5 1 70 3.0 964982400 85 29 42 83 47 26 ... 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0
6 1 101 5.0 964980868 85 29 42 83 47 26 ... 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0
8 1 151 5.0 964984041 85 29 42 83 47 26 ... 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
100817 610 158956 3.0 1493848947 267 66 56 411 151 119 ... 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0
100819 610 160080 3.0 1493848031 267 66 56 411 151 119 ... 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0
100827 610 163937 3.5 1493848789 267 66 56 411 151 119 ... 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0
100828 610 163981 3.5 1493850155 267 66 56 411 151 119 ... 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0
100832 610 168248 5.0 1493850091 267 66 56 411 151 119 ... 0.0 0.0 0.0 0.0 0.0 0.24502 0.0 0.0 0.0 0.0

30101 rows × 324 columns

130 out of 610