Content Based Model¶

This notebook uses code from the cross_val, sample_train_test, and evaluate pipeline.

import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor

def load_data(aug_tt, item_tt, user_tt):
    """
    Load the data from the transaction tables

    Paramters
    ---------
    aug_tt       : str
                   File name of the parquet file with each row corresponding
                   to a user's features, an item's features, and the user's
                   rating for that item

    item_tt      : str
                   File name of the parquet file with each row corresponding
                   to an item's features

    user_tt      : str
                   File name of the parquet file with each row corresponding
                   to a user's features

    Returns
    -------
    df            : pandas DataFrame
                    The augmented transaction table

    item_df       : pandas DataFrame
                    The item features as a transaction table

    user_df       : pandas DataFrame
                    The userfeatures as a transaction table

    item_ids      : list
                    All unique item ids

    user_ids      : list
                    All unique user ids
    """

    df = pd.read_parquet(aug_tt).dropna()
    item_df = pd.read_parquet(item_tt)
    item_ids = item_df['movieId'].unique()
    item_df = item_df.drop(columns=['movieId'])
    user_df = pd.read_parquet(user_tt).drop(columns=['userId'])
    user_ids = df['userId'].unique()
    return df, item_df, user_df, item_ids, user_ids


def fit_ml_cb(train_df, model, target_col='rating', drop_cols=['userId', 'movieId', 'timestamp']):
    """
    Perform item-wise clustering and assign each item to a cluster of similar
    items based on the users that 

    Paramters
    ---------
    train_df     : pandas DataFrame
                   The training set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model        : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float.

    target_col   : str
                   The column corresponding to the rating.

    drop_cols    : list
                   Columns to be dropped in train_df.

    Returns
    -------
    rs_model      : an sklearn model object
                    The fitted version of the model input used to predict the
                    rating of a user for an object given the user's features
                    and the item's features.
    """
    rs_model = clone(model)
    target = train_df[target_col].dropna().values.ravel()
    train_df = train_df.drop(columns=[target_col]+drop_cols)
    rs_model = model.fit(train_df, target)
    return rs_model


def reco_ml_cb(user_df, item_df, item_ids, model_fitted):
    """
    Completes the entire utility matrix based on the model passed

    Parameters
    ---------
    train_df     : pandas DataFrame
                   The training set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model        : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float.

    target_col   : str
                   The column corresponding to the rating.

    Returns
    -------
    full_matrix  : a pandas DataFrame
                   The completed utility matrix.
    """
    recos = {}
    c = 1
    for u, u_feats in user_df.iterrows():
        print(c, 'out of', len(user_df), end='\r')
        u_feats = pd.concat([pd.DataFrame(u_feats).T] *
                            len(item_ids)).reset_index(drop=True)
        a_feats = u_feats.join(item_df)
        reco = pd.Series(model_fitted.predict(a_feats), index=item_ids)
        recos[u] = reco
        c += 1
    full_matrix = pd.DataFrame.from_dict(recos, orient='index')
    return full_matrix


def reco_ml_cb_tt(df_test, model_fitted, target='rating', drop_cols=['userId', 'movieId', 'timestamp']):
    """
    Make predictions on the test set and outputs an array of the predicted
    values for them.

    Paramters
    ---------
    df_test      : pandas DataFrame
                   The test set as a transaction table. Each row
                   corresponds to a user's features and that item's features
                   along with the user's rating for that item.

    model_fitted : an sklearn regressor object
                   An object with a fit and predict method that outputs a
                   float. Must be fitted already

    target_col   : str
                   The column corresponding to the rating.
                   
    drop_cols    : list
                   Columns to be dropped in df_test.

    Returns
    -------
    result        : numpy array
                   The results of the model using df_test's features
    """
    df_test = df_test.drop(columns=[target]+drop_cols)
    result = model_fitted.predict(df_test)
    return result

def split_train_test(data, train_ratio=0.7,uid='userId', iid='movieId', rid='rating'):
    """
    Splits the transaction data into train and test sets.
    
    Parameters
    ----------
    data         : pandas DataFrame for transaction table containing user, item, and ratings
    
    train_ratio  : the desired ratio of training set, while 1-train ratio is automatically set for the test set 
    
    
    Returns
    ---------
    df_train_fin : dataframe for the training set
    
    df_test_fin  : dataframe for the test set
    
    df_test_fin* : possible option is a pivoted df ready as the util matrix input of the recsys. In our case, the
                   index='userId', columns='movieId', values='rating'. To generalize a transaction table, 
                   index=column[0], columns=itemId, values=rating.
    """
    
    list_df_train = []
    list_df_test = []
    
    #group by user id
    d = dict(tuple(data.groupby(data.columns[0]))) #assuming column[0] is the userId
    
    #splitting randomly per user
    for i in (d):
        if len(d[i])<2:
            list_df_test.append(d[i])
            
        else:            
            df_train = d[i].sample(frac=train_ratio)  
            ind = df_train.index
            df_test = d[i].drop(ind)
            list_df_train.append(df_train) 
            list_df_test.append(df_test)

    # 2. merge selected train set per user to a single dataframe
    df_train_fin = pd.concat(list_df_train)
    df_test_fin = pd.concat(list_df_test)
    
    # 3. Option to pivot it to create the utility matrix ready as input for recsys
    df_test_um = df_test_fin.pivot(index=uid, columns=iid, values=rid)
    
    # 4. get indices of train and test sets
    indx_train = df_train_fin.index
    indx_test = df_test_fin.index

    return df_train_fin, df_test_fin, df_test_um, indx_train, indx_test #return indices

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def evaluate(df_test_result, df_test_data):
    """
    Calculates the mse and mae per user of the results of the recommender system for a given test set.
    
    Parameters
    ----------
    
    df_test_result   : utility matrix containing the result of the recommender systems
    
    df_test_data     : pivoted test data generated from splitting the transaction table and tested on the recommender systems
    
    Returns
    ---------
    
    mse_list         : list of mean squared error for each user
    
    mae_list         : list of mean absolute error for each user
    
    """
    
    
    mse_list = []
    mae_list = []
    
#     test indices first, all user ids should be represented in the test matrix 
    idx_orig_data = df_test_data.index
    idx_result = df_test_result.index
    a=idx_orig_data.difference(idx_result)
    
    if len(a)==0:
        print('proceed')
        for i in (df_test_result.index):
            y_pred = df_test_result[df_test_result.index==i].fillna(0)
            y = df_test_data[df_test_data.index==i].fillna(0)
            y_pred = y_pred[y.columns]

            mse = mean_squared_error(y, y_pred)
            mae = mean_absolute_error(y, y_pred)

            mse_list.append(mse)
            mae_list.append(mae)
    else:
        print('error')
    
    return mse_list, mae_list

import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def evaluate_arrays(model_result_arr, df_data, indx_test):
    """
    Calculates the mse and mae of the recommender system for a given result and test set.
    
    Parameters
    ----------
    
    model_result_arr   : ratings from the results of the recommender sys using test set
    
    df_test_truth      : the original dataframe for before splitting.
                         the original ratings or ground truth from the test set will be extracted from here using indices
                         
    indx_test          : result indices of test set from splitting
    
    Returns
    ---------
    
    mse                : mse value using sklearn 
    
    mae                : mse value using sklearn 
    
    """
    
    df_test_truth = df_data.loc[pd.Index(indx_test), df_data.columns[2]]
    test_arr = df_test_truth.values
         
#     test indices first, all user ids should be represented in the test matrix 

    result_len = len(model_result_arr) 
    test_len = len(test_arr)
      
    if result_len!=test_len:
        raise ValueError('the arrays are of different lengths %s in %s' % (result_len,test_len))
        
    else:
        print('proceed')
            
        mse = mean_squared_error(test_arr, model_result_arr)
        mae = mean_absolute_error(test_arr, model_result_arr)

            
    return mse, mae

def cross_val(df, k, model, split_method='random'):
    """
    Performs cross-validation for different train and test sets.

    Parameters
    -----------
    df                    : the data to be split in the form of vanilla/transaction++ table (uid, iid, rating, timestamp)

    k                     : the number of times splitting and learning with the model is desired
    
    model                 : an unfitted sklearn model

    split_method          : 'random' splitting or 'chronological' splitting of the data


    Returns
    --------
    mse and mae           : error metrics using sklearn


    """
    mse = []
    mae = []

    if split_method == 'random':

        for i in range(k):
            print(i)
            # 1. split
            print('Starting splitting')
            df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(
                df, 0.7)
            print('Finished splitting')
            # 2. train with model
            model_clone = clone(model)
            print('Starting training')
            model_clone_fit = fit_ml_cb(df_train.sample(100), model_clone)
            print('Finished training')
            print('Starting completing matrix')
            result = reco_ml_cb_tt(df_test, model_fit)
            print('Finished completing matrix')
            print('Starting computing MAE and MSE')
            # 3. evaluate results (result is in the form of utility matrix)
            mse_i, mae_i = evaluate_arrays(result, df, indx_test)
            print('Finished computing MAE and MSE')

            mse.append(mse_i)
            mae.append(mae_i)

    elif split_method == 'chronological':

        # 1. split
        df_train, df_test, df_test_um, indx_train, indx_test = split_train_test_chronological(
            df, 0.7)

        print('Starting splitting')
        print('Finished splitting')
        # 2. train with model
        model_clone = clone(model)
        print('Starting training')
        model_clone_fit = fit_ml_cb(df_train.sample(100), model_clone)
        print('Finished training')
        print('Starting completing matrix')
        result = reco_ml_cb_tt(df_test, model_fit)
        print('Finished completing matrix')
        print('Starting computing MAE and MSE')
        # 3. evaluate results (result is in the form of utility matrix)
        mse_i, mae_i = evaluate_arrays(result, df, indx_test)
        print('Finished computing MAE and MSE')

        mse.append(mse_i)
        mae.append(mae_i)

    return mse, mae

	userId	movieId	rating	timestamp	u_1	u_2	u_3	u_4	u_5	u_6	...	i_291	i_292	i_293	i_294	i_295	i_296	i_297	i_298	i_299	i_300
1	1	3	4.0	964981247	85	29	42	83	47	26	...	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.000000	0.0	0.000000
2	1	6	4.0	964982224	85	29	42	83	47	26	...	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.000000	0.0	0.000000
5	1	70	3.0	964982400	85	29	42	83	47	26	...	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.000000	0.0	0.000000
11	1	216	5.0	964981208	85	29	42	83	47	26	...	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.448884	0.0	0.000000
18	1	333	5.0	964981179	85	29	42	83	47	26	...	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.000000	0.0	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
100824	610	161582	4.0	1493847759	267	66	56	411	151	119	...	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.000000	0.0	0.000000
100826	610	162350	3.5	1493849971	267	66	56	411	151	119	...	0.0	0.0	0.000000	0.0	0.0	0.000000	0.0	0.000000	0.0	0.000000
100829	610	164179	5.0	1493845631	267	66	56	411	151	119	...	0.0	0.0	0.000000	0.0	0.0	0.296870	0.0	0.000000	0.0	0.000000
100834	610	168252	5.0	1493846352	267	66	56	411	151	119	...	0.0	0.0	0.000000	0.0	0.0	0.291784	0.0	0.000000	0.0	0.282198
100835	610	170875	3.0	1493846415	267	66	56	411	151	119	...	0.0	0.0	0.365896	0.0	0.0	0.313337	0.0	0.000000	0.0	0.000000

	userId	movieId	rating	timestamp	u_1	u_2	u_3	u_4	u_5	u_6	...	i_291	i_292	i_293	i_294	i_295	i_296	i_297	i_298	i_299	i_300
0	1	1	4.0	964982703	85	29	42	83	47	26	...	0.0	0.0	0.0	0.0	0.0	0.00000	0.0	0.0	0.0	0.0
4	1	50	5.0	964982931	85	29	42	83	47	26	...	0.0	0.0	0.0	0.0	0.0	0.00000	0.0	0.0	0.0	0.0
5	1	70	3.0	964982400	85	29	42	83	47	26	...	0.0	0.0	0.0	0.0	0.0	0.00000	0.0	0.0	0.0	0.0
6	1	101	5.0	964980868	85	29	42	83	47	26	...	0.0	0.0	0.0	0.0	0.0	0.00000	0.0	0.0	0.0	0.0
8	1	151	5.0	964984041	85	29	42	83	47	26	...	0.0	0.0	0.0	0.0	0.0	0.00000	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
100817	610	158956	3.0	1493848947	267	66	56	411	151	119	...	0.0	0.0	0.0	0.0	0.0	0.00000	0.0	0.0	0.0	0.0
100819	610	160080	3.0	1493848031	267	66	56	411	151	119	...	0.0	0.0	0.0	0.0	0.0	0.00000	0.0	0.0	0.0	0.0
100827	610	163937	3.5	1493848789	267	66	56	411	151	119	...	0.0	0.0	0.0	0.0	0.0	0.00000	0.0	0.0	0.0	0.0
100828	610	163981	3.5	1493850155	267	66	56	411	151	119	...	0.0	0.0	0.0	0.0	0.0	0.00000	0.0	0.0	0.0	0.0
100832	610	168248	5.0	1493850091	267	66	56	411	151	119	...	0.0	0.0	0.0	0.0	0.0	0.24502	0.0	0.0	0.0	0.0

ReSyPE

Content Based Model¶

Model Pipeline¶