Splitting transaction table randomly to create train and test sets

This notebook contains the first function:

  1. split_train_test: creates train and test sets by splitting the raw data ‘user_feature.csv’.

  2. evaluate: calculates the mse and mae of the final recommendations to the actual recommendations based on the test set.

  3. append_error_to_df: for visualization purposes and for further exploration of the errors.

The current function now returns the train, test, utility matrix-form of test set, and the indices of the train and test sets that will be used as input for the content-based recsys pipeline.

# import pandas as pd
# import numpy as np
# data = pd.read_csv('user_feature.csv')
# features = ['userId', 'movieId', 'rating']
# # data
# new_data=data[features]
# new_data
userId movieId rating
0 1 1 4.0
1 1 3 4.0
2 1 6 4.0
3 1 47 5.0
4 1 50 5.0
... ... ... ...
100831 610 166534 4.0
100832 610 168248 5.0
100833 610 168250 5.0
100834 610 168252 5.0
100835 610 170875 3.0

100836 rows × 3 columns

Splitting

import pandas as pd

def split_train_test(data, train_ratio=0.7):
    """
    Splits the transaction data into train and test sets.
    
    Parameters
    ----------
    data         : pandas DataFrame for transaction table containing user, item, and ratings
    
    train_ratio  : the desired ratio of training set, while 1-train ratio is automatically set for the test set 
    
    
    Returns
    ---------
    df_train_fin : dataframe for the training set
    
    df_test_fin  : dataframe for the test set
    
    df_test_fin* : possible option is a pivoted df ready as the util matrix input of the recsys. In our case, the
                   index='userId', columns='movieId', values='rating'. To generalize a transaction table, 
                   index=column[0], columns=itemId, values=rating.
    """
    
    list_df_train = []
    list_df_test = []
    
    #group by user id
    d = dict(tuple(data.groupby(data.columns[0]))) #assuming column[0] is the userId
    
    #splitting randomly per user
    for i in (d):
        if len(d[i])<2:
            print(len(d[i]))
            list_df_test.append(d[i])
            
        else:            
            df_train = d[i].sample(frac=train_ratio)  
            ind = df_train.index
            df_test = d[i].drop(ind)
            list_df_train.append(df_train) 
            list_df_test.append(df_test)

    # 2. merge selected train set per user to a single dataframe
    df_train_fin = pd.concat(list_df_train)
    df_test_fin = pd.concat(list_df_test)
    
    # 3. Option to pivot it to create the utility matrix ready as input for recsys
    df_test_um = df_test_fin.pivot(index=df_test_fin.columns[0], columns=df_test_fin.columns[1], values=df_test_fin.columns[2])
    
    # 4. get indices of train and test sets
    indx_train = df_train_fin.index
    indx_test = df_test_fin.index

    return df_train_fin, df_test_fin, df_test_um, indx_train, indx_test #return indices
df_train, df_test, df_test_um, ind_train, ind_test = split_train_test(new_data, 0.70)

Unittest

import unittest
import pandas as pd

class Test_split(unittest.TestCase):
    
    def test_shape(self):
        df = pd.DataFrame({'u': [1,1,2,2,3,3,3,5,5,6], 'i': [3,4,5,6,7,1,2,3,1,0], 'r':[5,6,7,8,9,3,2,1,0,9]}) 
        df_train1, df_test1, df_test_um, ind_train, ind_test = split_train_test(df, 0.70)
        cdf = pd.concat([df_train1, df_test1])
        s1=df.shape
        s2=cdf.shape

        self.assertEqual(s1,s2)
    

unittest.main(argv=[''], verbosity=2, exit=False)
    
test_shape (__main__.Test_split) ... 
1
ok

----------------------------------------------------------------------
Ran 1 test in 0.021s

OK
<unittest.main.TestProgram at 0x7fcb61ec35e0>