Splitting transaction table chronologically to train and test sets

This notebook contains the first function but with inclusion of the timestamp of the ratings:

  1. split_train_test: creates train and test sets by splitting the raw data ‘user_feature.csv’.

  2. evaluate: calculates the mse and mae of the final recommendations to the actual recommendations based on the test set.

  3. append_error_to_df: for visualization purposes and for further exploration of the errors.

The current function now returns the train, test, utility matrix-form of test set, and the indices of the train and test sets that will be used as input for the content-based recsys pipeline.

# import pandas as pd
# import numpy as np
# data = pd.read_csv('user_feature.csv')
# features = ['userId', 'movieId', 'rating', 'timestamp']
# # data
# new_data=data[features]
# new_data
userId movieId rating timestamp
0 1 1 4.0 964982703
1 1 3 4.0 964981247
2 1 6 4.0 964982224
3 1 47 5.0 964983815
4 1 50 5.0 964982931
... ... ... ... ...
100831 610 166534 4.0 1493848402
100832 610 168248 5.0 1493850091
100833 610 168250 5.0 1494273047
100834 610 168252 5.0 1493846352
100835 610 170875 3.0 1493846415

100836 rows × 4 columns

Splitting chronologically

import pandas as pd

def split_train_test_chronological(data, train_ratio=0.7):
    """
    Splits the transaction data into train and test sets based on chronological order.
    
    Parameters
    ----------
    data         : pandas DataFrame for transaction table containing user, item, and ratings
    
    train_ratio  : the desired ratio of training set, while 1-train ratio is automatically set for the test set 
    
    
    Returns
    ---------
    df_train_fin : dataframe for the training set
    
    df_test_fin  : dataframe for the test set
    
    df_test_fin* : possible option is a pivoted df ready as the util matrix input of the recsys. In our case, the
                   index='userId', columns='movieId', values='rating'. To generalize a transaction table, 
                   index=column[0], columns=itemId, values=rating.
    """
    
    # 1. check if the data has timestamp
    col = 'timestamp'
    if col not in data.columns:
    #     print('column does not exist')
        raise ValueError('could not find %s in %s' % (col,list(data.columns)))
    
    # 2. split data into train and test. test set is automatically the last 30% of the data set
    list_df_train = []
    list_df_test = []
    
    #group by user id
    d = dict(tuple(data.groupby(data.columns[0]))) #assuming column[0] is the userId
    
    #splitting randomly per user
    for i in (d):
        if len(d[i])<2:
            print(len(d[i]))
            list_df_test.append(d[i])
            
        else:
            d[i].sort_values('timestamp', inplace=True)
            df_train = d[i].iloc[0:int(train_ratio*len(d[i])),:]
            
            ind = df_train.index
            df_test = d[i].drop(ind)
            
            list_df_train.append(df_train) 
            list_df_test.append(df_test)

    # 3. merge selected train set per user to a single dataframe
    df_train_fin = pd.concat(list_df_train)
    df_test_fin = pd.concat(list_df_test)
    
    # 4. Option to pivot it to create the utility matrix ready as input for recsys
    df_test_um = df_test_fin.pivot(index=df_test_fin.columns[0], columns=df_test_fin.columns[1], values=df_test_fin.columns[2])
    
    # 5. get indices of train and test sets
    indx_train = df_train_fin.index
    indx_test = df_test_fin.index

    return df_train_fin, df_test_fin, df_test_um, indx_train, indx_test 
df_train, df_test, df_test_um, ind_train, ind_test = split_train_test_chronological(new_data, 0.70)
df_train
userId movieId rating timestamp
73 1 1210 5.0 964980499
43 1 804 4.0 964980499
120 1 2018 5.0 964980523
171 1 2628 4.0 964980523
183 1 2826 4.0 964980523
... ... ... ... ...
100610 610 101112 3.0 1493847967
100022 610 7369 2.5 1493847980
100603 610 100163 3.0 1493847984
99808 610 4153 3.0 1493847986
100565 610 95558 3.5 1493847991

70312 rows × 4 columns

Unittest

import unittest
import pandas as pd

class Test_split(unittest.TestCase):
    
    def test_shape(self):
        df = pd.DataFrame({'u': [1,1,2,2,3,3,3,5,5,6], 'i': [3,4,5,6,7,1,2,3,1,0], 'r':[5,6,7,8,9,3,2,1,0,9], 'timestamp':[0,1,3,9,3,2,1,4,6,5]}) 
        df_train1, df_test1, df_test_um, ind_train, ind_test = split_train_test_chronological(df, 0.70)
        cdf = pd.concat([df_train1, df_test1])
        s1=df.shape
        s2=cdf.shape

        self.assertEqual(s1,s2)
    

unittest.main(argv=[''], verbosity=2, exit=False)
    
test_shape (__main__.Test_split) ... 
1
ok

----------------------------------------------------------------------
Ran 1 test in 0.011s

OK
<unittest.main.TestProgram at 0x7f9e61734220>