User Feature Table

def create_user_feature(num_transactions = 100_836):
    '''
    Return a user_feature matrix
    
    Takes in the transaction list from the Movielens 100k dataset
    and replaces the userId with a feature vector representing
    the number of movies seen by the user per genre
    
    possible genres include the following:
    'IMAX', 'Adventure', 'Mystery', 'Animation', 'Documentary', 'Comedy',
       'Western', 'War', 'Film-Noir', 'Crime', 'Drama', 'Thriller', 'Fantasy',
       'Action', 'Sci-Fi', 'Children', 'Romance', 'Horror', 'Musical',
       '(no genres listed)'
       
    Input
    ---------
    none
    
    
    Output
    ---------
    user_feature (pd.DataFrame): feature_vector containing number of count of 
                                 genres seen based on ratings given by a user
                                 - each movie can have several genres
                                 - each row correspond to a transaction (user rating)
    
    
    
    '''
    import numpy as np
    import pandas as pd
    from collections import Counter
    
    raw_transaction_list = pd.read_csv('ratings.csv', nrows = num_transactions)
    transaction_list =  raw_transaction_list[['userId','movieId', 'rating']].copy()
    
    # reduce size of DataFrame for transaction_list by downcasting
    for col in transaction_list:
        if transaction_list[col].dtype == 'int64':
            transaction_list[col] = pd.to_numeric(transaction_list[col], downcast='integer')
        if transaction_list[col].dtype == 'float64':
            transaction_list[col] = pd.to_numeric(transaction_list[col], downcast='float')

    
    # preprocess movie list and genres
    movie_description = pd.read_csv('movies.csv')    
    movie_description = movie_description.set_index('movieId')
    movie_description['genre'] = movie_description['genres'].str.split('|')
    
    # extract the genres for the movie in each transaction/rating
    movie_IDs_list = transaction_list['movieId']
    transaction_list['genre'] = list(movie_description.loc[movie_IDs_list[:len(movie_IDs_list)]]['genre'])

    # count the number of genres seen by each userId
    genre_count = (transaction_list.groupby('userId')['genre']
                     .apply(list)
                     .apply(lambda x: [item for sublist in x for item in sublist])
                     .apply(Counter))
    
    # remove genre column in transaction list (just to conserve memspace)
    del transaction_list['genre']
        
    # create user_feature with count of genres per user
    user_feature = pd.DataFrame(list(genre_count)).fillna(0)
    for col in user_feature:
        user_feature[col] = pd.to_numeric(user_feature[col], downcast='integer')
        
    
    user_feature['userId'] = genre_count.index
    
    
    # re-arrange columns
    cols = user_feature.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    user_feature = user_feature[cols]
    
    # rename cols
    old_cols = user_feature.columns[1:]
    new_cols = []
    for idx, col in enumerate(cols[1:], 1):
        new_cols.append(f'u_{idx}')
    user_feature.rename(columns=dict(zip(old_cols, new_cols)), inplace=True)
    
    # replace NaNs in user_feature with 0
    user_feature = user_feature.fillna(0)
    
    return raw_transaction_list, user_feature
    
    
vanilla_transaction_list, user_feature_table = create_user_feature(num_transactions = 5000)
vanilla_transaction_list.head(10)
userId movieId rating timestamp
0 1 1 4.0 964982703
1 1 3 4.0 964981247
2 1 6 4.0 964982224
3 1 47 5.0 964983815
4 1 50 5.0 964982931
5 1 70 3.0 964982400
6 1 101 5.0 964980868
7 1 110 4.0 964982176
8 1 151 5.0 964984041
9 1 157 5.0 964984100
user_feature_table.head()
userId u_1 u_2 u_3 u_4 u_5 u_6 u_7 u_8 u_9 ... u_11 u_12 u_13 u_14 u_15 u_16 u_17 u_18 u_19 u_20
0 1 85 29 42 83 47 26 90 45 55 ... 17 68 22 7 40 22 1 0 0 0
1 2 3 0 0 7 0 1 11 10 10 ... 1 17 1 1 4 0 0 4 3 0
2 3 11 4 5 9 4 5 14 2 7 ... 8 16 5 0 15 1 0 0 0 0
3 4 29 6 10 104 19 58 25 27 38 ... 4 120 7 10 12 16 4 1 2 0
4 5 8 6 9 15 7 11 9 12 9 ... 1 25 3 2 2 5 0 3 0 0

5 rows × 21 columns

user_feature_table.shape
(32, 21)