def create_user_feature(num_transactions = 100_836):
'''
Return a user_feature matrix
Takes in the transaction list from the Movielens 100k dataset
and replaces the userId with a feature vector representing
the number of movies seen by the user per genre
possible genres include the following:
'IMAX', 'Adventure', 'Mystery', 'Animation', 'Documentary', 'Comedy',
'Western', 'War', 'Film-Noir', 'Crime', 'Drama', 'Thriller', 'Fantasy',
'Action', 'Sci-Fi', 'Children', 'Romance', 'Horror', 'Musical',
'(no genres listed)'
Input
---------
none
Output
---------
user_feature (pd.DataFrame): feature_vector containing number of count of
genres seen based on ratings given by a user
- each movie can have several genres
- each row correspond to a transaction (user rating)
'''
import numpy as np
import pandas as pd
from collections import Counter
raw_transaction_list = pd.read_csv('ratings.csv', nrows = num_transactions)
transaction_list = raw_transaction_list[['userId','movieId', 'rating']].copy()
# reduce size of DataFrame for transaction_list by downcasting
for col in transaction_list:
if transaction_list[col].dtype == 'int64':
transaction_list[col] = pd.to_numeric(transaction_list[col], downcast='integer')
if transaction_list[col].dtype == 'float64':
transaction_list[col] = pd.to_numeric(transaction_list[col], downcast='float')
# preprocess movie list and genres
movie_description = pd.read_csv('movies.csv')
movie_description = movie_description.set_index('movieId')
movie_description['genre'] = movie_description['genres'].str.split('|')
# extract the genres for the movie in each transaction/rating
movie_IDs_list = transaction_list['movieId']
transaction_list['genre'] = list(movie_description.loc[movie_IDs_list[:len(movie_IDs_list)]]['genre'])
# count the number of genres seen by each userId
genre_count = (transaction_list.groupby('userId')['genre']
.apply(list)
.apply(lambda x: [item for sublist in x for item in sublist])
.apply(Counter))
# remove genre column in transaction list (just to conserve memspace)
del transaction_list['genre']
# create user_feature with count of genres per user
user_feature = pd.DataFrame(list(genre_count)).fillna(0)
for col in user_feature:
user_feature[col] = pd.to_numeric(user_feature[col], downcast='integer')
user_feature['userId'] = genre_count.index
# re-arrange columns
cols = user_feature.columns.tolist()
cols = cols[-1:] + cols[:-1]
user_feature = user_feature[cols]
# rename cols
old_cols = user_feature.columns[1:]
new_cols = []
for idx, col in enumerate(cols[1:], 1):
new_cols.append(f'u_{idx}')
user_feature.rename(columns=dict(zip(old_cols, new_cols)), inplace=True)
# replace NaNs in user_feature with 0
user_feature = user_feature.fillna(0)
return raw_transaction_list, user_feature