Content Based Model¶
This notebook uses code from the cross_val, sample_train_test, and evaluate pipeline.
import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
def load_data(aug_tt, item_tt, user_tt):
"""
Load the data from the transaction tables
Paramters
---------
aug_tt : str
File name of the parquet file with each row corresponding
to a user's features, an item's features, and the user's
rating for that item
item_tt : str
File name of the parquet file with each row corresponding
to an item's features
user_tt : str
File name of the parquet file with each row corresponding
to a user's features
Returns
-------
df : pandas DataFrame
The augmented transaction table
item_df : pandas DataFrame
The item features as a transaction table
user_df : pandas DataFrame
The userfeatures as a transaction table
item_ids : list
All unique item ids
user_ids : list
All unique user ids
"""
df = pd.read_parquet(aug_tt).dropna()
item_df = pd.read_parquet(item_tt)
item_ids = item_df['movieId'].unique()
item_df = item_df.drop(columns=['movieId'])
user_df = pd.read_parquet(user_tt).drop(columns=['userId'])
user_ids = df['userId'].unique()
return df, item_df, user_df, item_ids, user_ids
def fit_ml_cb(train_df, model, target_col='rating', drop_cols=['userId', 'movieId', 'timestamp']):
"""
Perform item-wise clustering and assign each item to a cluster of similar
items based on the users that
Paramters
---------
train_df : pandas DataFrame
The training set as a transaction table. Each row
corresponds to a user's features and that item's features
along with the user's rating for that item.
model : an sklearn regressor object
An object with a fit and predict method that outputs a
float.
target_col : str
The column corresponding to the rating.
drop_cols : list
Columns to be dropped in train_df.
Returns
-------
rs_model : an sklearn model object
The fitted version of the model input used to predict the
rating of a user for an object given the user's features
and the item's features.
"""
rs_model = clone(model)
target = train_df[target_col].dropna().values.ravel()
train_df = train_df.drop(columns=[target_col]+drop_cols)
rs_model = model.fit(train_df, target)
return rs_model
def reco_ml_cb(user_df, item_df, item_ids, model_fitted):
"""
Completes the entire utility matrix based on the model passed
Parameters
---------
train_df : pandas DataFrame
The training set as a transaction table. Each row
corresponds to a user's features and that item's features
along with the user's rating for that item.
model : an sklearn regressor object
An object with a fit and predict method that outputs a
float.
target_col : str
The column corresponding to the rating.
Returns
-------
full_matrix : a pandas DataFrame
The completed utility matrix.
"""
recos = {}
c = 1
for u, u_feats in user_df.iterrows():
print(c, 'out of', len(user_df), end='\r')
u_feats = pd.concat([pd.DataFrame(u_feats).T] *
len(item_ids)).reset_index(drop=True)
a_feats = u_feats.join(item_df)
reco = pd.Series(model_fitted.predict(a_feats), index=item_ids)
recos[u] = reco
c += 1
full_matrix = pd.DataFrame.from_dict(recos, orient='index')
return full_matrix
def reco_ml_cb_tt(df_test, model_fitted, target='rating', drop_cols=['userId', 'movieId', 'timestamp']):
"""
Make predictions on the test set and outputs an array of the predicted
values for them.
Paramters
---------
df_test : pandas DataFrame
The test set as a transaction table. Each row
corresponds to a user's features and that item's features
along with the user's rating for that item.
model_fitted : an sklearn regressor object
An object with a fit and predict method that outputs a
float. Must be fitted already
target_col : str
The column corresponding to the rating.
drop_cols : list
Columns to be dropped in df_test.
Returns
-------
result : numpy array
The results of the model using df_test's features
"""
df_test = df_test.drop(columns=[target]+drop_cols)
result = model_fitted.predict(df_test)
return result
def split_train_test(data, train_ratio=0.7,uid='userId', iid='movieId', rid='rating'):
"""
Splits the transaction data into train and test sets.
Parameters
----------
data : pandas DataFrame for transaction table containing user, item, and ratings
train_ratio : the desired ratio of training set, while 1-train ratio is automatically set for the test set
Returns
---------
df_train_fin : dataframe for the training set
df_test_fin : dataframe for the test set
df_test_fin* : possible option is a pivoted df ready as the util matrix input of the recsys. In our case, the
index='userId', columns='movieId', values='rating'. To generalize a transaction table,
index=column[0], columns=itemId, values=rating.
"""
list_df_train = []
list_df_test = []
#group by user id
d = dict(tuple(data.groupby(data.columns[0]))) #assuming column[0] is the userId
#splitting randomly per user
for i in (d):
if len(d[i])<2:
list_df_test.append(d[i])
else:
df_train = d[i].sample(frac=train_ratio)
ind = df_train.index
df_test = d[i].drop(ind)
list_df_train.append(df_train)
list_df_test.append(df_test)
# 2. merge selected train set per user to a single dataframe
df_train_fin = pd.concat(list_df_train)
df_test_fin = pd.concat(list_df_test)
# 3. Option to pivot it to create the utility matrix ready as input for recsys
df_test_um = df_test_fin.pivot(index=uid, columns=iid, values=rid)
# 4. get indices of train and test sets
indx_train = df_train_fin.index
indx_test = df_test_fin.index
return df_train_fin, df_test_fin, df_test_um, indx_train, indx_test #return indices
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
def evaluate(df_test_result, df_test_data):
"""
Calculates the mse and mae per user of the results of the recommender system for a given test set.
Parameters
----------
df_test_result : utility matrix containing the result of the recommender systems
df_test_data : pivoted test data generated from splitting the transaction table and tested on the recommender systems
Returns
---------
mse_list : list of mean squared error for each user
mae_list : list of mean absolute error for each user
"""
mse_list = []
mae_list = []
# test indices first, all user ids should be represented in the test matrix
idx_orig_data = df_test_data.index
idx_result = df_test_result.index
a=idx_orig_data.difference(idx_result)
if len(a)==0:
print('proceed')
for i in (df_test_result.index):
y_pred = df_test_result[df_test_result.index==i].fillna(0)
y = df_test_data[df_test_data.index==i].fillna(0)
y_pred = y_pred[y.columns]
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse_list.append(mse)
mae_list.append(mae)
else:
print('error')
return mse_list, mae_list
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
def evaluate_arrays(model_result_arr, df_data, indx_test):
"""
Calculates the mse and mae of the recommender system for a given result and test set.
Parameters
----------
model_result_arr : ratings from the results of the recommender sys using test set
df_test_truth : the original dataframe for before splitting.
the original ratings or ground truth from the test set will be extracted from here using indices
indx_test : result indices of test set from splitting
Returns
---------
mse : mse value using sklearn
mae : mse value using sklearn
"""
df_test_truth = df_data.loc[pd.Index(indx_test), df_data.columns[2]]
test_arr = df_test_truth.values
# test indices first, all user ids should be represented in the test matrix
result_len = len(model_result_arr)
test_len = len(test_arr)
if result_len!=test_len:
raise ValueError('the arrays are of different lengths %s in %s' % (result_len,test_len))
else:
print('proceed')
mse = mean_squared_error(test_arr, model_result_arr)
mae = mean_absolute_error(test_arr, model_result_arr)
return mse, mae
def cross_val(df, k, model, split_method='random'):
"""
Performs cross-validation for different train and test sets.
Parameters
-----------
df : the data to be split in the form of vanilla/transaction++ table (uid, iid, rating, timestamp)
k : the number of times splitting and learning with the model is desired
model : an unfitted sklearn model
split_method : 'random' splitting or 'chronological' splitting of the data
Returns
--------
mse and mae : error metrics using sklearn
"""
mse = []
mae = []
if split_method == 'random':
for i in range(k):
print(i)
# 1. split
print('Starting splitting')
df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(
df, 0.7)
print('Finished splitting')
# 2. train with model
model_clone = clone(model)
print('Starting training')
model_clone_fit = fit_ml_cb(df_train.sample(100), model_clone)
print('Finished training')
print('Starting completing matrix')
result = reco_ml_cb_tt(df_test, model_fit)
print('Finished completing matrix')
print('Starting computing MAE and MSE')
# 3. evaluate results (result is in the form of utility matrix)
mse_i, mae_i = evaluate_arrays(result, df, indx_test)
print('Finished computing MAE and MSE')
mse.append(mse_i)
mae.append(mae_i)
elif split_method == 'chronological':
# 1. split
df_train, df_test, df_test_um, indx_train, indx_test = split_train_test_chronological(
df, 0.7)
print('Starting splitting')
print('Finished splitting')
# 2. train with model
model_clone = clone(model)
print('Starting training')
model_clone_fit = fit_ml_cb(df_train.sample(100), model_clone)
print('Finished training')
print('Starting completing matrix')
result = reco_ml_cb_tt(df_test, model_fit)
print('Finished completing matrix')
print('Starting computing MAE and MSE')
# 3. evaluate results (result is in the form of utility matrix)
mse_i, mae_i = evaluate_arrays(result, df, indx_test)
print('Finished computing MAE and MSE')
mse.append(mse_i)
mae.append(mae_i)
return mse, mae
Model Pipeline¶
#Declare your model
rs_model1 = RandomForestRegressor(random_state=202109, n_jobs=-1)
#Load the data
df, item_df, user_df, item_ids, user_ids = load_data('augmented_transaction_table.parquet',
'item_feature.parquet',
'user_feature.parquet')
#Do your train and test split
df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(df, 0.7) #To split the data
# #Fit your model to the train data
model_fit = fit_ml_cb(df_train.sample(100), rs_model1) #To fit the model
#Predict on the test data
preds_array = reco_ml_cb_tt(df_test, model_fit) #To make predictions as an array
mse, mae = cross_val(df, 5, model_fit, split_method='random')
0
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
1
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
2
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
3
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
4
Starting splitting
Finished splitting
Starting training
Finished training
Starting completing matrix
Finished completing matrix
Starting computing MAE and MSE
proceed
Finished computing MAE and MSE
evaluate_arrays(preds_array, df, indx_test) #MSE and MAE
proceed
(0.8669795716126364, 0.7143198938393699)
import unittest
class TestGetRec(unittest.TestCase):
import pandas as pd
import numpy as np
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor
def test_matrix_shape(self):
df, item_df, user_df, item_ids, user_ids = load_data('augmented_transaction_table.parquet',
'item_feature.parquet',
'user_feature.parquet')
df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(df, 0.7) #To split the data
model_fit = fit_ml_cb(df_train.sample(100), rs_model1)
matrix_result = reco_ml_cb(user_df, item_df, item_ids, model_fit)
self.assertEqual(matrix_result.shape[0], len(user_ids))
self.assertEqual(matrix_result.shape[1], len(item_ids))
def test_array_pred(self):
df, item_df, user_df, item_ids, user_ids = load_data('augmented_transaction_table.parquet',
'item_feature.parquet',
'user_feature.parquet')
df_train, df_test, df_test_um, indx_train, indx_test = split_train_test(df, 0.7) #To split the data
model_fit = fit_ml_cb(df_train.sample(100), rs_model1)
array_result = reco_ml_cb_tt(df_test, model_fit)
self.assertEqual(len(array_result), len(df_test))
unittest.main(argv=[''], verbosity=2, exit=False)
test_array_pred (__main__.TestGetRec) ...
| userId | movieId | rating | timestamp | u_1 | u_2 | u_3 | u_4 | u_5 | u_6 | ... | i_291 | i_292 | i_293 | i_294 | i_295 | i_296 | i_297 | i_298 | i_299 | i_300 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 3 | 4.0 | 964981247 | 85 | 29 | 42 | 83 | 47 | 26 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 |
| 2 | 1 | 6 | 4.0 | 964982224 | 85 | 29 | 42 | 83 | 47 | 26 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 |
| 5 | 1 | 70 | 3.0 | 964982400 | 85 | 29 | 42 | 83 | 47 | 26 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 |
| 11 | 1 | 216 | 5.0 | 964981208 | 85 | 29 | 42 | 83 | 47 | 26 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.448884 | 0.0 | 0.000000 |
| 18 | 1 | 333 | 5.0 | 964981179 | 85 | 29 | 42 | 83 | 47 | 26 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 100824 | 610 | 161582 | 4.0 | 1493847759 | 267 | 66 | 56 | 411 | 151 | 119 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 |
| 100826 | 610 | 162350 | 3.5 | 1493849971 | 267 | 66 | 56 | 411 | 151 | 119 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 |
| 100829 | 610 | 164179 | 5.0 | 1493845631 | 267 | 66 | 56 | 411 | 151 | 119 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.296870 | 0.0 | 0.000000 | 0.0 | 0.000000 |
| 100834 | 610 | 168252 | 5.0 | 1493846352 | 267 | 66 | 56 | 411 | 151 | 119 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.291784 | 0.0 | 0.000000 | 0.0 | 0.282198 |
| 100835 | 610 | 170875 | 3.0 | 1493846415 | 267 | 66 | 56 | 411 | 151 | 119 | ... | 0.0 | 0.0 | 0.365896 | 0.0 | 0.0 | 0.313337 | 0.0 | 0.000000 | 0.0 | 0.000000 |
30101 rows × 324 columns
ok
test_matrix_shape (__main__.TestGetRec) ...
| userId | movieId | rating | timestamp | u_1 | u_2 | u_3 | u_4 | u_5 | u_6 | ... | i_291 | i_292 | i_293 | i_294 | i_295 | i_296 | i_297 | i_298 | i_299 | i_300 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 4.0 | 964982703 | 85 | 29 | 42 | 83 | 47 | 26 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 1 | 50 | 5.0 | 964982931 | 85 | 29 | 42 | 83 | 47 | 26 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 5 | 1 | 70 | 3.0 | 964982400 | 85 | 29 | 42 | 83 | 47 | 26 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 6 | 1 | 101 | 5.0 | 964980868 | 85 | 29 | 42 | 83 | 47 | 26 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 8 | 1 | 151 | 5.0 | 964984041 | 85 | 29 | 42 | 83 | 47 | 26 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 100817 | 610 | 158956 | 3.0 | 1493848947 | 267 | 66 | 56 | 411 | 151 | 119 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100819 | 610 | 160080 | 3.0 | 1493848031 | 267 | 66 | 56 | 411 | 151 | 119 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100827 | 610 | 163937 | 3.5 | 1493848789 | 267 | 66 | 56 | 411 | 151 | 119 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100828 | 610 | 163981 | 3.5 | 1493850155 | 267 | 66 | 56 | 411 | 151 | 119 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 100832 | 610 | 168248 | 5.0 | 1493850091 | 267 | 66 | 56 | 411 | 151 | 119 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.24502 | 0.0 | 0.0 | 0.0 | 0.0 |
30101 rows × 324 columns
130 out of 610