Evaluation¶
This notebook contains 2nd and 3rd of three functions:
split_train_test: creates train and test sets by splitting the raw data ‘user_feature.csv’.
evaluate: calculates the mse and mae of the final recommendations to the actual recommendations based on the test set.
append_error_to_df: for visualization purposes and for further exploration of the errors.
Generating input data for unittesting purposes.¶
The commented cells are for the purpose of testing the function and unittest only.
# import pandas as pd
# data = pd.read_csv('user_feature.csv')
# features = ['userId', 'movieId', 'rating']
# # data
# new_data=data[features]
# new_data
Splitting¶
# import pandas as pd
# def split_train_test(data, train_ratio=0.7):
# """
# Splits the transaction data into train and test sets.
# Parameters
# ----------
# data : pandas DataFrame for transaction table containing user, item, and ratings
# train_ratio : the desired ratio of training set, while 1-train ratio is automatically set for the test set
# Returns
# ---------
# df_train_fin : dataframe for the training set
# df_test_fin : dataframe for the test set
# """
# list_df_train = []
# list_df_test = []
# #group by user id
# d = dict(tuple(data.groupby(data.userId)))
# #splitting randomly per user
# for i in (d):
# df_train = d[i].sample(frac=train_ratio)
# ind = df_train.index
# df_test = d[i].drop(ind)
# list_df_train.append(df_train)
# list_df_test.append(df_test)
# # 2. merge selected train set per user to a single dataframe
# df_train_fin = pd.concat(list_df_train)
# df_test_fin = pd.concat(list_df_test)
# return df_train_fin, df_test_fin
# df_train, df_test = split_train_test(new_data, 0.70)
# df_test
# df_test.pivot(index='userId', columns='movieId', values='rating')
Metrics for the output of recommerder system¶
Sample test is created using a subset of the test set, while synthetic result is created by inducing few modifications in the test set.
# sample_test = df_test[(df_test.userId>= 2) & (df_test.userId<=4)].pivot(index='userId', columns='movieId', values='rating')
# sample_test
| movieId | 58 | 106 | 222 | 342 | 417 | 441 | 450 | 492 | 553 | 593 | ... | 5764 | 6874 | 8798 | 46970 | 58559 | 60756 | 70946 | 86345 | 106782 | 131724 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| userId | |||||||||||||||||||||
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | 4.0 | 3.5 | 4.0 | 4.5 | 5.0 | NaN | 4.0 | 5.0 | 5.0 |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4.5 | NaN | NaN | NaN | NaN | NaN | 5.0 | NaN | NaN | NaN |
| 4 | 3.0 | 4.0 | 1.0 | 5.0 | 2.0 | 1.0 | 2.0 | 5.0 | 2.0 | 5.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 rows × 86 columns
# # random change in the data for measurement of accuracy
# synthetic_result=sample_test-0.5
# synthetic_result.iloc[0,1] = 5.0
# synthetic_result.iloc[0,5] = 2.0
# synthetic_result.iloc[2,0] = 3.0
# synthetic_result
| movieId | 58 | 106 | 222 | 342 | 417 | 441 | 450 | 492 | 553 | 593 | ... | 5764 | 6874 | 8798 | 46970 | 58559 | 60756 | 70946 | 86345 | 106782 | 131724 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| userId | |||||||||||||||||||||
| 2 | NaN | 5.0 | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | ... | NaN | 3.5 | 3.0 | 3.5 | 4.0 | 4.5 | NaN | 3.5 | 4.5 | 4.5 |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | NaN | NaN | NaN | NaN | NaN | 4.5 | NaN | NaN | NaN |
| 4 | 3.0 | 3.5 | 0.5 | 4.5 | 1.5 | 0.5 | 1.5 | 4.5 | 1.5 | 4.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 rows × 86 columns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
def evaluate(df_test_result, df_test_data):
"""
Calculates the mse and mae per user of the results of the recommender system for a given test set.
Parameters
----------
df_test_result : utility matrix containing the result of the recommender systems
df_test_data : pivoted test data generated from splitting the transaction table and tested on the recommender systems
Returns
---------
mse_list : list of mean squared error for each user
mae_list : list of mean absolute error for each user
"""
mse_list = []
mae_list = []
# test indices first, all user ids should be represented in the test matrix
idx_orig_data = df_test_data.index
idx_result = df_test_result.index
a=idx_orig_data.difference(idx_result)
if len(a)==0:
print('proceed')
for i in (df_test_result.index):
y_pred = df_test_result[df_test_result.index==i].fillna(0)
y = df_test_data[df_test_data.index==i].fillna(0)
y_pred = y_pred[y.columns]
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse_list.append(mse)
mae_list.append(mae)
else:
print(error)
return mse_list, mae_list
mse, mae = evaluate(sample_test, synthetic_result)
print(mse)
print(mae)
proceed
[0.3633720930232558, 0.03488372093023256, 0.18604651162790697]
[0.13372093023255813, 0.06976744186046512, 0.37209302325581395]
def append_error_to_df(test_result, mse, mae):
"""
Inserts the error values into the first two rows of the dataframe of the predictions of system for easy visualization
and for further computations.
Parameters
----------
test_result : utility matrix for the result of the recommender systems on the test set
mse : mse computed from function evaluate
mae : mae computed from function evaluate
Returns
-------
test_result : modified utility matrix with errors
"""
test_result.insert(0, 'mse_u', mse)
test_result.insert(0, 'mae_u', mae)
return test_result
df_error = append_error_to_df(synthetic_result, mse, mae)
df_error
| movieId | mae_u | mse_u | 58 | 106 | 222 | 342 | 417 | 441 | 450 | 492 | ... | 5764 | 6874 | 8798 | 46970 | 58559 | 60756 | 70946 | 86345 | 106782 | 131724 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| userId | |||||||||||||||||||||
| 2 | 0.133721 | 0.363372 | NaN | 5.0 | NaN | NaN | NaN | 2.0 | NaN | NaN | ... | NaN | 3.5 | 3.0 | 3.5 | 4.0 | 4.5 | NaN | 3.5 | 4.5 | 4.5 |
| 3 | 0.069767 | 0.034884 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 4.0 | NaN | NaN | NaN | NaN | NaN | 4.5 | NaN | NaN | NaN |
| 4 | 0.372093 | 0.186047 | 3.0 | 3.5 | 0.5 | 4.5 | 1.5 | 0.5 | 1.5 | 4.5 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 rows × 88 columns
Unittest¶
import unittest
import pandas as pd
from pandas._testing import assert_index_equal
from pandas._testing import assert_frame_equal
class Test_evaluate(unittest.TestCase):
def test_index(self): #indices or userIds should all be represented
df = pd.DataFrame({'u': [1,1,2,2,3,3,3,5,5,6], 'i': [3,4,5,6,7,1,2,3,1,0], 'r':[5,6,7,8,9,3,2,1,0,9]})
df_test1 = df.pivot(index=df.columns[0], columns=df.columns[1], values=df.columns[2])
df_result = df_test1-0.5
assert_index_equal(df_test1.index, df_result.index)
def test_type_error(self):
df = pd.DataFrame({'u': [1,1,2,2,3,3,3,5,5,6], 'i': [3,4,5,6,7,1,2,3,1,0], 'r':[5,6,7,8,9,3,2,1,0,9]})
df_test1 = df.pivot(index=df.columns[0], columns=df.columns[1], values=df.columns[2])
df_result1 = df_test1-0.5
mse, mae = evaluate(df_result1, df_test1)
length = len(df_result1)
self.assertEqual(len(mse), length)
self.assertEqual(len(mae), length)
self.assertIsNotNone(sum(mae))
self.assertIsNotNone(sum(mse))
def test_same_df_shape(self):
df = pd.DataFrame({'u': [1,1,2,2,3,3,3,5,5,6], 'i': [3,4,5,6,7,1,2,3,1,0], 'r':[5,6,7,8,9,3,2,1,0,9]})
df_test1 = df.pivot(index=df.columns[0], columns=df.columns[1], values=df.columns[2])
df_result2 = df_test1-0.5
self.assertEqual(df_result2.shape, df_test1.shape)
unittest.main(argv=[''], verbosity=2, exit=False)
test_index (__main__.Test_evaluate) ... ok
test_same_df_shape (__main__.Test_evaluate) ... ok
test_type_error (__main__.Test_evaluate) ...
proceed
ok
----------------------------------------------------------------------
Ran 3 tests in 0.057s
OK
<unittest.main.TestProgram at 0x7f851a21bac0>