Item Feature Table¶
def preprocess_string(text):
''' Preprocess text for tf-idf
Transforms the text into lowercase and removes symbols
and punctuations
Removes stopwords using NLTK library
Lemmatizes words using SnowballStemmer (NLTK Library)
Input
--------
text (string) : string from the Movielens synopsis dataset
Output
--------
new_text (string) : preprocessed text for further tf-idf processing
'''
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer # get from VP later
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('english')
stemmer = SnowballStemmer(language='english')
text = text.lower()
text = ''.join([char for char in text if char not in string.punctuation])
new_text = ""
words = word_tokenize(text)
for word in words:
if word not in stop_words and len(word) > 2:
new_text = new_text + " " + stemmer.stem(word)
return new_text
def create_item_feature(num_features = 300):
'''
Return item_feature matrix based on TF-IDF of Movie Synopsis
Takes in the list of movies that has been rated in the MovieLens 100k
dataset and fetches the respective synopsis for TF-IDF computation
Input
---------
num_features : number of features to be used for the TF-IDF extraction
: default value 300 (~sqrt[100k rows])
Output
---------
item_feature (pd.DataFrame): feature_vector from TF-IDF extracted
from movie synopses the TheMovieDB dataset
'''
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
transaction_list = pd.read_csv('ratings.csv', usecols=['movieId'])
# filter the unique movie IDs
seen_movies = pd.DataFrame(transaction_list['movieId'].unique(), columns={'movieId'})
# the synopsis is based on the "The Movie DB" Id system
# links.csv has a mapping between MovieLens ID and The MovieDB Id
movie_id_links = pd.read_csv('links.csv', usecols =['movieId','tmdbId'])
movie_id_links = movie_id_links.dropna()
movie_id_links.head()
# get mapping between MovieLens IDs and TMDB IDs
seen_movies = seen_movies.merge(movie_id_links, on='movieId', how='inner')
# Read MetaData CSV file with movie plots/synopsis
metadata = pd.read_csv('movies_metadata.csv', usecols=['id','overview'])
metadata = metadata.rename(columns={'id':'tmdbId'})
# drop movies with invalid tmbdId (e.g., date string instead of integer)
ids1 = pd.to_numeric(metadata['tmdbId'], errors='coerce').isna()
metadata = metadata.drop(metadata[ids1].index)
# drop movies with NaN synopsis
metadata = metadata.dropna()
metadata['tmdbId'] = metadata['tmdbId'].astype(float)
metadata = metadata.drop_duplicates(subset=['tmdbId'])
# get only synopsis for movies in the transaction list
synopsis_set = seen_movies.merge(metadata, on='tmdbId', how='inner')
# preprocess synopsis strings
synopsis_set['overview'] = synopsis_set['overview'].apply(preprocess_string)
# TF-IDF processing
tfidfvectorizer = TfidfVectorizer(analyzer='word', token_pattern = '[a-z]+\w*', stop_words='english', max_features=num_features)
tfidf_vector = tfidfvectorizer.fit_transform(synopsis_set['overview'])
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=synopsis_set['movieId'], columns=tfidfvectorizer.get_feature_names_out())
# normalization per column (word)
tfidf_df = tfidf_df.apply(lambda x: (x - x.min())/(x.max() - x.min()))
tfidf_df = tfidf_df.reset_index()
# rename cols
old_cols = tfidf_df.columns
new_cols = []
new_cols.append(old_cols[0])
for idx, col in enumerate(old_cols[1:], 1):
new_cols.append(f'i_{idx}')
tfidf_df.rename(columns=dict(zip(old_cols, new_cols)), inplace=True)
return tfidf_df
item_feature_table = create_item_feature(num_features = 300)
item_feature_table.head()
| movieId | i_1 | i_2 | i_3 | i_4 | i_5 | i_6 | i_7 | i_8 | i_9 | ... | i_291 | i_292 | i_293 | i_294 | i_295 | i_296 | i_297 | i_298 | i_299 | i_300 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 6 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.513025 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 47 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 50 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 301 columns
item_feature_table.shape
(9508, 301)