Item Feature Table

def preprocess_string(text):
    ''' Preprocess text for tf-idf
    
    Transforms the text into lowercase and removes symbols
    and punctuations
    Removes stopwords using NLTK library
    Lemmatizes words using SnowballStemmer (NLTK Library)
    
    Input
    --------
    text (string) :  string from the Movielens synopsis dataset 
    
    
    Output
    --------
    new_text (string)  : preprocessed text for further tf-idf processing
    
    '''
    import string
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer # get from VP later
    from nltk.tokenize import word_tokenize
    
    
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer(language='english')
    
    text = text.lower()
    
    text = ''.join([char for char in text if char not in string.punctuation])
    
    new_text = ""
    words = word_tokenize(text)
    for word in words:
        if word not in stop_words and len(word) > 2:
            new_text = new_text + " " + stemmer.stem(word)
    
    return new_text
def create_item_feature(num_features = 300):
    '''
    Return item_feature matrix based on TF-IDF of Movie Synopsis
    
    Takes in the list of movies that has been rated in the MovieLens 100k
    dataset and fetches the respective synopsis for TF-IDF computation
    
       
    Input
    ---------
    num_features : number of features to be used for the TF-IDF extraction
                 : default value 300 (~sqrt[100k rows])
    
    
    Output
    ---------
    item_feature (pd.DataFrame): feature_vector from TF-IDF extracted
                            from movie synopses the TheMovieDB dataset
    
    
    
    '''
    
    import numpy as np
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    transaction_list = pd.read_csv('ratings.csv', usecols=['movieId'])
    
    # filter the unique movie IDs
    seen_movies = pd.DataFrame(transaction_list['movieId'].unique(), columns={'movieId'})
    
    # the synopsis is based on the "The Movie DB" Id system
    # links.csv has a mapping between MovieLens ID and The MovieDB Id
    movie_id_links = pd.read_csv('links.csv', usecols =['movieId','tmdbId'])
    movie_id_links = movie_id_links.dropna()
    movie_id_links.head()
    
    # get mapping between MovieLens IDs and TMDB IDs
    seen_movies = seen_movies.merge(movie_id_links, on='movieId', how='inner')
    
    # Read MetaData CSV file with movie plots/synopsis
    metadata = pd.read_csv('movies_metadata.csv', usecols=['id','overview'])
    metadata = metadata.rename(columns={'id':'tmdbId'})

    # drop movies with invalid tmbdId (e.g., date string instead of integer)
    ids1 = pd.to_numeric(metadata['tmdbId'], errors='coerce').isna()
    metadata = metadata.drop(metadata[ids1].index)

    # drop movies with NaN synopsis
    metadata = metadata.dropna()
    metadata['tmdbId'] = metadata['tmdbId'].astype(float)
    metadata = metadata.drop_duplicates(subset=['tmdbId'])

        
    # get only synopsis for movies in the transaction list
    synopsis_set = seen_movies.merge(metadata, on='tmdbId', how='inner')
    
    # preprocess synopsis strings
    synopsis_set['overview'] = synopsis_set['overview'].apply(preprocess_string)
    
    # TF-IDF processing
    tfidfvectorizer = TfidfVectorizer(analyzer='word', token_pattern = '[a-z]+\w*', stop_words='english', max_features=num_features)
    tfidf_vector = tfidfvectorizer.fit_transform(synopsis_set['overview'])
    tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=synopsis_set['movieId'], columns=tfidfvectorizer.get_feature_names_out())
    
    # normalization per column (word)
    tfidf_df = tfidf_df.apply(lambda x: (x - x.min())/(x.max() - x.min()))
    tfidf_df = tfidf_df.reset_index()
    
    # rename cols
    old_cols = tfidf_df.columns
    new_cols = []
    new_cols.append(old_cols[0])
    for idx, col in enumerate(old_cols[1:], 1):
        new_cols.append(f'i_{idx}')
    tfidf_df.rename(columns=dict(zip(old_cols, new_cols)), inplace=True)
    
    return tfidf_df
    
    
item_feature_table = create_item_feature(num_features = 300)
item_feature_table.head()
movieId i_1 i_2 i_3 i_4 i_5 i_6 i_7 i_8 i_9 ... i_291 i_292 i_293 i_294 i_295 i_296 i_297 i_298 i_299 i_300
0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.513025 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 47 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 50 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 301 columns

item_feature_table.shape
(9508, 301)