GitHub Repository: DataScienceUWL/DS775
Path: blob/main/Lessons/Lesson 13 - RecSys 1/Chapter_Notebooks/Content Based Recommenders.ipynb
⁸⁷¹ views

Kernel: Python 3 (system-wide)

Plot Description Based Recommender

In [1]:

import pandas as pd
import numpy as np

#Read the CSV File into df
# Note we have truncated the dataset to 5000 rows for illustration, the actual data has over 40000 rows
# the full dataset is available on Kaggle here
# https://www.kaggle.com/rounakbanik/the-movies-dataset/downloads/the-movies-dataset.zip/7
# the recommenders work better with more data of course

#Import data from the clean file (created in the Knowledge Recommender.ipynb file)
df = pd.read_csv('./data/metadata_clean.csv')

#Print the head of the cleaned DataFrame
df.head()

Out[1]:

In [2]:

#Import the original file
orig_df = pd.read_csv('./data/movies_metadata.csv', low_memory=False)

#Add the useful features into the cleaned dataframe
df['overview'], df['id'] = orig_df['overview'], orig_df['id']

df.head()

Out[2]:

In [3]:

#Import TfIdfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['overview'] = df['overview'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(df['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

Out[3]:

(5000, 22304)

In [4]:

# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [5]:

#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [6]:

###############################
# This is Banik's Approach. Don't use this approach. Use the approach from the lesson
###############################

# Function that takes in movie title as input and gives recommendations 
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [7]:

#Get recommendations for The Lion King

# some of these will seem inappropriate with our smaller dataset, grab the full data from kaggle for better results
content_recommender('The Lion King')

Out[7]:

       The Waiting Game
  Napoleon and Samantha
        The Wizard of Oz
               The Bear
         Prince Valiant
      Shadow of a Doubt
            Pot o' Gold
                  42 Up
              King Kong
       A Perfect Murder
Name: title, dtype: object

Metadata Based Recommender

In [8]:

# Load the keywords and credits files
cred_df = pd.read_csv('./data/credits.csv',low_memory=False,usecols=[0,1,2]) 
key_df = pd.read_csv('./data/keywords.csv')

#Print the head of the credit dataframe
cred_df.head()

Out[8]:

In [9]:

#Print the head of the keywords dataframe
key_df.head()

Out[9]:

In [10]:

#Convert the IDs of df into int (yes, this may throw an error, we'll do something different below)
df['id'] = df['id'].astype('int')

In [11]:

# Function to convert all non-integer IDs to NaN
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

In [12]:

#Clean the ids of df
df['id'] = df['id'].apply(clean_ids)

#Filter all rows that have a null ID
df = df[df['id'].notnull()]

In [13]:

# Convert IDs into integer
df.loc['id'] = df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cred_df['id'] = cred_df['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
df = df.merge(cred_df, on='id')
df = df.merge(key_df, on='id')

#Display the head of df
df.head()

Out[13]:

In [15]:

# Convert the stringified objects into the native python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [16]:

#Print the first cast member of the first movie in df
df.iloc[0]['crew'][0]

Out[16]:

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [17]:

# Extract the director's name. If director is not listed, return NaN
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [18]:

#Define the new director feature
df['director'] = df['crew'].apply(get_director)

#Print the directors of the first five movies
df['director'].head()

Out[18]:

    John Lasseter
     Joe Johnston
    Howard Deutch
  Forest Whitaker
    Charles Shyer
Name: director, dtype: object

In [19]:

# Returns the list top 3 elements or entire list; whichever is more.
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [20]:

#Apply the generate_list function to cast and keywords
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)

In [21]:

#Only consider a maximum of 3 genres
df['genres'] = df['genres'].apply(lambda x: x[:3])

In [22]:

# Print the new features of the first 5 movies along with title
df[['title', 'cast', 'director', 'keywords', 'genres']].head()

Out[22]:

In [23]:

# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [24]:

#Apply the generate_list function to cast, keywords, director and genres
for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [25]:

#Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [26]:

# Create the new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [27]:

#Display the soup of the first movie
df.iloc[0]['soup']

Out[27]:

'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

In [28]:

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Define a new CountVectorizer object and create vectors for the soup
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [29]:

#Import cosine_similarity function
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity score (equivalent to dot product for tf-idf vectors)
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [30]:

# Reset index of your df and construct reverse mapping again
df = df.reset_index()
indices2 = pd.Series(df.index, index=df['title'])

In [31]:

content_recommender('The Lion King', cosine_sim2, df, indices2)

Out[31]:

              Creature Comforts
                   Time Masters
  Thomas and the Magic Railroad
            So Dear to My Heart
                     Thumbelina
          The Flight of Dragons
               Ill Gotten Gains
     Jails, Hospitals & Hip-Hop
       James and the Giant Peach
     The Hunchback of Notre Dame
Name: title, dtype: object

Plot Description Based Recommender

Metadata Based Recommender

Product

Resources

Company