Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
DataScienceUWL
GitHub Repository: DataScienceUWL/DS775
Path: blob/main/Lessons/Lesson 13 - RecSys 1/Chapter_Notebooks/Content Based Recommenders.ipynb
871 views
Kernel: Python 3 (system-wide)

Plot Description Based Recommender

import pandas as pd import numpy as np #Read the CSV File into df # Note we have truncated the dataset to 5000 rows for illustration, the actual data has over 40000 rows # the full dataset is available on Kaggle here # https://www.kaggle.com/rounakbanik/the-movies-dataset/downloads/the-movies-dataset.zip/7 # the recommenders work better with more data of course #Import data from the clean file (created in the Knowledge Recommender.ipynb file) df = pd.read_csv('./data/metadata_clean.csv') #Print the head of the cleaned DataFrame df.head()
#Import the original file orig_df = pd.read_csv('./data/movies_metadata.csv', low_memory=False) #Add the useful features into the cleaned dataframe df['overview'], df['id'] = orig_df['overview'], orig_df['id'] df.head()
#Import TfIdfVectorizer from the scikit-learn library from sklearn.feature_extraction.text import TfidfVectorizer #Define a TF-IDF Vectorizer Object. Remove all english stopwords tfidf = TfidfVectorizer(stop_words='english') #Replace NaN with an empty string df['overview'] = df['overview'].fillna('') #Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature tfidf_matrix = tfidf.fit_transform(df['overview']) #Output the shape of tfidf_matrix tfidf_matrix.shape
(5000, 22304)
# Import linear_kernel to compute the dot product from sklearn.metrics.pairwise import linear_kernel # Compute the cosine similarity matrix cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any indices = pd.Series(df.index, index=df['title']).drop_duplicates()
############################### # This is Banik's Approach. Don't use this approach. Use the approach from the lesson ############################### # Function that takes in movie title as input and gives recommendations def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices): # Obtain the index of the movie that matches the title idx = indices[title] # Get the pairwsie similarity scores of all movies with that movie # And convert it into a list of tuples as described above sim_scores = list(enumerate(cosine_sim[idx])) # Sort the movies based on the cosine similarity scores sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # Get the scores of the 10 most similar movies. Ignore the first movie. sim_scores = sim_scores[1:11] # Get the movie indices movie_indices = [i[0] for i in sim_scores] # Return the top 10 most similar movies return df['title'].iloc[movie_indices]
#Get recommendations for The Lion King # some of these will seem inappropriate with our smaller dataset, grab the full data from kaggle for better results content_recommender('The Lion King')
3203 The Waiting Game 2779 Napoleon and Samantha 892 The Wizard of Oz 3293 The Bear 1741 Prince Valiant 2094 Shadow of a Doubt 3695 Pot o' Gold 2960 42 Up 2253 King Kong 1783 A Perfect Murder Name: title, dtype: object

Metadata Based Recommender

# Load the keywords and credits files cred_df = pd.read_csv('./data/credits.csv',low_memory=False,usecols=[0,1,2]) key_df = pd.read_csv('./data/keywords.csv') #Print the head of the credit dataframe cred_df.head()
#Print the head of the keywords dataframe key_df.head()
#Convert the IDs of df into int (yes, this may throw an error, we'll do something different below) df['id'] = df['id'].astype('int')
# Function to convert all non-integer IDs to NaN def clean_ids(x): try: return int(x) except: return np.nan
#Clean the ids of df df['id'] = df['id'].apply(clean_ids) #Filter all rows that have a null ID df = df[df['id'].notnull()]
# Convert IDs into integer df.loc['id'] = df['id'].astype('int') key_df['id'] = key_df['id'].astype('int') cred_df['id'] = cred_df['id'].astype('int') # Merge keywords and credits into your main metadata dataframe df = df.merge(cred_df, on='id') df = df.merge(key_df, on='id') #Display the head of df df.head()
# Convert the stringified objects into the native python objects from ast import literal_eval features = ['cast', 'crew', 'keywords', 'genres'] for feature in features: df[feature] = df[feature].apply(literal_eval)
#Print the first cast member of the first movie in df df.iloc[0]['crew'][0]
{'credit_id': '52fe4284c3a36847f8024f49', 'department': 'Directing', 'gender': 2, 'id': 7879, 'job': 'Director', 'name': 'John Lasseter', 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}
# Extract the director's name. If director is not listed, return NaN def get_director(x): for crew_member in x: if crew_member['job'] == 'Director': return crew_member['name'] return np.nan
#Define the new director feature df['director'] = df['crew'].apply(get_director) #Print the directors of the first five movies df['director'].head()
0 John Lasseter 1 Joe Johnston 2 Howard Deutch 3 Forest Whitaker 4 Charles Shyer Name: director, dtype: object
# Returns the list top 3 elements or entire list; whichever is more. def generate_list(x): if isinstance(x, list): names = [i['name'] for i in x] #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list. if len(names) > 3: names = names[:3] return names #Return empty list in case of missing/malformed data return []
#Apply the generate_list function to cast and keywords df['cast'] = df['cast'].apply(generate_list) df['keywords'] = df['keywords'].apply(generate_list)
#Only consider a maximum of 3 genres df['genres'] = df['genres'].apply(lambda x: x[:3])
# Print the new features of the first 5 movies along with title df[['title', 'cast', 'director', 'keywords', 'genres']].head()
# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase def sanitize(x): if isinstance(x, list): #Strip spaces and convert to lowercase return [str.lower(i.replace(" ", "")) for i in x] else: #Check if director exists. If not, return empty string if isinstance(x, str): return str.lower(x.replace(" ", "")) else: return ''
#Apply the generate_list function to cast, keywords, director and genres for feature in ['cast', 'director', 'genres', 'keywords']: df[feature] = df[feature].apply(sanitize)
#Function that creates a soup out of the desired metadata def create_soup(x): return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
# Create the new soup feature df['soup'] = df.apply(create_soup, axis=1)
#Display the soup of the first movie df.iloc[0]['soup']
'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'
# Import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer #Define a new CountVectorizer object and create vectors for the soup count = CountVectorizer(stop_words='english') count_matrix = count.fit_transform(df['soup'])
#Import cosine_similarity function from sklearn.metrics.pairwise import cosine_similarity #Compute the cosine similarity score (equivalent to dot product for tf-idf vectors) cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
# Reset index of your df and construct reverse mapping again df = df.reset_index() indices2 = pd.Series(df.index, index=df['title'])
content_recommender('The Lion King', cosine_sim2, df, indices2)
3315 Creature Comforts 3476 Time Masters 3703 Thomas and the Magic Railroad 1004 So Dear to My Heart 2766 Thumbelina 4914 The Flight of Dragons 1634 Ill Gotten Gains 3466 Jails, Hospitals & Hip-Hop 651 James and the Giant Peach 770 The Hunchback of Notre Dame Name: title, dtype: object