Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
DataScienceUWL
GitHub Repository: DataScienceUWL/DS775
Path: blob/main/Lessons/Lesson 14 - RecSys 2/Self_Assess_Solns_14.ipynb
871 views
Kernel: Python 3 (system-wide)
# EXECUTE FIRST # computational imports import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import root_mean_squared_error from sklearn.metrics.pairwise import cosine_similarity from surprise import Reader, Dataset, KNNBasic, NormalPredictor,BaselineOnly,KNNWithMeans,KNNBaseline from surprise import SVD, SVDpp, NMF, SlopeOne, CoClustering from surprise.model_selection import cross_validate from surprise.model_selection import GridSearchCV from surprise import accuracy import random from ast import literal_eval from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import linear_kernel # plotting imports import matplotlib.pyplot as plt import seaborn as sns sns.set_style("darkgrid") matplotlib.style.use('ggplot') # for reading files from urls import urllib.request # display imports from IPython.display import display, IFrame from IPython.core.display import HTML

Lesson 14 - Self-Assessment Solutions

Self-Assessment: Setting up the File

# load the data import pandas as pd import numpy as np bx = pd.read_csv('./data/BX-Book-Ratings-3000.csv') bx.head()
print("Mean book rating: ", '%.2f' % bx['Book-Rating'].mean())
Mean book rating: 2.63
#Import the train_test_split function from sklearn.model_selection import train_test_split #Assign X as the original ratings dataframe and y as the user_id column of ratings. X = bx.copy() y = bx['User-ID'] #Split into training and test datasets, stratified along user_id X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, stratify=y, random_state=42)

Self-Assessment: Baseline RMSE to Assess Model Performance

#verify the median of the data print(f"The median of this rating range is {np.median(np.arange(np.min(bx['Book-Rating']), (np.max(bx['Book-Rating']) + 1)))}") #Define the baseline model to always the scale median. def baseline(user_id, item_id, scale_median, *args): return scale_median
The median of this rating range is 6.0
#Function to compute the RMSE score obtained on the testing set by a model def score(cf_model, X_test, *args): #Construct a list of user-book tuples from the testing dataset id_pairs = zip(X_test[X_test.columns[0]], X_test[X_test.columns[1]]) #Predict the rating for every user-item tuple y_pred = np.array([cf_model(user, item, *args) for (user, item) in id_pairs]) #Extract the actual ratings given by the users in the test data y_true = np.array(X_test[X_test.columns[2]]) #Return the final RMSE score return root_mean_squared_error(y_true, y_pred)
score(baseline, X_test, 6)
4.703780985075257

Self-Assessment: Weighted Mean User-Based Filter

#Build the ratings matrix using pivot_table function #r_matrix = X_train.pivot_table(values='Book-Rating', index='User-ID', columns='ISBN') r_matrix = X_train.pivot(values='Book-Rating', index='User-ID', columns='ISBN') r_matrix.head()
#Create a dummy ratings matrix with all null values imputed to 0 r_matrix_dummy = r_matrix.copy().fillna(0)
r_matrix_dummy.head()
#Compute the cosine similarity matrix using the dummy ratings matrix cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)
#Convert into pandas dataframe cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index) cosine_sim.head()
#User Based Collaborative Filter using Weighted Mean Ratings def cf_wmean(user_id, item_id, ratings_matrix, c_sim_matrix, median_rating): #Check if item_id exists in r_matrix if item_id in ratings_matrix: #Get the similarity scores for the user in question with every other user sim_scores = c_sim_matrix[user_id] #Get the user ratings for the item in question i_ratings = ratings_matrix[item_id] #Extract the indices containing NaN in the i_ratings series idx = i_ratings[i_ratings.isnull()].index #Drop the NaN values from the m_ratings Series i_ratings = i_ratings.dropna() #Drop the corresponding cosine scores from the sim_scores series sim_scores = sim_scores.drop(idx) #Compute the final weighted mean if sim_scores.sum()>0: wmean_rating = np.dot(sim_scores, i_ratings)/ sim_scores.sum() else: # user had zero cosine similarity with other users wmean_rating = median_rating else: #Default to the median in the absence of any information wmean_rating = median_rating return wmean_rating
score(cf_wmean, X_test, r_matrix, cosine_sim, 6)
3.607093266358255

The RMSE with the user-based collaborative filter is 3.61 compared to 4.70 for the baseline model, so predicted ratings are more precise.

Self-Assessment: Weighted Mean Item-Based Filter - Solution

#Build the ratings matrix using pivot_table function #r_matrix = X_train.pivot_table(values='Book-Rating', index='ISBN', columns='User-ID') r_matrix_item = X_train.pivot(values='Book-Rating', index='ISBN', columns='User-ID') r_matrix_item.head()
#Create a dummy ratings matrix with all null values imputed to 0 r_matrix_item_dummy = r_matrix_item.copy().fillna(0)
# Import cosine_score from sklearn.metrics.pairwise import cosine_similarity #Compute the cosine similarity matrix using the dummy ratings matrix cosine_sim_item = cosine_similarity(r_matrix_item_dummy, r_matrix_item_dummy)
#Convert into pandas dataframe cosine_sim_item = pd.DataFrame(cosine_sim_item, index=r_matrix_item.index, columns=r_matrix_item.index) cosine_sim_item.head(10)
#Item-Based Collaborative Filter using Weighted Mean Ratings def cf_item_wmean(user_id, item_id, ratings_matrix, c_sim_matrix, median_rating): #Check if user exists in r_matrix if user_id in ratings_matrix: #Get the similarity scores for the item in question with every other item sim_scores = c_sim_matrix[item_id] #Get the user ratings for the book in question u_ratings = ratings_matrix[user_id] #Extract the indices containing NaN in the m_ratings series idx = u_ratings[u_ratings.isnull()].index #Drop the NaN values from the m_ratings Series u_ratings = u_ratings.dropna() #Drop the corresponding cosine scores from the sim_scores series sim_scores = sim_scores.drop(idx) #Compute the final weighted mean if sim_scores.sum() > 0: wmean_rating = np.dot(sim_scores, u_ratings)/ sim_scores.sum() else: # the book has zero cosine similarity with other books wmean_rating = median_rating else: #Default to a rating of 6.0 in the absence of any information wmean_rating = median_rating return wmean_rating
score(cf_item_wmean, X_test, r_matrix_item, cosine_sim_item, 6)
3.4119539180908327

The weighted-mean item-based collaborative filter is the best so far at RMSE = 3.41. The weighted-mean item-based collaborative filter had RMSE = 3.61 and the baseline model had RMSE = 4.70.

Self-Assessment: kNN-Based Collaborative Filter - Solution

bx.head()
#Define a Reader object #The Reader object helps in parsing the file or dataframe containing ratings reader = Reader(rating_scale=(1,11)) #Create the dataset to be used for building the filter #data = Dataset.load_from_df(ratings, reader) data = Dataset.load_from_df(bx, reader) #Define the algorithm object; in this case kNN random.seed(1) np.random.seed(1) knn = KNNBasic(k=5, verbose=False) #Evaluate the performance in terms of RMSE from surprise.model_selection import cross_validate knn_cv = cross_validate(knn, data, measures=['RMSE'], cv=5, verbose=True) #to extract the mean RMSE, we need to get the mean of the test_rmse values knn_RMSE = np.mean(knn_cv['test_rmse']) print(f'\nThe RMSE across five folds was {knn_RMSE}')
Evaluating RMSE of algorithm KNNBasic on 5 split(s). Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std RMSE (testset) 3.6956 3.7194 3.8178 3.7849 3.4263 3.6888 0.1384 Fit time 0.00 0.00 0.00 0.00 0.00 0.00 0.00 Test time 0.01 0.01 0.01 0.01 0.01 0.01 0.00 The RMSE across five folds was 3.688801997695312

The RMSE for each model used so far are stated below ranked from best to worst:

  • weighted-mean item-based collaborative filter: RMSE = 3.41

  • weighted-mean item-based collaborative filter: RMSE = 3.61

  • kNN-based collaborative filter: (average) RMSE = 3.69 (note that this one will vary slightly if you didn't set a seed or if you use a different seed)

  • baseline model: RMSE = 4.70.

Self-Assessment: kNNBasic Item-based Collaborative Filter - Solution

#Define a Reader object #The Reader object helps in parsing the file or dataframe containing ratings reader = Reader(rating_scale=(1,11)) #Create the dataset to be used for building the filter #data = Dataset.load_from_df(ratings, reader) data = Dataset.load_from_df(bx, reader) sim_options = {'user_based': False # compute similarities between items } #Define the algorithm object; in this case kNN random.seed(1) np.random.seed(1) knn = KNNBasic(k=5, sim_options=sim_options) #Evaluate the performance in terms of RMSE from surprise.model_selection import cross_validate knn_cv = cross_validate(knn, data, measures=['RMSE'], cv=5, verbose=True) #to extract the mean RMSE, we need to get the mean of the test_rmse values knn_RMSE = np.mean(knn_cv['test_rmse']) print(f'\nThe RMSE across five folds was {knn_RMSE}') #re-train on the whole dataset trainset = data.build_full_trainset() knn.fit(trainset)
Computing the msd similarity matrix... Done computing similarity matrix. Computing the msd similarity matrix... Done computing similarity matrix. Computing the msd similarity matrix... Done computing similarity matrix. Computing the msd similarity matrix... Done computing similarity matrix. Computing the msd similarity matrix... Done computing similarity matrix. Evaluating RMSE of algorithm KNNBasic on 5 split(s). Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std RMSE (testset) 3.4648 3.4779 3.4987 3.4872 3.2932 3.4444 0.0764 Fit time 0.00 0.00 0.00 0.00 0.00 0.00 0.00 Test time 0.01 0.01 0.01 0.01 0.01 0.01 0.00 The RMSE across five folds was 3.444369689909382 Computing the msd similarity matrix... Done computing similarity matrix.
<surprise.prediction_algorithms.knns.KNNBasic at 0x7f4b4af9b2b0>

The RMSE for each model used so far are stated below ranked from best to worst:

  • weighted-mean item-based collaborative filter: RMSE = 3.41

  • kNN-based item-based collaborative filter: (average) RMSE = 3.44

  • weighted-mean item-based collaborative filter: RMSE = 3.61

  • kNN-based user-based collaborative filter: (average) RMSE = 3.69 (note that this one will vary slightly if you didn't set a seed or if you use a different seed)

  • baseline model: RMSE = 4.70.

Self-Assessment: Hybrid Recommender

# load the data import pandas as pd import numpy as np bx = pd.read_csv('./data/BX-Book-Ratings-3000.csv') bx.head(5)
#Define a Reader object #The Reader object helps in parsing the file or dataframe containing ratings reader = Reader(rating_scale=(1,11)) #Create the dataset to be used for building the filter #data = Dataset.load_from_df(ratings, reader) data = Dataset.load_from_df(bx, reader) #train a knn item-based collaborative filter - don't set k, just let it pick sim_options = {'user_based': False # compute similarities between items } # Retrieve the trainset. trainset = data.build_full_trainset() #Define the algorithm object; in this case item-based kNNBasic random.seed(1) np.random.seed(1) knn = KNNBasic(sim_options=sim_options) #fit the data knn.fit(trainset) #Build the SVD based Collaborative filter svd = SVD() random.seed(1) np.random.seed(1) #fit the data svd.fit(trainset) #test a couple of predictions (Note that ISBN is a string) print(knn.predict(31315, '446606189')) print(svd.predict(31315, '446606189'))
Computing the msd similarity matrix... Done computing similarity matrix. user: 31315 item: 446606189 r_ui = None est = 8.84 {'actual_k': 30, 'was_impossible': False} user: 31315 item: 446606189 r_ui = None est = 7.39 {'was_impossible': False}
#build the hybrid function def hybrid(ratings, userid, item_algo, user_algo, item_weight, N): ''' Parameters ratings: the ratings dataframe we're working with userid: the user for whom we are making predictions item_algo: the trained Surprise item-based collaborative filter user_algo: the trained Surprise user-based collaborative filter N: the number of predictions to return returns a dataframe of top recommendations ''' #first get a dataframe of unique books sim_items = ratings.copy().drop(columns=['User-ID', 'Book-Rating']).drop_duplicates() #generate the predicted this user's predicted rating for each of them based on the item-based filter sim_items['iPrediction'] = sim_items.apply(lambda x: item_algo.predict(userid, x['ISBN']).est, axis=1) #add the predictions based on the user-based collaborative filter sim_items['uPrediction'] = sim_items.apply(lambda x: user_algo.predict(userid, x['ISBN']).est, axis=1) #weight the item-based collaborative filter by item_weight and the user-based collaborative filter by 1-item_weight and sum them sim_items['finalPrediction'] = sim_items.apply(lambda x: (x['iPrediction'] * item_weight) + (x['uPrediction'] * (1-item_weight)), axis=1) #get the top N users who rated this highl sim_items = sim_items.sort_values('finalPrediction', ascending=False) return sim_items.head(N) hybrid(bx, 31315, knn, svd, .6, 10)