GitHub Repository: DataScienceUWL/DS775
Path: blob/main/Lessons/Lesson 14 - RecSys 2/Self_Assess_Solns_14.ipynb
⁸⁷¹ views

Kernel: Python 3 (system-wide)

In [3]:

# EXECUTE FIRST

# computational imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, KNNBasic, NormalPredictor,BaselineOnly,KNNWithMeans,KNNBaseline
from surprise import SVD, SVDpp, NMF, SlopeOne, CoClustering
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import accuracy

import random
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
matplotlib.style.use('ggplot')
# for reading files from urls
import urllib.request
# display imports
from IPython.display import display, IFrame
from IPython.core.display import HTML

Lesson 14 - Self-Assessment Solutions

Self-Assessment: Setting up the File

In [4]:

# load the data
import pandas as pd
import numpy as np
bx = pd.read_csv('./data/BX-Book-Ratings-3000.csv')
bx.head()

Out[4]:

In [5]:

print("Mean book rating:     ", '%.2f' % bx['Book-Rating'].mean())

Out[5]:

Mean book rating:      2.63

In [6]:

#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = bx.copy()
y = bx['User-ID']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, stratify=y, random_state=42)

Self-Assessment: Baseline RMSE to Assess Model Performance

In [7]:

#verify the median of the data
print(f"The median of this rating range is {np.median(np.arange(np.min(bx['Book-Rating']), (np.max(bx['Book-Rating']) + 1)))}")

#Define the baseline model to always the scale median.
def baseline(user_id, item_id, scale_median,  *args):
    return scale_median

Out[7]:

The median of this rating range is 6.0

In [9]:

#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model, X_test, *args):
    
    #Construct a list of user-book tuples from the testing dataset
    id_pairs = zip(X_test[X_test.columns[0]], X_test[X_test.columns[1]])
    
    #Predict the rating for every user-item tuple
    y_pred = np.array([cf_model(user, item, *args) for (user, item) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test[X_test.columns[2]])
    
    #Return the final RMSE score
    return root_mean_squared_error(y_true, y_pred)

In [10]:

score(baseline, X_test, 6)

Out[10]:

4.703780985075257

Self-Assessment: Weighted Mean User-Based Filter

In [11]:

#Build the ratings matrix using pivot_table function
#r_matrix = X_train.pivot_table(values='Book-Rating', index='User-ID', columns='ISBN')
r_matrix = X_train.pivot(values='Book-Rating', index='User-ID', columns='ISBN')

r_matrix.head()

Out[11]:

In [12]:

#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

In [13]:

r_matrix_dummy.head()

Out[13]:

In [14]:

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [15]:

#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head()

Out[15]:

In [16]:

#User Based Collaborative Filter using Weighted Mean Ratings
def cf_wmean(user_id, item_id, ratings_matrix, c_sim_matrix, median_rating):
    
    #Check if item_id exists in r_matrix
    if item_id in ratings_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = c_sim_matrix[user_id]
        
        #Get the user ratings for the item in question
        i_ratings = ratings_matrix[item_id]
        
        #Extract the indices containing NaN in the i_ratings series
        idx = i_ratings[i_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        i_ratings = i_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)

        #Compute the final weighted mean
        if sim_scores.sum()>0:
            wmean_rating = np.dot(sim_scores, i_ratings)/ sim_scores.sum()
        else:  # user had zero cosine similarity with other users
            wmean_rating = median_rating

    else:
        #Default to the median in the absence of any information
        wmean_rating = median_rating
    
    return wmean_rating

In [17]:

score(cf_wmean, X_test, r_matrix, cosine_sim, 6)

Out[17]:

3.607093266358255

The RMSE with the user-based collaborative filter is 3.61 compared to 4.70 for the baseline model, so predicted ratings are more precise.

Self-Assessment: Weighted Mean Item-Based Filter - Solution

In [18]:

#Build the ratings matrix using pivot_table function
#r_matrix = X_train.pivot_table(values='Book-Rating', index='ISBN', columns='User-ID')
r_matrix_item = X_train.pivot(values='Book-Rating', index='ISBN', columns='User-ID')

r_matrix_item.head()

Out[18]:

In [19]:

#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_item_dummy = r_matrix_item.copy().fillna(0)

In [20]:

# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim_item = cosine_similarity(r_matrix_item_dummy, r_matrix_item_dummy)

In [21]:

#Convert into pandas dataframe 
cosine_sim_item = pd.DataFrame(cosine_sim_item, index=r_matrix_item.index, columns=r_matrix_item.index)

cosine_sim_item.head(10)

Out[21]:

In [22]:

#Item-Based Collaborative Filter using Weighted Mean Ratings
def cf_item_wmean(user_id, item_id, ratings_matrix, c_sim_matrix, median_rating):
    
    #Check if user exists in r_matrix
    if user_id in ratings_matrix:
        
        #Get the similarity scores for the item in question with every other item
        sim_scores = c_sim_matrix[item_id]
        
        #Get the user ratings for the book in question
        u_ratings = ratings_matrix[user_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = u_ratings[u_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        u_ratings = u_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        if sim_scores.sum() > 0:
            wmean_rating = np.dot(sim_scores, u_ratings)/ sim_scores.sum()
        else: # the book has zero cosine similarity with other books
            wmean_rating = median_rating
    
    else:
        #Default to a rating of 6.0 in the absence of any information
        wmean_rating = median_rating
    
    return wmean_rating

In [23]:

score(cf_item_wmean, X_test, r_matrix_item, cosine_sim_item, 6)

Out[23]:

3.4119539180908327

The weighted-mean item-based collaborative filter is the best so far at RMSE = 3.41. The weighted-mean item-based collaborative filter had RMSE = 3.61 and the baseline model had RMSE = 4.70.

Self-Assessment: kNN-Based Collaborative Filter - Solution

In [24]:

bx.head()

Out[24]:

In [25]:

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader(rating_scale=(1,11))

#Create the dataset to be used for building the filter
#data = Dataset.load_from_df(ratings, reader)
data = Dataset.load_from_df(bx, reader)

#Define the algorithm object; in this case kNN
random.seed(1)
np.random.seed(1)
knn = KNNBasic(k=5, verbose=False)

#Evaluate the performance in terms of RMSE
from surprise.model_selection import cross_validate
knn_cv = cross_validate(knn, data, measures=['RMSE'], cv=5, verbose=True)
#to extract the mean RMSE, we need to get the mean of the test_rmse values
knn_RMSE = np.mean(knn_cv['test_rmse'])
print(f'\nThe RMSE across five folds was {knn_RMSE}')

Out[25]:

Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.6956  3.7194  3.8178  3.7849  3.4263  3.6888  0.1384  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    

The RMSE across five folds was 3.688801997695312

The RMSE for each model used so far are stated below ranked from best to worst:

weighted-mean item-based collaborative filter: RMSE = 3.41
weighted-mean item-based collaborative filter: RMSE = 3.61
kNN-based collaborative filter: (average) RMSE = 3.69 (note that this one will vary slightly if you didn't set a seed or if you use a different seed)
baseline model: RMSE = 4.70.

Self-Assessment: kNNBasic Item-based Collaborative Filter - Solution

In [26]:

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader(rating_scale=(1,11))

#Create the dataset to be used for building the filter
#data = Dataset.load_from_df(ratings, reader)
data = Dataset.load_from_df(bx, reader)


sim_options = {'user_based': False  # compute  similarities between items
               }

#Define the algorithm object; in this case kNN
random.seed(1)
np.random.seed(1)
knn = KNNBasic(k=5, sim_options=sim_options)

#Evaluate the performance in terms of RMSE
from surprise.model_selection import cross_validate
knn_cv = cross_validate(knn, data, measures=['RMSE'], cv=5, verbose=True)
#to extract the mean RMSE, we need to get the mean of the test_rmse values
knn_RMSE = np.mean(knn_cv['test_rmse'])
print(f'\nThe RMSE across five folds was {knn_RMSE}')

#re-train on the whole dataset
trainset = data.build_full_trainset()
knn.fit(trainset)

Out[26]:

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.4648  3.4779  3.4987  3.4872  3.2932  3.4444  0.0764  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    

The RMSE across five folds was 3.444369689909382
Computing the msd similarity matrix...
Done computing similarity matrix.

<surprise.prediction_algorithms.knns.KNNBasic at 0x7f4b4af9b2b0>

The RMSE for each model used so far are stated below ranked from best to worst:

weighted-mean item-based collaborative filter: RMSE = 3.41
kNN-based item-based collaborative filter: (average) RMSE = 3.44
weighted-mean item-based collaborative filter: RMSE = 3.61
kNN-based user-based collaborative filter: (average) RMSE = 3.69 (note that this one will vary slightly if you didn't set a seed or if you use a different seed)
baseline model: RMSE = 4.70.

Self-Assessment: Hybrid Recommender

In [27]:

# load the data
import pandas as pd
import numpy as np
bx = pd.read_csv('./data/BX-Book-Ratings-3000.csv')
bx.head(5)

Out[27]:

In [28]:

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader(rating_scale=(1,11))

#Create the dataset to be used for building the filter
#data = Dataset.load_from_df(ratings, reader)
data = Dataset.load_from_df(bx, reader)

#train a knn item-based collaborative filter - don't set k, just let it pick
sim_options = {'user_based': False  # compute  similarities between items
               }

# Retrieve the trainset.
trainset = data.build_full_trainset()

#Define the algorithm object; in this case item-based kNNBasic
random.seed(1)
np.random.seed(1)
knn = KNNBasic(sim_options=sim_options)

#fit the data
knn.fit(trainset)


#Build the SVD based Collaborative filter
svd = SVD()
random.seed(1)
np.random.seed(1)

#fit the data
svd.fit(trainset)

#test a couple of predictions (Note that ISBN is a string)
print(knn.predict(31315, '446606189'))
print(svd.predict(31315, '446606189'))

Out[28]:

Computing the msd similarity matrix...
Done computing similarity matrix.
user: 31315      item: 446606189  r_ui = None   est = 8.84   {'actual_k': 30, 'was_impossible': False}
user: 31315      item: 446606189  r_ui = None   est = 7.39   {'was_impossible': False}

In [29]:

#build the hybrid function
def hybrid(ratings, userid, item_algo, user_algo, item_weight, N):
    '''
    Parameters
    ratings: the ratings dataframe we're working with
    userid: the user for whom we are making predictions
    item_algo: the trained Surprise item-based collaborative filter
    user_algo: the trained Surprise user-based collaborative filter
    N: the number of predictions to return
    returns
    a dataframe of top recommendations
    '''

    #first get a dataframe of unique books
    sim_items = ratings.copy().drop(columns=['User-ID', 'Book-Rating']).drop_duplicates()
    #generate the predicted this user's predicted rating for each of them based on the item-based filter
    sim_items['iPrediction'] = sim_items.apply(lambda x: item_algo.predict(userid, x['ISBN']).est, axis=1)

    #add the predictions based on the user-based collaborative filter
    sim_items['uPrediction'] = sim_items.apply(lambda x: user_algo.predict(userid, x['ISBN']).est, axis=1)

    #weight the item-based collaborative filter by item_weight and the user-based collaborative filter by 1-item_weight and sum them

    sim_items['finalPrediction'] = sim_items.apply(lambda x: (x['iPrediction'] * item_weight) + (x['uPrediction'] * (1-item_weight)), axis=1)

    #get the top N users who rated this highl
    sim_items = sim_items.sort_values('finalPrediction', ascending=False)
    return sim_items.head(N)

hybrid(bx, 31315, knn, svd, .6, 10)

Out[29]:

In [0]: