Path: blob/main/C3 - Unsupervised Learning, Recommenders, Reinforcement Learning/week2/C3W2/C3W2A2/recsysNN_utils.py
3565 views
import pickle5 as pickle1import numpy as np2from numpy import genfromtxt3from collections import defaultdict4import pandas as pd5import tensorflow as tf6from tensorflow.keras.models import Model7from sklearn.preprocessing import StandardScaler, MinMaxScaler8from sklearn.model_selection import train_test_split9import csv10import re11import tabulate121314def load_data():15item_train = genfromtxt('./data/content_item_train.csv', delimiter=',')16user_train = genfromtxt('./data/content_user_train.csv', delimiter=',')17y_train = genfromtxt('./data/content_y_train.csv', delimiter=',')18with open('./data/content_item_train_header.txt', newline='') as f: #csv reader handles quoted strings better19item_features = list(csv.reader(f))[0]20with open('./data/content_user_train_header.txt', newline='') as f:21user_features = list(csv.reader(f))[0]22item_vecs = genfromtxt('./data/content_item_vecs.csv', delimiter=',')2324movie_dict = defaultdict(dict)25count = 026# with open('./data/movies.csv', newline='') as csvfile:27with open('./data/content_movie_list.csv', newline='') as csvfile:28reader = csv.reader(csvfile, delimiter=',', quotechar='"')29for line in reader:30if count == 0:31count +=1 #skip header32#print(line)33else:34count +=135movie_id = int(line[0])36movie_dict[movie_id]["title"] = line[1]37movie_dict[movie_id]["genres"] =line[2]3839with open('./data/content_user_to_genre.pickle', 'rb') as f:40user_to_genre = pickle.load(f)4142return(item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre)434445def pprint_train(x_train, features, vs, u_s, maxcount = 5, user=True):46""" Prints user_train or item_train nicely """47if user:48flist = [".0f",".0f",".1f",49".1f", ".1f", ".1f", ".1f",".1f",".1f", ".1f",".1f",".1f", ".1f",".1f",".1f",".1f",".1f"]50else:51flist = [".0f",".0f",".1f",52".0f",".0f",".0f", ".0f",".0f",".0f", ".0f",".0f",".0f", ".0f",".0f",".0f",".0f",".0f"]5354head = features[:vs]55if vs < u_s: print("error, vector start {vs} should be greater then user start {u_s}")56for i in range(u_s):57head[i] = "[" + head[i] + "]"58genres = features[vs:]59hdr = head + genres60disp = [split_str(hdr, 5)]61count = 062for i in range(0,x_train.shape[0]):63if count == maxcount: break64count += 165disp.append( [66x_train[i,0].astype(int),67x_train[i,1].astype(int),68x_train[i,2].astype(float),69*x_train[i,3:].astype(float)70])71table = tabulate.tabulate(disp, tablefmt='html',headers="firstrow", floatfmt=flist, numalign='center')72return(table)737475def pprint_data(y_p, user_train, item_train, printfull=False):76np.set_printoptions(precision=1)7778for i in range(0,1000):79#print(f"{y_p[i,0]: 0.2f}, {ynorm_train.numpy()[i].item(): 0.2f}")80print(f"{y_pu[i,0]: 0.2f}, {y_train[i]: 0.2f}, ", end='')81print(f"{user_train[i,0].astype(int):d}, ", end='') # userid82print(f"{user_train[i,1].astype(int):d}, ", end=''), # rating cnt83print(f"{user_train[i,2].astype(float): 0.2f}, ", end='') # rating ave84print(": ", end = '')85print(f"{item_train[i,0].astype(int):d}, ", end='') # movie id86print(f"{item_train[i,2].astype(float):0.1f}, ", end='') # ave movie rating87if printfull:88for j in range(8, user_train.shape[1]):89print(f"{user_train[i,j].astype(float):0.1f}, ", end='') # rating90print(":", end='')91for j in range(3, item_train.shape[1]):92print(f"{item_train[i,j].astype(int):d}, ", end='') # rating93print()94else:95a = user_train[i, uvs:user_train.shape[1]]96b = item_train[i, ivs:item_train.shape[1]]97c = np.multiply(a,b)98print(c)99100def split_str(ifeatures, smax):101ofeatures = []102for s in ifeatures:103if ' ' not in s: # skip string that already have a space104if len(s) > smax:105mid = int(len(s)/2)106s = s[:mid] + " " + s[mid:]107ofeatures.append(s)108return(ofeatures)109110def pprint_data_tab(y_p, user_train, item_train, uvs, ivs, user_features, item_features, maxcount = 20, printfull=False):111flist = [".1f", ".1f", ".0f", ".1f", ".0f", ".0f", ".0f",112".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f"]113user_head = user_features[:uvs]114genres = user_features[uvs:]115item_head = item_features[:ivs]116hdr = ["y_p", "y"] + user_head + item_head + genres117disp = [split_str(hdr, 5)]118count = 0119for i in range(0,y_p.shape[0]):120if count == maxcount: break121count += 1122a = user_train[i, uvs:user_train.shape[1]]123b = item_train[i, ivs:item_train.shape[1]]124c = np.multiply(a,b)125126disp.append( [ y_p[i,0], y_train[i],127user_train[i,0].astype(int), # user id128user_train[i,1].astype(int), # rating cnt129user_train[i,2].astype(float), # user rating ave130item_train[i,0].astype(int), # movie id131item_train[i,1].astype(int), # year132item_train[i,2].astype(float), # ave movie rating133*c134])135table = tabulate.tabulate(disp, tablefmt='html',headers="firstrow", floatfmt=flist, numalign='center')136return(table)137138139140141def print_pred_movies(y_p, user, item, movie_dict, maxcount=10):142""" print results of prediction of a new user. inputs are expected to be in143sorted order, unscaled. """144count = 0145movies_listed = defaultdict(int)146disp = [["y_p", "movie id", "rating ave", "title", "genres"]]147148for i in range(0, y_p.shape[0]):149if count == maxcount:150break151count += 1152movie_id = item[i, 0].astype(int)153if movie_id in movies_listed:154continue155movies_listed[movie_id] = 1156disp.append([y_p[i, 0], item[i, 0].astype(int), item[i, 2].astype(float),157movie_dict[movie_id]['title'], movie_dict[movie_id]['genres']])158159table = tabulate.tabulate(disp, tablefmt='html',headers="firstrow")160return(table)161162def gen_user_vecs(user_vec, num_items):163""" given a user vector return:164user predict maxtrix to match the size of item_vecs """165user_vecs = np.tile(user_vec, (num_items, 1))166return(user_vecs)167168# predict on everything, filter on print/use169def predict_uservec(user_vecs, item_vecs, model, u_s, i_s, scaler, ScalerUser, ScalerItem, scaledata=False):170""" given a user vector, does the prediction on all movies in item_vecs returns171an array predictions sorted by predicted rating,172arrays of user and item, sorted by predicted rating sorting index173"""174if scaledata:175scaled_user_vecs = ScalerUser.transform(user_vecs)176scaled_item_vecs = ScalerItem.transform(item_vecs)177y_p = model.predict([scaled_user_vecs[:, u_s:], scaled_item_vecs[:, i_s:]])178else:179y_p = model.predict([user_vecs[:, u_s:], item_vecs[:, i_s:]])180y_pu = scaler.inverse_transform(y_p)181182if np.any(y_pu < 0) :183print("Error, expected all positive predictions")184sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist() #negate to get largest rating first185sorted_ypu = y_pu[sorted_index]186sorted_items = item_vecs[sorted_index]187sorted_user = user_vecs[sorted_index]188return(sorted_index, sorted_ypu, sorted_items, sorted_user)189190191def print_pred_debug(y_p, y, user, item, maxcount=10, onlyrating=False, printfull=False):192""" hopefully reusable print. Keep for debug """193count = 0194for i in range(0, y_p.shape[0]):195if onlyrating == False or (onlyrating == True and y[i,0] != 0):196if count == maxcount: break197count += 1198print(f"{y_p[i, 0]: 0.2f}, {y[i,0]: 0.2f}, ", end='')199print(f"{user[i, 0].astype(int):d}, ", end='') # userid200print(f"{user[i, 1].astype(int):d}, ", end=''), # rating cnt201print(f"{user[i, 2].astype(float):0.1f}, ", end=''), # rating ave202print(": ", end = '')203print(f"{item[i, 0].astype(int):d}, ", end='') # movie id204print(f"{item[i, 2].astype(float):0.1f}, ", end='') # ave movie rating205print(": ", end = '')206if printfull:207for j in range(uvs, user.shape[1]):208print(f"{user[i, j].astype(float):0.1f}, ", end='') # rating209print(":", end='')210for j in range(ivs, item.shape[1]):211print(f"{item[i, j].astype(int):d}, ", end='') # rating212print()213else:214a = user[i, uvs:user.shape[1]]215b = item[i, ivs:item.shape[1]]216c = np.multiply(a,b)217print(c)218219220def get_user_vecs(user_id, user_train, item_vecs, user_to_genre):221""" given a user_id, return:222user train/predict matrix to match the size of item_vecs223y vector with ratings for all rated movies and 0 for others of size item_vecs """224225if user_id not in user_to_genre:226print("error: unknown user id")227return(None)228else:229user_vec_found = False230for i in range(len(user_train)):231if user_train[i, 0] == user_id:232user_vec = user_train[i]233user_vec_found = True234break235if not user_vec_found:236print("error in get_user_vecs, did not find uid in user_train")237num_items = len(item_vecs)238user_vecs = np.tile(user_vec, (num_items, 1))239240y = np.zeros(num_items)241for i in range(num_items): # walk through movies in item_vecs and get the movies, see if user has rated them242movie_id = item_vecs[i, 0]243if movie_id in user_to_genre[user_id]['movies']:244rating = user_to_genre[user_id]['movies'][movie_id]245else:246rating = 0247y[i] = rating248return(user_vecs, y)249250251def get_item_genre(item, ivs, item_features):252offset = np.where(item[ivs:] == 1)[0][0]253genre = item_features[ivs + offset]254return(genre, offset)255256257def print_existing_user(y_p, y, user, items, item_features, ivs, uvs, movie_dict, maxcount=10):258""" print results of prediction a user who was in the datatbase. inputs are expected to be in sorted order, unscaled. """259count = 0260movies_listed = defaultdict(int)261disp = [["y_p", "y", "user", "user genre ave", "movie rating ave", "title", "genres"]]262listed = []263count = 0264for i in range(0, y.shape[0]):265if y[i, 0] != 0:266if count == maxcount:267break268count += 1269movie_id = items[i, 0].astype(int)270271offset = np.where(items[i, ivs:] == 1)[0][0]272genre_rating = user[i, uvs + offset]273genre = item_features[ivs + offset]274disp.append([y_p[i, 0], y[i, 0],275user[i, 0].astype(int), # userid276genre_rating.astype(float),277items[i, 2].astype(float), # movie average rating278movie_dict[movie_id]['title'], genre])279280table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=[".1f", ".1f", ".0f", ".2f", ".2f"])281return(table)282283