Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
greyhatguy007
GitHub Repository: greyhatguy007/Machine-Learning-Specialization-Coursera
Path: blob/main/C3 - Unsupervised Learning, Recommenders, Reinforcement Learning/week2/C3W2/C3W2A2/recsysNN_utils.py
3565 views
1
import pickle5 as pickle
2
import numpy as np
3
from numpy import genfromtxt
4
from collections import defaultdict
5
import pandas as pd
6
import tensorflow as tf
7
from tensorflow.keras.models import Model
8
from sklearn.preprocessing import StandardScaler, MinMaxScaler
9
from sklearn.model_selection import train_test_split
10
import csv
11
import re
12
import tabulate
13
14
15
def load_data():
16
item_train = genfromtxt('./data/content_item_train.csv', delimiter=',')
17
user_train = genfromtxt('./data/content_user_train.csv', delimiter=',')
18
y_train = genfromtxt('./data/content_y_train.csv', delimiter=',')
19
with open('./data/content_item_train_header.txt', newline='') as f: #csv reader handles quoted strings better
20
item_features = list(csv.reader(f))[0]
21
with open('./data/content_user_train_header.txt', newline='') as f:
22
user_features = list(csv.reader(f))[0]
23
item_vecs = genfromtxt('./data/content_item_vecs.csv', delimiter=',')
24
25
movie_dict = defaultdict(dict)
26
count = 0
27
# with open('./data/movies.csv', newline='') as csvfile:
28
with open('./data/content_movie_list.csv', newline='') as csvfile:
29
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
30
for line in reader:
31
if count == 0:
32
count +=1 #skip header
33
#print(line)
34
else:
35
count +=1
36
movie_id = int(line[0])
37
movie_dict[movie_id]["title"] = line[1]
38
movie_dict[movie_id]["genres"] =line[2]
39
40
with open('./data/content_user_to_genre.pickle', 'rb') as f:
41
user_to_genre = pickle.load(f)
42
43
return(item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre)
44
45
46
def pprint_train(x_train, features, vs, u_s, maxcount = 5, user=True):
47
""" Prints user_train or item_train nicely """
48
if user:
49
flist = [".0f",".0f",".1f",
50
".1f", ".1f", ".1f", ".1f",".1f",".1f", ".1f",".1f",".1f", ".1f",".1f",".1f",".1f",".1f"]
51
else:
52
flist = [".0f",".0f",".1f",
53
".0f",".0f",".0f", ".0f",".0f",".0f", ".0f",".0f",".0f", ".0f",".0f",".0f",".0f",".0f"]
54
55
head = features[:vs]
56
if vs < u_s: print("error, vector start {vs} should be greater then user start {u_s}")
57
for i in range(u_s):
58
head[i] = "[" + head[i] + "]"
59
genres = features[vs:]
60
hdr = head + genres
61
disp = [split_str(hdr, 5)]
62
count = 0
63
for i in range(0,x_train.shape[0]):
64
if count == maxcount: break
65
count += 1
66
disp.append( [
67
x_train[i,0].astype(int),
68
x_train[i,1].astype(int),
69
x_train[i,2].astype(float),
70
*x_train[i,3:].astype(float)
71
])
72
table = tabulate.tabulate(disp, tablefmt='html',headers="firstrow", floatfmt=flist, numalign='center')
73
return(table)
74
75
76
def pprint_data(y_p, user_train, item_train, printfull=False):
77
np.set_printoptions(precision=1)
78
79
for i in range(0,1000):
80
#print(f"{y_p[i,0]: 0.2f}, {ynorm_train.numpy()[i].item(): 0.2f}")
81
print(f"{y_pu[i,0]: 0.2f}, {y_train[i]: 0.2f}, ", end='')
82
print(f"{user_train[i,0].astype(int):d}, ", end='') # userid
83
print(f"{user_train[i,1].astype(int):d}, ", end=''), # rating cnt
84
print(f"{user_train[i,2].astype(float): 0.2f}, ", end='') # rating ave
85
print(": ", end = '')
86
print(f"{item_train[i,0].astype(int):d}, ", end='') # movie id
87
print(f"{item_train[i,2].astype(float):0.1f}, ", end='') # ave movie rating
88
if printfull:
89
for j in range(8, user_train.shape[1]):
90
print(f"{user_train[i,j].astype(float):0.1f}, ", end='') # rating
91
print(":", end='')
92
for j in range(3, item_train.shape[1]):
93
print(f"{item_train[i,j].astype(int):d}, ", end='') # rating
94
print()
95
else:
96
a = user_train[i, uvs:user_train.shape[1]]
97
b = item_train[i, ivs:item_train.shape[1]]
98
c = np.multiply(a,b)
99
print(c)
100
101
def split_str(ifeatures, smax):
102
ofeatures = []
103
for s in ifeatures:
104
if ' ' not in s: # skip string that already have a space
105
if len(s) > smax:
106
mid = int(len(s)/2)
107
s = s[:mid] + " " + s[mid:]
108
ofeatures.append(s)
109
return(ofeatures)
110
111
def pprint_data_tab(y_p, user_train, item_train, uvs, ivs, user_features, item_features, maxcount = 20, printfull=False):
112
flist = [".1f", ".1f", ".0f", ".1f", ".0f", ".0f", ".0f",
113
".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f",".1f"]
114
user_head = user_features[:uvs]
115
genres = user_features[uvs:]
116
item_head = item_features[:ivs]
117
hdr = ["y_p", "y"] + user_head + item_head + genres
118
disp = [split_str(hdr, 5)]
119
count = 0
120
for i in range(0,y_p.shape[0]):
121
if count == maxcount: break
122
count += 1
123
a = user_train[i, uvs:user_train.shape[1]]
124
b = item_train[i, ivs:item_train.shape[1]]
125
c = np.multiply(a,b)
126
127
disp.append( [ y_p[i,0], y_train[i],
128
user_train[i,0].astype(int), # user id
129
user_train[i,1].astype(int), # rating cnt
130
user_train[i,2].astype(float), # user rating ave
131
item_train[i,0].astype(int), # movie id
132
item_train[i,1].astype(int), # year
133
item_train[i,2].astype(float), # ave movie rating
134
*c
135
])
136
table = tabulate.tabulate(disp, tablefmt='html',headers="firstrow", floatfmt=flist, numalign='center')
137
return(table)
138
139
140
141
142
def print_pred_movies(y_p, user, item, movie_dict, maxcount=10):
143
""" print results of prediction of a new user. inputs are expected to be in
144
sorted order, unscaled. """
145
count = 0
146
movies_listed = defaultdict(int)
147
disp = [["y_p", "movie id", "rating ave", "title", "genres"]]
148
149
for i in range(0, y_p.shape[0]):
150
if count == maxcount:
151
break
152
count += 1
153
movie_id = item[i, 0].astype(int)
154
if movie_id in movies_listed:
155
continue
156
movies_listed[movie_id] = 1
157
disp.append([y_p[i, 0], item[i, 0].astype(int), item[i, 2].astype(float),
158
movie_dict[movie_id]['title'], movie_dict[movie_id]['genres']])
159
160
table = tabulate.tabulate(disp, tablefmt='html',headers="firstrow")
161
return(table)
162
163
def gen_user_vecs(user_vec, num_items):
164
""" given a user vector return:
165
user predict maxtrix to match the size of item_vecs """
166
user_vecs = np.tile(user_vec, (num_items, 1))
167
return(user_vecs)
168
169
# predict on everything, filter on print/use
170
def predict_uservec(user_vecs, item_vecs, model, u_s, i_s, scaler, ScalerUser, ScalerItem, scaledata=False):
171
""" given a user vector, does the prediction on all movies in item_vecs returns
172
an array predictions sorted by predicted rating,
173
arrays of user and item, sorted by predicted rating sorting index
174
"""
175
if scaledata:
176
scaled_user_vecs = ScalerUser.transform(user_vecs)
177
scaled_item_vecs = ScalerItem.transform(item_vecs)
178
y_p = model.predict([scaled_user_vecs[:, u_s:], scaled_item_vecs[:, i_s:]])
179
else:
180
y_p = model.predict([user_vecs[:, u_s:], item_vecs[:, i_s:]])
181
y_pu = scaler.inverse_transform(y_p)
182
183
if np.any(y_pu < 0) :
184
print("Error, expected all positive predictions")
185
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist() #negate to get largest rating first
186
sorted_ypu = y_pu[sorted_index]
187
sorted_items = item_vecs[sorted_index]
188
sorted_user = user_vecs[sorted_index]
189
return(sorted_index, sorted_ypu, sorted_items, sorted_user)
190
191
192
def print_pred_debug(y_p, y, user, item, maxcount=10, onlyrating=False, printfull=False):
193
""" hopefully reusable print. Keep for debug """
194
count = 0
195
for i in range(0, y_p.shape[0]):
196
if onlyrating == False or (onlyrating == True and y[i,0] != 0):
197
if count == maxcount: break
198
count += 1
199
print(f"{y_p[i, 0]: 0.2f}, {y[i,0]: 0.2f}, ", end='')
200
print(f"{user[i, 0].astype(int):d}, ", end='') # userid
201
print(f"{user[i, 1].astype(int):d}, ", end=''), # rating cnt
202
print(f"{user[i, 2].astype(float):0.1f}, ", end=''), # rating ave
203
print(": ", end = '')
204
print(f"{item[i, 0].astype(int):d}, ", end='') # movie id
205
print(f"{item[i, 2].astype(float):0.1f}, ", end='') # ave movie rating
206
print(": ", end = '')
207
if printfull:
208
for j in range(uvs, user.shape[1]):
209
print(f"{user[i, j].astype(float):0.1f}, ", end='') # rating
210
print(":", end='')
211
for j in range(ivs, item.shape[1]):
212
print(f"{item[i, j].astype(int):d}, ", end='') # rating
213
print()
214
else:
215
a = user[i, uvs:user.shape[1]]
216
b = item[i, ivs:item.shape[1]]
217
c = np.multiply(a,b)
218
print(c)
219
220
221
def get_user_vecs(user_id, user_train, item_vecs, user_to_genre):
222
""" given a user_id, return:
223
user train/predict matrix to match the size of item_vecs
224
y vector with ratings for all rated movies and 0 for others of size item_vecs """
225
226
if user_id not in user_to_genre:
227
print("error: unknown user id")
228
return(None)
229
else:
230
user_vec_found = False
231
for i in range(len(user_train)):
232
if user_train[i, 0] == user_id:
233
user_vec = user_train[i]
234
user_vec_found = True
235
break
236
if not user_vec_found:
237
print("error in get_user_vecs, did not find uid in user_train")
238
num_items = len(item_vecs)
239
user_vecs = np.tile(user_vec, (num_items, 1))
240
241
y = np.zeros(num_items)
242
for i in range(num_items): # walk through movies in item_vecs and get the movies, see if user has rated them
243
movie_id = item_vecs[i, 0]
244
if movie_id in user_to_genre[user_id]['movies']:
245
rating = user_to_genre[user_id]['movies'][movie_id]
246
else:
247
rating = 0
248
y[i] = rating
249
return(user_vecs, y)
250
251
252
def get_item_genre(item, ivs, item_features):
253
offset = np.where(item[ivs:] == 1)[0][0]
254
genre = item_features[ivs + offset]
255
return(genre, offset)
256
257
258
def print_existing_user(y_p, y, user, items, item_features, ivs, uvs, movie_dict, maxcount=10):
259
""" print results of prediction a user who was in the datatbase. inputs are expected to be in sorted order, unscaled. """
260
count = 0
261
movies_listed = defaultdict(int)
262
disp = [["y_p", "y", "user", "user genre ave", "movie rating ave", "title", "genres"]]
263
listed = []
264
count = 0
265
for i in range(0, y.shape[0]):
266
if y[i, 0] != 0:
267
if count == maxcount:
268
break
269
count += 1
270
movie_id = items[i, 0].astype(int)
271
272
offset = np.where(items[i, ivs:] == 1)[0][0]
273
genre_rating = user[i, uvs + offset]
274
genre = item_features[ivs + offset]
275
disp.append([y_p[i, 0], y[i, 0],
276
user[i, 0].astype(int), # userid
277
genre_rating.astype(float),
278
items[i, 2].astype(float), # movie average rating
279
movie_dict[movie_id]['title'], genre])
280
281
table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=[".1f", ".1f", ".0f", ".2f", ".2f"])
282
return(table)
283