CoCalc -- assigment

GitHub Repository: greyhatguy007/Machine-Learning-Specialization-Coursera
Path: blob/main/C2 - Advanced Learning Algorithms/week3/C2W3A1/assigment_utils.py
³⁵²⁰ views
1
"""
2
assignment_utils.py
3
contains routines used by C2_W3 Assignments 
4
"""
5
import copy
6
import math
7
import numpy as np
8
import matplotlib.pyplot as plt
9
import matplotlib as mpl
10
from matplotlib.patches import FancyArrowPatch
11
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
12
from matplotlib.widgets import Button, CheckButtons
13
from sklearn.linear_model import LinearRegression, Ridge
14
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
15
from sklearn.metrics import mean_squared_error
16
from sklearn.model_selection import train_test_split
17
from sklearn.datasets import make_blobs
18

19
from ipywidgets import Output
20
np.set_printoptions(precision=2)
21

22
dlc = dict(dlblue = '#0096ff', dlorange = '#FF9300', dldarkred='#C00000', dlmagenta='#FF40FF', dlpurple='#7030A0', dldarkblue =  '#0D5BDC')
23
dlblue = '#0096ff'; dlorange = '#FF9300'; dldarkred='#C00000'; dlmagenta='#FF40FF'; dlpurple='#7030A0'; dldarkblue =  '#0D5BDC'
24
dlcolors = [dlblue, dlorange, dldarkred, dlmagenta, dlpurple]
25
plt.style.use('./deeplearning.mplstyle')
26

27
# --- Assignment ----------------------------------------
28
def gen_data(m, seed=1, scale=0.7):
29
    """ generate a data set based on a x^2 with added noise """
30
    c = 0
31
    x_train = np.linspace(0,49,m)
32
    np.random.seed(seed)
33
    y_ideal = x_train**2 + c
34
    y_train = y_ideal + scale * y_ideal*(np.random.sample((m,))-0.5)
35
    x_ideal = x_train #for redraw when new data included in X
36
    return x_train, y_train, x_ideal, y_ideal
37

38
def gen_blobs():
39
    classes = 6
40
    m = 800
41
    std = 0.4
42
    centers = np.array([[-1, 0], [1, 0], [0, 1], [0, -1],  [-2,1],[-2,-1]])
43
    X, y = make_blobs(n_samples=m, centers=centers, cluster_std=std, random_state=2, n_features=2)
44
    return (X, y, centers, classes, std)
45

46
class lin_model:
47
    def __init__(self, degree, regularization = False, lambda_=0):
48
        if regularization:
49
            self.linear_model = Ridge(alpha=lambda_)
50
        else:
51
            self.linear_model = LinearRegression()
52
        self.poly = PolynomialFeatures(degree, include_bias=False)
53
        self.scaler = StandardScaler()
54
        
55
    def fit(self, X_train,y_train):
56
        ''' just fits the data. mapping and scaling are not repeated '''
57
        X_train_mapped = self.poly.fit_transform(X_train.reshape(-1,1))
58
        X_train_mapped_scaled = self.scaler.fit_transform(X_train_mapped)
59
        self.linear_model.fit(X_train_mapped_scaled, y_train )
60

61
    def predict(self, X):
62
        X_mapped = self.poly.transform(X.reshape(-1,1))
63
        X_mapped_scaled = self.scaler.transform(X_mapped)
64
        yhat = self.linear_model.predict(X_mapped_scaled)
65
        return(yhat)
66
    
67
    def mse(self, y, yhat):
68
        err = mean_squared_error(y,yhat)/2   #sklean doesn't have div by 2
69
        return (err)
70
     
71
def plt_train_test(X_train, y_train, X_test, y_test, x, y_pred, x_ideal, y_ideal, degree):
72
    fig, ax = plt.subplots(1,1, figsize=(4,4))
73
    fig.canvas.toolbar_visible = False
74
    fig.canvas.header_visible = False
75
    fig.canvas.footer_visible = False
76

77
    ax.set_title("Poor Performance on Test Data",fontsize = 12)
78
    ax.set_xlabel("x")
79
    ax.set_ylabel("y")
80

81
    ax.scatter(X_train, y_train, color = "red",           label="train")
82
    ax.scatter(X_test, y_test,       color = dlc["dlblue"], label="test")
83
    ax.set_xlim(ax.get_xlim())
84
    ax.set_ylim(ax.get_ylim())
85
    ax.plot(x, y_pred,  lw=0.5, label=f"predicted, degree={degree}")
86
    ax.plot(x_ideal, y_ideal, "--", color = "orangered", label="y_ideal", lw=1)
87
    ax.legend(loc='upper left')
88
    plt.tight_layout()
89
    plt.show()
90

91
def plt_optimal_degree(X_train, y_train, X_cv, y_cv, x, y_pred, x_ideal, y_ideal, err_train, err_cv, optimal_degree, max_degree):
92
    fig, ax = plt.subplots(1,2,figsize=(8,4))
93
    fig.canvas.toolbar_visible = False
94
    fig.canvas.header_visible = False
95
    fig.canvas.footer_visible = False
96

97
    ax[0].set_title("predictions vs data",fontsize = 12)
98
    ax[0].set_xlabel("x")
99
    ax[0].set_ylabel("y")
100

101
    ax[0].plot(x_ideal, y_ideal, "--", color = "orangered", label="y_ideal", lw=1)
102
    ax[0].scatter(X_train, y_train, color = "red",           label="train")
103
    ax[0].scatter(X_cv, y_cv,       color = dlc["dlorange"], label="cv")
104
    ax[0].set_xlim(ax[0].get_xlim())
105
    ax[0].set_ylim(ax[0].get_ylim())
106
    for i in range(0,max_degree):
107
        ax[0].plot(x, y_pred[:,i],  lw=0.5, label=f"{i+1}")
108
    ax[0].legend(loc='upper left')
109

110
    ax[1].set_title("error vs degree",fontsize = 12)
111
    cpts = list(range(1, max_degree+1))
112
    ax[1].plot(cpts, err_train[0:], marker='o',label="train error", lw=2,  color = dlc["dlblue"])
113
    ax[1].plot(cpts, err_cv[0:],    marker='o',label="cv error",  lw=2, color = dlc["dlorange"])
114
    ax[1].set_ylim(*ax[1].get_ylim())
115
    ax[1].axvline(optimal_degree, lw=1, color = dlc["dlmagenta"])
116
    ax[1].annotate("optimal degree", xy=(optimal_degree,80000),xycoords='data',
117
                xytext=(0.3, 0.8), textcoords='axes fraction', fontsize=10,
118
                   arrowprops=dict(arrowstyle="->", connectionstyle="arc3", 
119
                                   color=dlc['dldarkred'], lw=1))
120
    ax[1].set_xlabel("degree")
121
    ax[1].set_ylabel("error")
122
    ax[1].legend()
123
    fig.suptitle("Find Optimal Degree",fontsize = 12)
124
    plt.tight_layout()
125

126
    plt.show()
127
    
128
def plt_tune_regularization(X_train, y_train, X_cv, y_cv, x, y_pred, err_train, err_cv, optimal_reg_idx, lambda_range):
129
    fig, ax = plt.subplots(1,2,figsize=(8,4))
130
    fig.canvas.toolbar_visible = False
131
    fig.canvas.header_visible = False
132
    fig.canvas.footer_visible = False
133

134
    ax[0].set_title("predictions vs data",fontsize = 12)
135
    ax[0].set_xlabel("x")
136
    ax[0].set_ylabel("y")
137

138
    ax[0].scatter(X_train, y_train, color = "red",           label="train")
139
    ax[0].scatter(X_cv, y_cv,       color = dlc["dlorange"], label="cv")
140
    ax[0].set_xlim(ax[0].get_xlim())
141
    ax[0].set_ylim(ax[0].get_ylim())
142
#   ax[0].plot(x, y_pred[:,:],  lw=0.5, label=[f"$\lambda =${i}" for i in lambda_range])
143
    for i in (0,3,7,9):
144
        ax[0].plot(x, y_pred[:,i],  lw=0.5, label=f"$\lambda =${lambda_range[i]}")
145
    ax[0].legend()
146

147
    ax[1].set_title("error vs regularization",fontsize = 12)
148
    ax[1].plot(lambda_range, err_train[:], label="train error", color = dlc["dlblue"])
149
    ax[1].plot(lambda_range, err_cv[:],    label="cv error",    color = dlc["dlorange"])
150
    ax[1].set_xscale('log')
151
    ax[1].set_ylim(*ax[1].get_ylim())
152
    opt_x = lambda_range[optimal_reg_idx]
153
    ax[1].vlines(opt_x, *ax[1].get_ylim(), color = "black", lw=1)
154
    ax[1].annotate("optimal lambda", (opt_x,150000), xytext=(-80,10), textcoords="offset points",
155
                  arrowprops={'arrowstyle':'simple'})
156
    ax[1].set_xlabel("regularization (lambda)")
157
    ax[1].set_ylabel("error")
158
    fig.suptitle("Tuning Regularization",fontsize = 12)
159
    ax[1].text(0.05,0.44,"High\nVariance",fontsize=12, ha='left',transform=ax[1].transAxes,color = dlc["dlblue"])
160
    ax[1].text(0.95,0.44,"High\nBias",    fontsize=12, ha='right',transform=ax[1].transAxes,color = dlc["dlblue"])
161
    ax[1].legend(loc='upper left')
162
    plt.tight_layout()
163
    plt.show()
164

165
def tune_m():
166
    """ tune the number of examples to reduce overfitting """
167
    m = 50
168
    m_range = np.array(m*np.arange(1,16))
169
    num_steps = m_range.shape[0]
170
    degree = 16
171
    err_train = np.zeros(num_steps)     
172
    err_cv = np.zeros(num_steps)        
173
    y_pred = np.zeros((100,num_steps))     
174
    
175
    for i in range(num_steps):
176
        X, y, y_ideal, x_ideal = gen_data(m_range[i],5,0.7)
177
        x = np.linspace(0,int(X.max()),100)  
178
        X_train, X_, y_train, y_ = train_test_split(X,y,test_size=0.40, random_state=1)
179
        X_cv, X_test, y_cv, y_test = train_test_split(X_,y_,test_size=0.50, random_state=1)
180

181
        lmodel = lin_model(degree)  # no regularization
182
        lmodel.fit(X_train, y_train)
183
        yhat = lmodel.predict(X_train)
184
        err_train[i] = lmodel.mse(y_train, yhat)
185
        yhat = lmodel.predict(X_cv)
186
        err_cv[i] = lmodel.mse(y_cv, yhat)
187
        y_pred[:,i] = lmodel.predict(x)
188
    return(X_train, y_train, X_cv, y_cv, x, y_pred, err_train, err_cv, m_range,degree)
189

190
def plt_tune_m(X_train, y_train, X_cv, y_cv, x, y_pred, err_train, err_cv, m_range, degree):
191
    
192
    fig, ax = plt.subplots(1,2,figsize=(8,4))
193
    fig.canvas.toolbar_visible = False
194
    fig.canvas.header_visible = False
195
    fig.canvas.footer_visible = False
196

197
    ax[0].set_title("predictions vs data",fontsize = 12)
198
    ax[0].set_xlabel("x")
199
    ax[0].set_ylabel("y")
200

201
    ax[0].scatter(X_train, y_train, color = "red",           s=3, label="train", alpha=0.4)
202
    ax[0].scatter(X_cv, y_cv,       color = dlc["dlorange"], s=3, label="cv",    alpha=0.4)
203
    ax[0].set_xlim(ax[0].get_xlim())
204
    ax[0].set_ylim(ax[0].get_ylim())
205
    for i in range(0,len(m_range),3):
206
        ax[0].plot(x, y_pred[:,i],  lw=1, label=f"$m =${m_range[i]}")
207
    ax[0].legend(loc='upper left')
208
    ax[0].text(0.05,0.5,f"degree = {degree}", fontsize=10, ha='left',transform=ax[0].transAxes,color = dlc["dlblue"])
209

210
    ax[1].set_title("error vs number of examples",fontsize = 12)
211
    ax[1].plot(m_range, err_train[:], label="train error", color = dlc["dlblue"])
212
    ax[1].plot(m_range, err_cv[:],    label="cv error",    color = dlc["dlorange"])
213
    ax[1].set_xlabel("Number of Examples (m)")
214
    ax[1].set_ylabel("error")
215
    fig.suptitle("Tuning number of examples",fontsize = 12)
216
    ax[1].text(0.05,0.5,"High\nVariance",        fontsize=12, ha='left',transform=ax[1].transAxes,color = dlc["dlblue"])
217
    ax[1].text(0.95,0.5,"Good \nGeneralization", fontsize=12, ha='right',transform=ax[1].transAxes,color = dlc["dlblue"])
218
    ax[1].legend()
219
    plt.tight_layout()
220
    plt.show()  
221
    
222
dkcolors = plt.cm.Paired((1,3,7,9,5,11))
223
ltcolors = plt.cm.Paired((0,2,6,8,4,10))
224
dkcolors_map = mpl.colors.ListedColormap(dkcolors)
225
ltcolors_map = mpl.colors.ListedColormap(ltcolors)
226

227
def plt_mc_data(ax, X, y, classes,  class_labels=None, map=plt.cm.Paired, legend=False,size=50, m='o'):
228
    for i in range(classes):
229
        idx = np.where(y == i)
230
        col = len(idx[0])*[i]
231
        label = class_labels[i] if class_labels else "c{}".format(i)
232
        ax.scatter(X[idx, 0], X[idx, 1],  marker=m,
233
                    c=col, vmin=0, vmax=map.N, cmap=map,
234
                    s=size, label=label)
235
    if legend: ax.legend()
236
    ax.axis('equal')
237

238

239
#Plot a multi-class categorical decision boundary
240
# This version handles a non-vector prediction (adds a for-loop over points)
241
def plot_cat_decision_boundary(ax, X,predict , class_labels=None, legend=False, vector=True, color='g', lw = 1):
242

243
    # create a mesh to points to plot
244
    pad = 0.5
245
    x_min, x_max = X[:, 0].min() - pad, X[:, 0].max() + pad
246
    y_min, y_max = X[:, 1].min() - pad, X[:, 1].max() + pad
247
    h = max(x_max-x_min, y_max-y_min)/200
248
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
249
                         np.arange(y_min, y_max, h))
250
    points = np.c_[xx.ravel(), yy.ravel()]
251
    #print("points", points.shape)
252
    #make predictions for each point in mesh
253
    if vector:
254
        Z = predict(points)
255
    else:
256
        Z = np.zeros((len(points),))
257
        for i in range(len(points)):
258
            Z[i] = predict(points[i].reshape(1,2))
259
    Z = Z.reshape(xx.shape)
260

261
    #contour plot highlights boundaries between values - classes in this case
262
    ax.contour(xx, yy, Z, colors=color, linewidths=lw) 
263
    ax.axis('tight')
264

265
def recat(pt, origins):
266
    """ categorize a point based on distance from origin of clusters """
267
    nclusters = len(origins)
268
    min_dist = 10000
269
    y_new = None
270
    for j in range(nclusters):
271
        temp = origins[j] - pt.reshape(2,)
272
        #print(temp.shape,origins[j].shape)
273
        dist = np.sqrt(np.dot(temp.T, temp))
274
        if dist < min_dist:
275
            y_new = j
276
            min_dist = dist
277
    return(y_new)
278

279
def plt_train_eq_dist(X_train,y_train,classes, X_cv,   y_cv, centers, std):
280
    css = np.unique(y_train)
281
    fig,ax = plt.subplots(1,2,figsize=(8,4))
282
    fig.canvas.toolbar_visible = False
283
    fig.canvas.header_visible = False
284
    fig.canvas.footer_visible = False
285
    plt_mc_data(ax[0], X_train,y_train,classes, map=dkcolors_map, legend=True, size=50)
286
    plt_mc_data(ax[0], X_cv,   y_cv,   classes, map=ltcolors_map, legend=True, m="<")
287
    ax[0].set_title("Training, CV Data")
288
    for c in css:
289
        circ = plt.Circle(centers[c], 2*std, color=dkcolors_map(c), clip_on=False, fill=False, lw=0.5)
290
        ax[0].add_patch(circ)
291

292

293
    #make a model for plotting routines to call
294
    cat_predict = lambda pt: recat(pt.reshape(1,2), centers)
295
    plot_cat_decision_boundary(ax[1], X_train, cat_predict,  vector=False, color = dlc["dlmagenta"], lw=0.75)
296
    ax[1].set_title("ideal performance", fontsize=14)
297

298
    #add the original data to the decison boundary
299
    plt_mc_data(ax[1], X_train,y_train, classes, map=dkcolors_map, legend=True, size=50)
300
    ax[1].set_xlabel('x0') ; ax[1].set_ylabel("x1");
301
    plt.show()
302
    
303
    
304
def plt_nn(model_predict,X_train,y_train, classes, X_cv, y_cv, suptitle=""):
305
    #plot the decison boundary.
306
    fig,ax = plt.subplots(1,2, figsize=(8,4))
307
    fig.canvas.toolbar_visible = False
308
    fig.canvas.header_visible = False
309
    fig.canvas.footer_visible = False
310
    plot_cat_decision_boundary(ax[0], X_train, model_predict,  vector=True)
311
    ax[0].set_title("training data", fontsize=14)
312

313
    #add the original data to the decison boundary
314
    plt_mc_data(ax[0], X_train,y_train, classes, map=dkcolors_map, legend=True, size=75)
315
    ax[0].set_xlabel('x0') ; ax[0].set_ylabel("x1");
316

317
    plot_cat_decision_boundary(ax[1], X_train, model_predict,  vector=True)
318
    ax[1].set_title("cross-validation data", fontsize=14)
319
    plt_mc_data(ax[1], X_cv,y_cv, classes, 
320
                map=ltcolors_map, legend=True, size=100, m='<')
321
    ax[1].set_xlabel('x0') ; ax[1].set_ylabel("x1"); 
322
    fig.suptitle(suptitle,fontsize = 12)
323
    plt.show()
324

325

326
def eval_cat_err(y, yhat):
327
    """ 
328
    Calculate the categorization error
329
    Args:
330
      y    : (ndarray  Shape (m,) or (m,1))  target value of each example
331
      yhat : (ndarray  Shape (m,) or (m,1))  predicted value of each example
332
    Returns:|
333
      err: (scalar)             
334
    """
335
    m = len(y)
336
    incorrect = 0
337
    for i in range(m):
338
        if yhat[i] != y[i]:
339
            incorrect += 1
340
    err = incorrect/m
341
    return(err)
342

343
def plot_iterate(lambdas, models, X_train, y_train, X_cv, y_cv):
344
    err_train = np.zeros(len(lambdas))
345
    err_cv = np.zeros(len(lambdas))
346
    for i in range(len(models)):
347
        err_train[i] = eval_cat_err(y_train,np.argmax( models[i](X_train), axis=1))
348
        err_cv[i] = eval_cat_err(y_cv, np.argmax( models[i](X_cv), axis=1))
349

350
    fig, ax = plt.subplots(1,1,figsize=(6,4))
351
    fig.canvas.toolbar_visible = False
352
    fig.canvas.header_visible = False
353
    fig.canvas.footer_visible = False
354
    ax.set_title("error vs regularization",fontsize = 12)
355
    ax.plot(lambdas, err_train, marker='o', label="train error", color = dlc["dlblue"])
356
    ax.plot(lambdas, err_cv,    marker='o', label="cv error",    color = dlc["dlorange"])
357
    ax.set_xscale('log')
358
    ax.set_ylim(*ax.get_ylim())
359
    ax.set_xlabel("Regularization (lambda)",fontsize = 14)
360
    ax.set_ylabel("Error",fontsize = 14)
361
    ax.legend()
362
    fig.suptitle("Tuning Regularization",fontsize = 14)
363
    ax.text(0.05,0.14,"Training Error\nlower than CV",fontsize=12, ha='left',transform=ax.transAxes,color = dlc["dlblue"])
364
    ax.text(0.95,0.14,"Similar\nTraining, CV",    fontsize=12, ha='right',transform=ax.transAxes,color = dlc["dlblue"])
365
    plt.show()
366
 
367
# not used but will calculate the erro assuming an equal distance
368
def err_all_equal(X_train,X_cv,X_test, y_train,y_cv,y_test, centers):
369
    X_all = np.concatenate((X_train,X_cv,X_test), axis=0)
370
    y_all = np.concatenate((y_train,y_cv,y_test), axis=0)
371
    m = len(X_all)
372
    y_eq  = np.zeros(m)
373
    for i in range(m):
374
        y_eq[i] = recat(X_all[i], centers)
375
    err_all = eval_cat_err(y_all, y_eq)
376
    return(err_all)
377

378
def plt_compare(X,y, classes, simple, regularized, centers):
379
    plt.close("all")
380
    fig,ax = plt.subplots(1,3, figsize=(8,3))
381
    fig.canvas.toolbar_visible = False
382
    fig.canvas.header_visible = False
383
    fig.canvas.footer_visible = False
384

385
  #plt simple   
386
    plot_cat_decision_boundary(ax[0], X, simple,  vector=True)
387
    ax[0].set_title("Simple Model", fontsize=14)
388
    plt_mc_data(ax[0], X,y, classes, map=dkcolors_map, legend=True, size=75)
389
    ax[0].set_xlabel('x0') ; ax[0].set_ylabel("x1");
390

391
  #plt regularized   
392
    plot_cat_decision_boundary(ax[1], X, regularized,  vector=True)
393
    ax[1].set_title("Regularized Model", fontsize=14)
394
    plt_mc_data(ax[1], X,y, classes, map=dkcolors_map, legend=True, size=75)
395
    ax[1].set_xlabel('x0') ; ax[0].set_ylabel("x1");
396

397
  #plt ideal
398
    cat_predict = lambda pt: recat(pt.reshape(1,2), centers)
399
    plot_cat_decision_boundary(ax[2], X, cat_predict,  vector=False)
400
    ax[2].set_title("Ideal Model", fontsize=14)
401
    plt_mc_data(ax[2], X,y, classes, map=dkcolors_map, legend=True, size=75)
402
    ax[2].set_xlabel('x0') ; ax[0].set_ylabel("x1");
403

404
    err_s = eval_cat_err(y, simple(X))
405
    err_r = eval_cat_err(y, regularized(X))
406
    ax[0].text(-2.75,3,f"err_test={err_s:0.2f}", fontsize=12)
407
    ax[1].text(-2.75,3,f"err_test={err_r:0.2f}", fontsize=12)
408
    m = len(X)
409
    y_eq  = np.zeros(m)
410
    for i in range(m):
411
        y_eq[i] = recat(X[i], centers)
412
    err_eq = eval_cat_err(y, y_eq)
413
    ax[2].text(-2.75,3,f"err_test={err_eq:0.2f}", fontsize=12)
414
    plt.show()
415

416
# --- End Assignment ----------------------------------------
417

418
Product

Resources

Company