Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/deprecated/scripts/ard_vb_linreg_logreg.py
1192 views
1
# Bayesian Linear and logistic Regression with ARD prior (fitted with Variational Bayes)
2
3
#https://github.com/AmazaspShumik/sklearn-bayes/blob/master/skbayes/rvm_ard_models/vrvm.py
4
5
import superimport
6
7
import numpy as np
8
from sklearn.externals import six
9
from scipy.special import expit
10
from scipy.linalg import solve_triangular
11
from sklearn.linear_model.base import LinearModel, LinearClassifierMixin
12
from sklearn.base import RegressorMixin, BaseEstimator
13
from sklearn.utils import check_X_y,check_array
14
from sklearn.metrics.pairwise import pairwise_kernels
15
from sklearn.utils.validation import check_is_fitted
16
from sklearn.utils.multiclass import check_classification_targets
17
from sklearn.utils import as_float_array
18
import warnings
19
20
21
class VBRegressionARD(LinearModel,RegressorMixin):
22
'''
23
Bayesian Linear Regression with ARD prior (fitted with Variational Bayes)
24
25
Parameters
26
----------
27
n_iter: int, optional (DEFAULT = 100)
28
Maximum number of iterations
29
fit_intercept : boolean, optional (DEFAULT = True)
30
If True, intercept will be used in computation
31
32
tol: float, optional (DEFAULT = 1e-3)
33
If absolute change in precision parameter for weights is below threshold
34
algorithm terminates.
35
36
copy_X : boolean, optional (DEFAULT = True)
37
If True, X will be copied, otherwise it will be overwritten.
38
39
verbose : boolean, optional (DEFAULT = True)
40
Verbose mode when fitting the model
41
42
a: float, optional, (DEFAULT = 1e-5)
43
Shape parameters for Gamma distributed precision of weights
44
45
b: float, optional, (DEFAULT = 1e-5)
46
Rate parameter for Gamma distributed precision of weights
47
48
c: float, optional, (DEFAULT = 1e-5)
49
Shape parameter for Gamma distributed precision of noise
50
51
d: float, optional, (DEFAULT = 1e-5)
52
Rate parameter for Gamma distributed precision of noise
53
54
prune_thresh: float, ( DEFAULT = 1e-3 )
55
Threshold for pruning out variable (applied after model is fitted)
56
57
58
Attributes
59
----------
60
coef_ : array, shape = (n_features)
61
Coefficients of the regression model (mean of posterior distribution)
62
active_ : array, dtype = np.bool, shape = (n_features)
63
True for non-zero coefficients, False otherwise
64
sigma_ : array, shape = (n_features, n_features)
65
Estimated covariance matrix of the weights, computed only
66
for non-zero coefficients
67
68
Reference:
69
----------
70
[1] Bishop & Tipping (2000), Variational Relevance Vector Machine
71
[2] Jan Drugowitch (2014), Variational Bayesian Inference for Bayesian Linear
72
and Logistic Regression
73
[3] Bishop (2006) Pattern Recognition and Machine Learning (ch. 7)
74
'''
75
def __init__(self, n_iter = 100, tol = 1e-3, fit_intercept = True,
76
a = 1e-5, b = 1e-5, c = 1e-5, d = 1e-5, copy_X = True,
77
prune_thresh = 1e-3, verbose = False):
78
self.n_iter = n_iter
79
self.tol = tol
80
self.fit_intercept = fit_intercept
81
self.a,self.b = a,b
82
self.c,self.d = c,d
83
self.copy_X = copy_X
84
self.verbose = verbose
85
self.prune_thresh = prune_thresh
86
87
88
def _center_data(self,X,y):
89
''' Centers data'''
90
X = as_float_array(X,self.copy_X)
91
# normalisation should be done in preprocessing!
92
X_std = np.ones(X.shape[1], dtype = X.dtype)
93
if self.fit_intercept:
94
X_mean = np.average(X,axis = 0)
95
y_mean = np.average(y,axis = 0)
96
X -= X_mean
97
y = y - y_mean
98
else:
99
X_mean = np.zeros(X.shape[1],dtype = X.dtype)
100
y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
101
return X,y, X_mean, y_mean, X_std
102
103
104
def fit(self,X,y):
105
'''
106
Fits variational relevance ARD regression
107
108
Parameters
109
-----------
110
X: array-like of size [n_samples, n_features]
111
Training data, matrix of explanatory variables
112
113
y: array-like of size [n_samples, n_features]
114
Target values
115
116
Returns
117
-------
118
self : object
119
Returns self.
120
'''
121
# precompute some values for faster iterations
122
X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
123
n_samples, n_features = X.shape
124
X, y, X_mean, y_mean, X_std = self._center_data(X, y)
125
XX = np.dot(X.T,X)
126
XY = np.dot(X.T,y)
127
Y2 = np.sum(y**2)
128
129
# final update for a and c
130
a = (self.a + 0.5) * np.ones(n_features, dtype = np.float)
131
c = (self.c + 0.5 * n_samples) #* np.ones(n_features, dtype = np.float)
132
# initial values of b,d before mean field approximation
133
d = self.d #* np.ones(n_features, dtype = np.float)
134
b = self.b * np.ones(n_features, dtype = np.float)
135
active = np.ones(n_features, dtype = np.bool)
136
w0 = np.zeros(n_features)
137
w = np.copy(w0)
138
139
for i in range(self.n_iter):
140
# ---------------------- update q(w) -----------------------
141
142
# calculate expectations for precision of noise & precision of weights
143
e_tau = c / d
144
e_A = a / b
145
XXa = XX[active,:][:,active]
146
XYa = XY[active]
147
Xa = X[:,active]
148
# parameters of updated posterior distribution
149
w[active],Ri = self._posterior_weights(XXa,XYa,e_tau,e_A[active])
150
151
# --------------------- update q(tau) ------------------------
152
# update rate parameter for Gamma distributed precision of noise
153
XSX = np.sum( np.dot(Xa,Ri.T)**2)
154
XMw = np.sum( np.dot(Xa,w[active])**2 )
155
XYw = np.sum( w[active]*XYa )
156
d = self.d + 0.5*(Y2 + XMw + XSX) - XYw
157
158
# -------------------- update q(alpha(j)) for each j ----------
159
# update rate parameter for Gamma distributed precision of weights
160
b[active] = self.b + 0.5*(w[active]**2 + np.sum(Ri**2,axis = 1))
161
162
# -------------------- check convergence ----------------------
163
# determine relevant vector as is described in Bishop & Tipping
164
# (i.e. using mean of posterior distribution)
165
active = np.abs(w) > self.prune_thresh
166
167
# make sure there is at least one relevant feature
168
if np.sum(active) == 0:
169
active[np.argmax(np.abs(w))] = True
170
# all irrelevant features are forced to zero
171
w[~active] = 0
172
# check convergence
173
if np.sum(abs(w-w0) > self.tol) == 0 or i==self.n_iter-1:
174
break
175
w0 = np.copy(w)
176
# if only one relevant feature => terminate
177
if np.sum(active)== 1:
178
if X.shape[1] > 3 and self.prune_thresh > 1e-1:
179
warnings.warn(("Only one relevant feature found! it can be useful to decrease"
180
"value for parameter prune_thresh"))
181
break
182
183
# update parameters after last update
184
e_tau = c / d
185
e_A = a / b
186
XXa = XX[active,:][:,active]
187
XYa = XY[active]
188
w[active], self.sigma_ = self._posterior_weights(XXa,XYa,e_tau,e_A[active],True)
189
self._e_tau_ = e_tau
190
self.coef_ = w
191
self._set_intercept(X_mean,y_mean,X_std)
192
self.active_ = active
193
return self
194
195
196
197
def predict_dist(self,X):
198
'''
199
Computes predictive distribution for test set.
200
Predictive distribution for each data point is one dimensional
201
Gaussian and therefore is characterised by mean and standard
202
deviation.
203
204
Parameters
205
-----------
206
X: {array-like, sparse} [n_samples_test, n_features]
207
Test data, matrix of explanatory variables
208
209
Returns
210
-------
211
y_hat: numpy array of size (n_samples_test,)
212
Estimated values of targets on test set (Mean of predictive distribution)
213
214
var_hat: numpy array of size (n_samples_test,)
215
Error bounds (Standard deviation of predictive distribution)
216
'''
217
y_hat = self._decision_function(X)
218
data_noise = 1./self._e_tau_
219
model_noise = np.sum( np.dot(X[:,self.active_],self.sigma_) * X[:,self.active_], axis = 1)
220
var_hat = data_noise + model_noise
221
return y_hat, var_hat
222
223
224
225
def _posterior_weights(self, XX, XY, exp_tau, exp_A, full_covar = False):
226
'''
227
Calculates parameters of posterior distribution of weights
228
229
Parameters:
230
-----------
231
X: numpy array of size n_features
232
Matrix of active features (changes at each iteration)
233
234
XY: numpy array of size [n_features]
235
Dot product of X and Y (for faster computations)
236
exp_tau: float
237
Mean of precision parameter of noise
238
239
exp_A: numpy array of size n_features
240
Vector of precisions for weights
241
242
Returns:
243
--------
244
[Mw, Sigma]: list of two numpy arrays
245
246
Mw: mean of posterior distribution
247
Sigma: covariance matrix
248
'''
249
# compute precision parameter
250
S = exp_tau*XX
251
np.fill_diagonal(S, np.diag(S) + exp_A)
252
253
# cholesky decomposition
254
R = np.linalg.cholesky(S)
255
256
# find mean of posterior distribution
257
RtMw = solve_triangular(R, exp_tau*XY, lower = True, check_finite = False)
258
Mw = solve_triangular(R.T, RtMw, lower = False, check_finite = False)
259
260
# use cholesky decomposition of S to find inverse ( or diagonal of inverse)
261
Ri = solve_triangular(R, np.eye(R.shape[1]), lower = True, check_finite = False)
262
if full_covar:
263
Sigma = np.dot(Ri.T,Ri)
264
return [Mw,Sigma]
265
else:
266
return [Mw,Ri]
267
268
269
#---------------------- Classification ---------------------------------------------
270
271
272
273
def lam(eps):
274
'''
275
Calculates lambda eps [part of local variational approximation
276
to sigmoid function]
277
'''
278
return 0.5 / eps * ( expit(eps) - 0.5 )
279
280
281
class VBClassificationARD(LinearClassifierMixin, BaseEstimator):
282
'''
283
Variational Bayesian Logistic Regression with local variational approximation.
284
285
286
Parameters:
287
-----------
288
n_iter: int, optional (DEFAULT = 50 )
289
Maximum number of iterations
290
291
tol: float, optional (DEFAULT = 1e-3)
292
Convergence threshold, if cange in coefficients is less than threshold
293
algorithm is terminated
294
295
fit_intercept: bool, optinal ( DEFAULT = True )
296
If True uses bias term in model fitting
297
298
a: float, optional (DEFAULT = 1e-6)
299
Rate parameter for Gamma prior on precision parameter of coefficients
300
301
b: float, optional (DEFAULT = 1e-6)
302
Shape parameter for Gamma prior on precision parameter of coefficients
303
304
prune_thresh: float, optional (DEFAULT = 1e-4)
305
Threshold for pruning out variable (applied after model is fitted)
306
307
verbose: bool, optional (DEFAULT = False)
308
Verbose mode
309
310
311
Attributes
312
----------
313
coef_ : array, shape = (n_features)
314
Coefficients of the regression model (mean of posterior distribution)
315
sigma_ : array, shape = (n_features, n_features)
316
estimated covariance matrix of the weights, computed only
317
for non-zero coefficients
318
319
intercept_: array, shape = (n_features)
320
intercepts
321
322
active_ : array, dtype = np.bool, shape = (n_features)
323
True for non-zero coefficients, False otherwise
324
References:
325
-----------
326
[1] Bishop 2006, Pattern Recognition and Machine Learning ( Chapter 7,10 )
327
[2] Murphy 2012, Machine Learning A Probabilistic Perspective ( Chapter 14,21 )
328
[3] Bishop & Tipping 2000, Variational Relevance Vector Machine
329
'''
330
def __init__(self, n_iter = 100, tol = 1e-3, fit_intercept = True,
331
a = 1e-4, b = 1e-4, prune_thresh = 1e-4, verbose = True):
332
self.n_iter = n_iter
333
self.tol = tol
334
self.verbose = verbose
335
self.prune_thresh = prune_thresh
336
self.fit_intercept = fit_intercept
337
self.a = a
338
self.b = b
339
340
341
def fit(self,X,y):
342
'''
343
Fits variational Bayesian Logistic Regression with local variational bound
344
345
Parameters
346
----------
347
X: array-like of size (n_samples, n_features)
348
Matrix of explanatory variables
349
350
y: array-like of size (n_samples,)
351
Vector of dependent variables
352
Returns
353
-------
354
self: object
355
self
356
'''
357
# preprocess data
358
X,y = check_X_y( X, y , dtype = np.float64)
359
check_classification_targets(y)
360
self.classes_ = np.unique(y)
361
n_classes = len(self.classes_)
362
363
# take into account bias term if required
364
n_samples, n_features = X.shape
365
n_features = n_features + int(self.fit_intercept)
366
if self.fit_intercept:
367
X = np.hstack( (np.ones([n_samples,1]),X))
368
369
# handle multiclass problems using One-vs-Rest
370
if n_classes < 2:
371
raise ValueError("Need samples of at least 2 classes")
372
if n_classes > 2:
373
self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes
374
self.intercept_ = [0]*n_classes
375
self.active_ = [0]*n_classes
376
else:
377
self.coef_, self.sigma_, self.intercept_ = [0],[0],[0]
378
self.active_ = [0]
379
380
# hyperparameters of precision for weights
381
a = self.a + 0.5 * np.ones(n_features)
382
b = self.b * np.ones(n_features)
383
384
for i in range(len(self.coef_)):
385
if n_classes == 2:
386
pos_class = self.classes_[1]
387
else:
388
pos_class = self.classes_[i]
389
mask = (y == pos_class)
390
y_bin = np.ones(y.shape)
391
y_bin[~mask] = 0
392
coef_, sigma_, intercept_, active_ = self._fit(X,y_bin,a,b)
393
self.coef_[i] = coef_
394
self.intercept_[i] = intercept_
395
self.sigma_[i] = sigma_
396
self.active_[i] = active_
397
398
self.coef_ = np.asarray(self.coef_)
399
return self
400
401
402
def predict_proba(self,x):
403
'''
404
Predicts probabilities of targets for test set
405
406
Parameters
407
----------
408
X: array-like of size [n_samples_test,n_features]
409
Matrix of explanatory variables (test set)
410
411
Returns
412
-------
413
probs: numpy array of size [n_samples_test]
414
Estimated probabilities of target classes
415
'''
416
scores = self.decision_function(x)
417
if self.fit_intercept:
418
x = np.hstack( (np.ones([x.shape[0],1]),x))
419
var = [np.sum(np.dot(x[:,a],s)*x[:,a],axis = 1) for a,s in zip(self.active_,self.sigma_)]
420
sigma = np.asarray(var)
421
ks = 1. / ( 1. + np.pi*sigma / 8)**0.5
422
probs = expit(scores.T*ks).T
423
if probs.shape[1] == 1:
424
probs = np.hstack([1 - probs, probs])
425
else:
426
probs /= np.reshape(np.sum(probs, axis = 1), (probs.shape[0],1))
427
return probs
428
429
430
def _fit(self,X,y,a,b):
431
'''
432
Fits single classifier for each class (for OVR framework)
433
'''
434
eps = 1 # default starting parameter for Jaakola Jordan bound
435
w0 = np.zeros(X.shape[1])
436
w = np.copy(w0)
437
active = np.ones(X.shape[1], dtype = np.bool)
438
XY = np.dot(X.T, y - 0.5)
439
440
for i in range(self.n_iter):
441
# In the E-step we update approximation of
442
# posterior distribution q(w,alpha) = q(w)*q(alpha)
443
444
# --------- update q(w) ------------------
445
l = lam(eps)
446
Xa = X[:,active]
447
XYa = XY[active] #np.dot(Xa.T,(y-0.5))
448
w[active],Ri = u,v = self._posterior_dist(Xa,l,a[active],b[active],XYa)
449
450
# -------- update q(alpha) ---------------
451
b[active] = self.b + 0.5*(w[active]**2 + np.sum(Ri**2,1))
452
453
# -------- update eps ------------
454
# In the M-step we update parameter eps which controls
455
# accuracy of local variational approximation to lower bound
456
XMX = np.dot(Xa,w[active])**2
457
XSX = np.sum( np.dot(Xa,Ri.T)**2, axis = 1)
458
eps = np.sqrt( XMX + XSX )
459
460
# determine relevant vector as is described in Bishop & Tipping
461
# (i.e. using mean of posterior distribution)
462
active = np.abs(w) > self.prune_thresh
463
464
# do not prune intercept & make sure there is at least one 'relevant feature'.
465
# If only one relevant feature , then choose rv with largest posterior mean
466
if self.fit_intercept:
467
active[0] = True
468
if np.sum(active[1:]) == 0:
469
active[np.argmax(np.abs(w[1:]))] = True
470
else:
471
if np.sum(active) == 0:
472
active[np.argmax(np.abs(w))] = True
473
# all irrelevant features are forced to zero
474
w[~active] = 0
475
# check convergence
476
if np.sum(abs(w-w0) > self.tol) == 0 or i==self.n_iter-1:
477
break
478
w0 = np.copy(w)
479
# if only one relevant feature => terminate
480
if np.sum(active) - 1*self.fit_intercept == 1:
481
if X.shape[1] > 3 and self.prune_thresh > 1e-1:
482
warnings.warn(("Only one relevant feature found! it can be useful to decrease"
483
"value for parameter prune_thresh"))
484
break
485
486
l = lam(eps)
487
Xa = X[:,active]
488
XYa = np.dot(Xa.T,(y-0.5))
489
w[active] , sigma_ = self._posterior_dist(Xa,l,a[active],b[active],XYa,True)
490
491
# separate intercept & coefficients
492
intercept_ = 0
493
if self.fit_intercept:
494
intercept_ = w[0]
495
coef_ = np.copy(w[1:])
496
else:
497
coef_ = w
498
return coef_, sigma_ , intercept_, active
499
500
501
def _posterior_dist(self,X,l,a,b,XY,full_covar = False):
502
'''
503
Finds gaussian approximation to posterior of coefficients
504
'''
505
sigma_inv = 2*np.dot(X.T*l,X)
506
alpha_vec = a / b
507
if self.fit_intercept:
508
alpha_vec[0] = np.finfo(np.float64).eps
509
np.fill_diagonal(sigma_inv, np.diag(sigma_inv) + alpha_vec)
510
R = np.linalg.cholesky(sigma_inv)
511
Z = solve_triangular(R,XY, lower = True)
512
mean_ = solve_triangular(R.T,Z,lower = False)
513
Ri = solve_triangular(R,np.eye(X.shape[1]), lower = True)
514
if full_covar:
515
sigma_ = np.dot(Ri.T,Ri)
516
return mean_ , sigma_
517
else:
518
return mean_ , Ri
519
520
521
522