Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
probml
GitHub Repository: probml/pyprobml
Path: blob/master/deprecated/scripts/ard_linreg_logreg.py
1192 views
1
# Linear and logistic Regression with Automatic Relevance Determination (Fast Version uses
2
# Sparse Bayesian Learning)
3
4
#https://github.com/AmazaspShumik/sklearn-bayes/blob/master/skbayes/rvm_ard_models/fast_rvm.py
5
6
import superimport
7
8
import numpy as np
9
from sklearn.base import RegressorMixin, BaseEstimator
10
#from sklearn.externals import six
11
from sklearn.linear_model.base import LinearModel, LinearClassifierMixin
12
from sklearn.utils import check_X_y,check_array,as_float_array
13
from sklearn.utils.multiclass import check_classification_targets
14
from sklearn.utils.extmath import pinvh,log_logistic,safe_sparse_dot
15
from sklearn.metrics.pairwise import pairwise_kernels
16
from sklearn.utils.validation import check_is_fitted
17
from scipy.special import expit
18
from scipy.optimize import fmin_l_bfgs_b
19
from scipy.linalg import solve_triangular
20
from scipy.stats import logistic
21
from numpy.linalg import LinAlgError
22
import scipy.sparse
23
import warnings
24
25
#TODO: predict_proba for RVC with Laplace Approximation
26
27
28
def update_precisions(Q,S,q,s,A,active,tol,n_samples,clf_bias):
29
'''
30
Selects one feature to be added/recomputed/deleted to model based on
31
effect it will have on value of log marginal likelihood.
32
'''
33
# initialise vector holding changes in log marginal likelihood
34
deltaL = np.zeros(Q.shape[0])
35
36
# identify features that can be added , recomputed and deleted in model
37
theta = q**2 - s
38
add = (theta > 0) * (active == False)
39
recompute = (theta > 0) * (active == True)
40
delete = ~(add + recompute)
41
42
# compute sparsity & quality parameters corresponding to features in
43
# three groups identified above
44
Qadd,Sadd = Q[add], S[add]
45
Qrec,Srec,Arec = Q[recompute], S[recompute], A[recompute]
46
Qdel,Sdel,Adel = Q[delete], S[delete], A[delete]
47
48
# compute new alpha's (precision parameters) for features that are
49
# currently in model and will be recomputed
50
Anew = s[recompute]**2/ ( theta[recompute] + np.finfo(np.float32).eps)
51
delta_alpha = (1./Anew - 1./Arec)
52
53
# compute change in log marginal likelihood
54
deltaL[add] = ( Qadd**2 - Sadd ) / Sadd + np.log(Sadd/Qadd**2 )
55
denom = np.maximum(1e-5, 1 + Srec*delta_alpha) # Kevin Murphy hack
56
deltaL[recompute] = Qrec**2 / (Srec + 1. / delta_alpha) - np.log(denom)
57
deltaL[delete] = Qdel**2 / (Sdel - Adel) - np.log(1 - Sdel / Adel)
58
deltaL = deltaL / n_samples
59
60
# find feature which caused largest change in likelihood
61
feature_index = np.argmax(deltaL)
62
63
# no deletions or additions
64
same_features = np.sum( theta[~recompute] > 0) == 0
65
66
# changes in precision for features already in model is below threshold
67
no_delta = np.sum( abs( Anew - Arec ) > tol ) == 0
68
69
# check convergence: if no features to add or delete and small change in
70
# precision for current features then terminate
71
converged = False
72
if same_features and no_delta:
73
converged = True
74
return [A,converged]
75
76
# if not converged update precision parameter of weights and return
77
if theta[feature_index] > 0:
78
A[feature_index] = s[feature_index]**2 / theta[feature_index]
79
if active[feature_index] == False:
80
active[feature_index] = True
81
else:
82
# at least two active features
83
if active[feature_index] == True and np.sum(active) >= 2:
84
# do not remove bias term in classification
85
# (in regression it is factored in through centering)
86
if not (feature_index == 0 and clf_bias):
87
active[feature_index] = False
88
A[feature_index] = np.PINF
89
90
return [A,converged]
91
92
93
###############################################################################
94
# ARD REGRESSION AND CLASSIFICATION
95
###############################################################################
96
97
98
#-------------------------- Regression ARD ------------------------------------
99
100
101
class RegressionARD(LinearModel,RegressorMixin):
102
'''
103
Regression with Automatic Relevance Determination (Fast Version uses
104
Sparse Bayesian Learning)
105
106
Parameters
107
----------
108
n_iter: int, optional (DEFAULT = 100)
109
Maximum number of iterations
110
111
tol: float, optional (DEFAULT = 1e-3)
112
If absolute change in precision parameter for weights is below threshold
113
algorithm terminates.
114
115
fit_intercept : boolean, optional (DEFAULT = True)
116
whether to calculate the intercept for this model. If set
117
to false, no intercept will be used in calculations
118
(e.g. data is expected to be already centered).
119
120
copy_X : boolean, optional (DEFAULT = True)
121
If True, X will be copied; else, it may be overwritten.
122
123
verbose : boolean, optional (DEFAULT = True)
124
Verbose mode when fitting the model
125
126
127
Attributes
128
----------
129
coef_ : array, shape = (n_features)
130
Coefficients of the regression model (mean of posterior distribution)
131
132
alpha_ : float
133
estimated precision of the noise
134
135
active_ : array, dtype = np.bool, shape = (n_features)
136
True for non-zero coefficients, False otherwise
137
138
lambda_ : array, shape = (n_features)
139
estimated precisions of the coefficients
140
141
sigma_ : array, shape = (n_features, n_features)
142
estimated covariance matrix of the weights, computed only
143
for non-zero coefficients
144
145
146
References
147
----------
148
[1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)
149
(http://www.miketipping.com/papers/met-fastsbl.pdf)
150
[2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)
151
(http://www.miketipping.com/abstracts.htm#Faul:NIPS01)
152
153
'''
154
155
def __init__( self, n_iter = 300, tol = 1e-3, fit_intercept = True,
156
copy_X = True, verbose = False):
157
self.n_iter = n_iter
158
self.tol = tol
159
self.scores_ = list()
160
self.fit_intercept = fit_intercept
161
self.copy_X = copy_X
162
self.verbose = verbose
163
164
165
def _center_data(self,X,y):
166
''' Centers data'''
167
X = as_float_array(X,self.copy_X)
168
# normalisation should be done in preprocessing!
169
X_std = np.ones(X.shape[1], dtype = X.dtype)
170
if self.fit_intercept:
171
X_mean = np.average(X,axis = 0)
172
y_mean = np.average(y,axis = 0)
173
X -= X_mean
174
y = y - y_mean
175
else:
176
X_mean = np.zeros(X.shape[1],dtype = X.dtype)
177
y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
178
return X,y, X_mean, y_mean, X_std
179
180
181
def fit(self,X,y):
182
'''
183
Fits ARD Regression with Sequential Sparse Bayes Algorithm.
184
185
Parameters
186
-----------
187
X: {array-like, sparse matrix} of size (n_samples, n_features)
188
Training data, matrix of explanatory variables
189
190
y: array-like of size [n_samples, n_features]
191
Target values
192
193
Returns
194
-------
195
self : object
196
Returns self.
197
'''
198
X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
199
X, y, X_mean, y_mean, X_std = self._center_data(X, y)
200
n_samples, n_features = X.shape
201
202
# precompute X'*Y , X'*X for faster iterations & allocate memory for
203
# sparsity & quality vectors
204
XY = np.dot(X.T,y)
205
XX = np.dot(X.T,X)
206
XXd = np.diag(XX)
207
208
# initialise precision of noise & and coefficients
209
var_y = np.var(y)
210
211
# check that variance is non zero !!!
212
if var_y == 0 :
213
beta = 1e-2
214
else:
215
beta = 1. / np.var(y)
216
217
A = np.PINF * np.ones(n_features)
218
active = np.zeros(n_features , dtype = np.bool)
219
220
# in case of almost perfect multicollinearity between some features
221
# start from feature 0
222
if np.sum( XXd - X_mean**2 < np.finfo(np.float32).eps ) > 0:
223
A[0] = np.finfo(np.float16).eps
224
active[0] = True
225
else:
226
# start from a single basis vector with largest projection on targets
227
proj = XY**2 / XXd
228
start = np.argmax(proj)
229
active[start] = True
230
A[start] = XXd[start]/( proj[start] - var_y)
231
232
warning_flag = 0
233
for i in range(self.n_iter):
234
XXa = XX[active,:][:,active]
235
XYa = XY[active]
236
Aa = A[active]
237
238
# mean & covariance of posterior distribution
239
Mn,Ri,cholesky = self._posterior_dist(Aa,beta,XXa,XYa)
240
if cholesky:
241
Sdiag = np.sum(Ri**2,0)
242
else:
243
Sdiag = np.copy(np.diag(Ri))
244
warning_flag += 1
245
246
# raise warning in case cholesky failes
247
if warning_flag == 1:
248
warnings.warn(("Cholesky decomposition failed ! Algorithm uses pinvh, "
249
"which is significantly slower, if you use RVR it "
250
"is advised to change parameters of kernel"))
251
252
# compute quality & sparsity parameters
253
s,q,S,Q = self._sparsity_quality(XX,XXd,XY,XYa,Aa,Ri,active,beta,cholesky)
254
255
# update precision parameter for noise distribution
256
rss = np.sum( ( y - np.dot(X[:,active] , Mn) )**2 )
257
beta = n_samples - np.sum(active) + np.sum(Aa * Sdiag )
258
beta /= ( rss + np.finfo(np.float32).eps )
259
260
# update precision parameters of coefficients
261
A,converged = update_precisions(Q,S,q,s,A,active,self.tol,
262
n_samples,False)
263
if self.verbose:
264
print(('Iteration: {0}, number of features '
265
'in the model: {1}').format(i,np.sum(active)))
266
if converged or i == self.n_iter - 1:
267
if converged and self.verbose:
268
print('Algorithm converged !')
269
break
270
271
# after last update of alpha & beta update parameters
272
# of posterior distribution
273
XXa,XYa,Aa = XX[active,:][:,active],XY[active],A[active]
274
Mn, Sn, cholesky = self._posterior_dist(Aa,beta,XXa,XYa,True)
275
self.coef_ = np.zeros(n_features)
276
self.coef_[active] = Mn
277
self.sigma_ = Sn
278
self.active_ = active
279
self.lambda_ = A
280
self.alpha_ = beta
281
self._set_intercept(X_mean,y_mean,X_std)
282
return self
283
284
285
def predict_dist(self,X):
286
'''
287
Computes predictive distribution for test set.
288
Predictive distribution for each data point is one dimensional
289
Gaussian and therefore is characterised by mean and variance.
290
291
Parameters
292
-----------
293
X: {array-like, sparse} (n_samples_test, n_features)
294
Test data, matrix of explanatory variables
295
296
Returns
297
-------
298
: list of length two [y_hat, var_hat]
299
300
y_hat: numpy array of size (n_samples_test,)
301
Estimated values of targets on test set (i.e. mean of predictive
302
distribution)
303
304
var_hat: numpy array of size (n_samples_test,)
305
Variance of predictive distribution
306
'''
307
y_hat = self._decision_function(X)
308
var_hat = 1./self.alpha_
309
var_hat += np.sum( np.dot(X[:,self.active_],self.sigma_) * X[:,self.active_], axis = 1)
310
return y_hat, var_hat
311
312
313
def _posterior_dist(self,A,beta,XX,XY,full_covar=False):
314
'''
315
Calculates mean and covariance matrix of posterior distribution
316
of coefficients.
317
'''
318
# compute precision matrix for active features
319
Sinv = beta * XX
320
np.fill_diagonal(Sinv, np.diag(Sinv) + A)
321
cholesky = True
322
# try cholesky, if it fails go back to pinvh
323
try:
324
# find posterior mean : R*R.T*mean = beta*X.T*Y
325
# solve(R*z = beta*X.T*Y) => find z => solve(R.T*mean = z) => find mean
326
R = np.linalg.cholesky(Sinv)
327
Z = solve_triangular(R,beta*XY, check_finite=False, lower = True)
328
Mn = solve_triangular(R.T,Z, check_finite=False, lower = False)
329
330
# invert lower triangular matrix from cholesky decomposition
331
Ri = solve_triangular(R,np.eye(A.shape[0]), check_finite=False, lower=True)
332
if full_covar:
333
Sn = np.dot(Ri.T,Ri)
334
return Mn,Sn,cholesky
335
else:
336
return Mn,Ri,cholesky
337
except LinAlgError:
338
cholesky = False
339
Sn = pinvh(Sinv)
340
Mn = beta*np.dot(Sinv,XY)
341
return Mn, Sn, cholesky
342
343
344
345
346
def _sparsity_quality(self,XX,XXd,XY,XYa,Aa,Ri,active,beta,cholesky):
347
'''
348
Calculates sparsity and quality parameters for each feature
349
350
Theoretical Note:
351
-----------------
352
Here we used Woodbury Identity for inverting covariance matrix
353
of target distribution
354
C = 1/beta + 1/alpha * X' * X
355
C^-1 = beta - beta^2 * X * Sn * X'
356
'''
357
bxy = beta*XY
358
bxx = beta*XXd
359
if cholesky:
360
# here Ri is inverse of lower triangular matrix obtained from cholesky decomp
361
xxr = np.dot(XX[:,active],Ri.T)
362
rxy = np.dot(Ri,XYa)
363
S = bxx - beta**2 * np.sum( xxr**2, axis=1)
364
Q = bxy - beta**2 * np.dot( xxr, rxy)
365
else:
366
# here Ri is covariance matrix
367
XXa = XX[:,active]
368
XS = np.dot(XXa,Ri)
369
S = bxx - beta**2 * np.sum(XS*XXa,1)
370
Q = bxy - beta**2 * np.dot(XS,XYa)
371
# Use following:
372
# (EQ 1) q = A*Q/(A - S) ; s = A*S/(A-S), so if A = np.PINF q = Q, s = S
373
qi = np.copy(Q)
374
si = np.copy(S)
375
# If A is not np.PINF, then it should be 'active' feature => use (EQ 1)
376
Qa,Sa = Q[active], S[active]
377
qi[active] = Aa * Qa / (Aa - Sa )
378
si[active] = Aa * Sa / (Aa - Sa )
379
return [si,qi,S,Q]
380
381
382
#----------------------- Classification ARD -----------------------------------
383
384
385
def _logistic_cost_grad(X,Y,w,diagA):
386
'''
387
Calculates cost and gradient for logistic regression
388
'''
389
n = X.shape[0]
390
Xw = np.dot(X,w)
391
s = expit(Xw)
392
wdA = w*diagA
393
wdA[0] = 1e-3 # broad prior for bias term => almost no regularization
394
cost = np.sum( Xw* (1-Y) - log_logistic(Xw)) + np.sum(w*wdA)/2
395
grad = np.dot(X.T, s - Y) + wdA
396
return [cost/n,grad/n]
397
398
399
400
class ClassificationARD(BaseEstimator,LinearClassifierMixin):
401
'''
402
Logistic Regression with Automatic Relevance determination (Fast Version uses
403
Sparse Bayesian Learning)
404
405
Parameters
406
----------
407
n_iter: int, optional (DEFAULT = 100)
408
Maximum number of iterations before termination
409
410
tol: float, optional (DEFAULT = 1e-3)
411
If absolute change in precision parameter for weights is below threshold
412
algorithm terminates.
413
414
normalize: bool, optional (DEFAULT = True)
415
If True normalizes features
416
417
n_iter_solver: int, optional (DEFAULT = 20)
418
Maximum number of iterations before termination of solver
419
420
tol_solver: float, optional (DEFAULT = 1e-5)
421
Convergence threshold for solver (it is used in estimating posterior
422
distribution)
423
424
fit_intercept : bool, optional ( DEFAULT = True )
425
If True will use intercept in the model. If set
426
to false, no intercept will be used in calculations
427
428
verbose : boolean, optional (DEFAULT = True)
429
Verbose mode when fitting the model
430
431
432
Attributes
433
----------
434
coef_ : array, shape = (n_features)
435
Coefficients of the regression model (mean of posterior distribution)
436
437
lambda_ : float
438
estimated precisions of weights
439
440
active_ : array, dtype = np.bool, shape = (n_features)
441
True for non-zero coefficients, False otherwise
442
443
sigma_ : array, shape = (n_features, n_features)
444
estimated covariance matrix of the weights, computed only
445
for non-zero coefficients
446
447
448
References
449
----------
450
[1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)
451
(http://www.miketipping.com/papers/met-fastsbl.pdf)
452
[2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)
453
(http://www.miketipping.com/abstracts.htm#Faul:NIPS01)
454
'''
455
def __init__(self, n_iter=100, tol=1e-4, n_iter_solver=15, normalize=True,
456
tol_solver=1e-4, fit_intercept=True, verbose=False):
457
self.n_iter = n_iter
458
self.tol = tol
459
self.n_iter_solver = n_iter_solver
460
self.normalize = normalize
461
self.tol_solver = tol_solver
462
self.fit_intercept = fit_intercept
463
self.verbose = verbose
464
465
466
def fit(self,X,y):
467
'''
468
Fits Logistic Regression with ARD
469
470
Parameters
471
----------
472
X: array-like of size [n_samples, n_features]
473
Training data, matrix of explanatory variables
474
475
y: array-like of size [n_samples]
476
Target values
477
478
Returns
479
-------
480
self : object
481
Returns self.
482
'''
483
X, y = check_X_y(X, y, accept_sparse = None, dtype=np.float64)
484
485
# normalize, if required
486
if self.normalize:
487
self._x_mean = np.mean(X,0)
488
self._x_std = np.std(X,0)
489
X = (X - self._x_mean) / self._x_std
490
491
# add bias term if required
492
if self.fit_intercept:
493
X = np.concatenate((np.ones([X.shape[0],1]),X),1)
494
495
# preprocess targets
496
check_classification_targets(y)
497
self.classes_ = np.unique(y)
498
n_classes = len(self.classes_)
499
if n_classes < 2:
500
raise ValueError("Need samples of at least 2 classes"
501
" in the data, but the data contains only one"
502
" class: %r" % self.classes_[0])
503
504
# if multiclass use OVR (i.e. fit classifier for each class)
505
if n_classes < 2:
506
raise ValueError("Need samples of at least 2 classes")
507
if n_classes > 2:
508
self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes
509
self.intercept_ , self.active_ = [0]*n_classes, [0]*n_classes
510
self.lambda_ = [0]*n_classes
511
else:
512
self.coef_, self.sigma_, self.intercept_,self.active_ = [0],[0],[0],[0]
513
self.lambda_ = [0]
514
515
for i in range(len(self.classes_)):
516
if n_classes == 2:
517
pos_class = self.classes_[1]
518
else:
519
pos_class = self.classes_[i]
520
mask = (y == pos_class)
521
y_bin = np.zeros(y.shape, dtype=np.float64)
522
y_bin[mask] = 1
523
coef,bias,active,sigma,lambda_ = self._fit(X,y_bin)
524
self.coef_[i], self.intercept_[i], self.sigma_[i] = coef, bias, sigma
525
self.active_[i], self.lambda_[i] = active, lambda_
526
# in case of binary classification fit only one classifier
527
if n_classes == 2:
528
break
529
self.coef_ = np.asarray(self.coef_)
530
self.intercept_ = np.asarray(self.intercept_)
531
return self
532
533
534
def _fit(self,X,y):
535
'''
536
Fits binary classification
537
'''
538
n_samples,n_features = X.shape
539
A = np.PINF * np.ones(n_features)
540
active = np.zeros(n_features , dtype = np.bool)
541
542
# if we fit intercept, make it active from the beginning
543
if self.fit_intercept:
544
active[0] = True
545
A[0] = np.finfo(np.float16).eps
546
547
warning_flag = 0
548
for i in range(self.n_iter):
549
Xa = X[:,active]
550
Aa = A[active]
551
552
# mean & precision of posterior distribution
553
Mn,Sn,B,t_hat, cholesky = self._posterior_dist(Xa,y, Aa)
554
if not cholesky:
555
warning_flag += 1
556
557
# raise warning in case cholesky failes (but only once)
558
if warning_flag == 1:
559
warnings.warn(("Cholesky decomposition failed ! Algorithm uses pinvh, "
560
"which is significantly slower, if you use RVC it "
561
"is advised to change parameters of kernel"))
562
563
# compute quality & sparsity parameters
564
s,q,S,Q = self._sparsity_quality(X,Xa,t_hat,B,A,Aa,active,Sn,cholesky)
565
566
# update precision parameters of coefficients
567
A,converged = update_precisions(Q,S,q,s,A,active,self.tol,n_samples,self.fit_intercept)
568
569
# terminate if converged
570
if converged or i == self.n_iter - 1:
571
break
572
573
Xa,Aa = X[:,active], A[active]
574
Mn,Sn,B,t_hat,cholesky = self._posterior_dist(Xa,y,Aa)
575
# in case Sn is inverse of lower triangular matrix of Cholesky decomposition
576
# compute covariance using formula Sn = np.dot(Rinverse.T , Rinverse)
577
if cholesky:
578
Sn = np.dot(Sn.T,Sn)
579
intercept_ = 0
580
if self.fit_intercept:
581
n_features -= 1
582
if active[0] == True:
583
intercept_ = Mn[0]
584
Mn = Mn[1:]
585
active = active[1:]
586
coef_ = np.zeros([1,n_features])
587
coef_[0,active] = Mn
588
return coef_.squeeze(), intercept_, active, Sn, A
589
590
591
def predict(self,X):
592
'''
593
Estimates target values on test set
594
595
Parameters
596
----------
597
X: array-like of size (n_samples_test, n_features)
598
Matrix of explanatory variables
599
600
Returns
601
-------
602
y_pred: numpy arra of size (n_samples_test,)
603
Predicted values of targets
604
'''
605
probs = self.predict_proba(X)
606
indices = np.argmax(probs, axis = 1)
607
y_pred = self.classes_[indices]
608
return y_pred
609
610
611
def _decision_function_active(self,X,coef_,active_,intercept_):
612
''' Constructs decision function using only relevant features '''
613
if self.normalize:
614
X = (X - self._x_mean[active_]) / self._x_std[active_]
615
decision = safe_sparse_dot(X,coef_[active_]) + intercept_
616
return decision
617
618
619
def decision_function(self,X):
620
'''
621
Computes distance to separating hyperplane between classes. The larger
622
is the absolute value of the decision function further data point is
623
from the decision boundary.
624
625
Parameters
626
----------
627
X: array-like of size (n_samples_test,n_features)
628
Matrix of explanatory variables
629
630
Returns
631
-------
632
decision: numpy array of size (n_samples_test,)
633
Distance to decision boundary
634
'''
635
check_is_fitted(self, 'coef_')
636
X = check_array(X, accept_sparse=None, dtype = np.float64)
637
n_features = self.coef_.shape[1]
638
if X.shape[1] != n_features:
639
raise ValueError("X has %d features per sample; expecting %d"
640
% (X.shape[1], n_features))
641
decision = [self._decision_function_active(X[:,active],coef,active,bias) for
642
coef,active,bias in zip(self.coef_,self.active_,self.intercept_)]
643
decision = np.asarray(decision).squeeze().T
644
return decision
645
646
647
def predict_proba(self,X):
648
'''
649
Predicts probabilities of targets for test set using probit
650
function to approximate convolution of sigmoid and Gaussian.
651
652
Parameters
653
----------
654
X: array-like of size (n_samples_test,n_features)
655
Matrix of explanatory variables
656
657
Returns
658
-------
659
probs: numpy array of size (n_samples_test,)
660
Estimated probabilities of target classes
661
'''
662
y_hat = self.decision_function(X)
663
X = check_array(X, accept_sparse=None, dtype = np.float64)
664
if self.normalize:
665
X = (X - self._x_mean) / self._x_std
666
if self.fit_intercept:
667
X = np.concatenate((np.ones([X.shape[0],1]), X),1)
668
if y_hat.ndim == 1:
669
pr = self._convo_approx(X[:,self.lambda_[0]!=np.PINF],
670
y_hat,self.sigma_[0])
671
prob = np.vstack([1 - pr, pr]).T
672
else:
673
pr = [self._convo_approx(X[:,idx != np.PINF],y_hat[:,i],
674
self.sigma_[i]) for i,idx in enumerate(self.lambda_) ]
675
pr = np.asarray(pr).T
676
prob = pr / np.reshape(np.sum(pr, axis = 1), (pr.shape[0],1))
677
return prob
678
679
680
def _convo_approx(self,X,y_hat,sigma):
681
''' Computes approximation to convolution of sigmoid and gaussian'''
682
var = np.sum(np.dot(X,sigma)*X,1)
683
ks = 1. / ( 1. + np.pi * var/ 8)**0.5
684
pr = expit(y_hat * ks)
685
return pr
686
687
688
def _sparsity_quality(self,X,Xa,y,B,A,Aa,active,Sn,cholesky):
689
'''
690
Calculates sparsity & quality parameters for each feature
691
'''
692
XB = X.T*B
693
bxx = np.dot(B,X**2)
694
Q = np.dot(X.T,y)
695
if cholesky:
696
# Here Sn is inverse of lower triangular matrix, obtained from
697
# cholesky decomposition
698
XBX = np.dot(XB,Xa)
699
XBX = np.dot(XBX,Sn,out=XBX)
700
S = bxx - np.sum(XBX**2,1)
701
else:
702
XSX = np.dot(np.dot(Xa,Sn),Xa.T)
703
S = bxx - np.sum( np.dot( XB,XSX )*XB,1 )
704
qi = np.copy(Q)
705
si = np.copy(S)
706
Qa,Sa = Q[active], S[active]
707
qi[active] = Aa * Qa / (Aa - Sa )
708
si[active] = Aa * Sa / (Aa - Sa )
709
return [si,qi,S,Q]
710
711
712
def _posterior_dist(self,X,y,A):
713
'''
714
Uses Laplace approximation for calculating posterior distribution
715
'''
716
f = lambda w: _logistic_cost_grad(X,y,w,A)
717
w_init = np.random.random(X.shape[1])
718
Mn = fmin_l_bfgs_b(f, x0 = w_init, pgtol = self.tol_solver,
719
maxiter = self.n_iter_solver)[0]
720
Xm = np.dot(X,Mn)
721
s = expit(Xm)
722
B = logistic._pdf(Xm) # avoids underflow
723
S = np.dot(X.T*B,X)
724
np.fill_diagonal(S, np.diag(S) + A)
725
t_hat = y - s
726
cholesky = True
727
# try using Cholesky , if it fails then fall back on pinvh
728
try:
729
R = np.linalg.cholesky(S)
730
Sn = solve_triangular(R,np.eye(A.shape[0]),
731
check_finite=False,lower=True)
732
except LinAlgError:
733
Sn = pinvh(S)
734
cholesky = False
735
return [Mn,Sn,B,t_hat,cholesky]
736
737
738
739
###############################################################################
740
# Relevance Vector Machine: RVR and RVC
741
###############################################################################
742
743
744
745
def get_kernel( X, Y, gamma, degree, coef0, kernel, kernel_params ):
746
'''
747
Calculates kernelised features for RVR and RVC
748
'''
749
if callable(kernel):
750
params = kernel_params or {}
751
else:
752
params = {"gamma": gamma,
753
"degree": degree,
754
"coef0": coef0 }
755
return pairwise_kernels(X, Y, metric=kernel,
756
filter_params=True, **params)
757
758
759
760
class RVR(RegressionARD):
761
'''
762
Relevance Vector Regression (Fast Version uses Sparse Bayesian Learning)
763
764
Parameters
765
----------
766
n_iter: int, optional (DEFAULT = 300)
767
Maximum number of iterations
768
769
fit_intercept : boolean, optional (DEFAULT = True)
770
whether to calculate the intercept for this model
771
772
tol: float, optional (DEFAULT = 1e-3)
773
If absolute change in precision parameter for weights is below tol
774
algorithm terminates.
775
776
copy_X : boolean, optional (DEFAULT = True)
777
If True, X will be copied; else, it may be overwritten.
778
779
verbose : boolean, optional (DEFAULT = True)
780
Verbose mode when fitting the model
781
782
kernel: str, optional (DEFAULT = 'poly')
783
Type of kernel to be used (all kernels: ['rbf' | 'poly' | 'sigmoid', 'linear']
784
785
degree : int, (DEFAULT = 3)
786
Degree for poly kernels. Ignored by other kernels.
787
788
gamma : float, optional (DEFAULT = 1/n_features)
789
Kernel coefficient for rbf and poly kernels, ignored by other kernels
790
791
coef0 : float, optional (DEFAULT = 1)
792
Independent term in poly and sigmoid kernels, ignored by other kernels
793
794
kernel_params : mapping of string to any, optional
795
Parameters (keyword arguments) and values for kernel passed as
796
callable object, ignored by other kernels
797
798
799
Attributes
800
----------
801
coef_ : array, shape = (n_features)
802
Coefficients of the regression model (mean of posterior distribution)
803
804
alpha_ : float
805
estimated precision of the noise
806
807
active_ : array, dtype = np.bool, shape = (n_features)
808
True for non-zero coefficients, False otherwise
809
810
lambda_ : array, shape = (n_features)
811
estimated precisions of the coefficients
812
813
sigma_ : array, shape = (n_features, n_features)
814
estimated covariance matrix of the weights, computed only
815
for non-zero coefficients
816
817
relevant_vectors_ : array
818
Relevant Vectors
819
820
References
821
----------
822
[1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)
823
(http://www.miketipping.com/papers/met-fastsbl.pdf)
824
[2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)
825
(http://www.miketipping.com/abstracts.htm#Faul:NIPS01)
826
'''
827
def __init__(self, n_iter=300, tol = 1e-3, fit_intercept = True, copy_X = True,
828
verbose = False, kernel = 'poly', degree = 3, gamma = None,
829
coef0 = 1, kernel_params = None):
830
super(RVR,self).__init__(n_iter,tol,fit_intercept,copy_X,verbose)
831
self.kernel = kernel
832
self.degree = degree
833
self.gamma = gamma
834
self.coef0 = coef0
835
self.kernel_params = kernel_params
836
837
838
def fit(self,X,y):
839
'''
840
Fit Relevance Vector Regression Model
841
842
Parameters
843
-----------
844
X: {array-like,sparse matrix} of size (n_samples, n_features)
845
Training data, matrix of explanatory variables
846
847
y: array-like of size (n_samples, )
848
Target values
849
850
Returns
851
-------
852
self: object
853
self
854
'''
855
X,y = check_X_y(X,y,accept_sparse=['csr','coo','bsr'],dtype = np.float64)
856
# kernelise features
857
K = get_kernel( X, X, self.gamma, self.degree, self.coef0,
858
self.kernel, self.kernel_params)
859
860
# use fit method of RegressionARD
861
_ = super(RVR,self).fit(K,y)
862
863
# convert to csr (need to use __getitem__)
864
convert_tocsr = [scipy.sparse.coo.coo_matrix,
865
scipy.sparse.dia.dia_matrix,
866
scipy.sparse.bsr.bsr_matrix]
867
if type(X) in convert_tocsr:
868
X = X.tocsr()
869
self.relevant_ = np.where(self.active_== True)[0]
870
if X.ndim == 1:
871
self.relevant_vectors_ = X[self.relevant_]
872
else:
873
self.relevant_vectors_ = X[self.relevant_,:]
874
return self
875
876
877
def _decision_function(self,X):
878
''' Decision function '''
879
_, predict_vals = self._kernel_decision_function(X)
880
return predict_vals
881
882
883
def _kernel_decision_function(self,X):
884
''' Computes kernel and decision function based on kernel'''
885
check_is_fitted(self,'coef_')
886
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
887
K = get_kernel( X, self.relevant_vectors_, self.gamma, self.degree,
888
self.coef0, self.kernel, self.kernel_params)
889
return K , np.dot(K,self.coef_[self.active_]) + self.intercept_
890
891
892
def predict_dist(self,X):
893
'''
894
Computes predictive distribution for test set. Predictive distribution
895
for each data point is one dimensional Gaussian and therefore is
896
characterised by mean and variance.
897
898
Parameters
899
----------
900
X: {array-like,sparse matrix} of size (n_samples_test, n_features)
901
Matrix of explanatory variables
902
903
Returns
904
-------
905
: list of length two [y_hat, var_hat]
906
907
y_hat: numpy array of size (n_samples_test,)
908
Estimated values of targets on test set (i.e. mean of predictive
909
distribution)
910
911
var_hat: numpy array of size (n_samples_test,)
912
Variance of predictive distribution
913
'''
914
# kernel matrix and mean of predictive distribution
915
K, y_hat = self._kernel_decision_function(X)
916
var_hat = 1./self.alpha_
917
var_hat += np.sum( np.dot(K,self.sigma_) * K, axis = 1)
918
return y_hat,var_hat
919
920
921
922
class RVC(ClassificationARD):
923
'''
924
Relevance Vector Classifier (Fast Version, uses Sparse Bayesian Learning )
925
926
927
Parameters
928
----------
929
n_iter: int, optional (DEFAULT = 100)
930
Maximum number of iterations before termination
931
932
tol: float, optional (DEFAULT = 1e-4)
933
If absolute change in precision parameter for weights is below tol, then
934
the algorithm terminates.
935
n_iter_solver: int, optional (DEFAULT = 15)
936
Maximum number of iterations before termination of solver
937
938
tol_solver: float, optional (DEFAULT = 1e-4)
939
Convergence threshold for solver (it is used in estimating posterior
940
distribution)
941
942
fit_intercept : bool, optional ( DEFAULT = True )
943
If True will use intercept in the model
944
verbose : boolean, optional (DEFAULT = True)
945
Verbose mode when fitting the model
946
947
kernel: str, optional (DEFAULT = 'rbf')
948
Type of kernel to be used (all kernels: ['rbf' | 'poly' | 'sigmoid']
949
950
degree : int, (DEFAULT = 3)
951
Degree for poly kernels. Ignored by other kernels.
952
953
gamma : float, optional (DEFAULT = 1/n_features)
954
Kernel coefficient for rbf and poly kernels, ignored by other kernels
955
956
coef0 : float, optional (DEFAULT = 0.1)
957
Independent term in poly and sigmoid kernels, ignored by other kernels
958
959
kernel_params : mapping of string to any, optional
960
Parameters (keyword arguments) and values for kernel passed as
961
callable object, ignored by other kernels
962
963
964
Attributes
965
----------
966
coef_ : array, shape = (n_features)
967
Coefficients of the model (mean of posterior distribution)
968
969
lambda_ : float
970
Estimated precisions of weights
971
972
active_ : array, dtype = np.bool, shape = (n_features)
973
True for non-zero coefficients, False otherwise
974
975
sigma_ : array, shape = (n_features, n_features)
976
Estimated covariance matrix of the weights, computed only for non-zero
977
coefficients
978
979
980
References
981
----------
982
[1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)
983
(http://www.miketipping.com/papers/met-fastsbl.pdf)
984
[2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)
985
(http://www.miketipping.com/abstracts.htm#Faul:NIPS01)
986
'''
987
988
def __init__(self, n_iter = 100, tol = 1e-4, n_iter_solver = 15, tol_solver = 1e-4,
989
fit_intercept = True, verbose = False, kernel = 'rbf', degree = 2,
990
gamma = None, coef0 = 1, kernel_params = None):
991
super(RVC,self).__init__(n_iter,tol,n_iter_solver,True,tol_solver,
992
fit_intercept,verbose)
993
self.kernel = kernel
994
self.degree = degree
995
self.gamma = gamma
996
self.coef0 = coef0
997
self.kernel_params = kernel_params
998
999
1000
def fit(self,X,y):
1001
'''
1002
Fit Relevance Vector Classifier
1003
1004
Parameters
1005
-----------
1006
X: array-like of size [n_samples, n_features]
1007
Training data, matrix of explanatory variables
1008
1009
y: array-like of size [n_samples, n_features]
1010
Target values
1011
1012
Returns
1013
-------
1014
self: object
1015
self
1016
'''
1017
X,y = check_X_y(X,y, accept_sparse = None, dtype = np.float64)
1018
# kernelise features
1019
K = get_kernel( X, X, self.gamma, self.degree, self.coef0,
1020
self.kernel, self.kernel_params)
1021
# use fit method of ClassificationARD
1022
_ = super(RVC,self).fit(K,y)
1023
self.relevant_ = [np.where(active==True)[0] for active in self.active_]
1024
if X.ndim == 1:
1025
self.relevant_vectors_ = [ X[relevant_] for relevant_ in self.relevant_]
1026
else:
1027
self.relevant_vectors_ = [ X[relevant_,:] for relevant_ in self.relevant_ ]
1028
return self
1029
1030
1031
def decision_function(self,X):
1032
'''
1033
Computes distance to separating hyperplane between classes. The larger
1034
is the absolute value of the decision function further data point is
1035
from the decision boundary.
1036
1037
Parameters
1038
----------
1039
X: array-like of size (n_samples_test,n_features)
1040
Matrix of explanatory variables
1041
1042
Returns
1043
-------
1044
decision: numpy array of size (n_samples_test,)
1045
Distance to decision boundary
1046
'''
1047
check_is_fitted(self, 'coef_')
1048
X = check_array(X, accept_sparse=None, dtype = np.float64)
1049
n_features = self.relevant_vectors_[0].shape[1]
1050
if X.shape[1] != n_features:
1051
raise ValueError("X has %d features per sample; expecting %d"
1052
% (X.shape[1], n_features))
1053
kernel = lambda rvs : get_kernel(X,rvs,self.gamma, self.degree,
1054
self.coef0, self.kernel, self.kernel_params)
1055
decision = []
1056
for rv,cf,act,b in zip(self.relevant_vectors_,self.coef_,self.active_,
1057
self.intercept_):
1058
# if there are no relevant vectors => use intercept only
1059
if rv.shape[0] == 0:
1060
decision.append( np.ones(X.shape[0])*b )
1061
else:
1062
decision.append(self._decision_function_active(kernel(rv),cf,act,b))
1063
decision = np.asarray(decision).squeeze().T
1064
return decision
1065
1066
1067
def predict_proba(self,X):
1068
'''
1069
Predicts probabilities of targets.
1070
1071
Theoretical Note
1072
================
1073
Current version of method does not use MacKay's approximation
1074
to convolution of Gaussian and sigmoid. This results in less accurate
1075
estimation of class probabilities and therefore possible increase
1076
in misclassification error for multiclass problems (prediction accuracy
1077
for binary classification problems is not changed)
1078
1079
Parameters
1080
----------
1081
X: array-like of size (n_samples_test,n_features)
1082
Matrix of explanatory variables
1083
1084
Returns
1085
-------
1086
probs: numpy array of size (n_samples_test,)
1087
Estimated probabilities of target classes
1088
'''
1089
prob = expit(self.decision_function(X))
1090
if prob.ndim == 1:
1091
prob = np.vstack([1 - prob, prob]).T
1092
prob = prob / np.reshape(np.sum(prob, axis = 1), (prob.shape[0],1))
1093
return prob
1094
1095