CoCalc -- ard_linreg

GitHub Repository: probml/pyprobml
Path: blob/master/deprecated/scripts/ard_linreg_logreg.py
¹¹⁹² views
1
# Linear and logistic Regression with Automatic Relevance Determination (Fast Version uses 
2
#    Sparse Bayesian Learning)
3

4
#https://github.com/AmazaspShumik/sklearn-bayes/blob/master/skbayes/rvm_ard_models/fast_rvm.py
5

6
import superimport
7

8
import numpy as np
9
from sklearn.base import RegressorMixin, BaseEstimator
10
#from sklearn.externals import six
11
from sklearn.linear_model.base import LinearModel, LinearClassifierMixin
12
from sklearn.utils import check_X_y,check_array,as_float_array
13
from sklearn.utils.multiclass import check_classification_targets
14
from sklearn.utils.extmath import pinvh,log_logistic,safe_sparse_dot 
15
from sklearn.metrics.pairwise import pairwise_kernels
16
from sklearn.utils.validation import check_is_fitted
17
from scipy.special import expit
18
from scipy.optimize import fmin_l_bfgs_b
19
from scipy.linalg import solve_triangular
20
from scipy.stats import logistic
21
from numpy.linalg import LinAlgError
22
import scipy.sparse
23
import warnings
24

25
#TODO: predict_proba for RVC with Laplace Approximation
26

27

28
def update_precisions(Q,S,q,s,A,active,tol,n_samples,clf_bias):
29
    '''
30
    Selects one feature to be added/recomputed/deleted to model based on 
31
    effect it will have on value of log marginal likelihood.
32
    '''
33
    # initialise vector holding changes in log marginal likelihood
34
    deltaL = np.zeros(Q.shape[0])
35
    
36
    # identify features that can be added , recomputed and deleted in model
37
    theta        =  q**2 - s 
38
    add          =  (theta > 0) * (active == False)
39
    recompute    =  (theta > 0) * (active == True)
40
    delete       = ~(add + recompute)
41
    
42
    # compute sparsity & quality parameters corresponding to features in 
43
    # three groups identified above
44
    Qadd,Sadd      = Q[add], S[add]
45
    Qrec,Srec,Arec = Q[recompute], S[recompute], A[recompute]
46
    Qdel,Sdel,Adel = Q[delete], S[delete], A[delete]
47
    
48
    # compute new alpha's (precision parameters) for features that are 
49
    # currently in model and will be recomputed
50
    Anew           = s[recompute]**2/ ( theta[recompute] + np.finfo(np.float32).eps)
51
    delta_alpha    = (1./Anew - 1./Arec)
52
    
53
    # compute change in log marginal likelihood 
54
    deltaL[add]       = ( Qadd**2 - Sadd ) / Sadd + np.log(Sadd/Qadd**2 )
55
    denom = np.maximum(1e-5, 1 + Srec*delta_alpha) # Kevin Murphy hack
56
    deltaL[recompute] = Qrec**2 / (Srec + 1. / delta_alpha) - np.log(denom)
57
    deltaL[delete]    = Qdel**2 / (Sdel - Adel) - np.log(1 - Sdel / Adel)
58
    deltaL            = deltaL  / n_samples
59
    
60
    # find feature which caused largest change in likelihood
61
    feature_index = np.argmax(deltaL)
62
             
63
    # no deletions or additions
64
    same_features  = np.sum( theta[~recompute] > 0) == 0
65
    
66
    # changes in precision for features already in model is below threshold
67
    no_delta       = np.sum( abs( Anew - Arec ) > tol ) == 0
68
    
69
    # check convergence: if no features to add or delete and small change in 
70
    #                    precision for current features then terminate
71
    converged = False
72
    if same_features and no_delta:
73
        converged = True
74
        return [A,converged]
75
    
76
    # if not converged update precision parameter of weights and return
77
    if theta[feature_index] > 0:
78
        A[feature_index] = s[feature_index]**2 / theta[feature_index]
79
        if active[feature_index] == False:
80
            active[feature_index] = True
81
    else:
82
        # at least two active features
83
        if active[feature_index] == True and np.sum(active) >= 2:
84
            # do not remove bias term in classification 
85
            # (in regression it is factored in through centering)
86
            if not (feature_index == 0 and clf_bias):
87
               active[feature_index] = False
88
               A[feature_index]      = np.PINF
89
                
90
    return [A,converged]
91

92

93
###############################################################################
94
#                ARD REGRESSION AND CLASSIFICATION
95
###############################################################################
96

97

98
#-------------------------- Regression ARD ------------------------------------
99

100

101
class RegressionARD(LinearModel,RegressorMixin):
102
    '''
103
    Regression with Automatic Relevance Determination (Fast Version uses 
104
    Sparse Bayesian Learning)
105
    
106
    Parameters
107
    ----------
108
    n_iter: int, optional (DEFAULT = 100)
109
        Maximum number of iterations
110
        
111
    tol: float, optional (DEFAULT = 1e-3)
112
        If absolute change in precision parameter for weights is below threshold
113
        algorithm terminates.
114
        
115
    fit_intercept : boolean, optional (DEFAULT = True)
116
        whether to calculate the intercept for this model. If set
117
        to false, no intercept will be used in calculations
118
        (e.g. data is expected to be already centered).
119
        
120
    copy_X : boolean, optional (DEFAULT = True)
121
        If True, X will be copied; else, it may be overwritten.
122
        
123
    verbose : boolean, optional (DEFAULT = True)
124
        Verbose mode when fitting the model
125
        
126
        
127
    Attributes
128
    ----------
129
    coef_ : array, shape = (n_features)
130
        Coefficients of the regression model (mean of posterior distribution)
131
        
132
    alpha_ : float
133
       estimated precision of the noise
134
       
135
    active_ : array, dtype = np.bool, shape = (n_features)
136
       True for non-zero coefficients, False otherwise
137
       
138
    lambda_ : array, shape = (n_features)
139
       estimated precisions of the coefficients
140
       
141
    sigma_ : array, shape = (n_features, n_features)
142
        estimated covariance matrix of the weights, computed only
143
        for non-zero coefficients  
144
       
145
       
146
    References
147
    ----------
148
    [1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)
149
        (http://www.miketipping.com/papers/met-fastsbl.pdf)
150
    [2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)
151
        (http://www.miketipping.com/abstracts.htm#Faul:NIPS01)
152
        
153
    '''
154
    
155
    def __init__( self, n_iter = 300, tol = 1e-3, fit_intercept = True, 
156
                  copy_X = True, verbose = False):
157
        self.n_iter          = n_iter
158
        self.tol             = tol
159
        self.scores_         = list()
160
        self.fit_intercept   = fit_intercept
161
        self.copy_X          = copy_X
162
        self.verbose         = verbose
163
        
164
        
165
    def _center_data(self,X,y):
166
        ''' Centers data'''
167
        X     = as_float_array(X,self.copy_X)
168
        # normalisation should be done in preprocessing!
169
        X_std = np.ones(X.shape[1], dtype = X.dtype)
170
        if self.fit_intercept:
171
            X_mean = np.average(X,axis = 0)
172
            y_mean = np.average(y,axis = 0)
173
            X     -= X_mean
174
            y      = y - y_mean
175
        else:
176
            X_mean = np.zeros(X.shape[1],dtype = X.dtype)
177
            y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
178
        return X,y, X_mean, y_mean, X_std
179
        
180
        
181
    def fit(self,X,y):
182
        '''
183
        Fits ARD Regression with Sequential Sparse Bayes Algorithm.
184
        
185
        Parameters
186
        -----------
187
        X: {array-like, sparse matrix} of size (n_samples, n_features)
188
           Training data, matrix of explanatory variables
189
        
190
        y: array-like of size [n_samples, n_features] 
191
           Target values
192
           
193
        Returns
194
        -------
195
        self : object
196
            Returns self.
197
        '''
198
        X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
199
        X, y, X_mean, y_mean, X_std = self._center_data(X, y)
200
        n_samples, n_features = X.shape
201

202
        #  precompute X'*Y , X'*X for faster iterations & allocate memory for
203
        #  sparsity & quality vectors
204
        XY     = np.dot(X.T,y)
205
        XX     = np.dot(X.T,X)
206
        XXd    = np.diag(XX)
207

208
        #  initialise precision of noise & and coefficients
209
        var_y  = np.var(y)
210
        
211
        # check that variance is non zero !!!
212
        if var_y == 0 :
213
            beta = 1e-2
214
        else:
215
            beta = 1. / np.var(y)
216
        
217
        A      = np.PINF * np.ones(n_features)
218
        active = np.zeros(n_features , dtype = np.bool)
219
        
220
        # in case of almost perfect multicollinearity between some features
221
        # start from feature 0
222
        if np.sum( XXd - X_mean**2 < np.finfo(np.float32).eps ) > 0:
223
            A[0]       = np.finfo(np.float16).eps
224
            active[0]  = True
225
        else:
226
            # start from a single basis vector with largest projection on targets
227
            proj  = XY**2 / XXd
228
            start = np.argmax(proj)
229
            active[start] = True
230
            A[start]      = XXd[start]/( proj[start] - var_y)
231
 
232
        warning_flag = 0
233
        for i in range(self.n_iter):
234
            XXa     = XX[active,:][:,active]
235
            XYa     = XY[active]
236
            Aa      =  A[active]
237
            
238
            # mean & covariance of posterior distribution
239
            Mn,Ri,cholesky  = self._posterior_dist(Aa,beta,XXa,XYa)
240
            if cholesky:
241
                Sdiag  = np.sum(Ri**2,0)
242
            else:
243
                Sdiag  = np.copy(np.diag(Ri)) 
244
                warning_flag += 1
245
            
246
            # raise warning in case cholesky failes
247
            if warning_flag == 1:
248
                warnings.warn(("Cholesky decomposition failed ! Algorithm uses pinvh, "
249
                               "which is significantly slower, if you use RVR it "
250
                               "is advised to change parameters of kernel"))
251
                
252
            # compute quality & sparsity parameters            
253
            s,q,S,Q = self._sparsity_quality(XX,XXd,XY,XYa,Aa,Ri,active,beta,cholesky)
254
                
255
            # update precision parameter for noise distribution
256
            rss     = np.sum( ( y - np.dot(X[:,active] , Mn) )**2 )
257
            beta    = n_samples - np.sum(active) + np.sum(Aa * Sdiag )
258
            beta   /= ( rss + np.finfo(np.float32).eps )
259

260
            # update precision parameters of coefficients
261
            A,converged  = update_precisions(Q,S,q,s,A,active,self.tol,
262
                                             n_samples,False)
263
            if self.verbose:
264
                print(('Iteration: {0}, number of features '
265
                       'in the model: {1}').format(i,np.sum(active)))
266
            if converged or i == self.n_iter - 1:
267
                if converged and self.verbose:
268
                    print('Algorithm converged !')
269
                break
270
                 
271
        # after last update of alpha & beta update parameters
272
        # of posterior distribution
273
        XXa,XYa,Aa         = XX[active,:][:,active],XY[active],A[active]
274
        Mn, Sn, cholesky   = self._posterior_dist(Aa,beta,XXa,XYa,True)
275
        self.coef_         = np.zeros(n_features)
276
        self.coef_[active] = Mn
277
        self.sigma_        = Sn
278
        self.active_       = active
279
        self.lambda_       = A
280
        self.alpha_        = beta
281
        self._set_intercept(X_mean,y_mean,X_std)
282
        return self
283
        
284
        
285
    def predict_dist(self,X):
286
        '''
287
        Computes predictive distribution for test set.
288
        Predictive distribution for each data point is one dimensional
289
        Gaussian and therefore is characterised by mean and variance.
290
        
291
        Parameters
292
        -----------
293
        X: {array-like, sparse} (n_samples_test, n_features)
294
           Test data, matrix of explanatory variables
295
           
296
        Returns
297
        -------
298
        : list of length two [y_hat, var_hat]
299
        
300
             y_hat: numpy array of size (n_samples_test,)
301
                    Estimated values of targets on test set (i.e. mean of predictive
302
                    distribution)
303
           
304
             var_hat: numpy array of size (n_samples_test,)
305
                    Variance of predictive distribution
306
        '''
307
        y_hat     = self._decision_function(X)
308
        var_hat   = 1./self.alpha_
309
        var_hat  += np.sum( np.dot(X[:,self.active_],self.sigma_) * X[:,self.active_], axis = 1)
310
        return y_hat, var_hat
311

312

313
    def _posterior_dist(self,A,beta,XX,XY,full_covar=False):
314
        '''
315
        Calculates mean and covariance matrix of posterior distribution
316
        of coefficients.
317
        '''
318
        # compute precision matrix for active features
319
        Sinv = beta * XX
320
        np.fill_diagonal(Sinv, np.diag(Sinv) + A)
321
        cholesky = True
322
        # try cholesky, if it fails go back to pinvh
323
        try:
324
            # find posterior mean : R*R.T*mean = beta*X.T*Y
325
            # solve(R*z = beta*X.T*Y) => find z => solve(R.T*mean = z) => find mean
326
            R    = np.linalg.cholesky(Sinv)
327
            Z    = solve_triangular(R,beta*XY, check_finite=False, lower = True)
328
            Mn   = solve_triangular(R.T,Z, check_finite=False, lower = False)
329
            
330
            # invert lower triangular matrix from cholesky decomposition
331
            Ri   = solve_triangular(R,np.eye(A.shape[0]), check_finite=False, lower=True)
332
            if full_covar:
333
                Sn   = np.dot(Ri.T,Ri)
334
                return Mn,Sn,cholesky
335
            else:
336
                return Mn,Ri,cholesky
337
        except LinAlgError:
338
            cholesky = False
339
            Sn   = pinvh(Sinv)
340
            Mn   = beta*np.dot(Sinv,XY)
341
            return Mn, Sn, cholesky
342
            
343
    
344
    
345
    
346
    def _sparsity_quality(self,XX,XXd,XY,XYa,Aa,Ri,active,beta,cholesky):
347
        '''
348
        Calculates sparsity and quality parameters for each feature
349
        
350
        Theoretical Note:
351
        -----------------
352
        Here we used Woodbury Identity for inverting covariance matrix
353
        of target distribution 
354
        C    = 1/beta + 1/alpha * X' * X
355
        C^-1 = beta - beta^2 * X * Sn * X'
356
        '''
357
        bxy        = beta*XY
358
        bxx        = beta*XXd
359
        if cholesky:
360
            # here Ri is inverse of lower triangular matrix obtained from cholesky decomp
361
            xxr    = np.dot(XX[:,active],Ri.T)
362
            rxy    = np.dot(Ri,XYa)
363
            S      = bxx - beta**2 * np.sum( xxr**2, axis=1)
364
            Q      = bxy - beta**2 * np.dot( xxr, rxy)
365
        else:
366
            # here Ri is covariance matrix
367
            XXa    = XX[:,active]
368
            XS     = np.dot(XXa,Ri)
369
            S      = bxx - beta**2 * np.sum(XS*XXa,1)
370
            Q      = bxy - beta**2 * np.dot(XS,XYa)
371
        # Use following:
372
        # (EQ 1) q = A*Q/(A - S) ; s = A*S/(A-S), so if A = np.PINF q = Q, s = S
373
        qi         = np.copy(Q)
374
        si         = np.copy(S) 
375
        #  If A is not np.PINF, then it should be 'active' feature => use (EQ 1)
376
        Qa,Sa      = Q[active], S[active]
377
        qi[active] = Aa * Qa / (Aa - Sa )
378
        si[active] = Aa * Sa / (Aa - Sa )
379
        return [si,qi,S,Q]
380
              
381
        
382
#----------------------- Classification ARD -----------------------------------
383
     
384
     
385
def _logistic_cost_grad(X,Y,w,diagA):
386
    '''
387
    Calculates cost and gradient for logistic regression
388
    '''
389
    n     = X.shape[0]
390
    Xw    = np.dot(X,w)
391
    s     = expit(Xw)
392
    wdA   = w*diagA
393
    wdA[0] = 1e-3 # broad prior for bias term => almost no regularization
394
    cost = np.sum( Xw* (1-Y) - log_logistic(Xw)) + np.sum(w*wdA)/2 
395
    grad  = np.dot(X.T, s - Y) + wdA
396
    return [cost/n,grad/n]
397
    
398

399
        
400
class ClassificationARD(BaseEstimator,LinearClassifierMixin):
401
    '''
402
    Logistic Regression with Automatic Relevance determination (Fast Version uses 
403
    Sparse Bayesian Learning)
404
    
405
    Parameters
406
    ----------
407
    n_iter: int, optional (DEFAULT = 100)
408
        Maximum number of iterations before termination
409
        
410
    tol: float, optional (DEFAULT = 1e-3)
411
        If absolute change in precision parameter for weights is below threshold
412
        algorithm terminates.
413
        
414
    normalize: bool, optional (DEFAULT = True)
415
        If True normalizes features
416
              
417
    n_iter_solver: int, optional (DEFAULT = 20)
418
        Maximum number of iterations before termination of solver
419
        
420
    tol_solver: float, optional (DEFAULT = 1e-5)
421
        Convergence threshold for solver (it is used in estimating posterior
422
        distribution)
423
        
424
    fit_intercept : bool, optional ( DEFAULT = True )
425
        If True will use intercept in the model. If set
426
        to false, no intercept will be used in calculations
427
   
428
    verbose : boolean, optional (DEFAULT = True)
429
        Verbose mode when fitting the model
430
        
431
        
432
    Attributes
433
    ----------
434
    coef_ : array, shape = (n_features)
435
        Coefficients of the regression model (mean of posterior distribution)
436
        
437
    lambda_ : float
438
       estimated precisions of weights
439
       
440
    active_ : array, dtype = np.bool, shape = (n_features)
441
       True for non-zero coefficients, False otherwise
442
       
443
    sigma_ : array, shape = (n_features, n_features)
444
        estimated covariance matrix of the weights, computed only
445
        for non-zero coefficients
446
        
447
        
448
    References
449
    ----------
450
    [1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)
451
        (http://www.miketipping.com/papers/met-fastsbl.pdf)
452
    [2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)
453
        (http://www.miketipping.com/abstracts.htm#Faul:NIPS01)
454
    '''
455
    def __init__(self, n_iter=100, tol=1e-4, n_iter_solver=15, normalize=True,
456
                 tol_solver=1e-4, fit_intercept=True, verbose=False):
457
        self.n_iter             = n_iter
458
        self.tol                = tol
459
        self.n_iter_solver      = n_iter_solver
460
        self.normalize          = normalize
461
        self.tol_solver         = tol_solver
462
        self.fit_intercept      = fit_intercept
463
        self.verbose            = verbose
464
    
465
    
466
    def fit(self,X,y):
467
        '''
468
        Fits Logistic Regression with ARD
469
        
470
        Parameters
471
        ----------
472
        X: array-like of size [n_samples, n_features]
473
           Training data, matrix of explanatory variables
474
        
475
        y: array-like of size [n_samples] 
476
           Target values
477
           
478
        Returns
479
        -------
480
        self : object
481
            Returns self.
482
        '''
483
        X, y = check_X_y(X, y, accept_sparse = None, dtype=np.float64)
484
                    
485
        # normalize, if required
486
        if self.normalize:
487
            self._x_mean = np.mean(X,0)
488
            self._x_std  = np.std(X,0)
489
            X            = (X - self._x_mean) / self._x_std
490

491
        # add bias term if required
492
        if self.fit_intercept:
493
            X = np.concatenate((np.ones([X.shape[0],1]),X),1)
494

495
        # preprocess targets
496
        check_classification_targets(y)
497
        self.classes_ = np.unique(y)
498
        n_classes = len(self.classes_)
499
        if n_classes < 2:
500
            raise ValueError("Need samples of at least 2 classes"
501
                             " in the data, but the data contains only one"
502
                             " class: %r" % self.classes_[0])
503
        
504
        # if multiclass use OVR (i.e. fit classifier for each class)
505
        if n_classes < 2:
506
            raise ValueError("Need samples of at least 2 classes")
507
        if n_classes > 2:
508
            self.coef_, self.sigma_        = [0]*n_classes,[0]*n_classes
509
            self.intercept_ , self.active_ = [0]*n_classes, [0]*n_classes
510
            self.lambda_                   = [0]*n_classes
511
        else:
512
            self.coef_, self.sigma_, self.intercept_,self.active_ = [0],[0],[0],[0]
513
            self.lambda_                                          = [0]
514
         
515
        for i in range(len(self.classes_)):
516
            if n_classes == 2:
517
                pos_class = self.classes_[1]
518
            else:
519
                pos_class = self.classes_[i]
520
            mask = (y == pos_class)
521
            y_bin = np.zeros(y.shape, dtype=np.float64)
522
            y_bin[mask] = 1
523
            coef,bias,active,sigma,lambda_  = self._fit(X,y_bin)
524
            self.coef_[i], self.intercept_[i], self.sigma_[i]  = coef, bias, sigma
525
            self.active_[i], self.lambda_[i] = active, lambda_
526
            # in case of binary classification fit only one classifier           
527
            if n_classes == 2:
528
                break  
529
        self.coef_      = np.asarray(self.coef_)
530
        self.intercept_ = np.asarray(self.intercept_)
531
        return self
532
        
533
    
534
    def _fit(self,X,y):
535
        '''
536
        Fits binary classification
537
        '''
538
        n_samples,n_features = X.shape
539
        A         = np.PINF * np.ones(n_features)
540
        active    = np.zeros(n_features , dtype = np.bool)
541
        
542
        # if we fit intercept, make it active from the beginning
543
        if self.fit_intercept:
544
            active[0] = True
545
            A[0]      = np.finfo(np.float16).eps
546
        
547
        warning_flag = 0
548
        for i in range(self.n_iter):
549
            Xa      =  X[:,active]
550
            Aa      =  A[active]
551
            
552
            # mean & precision of posterior distribution
553
            Mn,Sn,B,t_hat, cholesky = self._posterior_dist(Xa,y, Aa)
554
            if not cholesky:
555
                warning_flag += 1
556
            
557
            # raise warning in case cholesky failes (but only once)
558
            if warning_flag == 1:
559
                warnings.warn(("Cholesky decomposition failed ! Algorithm uses pinvh, "
560
                               "which is significantly slower, if you use RVC it "
561
                               "is advised to change parameters of kernel"))
562

563
            # compute quality & sparsity parameters
564
            s,q,S,Q = self._sparsity_quality(X,Xa,t_hat,B,A,Aa,active,Sn,cholesky)
565

566
            # update precision parameters of coefficients
567
            A,converged  = update_precisions(Q,S,q,s,A,active,self.tol,n_samples,self.fit_intercept)
568

569
            # terminate if converged
570
            if converged or i == self.n_iter - 1:
571
                break
572
        
573
        Xa,Aa   = X[:,active], A[active]
574
        Mn,Sn,B,t_hat,cholesky = self._posterior_dist(Xa,y,Aa)
575
        # in case Sn is inverse of lower triangular matrix of Cholesky decomposition
576
        # compute covariance using formula Sn  = np.dot(Rinverse.T , Rinverse)
577
        if cholesky:
578
           Sn = np.dot(Sn.T,Sn) 
579
        intercept_ = 0
580
        if self.fit_intercept:
581
           n_features -= 1
582
           if active[0] == True:
583
               intercept_  = Mn[0]
584
               Mn          = Mn[1:]               
585
           active          = active[1:]
586
        coef_           = np.zeros([1,n_features])
587
        coef_[0,active] = Mn   
588
        return coef_.squeeze(), intercept_, active, Sn, A
589
   
590
        
591
    def predict(self,X):
592
        '''
593
        Estimates target values on test set
594
        
595
        Parameters
596
        ----------
597
        X: array-like of size (n_samples_test, n_features)
598
           Matrix of explanatory variables
599
           
600
        Returns
601
        -------
602
        y_pred: numpy arra of size (n_samples_test,)
603
           Predicted values of targets
604
        '''
605
        probs   = self.predict_proba(X)
606
        indices = np.argmax(probs, axis = 1)
607
        y_pred  = self.classes_[indices]
608
        return y_pred
609
        
610
        
611
    def _decision_function_active(self,X,coef_,active_,intercept_):
612
        ''' Constructs decision function using only relevant features '''
613
        if self.normalize:
614
            X = (X - self._x_mean[active_]) / self._x_std[active_]
615
        decision = safe_sparse_dot(X,coef_[active_]) + intercept_
616
        return decision
617
        
618
        
619
    def decision_function(self,X):
620
        ''' 
621
        Computes distance to separating hyperplane between classes. The larger 
622
        is the absolute value of the decision function further data point is 
623
        from the decision boundary.
624
        
625
        Parameters
626
        ----------
627
        X: array-like of size (n_samples_test,n_features)
628
           Matrix of explanatory variables
629
          
630
        Returns
631
        -------
632
        decision: numpy array of size (n_samples_test,)
633
           Distance to decision boundary
634
        '''
635
        check_is_fitted(self, 'coef_') 
636
        X = check_array(X, accept_sparse=None, dtype = np.float64)
637
        n_features = self.coef_.shape[1]
638
        if X.shape[1] != n_features:
639
            raise ValueError("X has %d features per sample; expecting %d"
640
                             % (X.shape[1], n_features))
641
        decision = [self._decision_function_active(X[:,active],coef,active,bias) for 
642
                    coef,active,bias in zip(self.coef_,self.active_,self.intercept_)]
643
        decision = np.asarray(decision).squeeze().T
644
        return decision
645
        
646

647
    def predict_proba(self,X):
648
        '''
649
        Predicts probabilities of targets for test set using probit 
650
        function to approximate convolution of sigmoid and Gaussian.
651
        
652
        Parameters
653
        ----------
654
        X: array-like of size (n_samples_test,n_features)
655
           Matrix of explanatory variables
656
           
657
        Returns
658
        -------
659
        probs: numpy array of size (n_samples_test,)
660
           Estimated probabilities of target classes
661
        '''
662
        y_hat = self.decision_function(X)
663
        X = check_array(X, accept_sparse=None, dtype = np.float64)
664
        if self.normalize:
665
            X = (X - self._x_mean) / self._x_std
666
        if self.fit_intercept:
667
            X    = np.concatenate((np.ones([X.shape[0],1]), X),1)
668
        if y_hat.ndim == 1:
669
            pr   = self._convo_approx(X[:,self.lambda_[0]!=np.PINF],
670
                                           y_hat,self.sigma_[0])
671
            prob = np.vstack([1 - pr, pr]).T
672
        else:
673
            pr   = [self._convo_approx(X[:,idx != np.PINF],y_hat[:,i],
674
                        self.sigma_[i]) for i,idx in enumerate(self.lambda_) ]
675
            pr   = np.asarray(pr).T
676
            prob = pr / np.reshape(np.sum(pr, axis = 1), (pr.shape[0],1))
677
        return prob
678

679
        
680
    def _convo_approx(self,X,y_hat,sigma):
681
        ''' Computes approximation to convolution of sigmoid and gaussian'''
682
        var = np.sum(np.dot(X,sigma)*X,1)
683
        ks  = 1. / ( 1. + np.pi * var/ 8)**0.5
684
        pr  = expit(y_hat * ks)
685
        return pr
686
        
687

688
    def _sparsity_quality(self,X,Xa,y,B,A,Aa,active,Sn,cholesky):
689
        '''
690
        Calculates sparsity & quality parameters for each feature
691
        '''
692
        XB    = X.T*B
693
        bxx   = np.dot(B,X**2)
694
        Q     = np.dot(X.T,y)
695
        if cholesky:
696
            # Here Sn is inverse of lower triangular matrix, obtained from
697
            # cholesky decomposition
698
            XBX = np.dot(XB,Xa)
699
            XBX = np.dot(XBX,Sn,out=XBX)
700
            S   = bxx - np.sum(XBX**2,1)
701
        else:
702
            XSX = np.dot(np.dot(Xa,Sn),Xa.T)
703
            S   = bxx - np.sum( np.dot( XB,XSX )*XB,1 )
704
        qi    = np.copy(Q)
705
        si    = np.copy(S) 
706
        Qa,Sa      = Q[active], S[active]
707
        qi[active] = Aa * Qa / (Aa - Sa )
708
        si[active] = Aa * Sa / (Aa - Sa )
709
        return [si,qi,S,Q]
710
        
711
    
712
    def _posterior_dist(self,X,y,A):
713
        '''
714
        Uses Laplace approximation for calculating posterior distribution
715
        '''
716
        f         = lambda w: _logistic_cost_grad(X,y,w,A)
717
        w_init    = np.random.random(X.shape[1])
718
        Mn        = fmin_l_bfgs_b(f, x0 = w_init, pgtol = self.tol_solver,
719
                                maxiter = self.n_iter_solver)[0]
720
        Xm        = np.dot(X,Mn)
721
        s         = expit(Xm)
722
        B         = logistic._pdf(Xm) # avoids underflow
723
        S         = np.dot(X.T*B,X)
724
        np.fill_diagonal(S, np.diag(S) + A)
725
        t_hat     = y - s
726
        cholesky  = True
727
        # try using Cholesky , if it fails then fall back on pinvh
728
        try:
729
            R        = np.linalg.cholesky(S)
730
            Sn       = solve_triangular(R,np.eye(A.shape[0]),
731
                                        check_finite=False,lower=True)
732
        except LinAlgError:
733
            Sn       = pinvh(S)
734
            cholesky = False
735
        return [Mn,Sn,B,t_hat,cholesky]
736
        
737

738

739
###############################################################################
740
#                  Relevance Vector Machine: RVR and RVC
741
###############################################################################
742

743

744

745
def get_kernel( X, Y, gamma, degree, coef0, kernel, kernel_params ):
746
    '''
747
    Calculates kernelised features for RVR and RVC
748
    '''
749
    if callable(kernel):
750
        params = kernel_params or {}
751
    else:
752
        params = {"gamma": gamma,
753
                  "degree": degree,
754
                  "coef0": coef0  }
755
    return pairwise_kernels(X, Y, metric=kernel,
756
                            filter_params=True, **params)
757
                            
758

759

760
class RVR(RegressionARD):
761
    '''
762
    Relevance Vector Regression (Fast Version uses Sparse Bayesian Learning)
763
    
764
    Parameters
765
    ----------
766
    n_iter: int, optional (DEFAULT = 300)
767
        Maximum number of iterations
768
        
769
    fit_intercept : boolean, optional (DEFAULT = True)
770
        whether to calculate the intercept for this model
771
        
772
    tol: float, optional (DEFAULT = 1e-3)
773
        If absolute change in precision parameter for weights is below tol
774
        algorithm terminates.
775
        
776
    copy_X : boolean, optional (DEFAULT = True)
777
        If True, X will be copied; else, it may be overwritten.
778
        
779
    verbose : boolean, optional (DEFAULT = True)
780
        Verbose mode when fitting the model 
781
        
782
    kernel: str, optional (DEFAULT = 'poly')
783
        Type of kernel to be used (all kernels: ['rbf' | 'poly' | 'sigmoid', 'linear']
784
    
785
    degree : int, (DEFAULT = 3)
786
        Degree for poly kernels. Ignored by other kernels.
787
        
788
    gamma : float, optional (DEFAULT = 1/n_features)
789
        Kernel coefficient for rbf and poly kernels, ignored by other kernels
790
        
791
    coef0 : float, optional (DEFAULT = 1)
792
        Independent term in poly and sigmoid kernels, ignored by other kernels
793
        
794
    kernel_params : mapping of string to any, optional
795
        Parameters (keyword arguments) and values for kernel passed as
796
        callable object, ignored by other kernels
797
        
798
        
799
    Attributes
800
    ----------
801
    coef_ : array, shape = (n_features)
802
        Coefficients of the regression model (mean of posterior distribution)
803
        
804
    alpha_ : float
805
       estimated precision of the noise
806
       
807
    active_ : array, dtype = np.bool, shape = (n_features)
808
       True for non-zero coefficients, False otherwise
809
       
810
    lambda_ : array, shape = (n_features)
811
       estimated precisions of the coefficients
812
       
813
    sigma_ : array, shape = (n_features, n_features)
814
        estimated covariance matrix of the weights, computed only
815
        for non-zero coefficients
816
        
817
    relevant_vectors_ : array 
818
        Relevant Vectors
819
    
820
    References
821
    ----------
822
    [1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)
823
        (http://www.miketipping.com/papers/met-fastsbl.pdf)
824
    [2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)
825
        (http://www.miketipping.com/abstracts.htm#Faul:NIPS01)
826
    '''
827
    def __init__(self, n_iter=300, tol = 1e-3, fit_intercept = True, copy_X = True,
828
                 verbose = False, kernel = 'poly', degree = 3, gamma  = None,
829
                 coef0  = 1, kernel_params = None):
830
        super(RVR,self).__init__(n_iter,tol,fit_intercept,copy_X,verbose)
831
        self.kernel = kernel
832
        self.degree = degree
833
        self.gamma  = gamma
834
        self.coef0  = coef0
835
        self.kernel_params = kernel_params
836
    
837
    
838
    def fit(self,X,y):
839
        '''
840
        Fit Relevance Vector Regression Model
841
        
842
        Parameters
843
        -----------
844
        X: {array-like,sparse matrix} of size (n_samples, n_features)
845
           Training data, matrix of explanatory variables
846
        
847
        y: array-like of size (n_samples, ) 
848
           Target values
849
           
850
        Returns
851
        -------
852
        self: object
853
           self
854
        '''
855
        X,y = check_X_y(X,y,accept_sparse=['csr','coo','bsr'],dtype = np.float64)
856
        # kernelise features
857
        K = get_kernel( X, X, self.gamma, self.degree, self.coef0, 
858
                       self.kernel, self.kernel_params)
859
        
860
        # use fit method of RegressionARD
861
        _ = super(RVR,self).fit(K,y)
862

863
        # convert to csr (need to use __getitem__)
864
        convert_tocsr = [scipy.sparse.coo.coo_matrix, 
865
                         scipy.sparse.dia.dia_matrix,
866
                         scipy.sparse.bsr.bsr_matrix]
867
        if type(X) in convert_tocsr:
868
            X = X.tocsr()
869
        self.relevant_  = np.where(self.active_== True)[0]
870
        if X.ndim == 1:
871
            self.relevant_vectors_ = X[self.relevant_]
872
        else:
873
            self.relevant_vectors_ = X[self.relevant_,:]
874
        return self
875
        
876
        
877
    def _decision_function(self,X):
878
        ''' Decision function '''
879
        _, predict_vals = self._kernel_decision_function(X)
880
        return predict_vals
881
        
882
    
883
    def _kernel_decision_function(self,X):
884
        ''' Computes kernel and decision function based on kernel'''
885
        check_is_fitted(self,'coef_')
886
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
887
        K = get_kernel( X, self.relevant_vectors_, self.gamma, self.degree, 
888
                        self.coef0, self.kernel, self.kernel_params)
889
        return K , np.dot(K,self.coef_[self.active_]) + self.intercept_
890
        
891
        
892
    def predict_dist(self,X):
893
        '''
894
        Computes predictive distribution for test set. Predictive distribution
895
        for each data point is one dimensional Gaussian and therefore is 
896
        characterised by mean and variance.
897
        
898
        Parameters
899
        ----------
900
        X: {array-like,sparse matrix} of size (n_samples_test, n_features)
901
           Matrix of explanatory variables 
902
           
903
        Returns
904
        -------
905
        : list of length two [y_hat, var_hat]
906
        
907
             y_hat: numpy array of size (n_samples_test,)
908
                    Estimated values of targets on test set (i.e. mean of predictive
909
                    distribution)
910
           
911
             var_hat: numpy array of size (n_samples_test,)
912
                    Variance of predictive distribution
913
        '''
914
        # kernel matrix and mean of predictive distribution
915
        K, y_hat  = self._kernel_decision_function(X)
916
        var_hat   = 1./self.alpha_
917
        var_hat  += np.sum( np.dot(K,self.sigma_) * K, axis = 1)
918
        return y_hat,var_hat
919

920

921

922
class RVC(ClassificationARD):
923
    '''
924
    Relevance Vector Classifier (Fast Version, uses Sparse Bayesian Learning )
925
        
926
    
927
    Parameters
928
    ----------
929
    n_iter: int, optional (DEFAULT = 100)
930
        Maximum number of iterations before termination
931
        
932
    tol: float, optional (DEFAULT = 1e-4)
933
        If absolute change in precision parameter for weights is below tol, then
934
        the algorithm terminates.
935
    n_iter_solver: int, optional (DEFAULT = 15)
936
        Maximum number of iterations before termination of solver
937
        
938
    tol_solver: float, optional (DEFAULT = 1e-4)
939
        Convergence threshold for solver (it is used in estimating posterior
940
        distribution)
941
        
942
    fit_intercept : bool, optional ( DEFAULT = True )
943
        If True will use intercept in the model
944
    verbose : boolean, optional (DEFAULT = True)
945
        Verbose mode when fitting the model
946
        
947
    kernel: str, optional (DEFAULT = 'rbf')
948
        Type of kernel to be used (all kernels: ['rbf' | 'poly' | 'sigmoid']
949
    
950
    degree : int, (DEFAULT = 3)
951
        Degree for poly kernels. Ignored by other kernels.
952
        
953
    gamma : float, optional (DEFAULT = 1/n_features)
954
        Kernel coefficient for rbf and poly kernels, ignored by other kernels
955
        
956
    coef0 : float, optional (DEFAULT = 0.1)
957
        Independent term in poly and sigmoid kernels, ignored by other kernels
958
        
959
    kernel_params : mapping of string to any, optional
960
        Parameters (keyword arguments) and values for kernel passed as
961
        callable object, ignored by other kernels
962
        
963
        
964
    Attributes
965
    ----------
966
    coef_ : array, shape = (n_features)
967
        Coefficients of the model (mean of posterior distribution)
968
        
969
    lambda_ : float
970
       Estimated precisions of weights
971
       
972
    active_ : array, dtype = np.bool, shape = (n_features)
973
       True for non-zero coefficients, False otherwise
974
       
975
    sigma_ : array, shape = (n_features, n_features)
976
       Estimated covariance matrix of the weights, computed only for non-zero 
977
       coefficients
978
       
979
       
980
    References
981
    ----------
982
    [1] Fast marginal likelihood maximisation for sparse Bayesian models (Tipping & Faul 2003)
983
        (http://www.miketipping.com/papers/met-fastsbl.pdf)
984
    [2] Analysis of sparse Bayesian learning (Tipping & Faul 2001)
985
        (http://www.miketipping.com/abstracts.htm#Faul:NIPS01)
986
    '''
987
    
988
    def __init__(self, n_iter = 100, tol = 1e-4, n_iter_solver = 15, tol_solver = 1e-4,
989
                 fit_intercept = True, verbose = False, kernel = 'rbf', degree = 2,
990
                 gamma  = None, coef0  = 1, kernel_params = None):
991
        super(RVC,self).__init__(n_iter,tol,n_iter_solver,True,tol_solver,
992
                                 fit_intercept,verbose)
993
        self.kernel        = kernel
994
        self.degree        = degree
995
        self.gamma         = gamma
996
        self.coef0         = coef0
997
        self.kernel_params = kernel_params
998
        
999
        
1000
    def fit(self,X,y):
1001
        '''
1002
        Fit Relevance Vector Classifier
1003
        
1004
        Parameters
1005
        -----------
1006
        X: array-like of size [n_samples, n_features]
1007
           Training data, matrix of explanatory variables
1008
        
1009
        y: array-like of size [n_samples, n_features] 
1010
           Target values
1011
           
1012
        Returns
1013
        -------
1014
        self: object
1015
           self
1016
        '''
1017
        X,y = check_X_y(X,y, accept_sparse = None, dtype = np.float64)
1018
        # kernelise features
1019
        K = get_kernel( X, X, self.gamma, self.degree, self.coef0, 
1020
                       self.kernel, self.kernel_params)
1021
        # use fit method of ClassificationARD
1022
        _ = super(RVC,self).fit(K,y)
1023
        self.relevant_  = [np.where(active==True)[0] for active in self.active_]
1024
        if X.ndim == 1:
1025
            self.relevant_vectors_ = [ X[relevant_] for relevant_ in self.relevant_]
1026
        else:
1027
            self.relevant_vectors_ = [ X[relevant_,:] for relevant_ in self.relevant_ ]
1028
        return self
1029

1030
        
1031
    def decision_function(self,X):
1032
        ''' 
1033
        Computes distance to separating hyperplane between classes. The larger 
1034
        is the absolute value of the decision function further data point is 
1035
        from the decision boundary.
1036
        
1037
        Parameters
1038
        ----------
1039
        X: array-like of size (n_samples_test,n_features)
1040
           Matrix of explanatory variables
1041
          
1042
        Returns
1043
        -------
1044
        decision: numpy array of size (n_samples_test,)
1045
           Distance to decision boundary
1046
        '''
1047
        check_is_fitted(self, 'coef_') 
1048
        X = check_array(X, accept_sparse=None, dtype = np.float64)
1049
        n_features = self.relevant_vectors_[0].shape[1]
1050
        if X.shape[1] != n_features:
1051
            raise ValueError("X has %d features per sample; expecting %d"
1052
                             % (X.shape[1], n_features))
1053
        kernel = lambda rvs : get_kernel(X,rvs,self.gamma, self.degree, 
1054
                                         self.coef0, self.kernel, self.kernel_params)
1055
        decision = []
1056
        for rv,cf,act,b in zip(self.relevant_vectors_,self.coef_,self.active_,
1057
                               self.intercept_):
1058
            # if there are no relevant vectors => use intercept only
1059
            if rv.shape[0] == 0:
1060
                decision.append( np.ones(X.shape[0])*b )
1061
            else:
1062
                decision.append(self._decision_function_active(kernel(rv),cf,act,b))
1063
        decision = np.asarray(decision).squeeze().T
1064
        return decision
1065
        
1066

1067
    def predict_proba(self,X):
1068
        '''
1069
        Predicts probabilities of targets.
1070
        
1071
        Theoretical Note
1072
        ================
1073
        Current version of method does not use MacKay's approximation
1074
        to convolution of Gaussian and sigmoid. This results in less accurate 
1075
        estimation of class probabilities and therefore possible increase
1076
        in misclassification error for multiclass problems (prediction accuracy
1077
        for binary classification problems is not changed)
1078
        
1079
        Parameters
1080
        ----------
1081
        X: array-like of size (n_samples_test,n_features)
1082
           Matrix of explanatory variables 
1083
           
1084
        Returns
1085
        -------
1086
        probs: numpy array of size (n_samples_test,)
1087
           Estimated probabilities of target classes
1088
        '''
1089
        prob = expit(self.decision_function(X))
1090
        if prob.ndim == 1:
1091
            prob = np.vstack([1 - prob, prob]).T
1092
        prob = prob / np.reshape(np.sum(prob, axis = 1), (prob.shape[0],1))
1093
        return prob
1094
    
1095
Product

Resources

Company