CoCalc -- ard_vb_linreg

GitHub Repository: probml/pyprobml
Path: blob/master/deprecated/scripts/ard_vb_linreg_logreg.py
¹¹⁹² views
1
# Bayesian Linear and logistic Regression with ARD prior (fitted with Variational Bayes)
2

3
#https://github.com/AmazaspShumik/sklearn-bayes/blob/master/skbayes/rvm_ard_models/vrvm.py
4

5
import superimport
6

7
import numpy as np
8
from sklearn.externals import six
9
from scipy.special import expit
10
from scipy.linalg import solve_triangular
11
from sklearn.linear_model.base import LinearModel, LinearClassifierMixin
12
from sklearn.base import RegressorMixin, BaseEstimator
13
from sklearn.utils import check_X_y,check_array
14
from sklearn.metrics.pairwise import pairwise_kernels
15
from sklearn.utils.validation import check_is_fitted
16
from sklearn.utils.multiclass import check_classification_targets
17
from sklearn.utils import as_float_array
18
import warnings
19

20

21
class VBRegressionARD(LinearModel,RegressorMixin):
22
    '''
23
    Bayesian Linear Regression with ARD prior (fitted with Variational Bayes)
24
    
25
    Parameters
26
    ----------
27
    n_iter: int, optional (DEFAULT = 100)
28
        Maximum number of iterations
29
    fit_intercept : boolean, optional (DEFAULT = True)
30
        If True, intercept will be used in computation
31
        
32
    tol: float, optional (DEFAULT = 1e-3)
33
        If absolute change in precision parameter for weights is below threshold
34
        algorithm terminates.
35
        
36
    copy_X : boolean, optional (DEFAULT = True)
37
        If True, X will be copied, otherwise it will be overwritten.
38
        
39
    verbose : boolean, optional (DEFAULT = True)
40
        Verbose mode when fitting the model 
41
       
42
    a: float, optional, (DEFAULT = 1e-5)
43
       Shape parameters for Gamma distributed precision of weights
44
       
45
    b: float, optional, (DEFAULT = 1e-5)
46
       Rate parameter for Gamma distributed precision of weights
47
    
48
    c: float, optional, (DEFAULT = 1e-5)
49
       Shape parameter for Gamma distributed precision of noise
50
    
51
    d: float, optional, (DEFAULT = 1e-5)
52
       Rate parameter for Gamma distributed precision of noise
53
       
54
    prune_thresh: float, ( DEFAULT = 1e-3 )
55
       Threshold for pruning out variable (applied after model is fitted)
56
    
57
    
58
    Attributes
59
    ----------
60
    coef_ : array, shape = (n_features)
61
        Coefficients of the regression model (mean of posterior distribution)
62
    active_ : array, dtype = np.bool, shape = (n_features)
63
        True for non-zero coefficients, False otherwise
64
    sigma_ : array, shape = (n_features, n_features)
65
        Estimated covariance matrix of the weights, computed only
66
         for non-zero coefficients
67
       
68
    Reference:
69
    ----------
70
    [1] Bishop & Tipping (2000), Variational Relevance Vector Machine
71
    [2] Jan Drugowitch (2014), Variational Bayesian Inference for Bayesian Linear 
72
                           and Logistic Regression
73
    [3] Bishop (2006) Pattern Recognition and Machine Learning (ch. 7)
74
    '''
75
    def __init__(self,  n_iter = 100, tol = 1e-3, fit_intercept = True,
76
                 a = 1e-5, b = 1e-5, c = 1e-5, d = 1e-5, copy_X = True, 
77
                 prune_thresh = 1e-3, verbose = False):
78
        self.n_iter          = n_iter
79
        self.tol             = tol
80
        self.fit_intercept   = fit_intercept
81
        self.a,self.b        = a,b
82
        self.c,self.d        = c,d
83
        self.copy_X          = copy_X
84
        self.verbose         = verbose
85
        self.prune_thresh    = prune_thresh
86
        
87
        
88
    def _center_data(self,X,y):
89
        ''' Centers data'''
90
        X     = as_float_array(X,self.copy_X)
91
        # normalisation should be done in preprocessing!
92
        X_std = np.ones(X.shape[1], dtype = X.dtype)
93
        if self.fit_intercept:
94
            X_mean = np.average(X,axis = 0)
95
            y_mean = np.average(y,axis = 0)
96
            X     -= X_mean
97
            y      = y - y_mean
98
        else:
99
            X_mean = np.zeros(X.shape[1],dtype = X.dtype)
100
            y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
101
        return X,y, X_mean, y_mean, X_std
102
        
103
        
104
    def fit(self,X,y):
105
        '''
106
        Fits variational relevance ARD regression
107
                
108
        Parameters
109
        -----------
110
        X: array-like of size [n_samples, n_features]
111
           Training data, matrix of explanatory variables
112
        
113
        y: array-like of size [n_samples, n_features] 
114
           Target values
115
           
116
        Returns
117
        -------
118
        self : object
119
            Returns self.
120
        '''
121
        # precompute some values for faster iterations 
122
        X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
123
        n_samples, n_features = X.shape
124
        X, y, X_mean, y_mean, X_std = self._center_data(X, y)
125
        XX               = np.dot(X.T,X)
126
        XY               = np.dot(X.T,y)
127
        Y2               = np.sum(y**2)
128
        
129
        # final update for a and c
130
        a        = (self.a + 0.5) * np.ones(n_features, dtype = np.float)
131
        c        = (self.c + 0.5 * n_samples) #* np.ones(n_features, dtype = np.float)
132
        # initial values of b,d before mean field approximation
133
        d        = self.d #* np.ones(n_features, dtype = np.float)
134
        b        = self.b * np.ones(n_features, dtype = np.float)
135
        active   = np.ones(n_features, dtype = np.bool)
136
        w0       = np.zeros(n_features) 
137
        w        = np.copy(w0)
138
        
139
        for i in range(self.n_iter):
140
            # ----------------------  update q(w) -----------------------
141
            
142
            # calculate expectations for precision of noise & precision of weights
143
            e_tau   = c / d
144
            e_A     = a / b
145
            XXa     = XX[active,:][:,active]
146
            XYa     = XY[active]
147
            Xa      = X[:,active]
148
            # parameters of updated posterior distribution
149
            w[active],Ri  = self._posterior_weights(XXa,XYa,e_tau,e_A[active])
150
                
151
            # --------------------- update q(tau) ------------------------
152
            # update rate parameter for Gamma distributed precision of noise 
153
            XSX       = np.sum( np.dot(Xa,Ri.T)**2)
154
            XMw       = np.sum( np.dot(Xa,w[active])**2 )    
155
            XYw       = np.sum( w[active]*XYa )
156
            d         = self.d + 0.5*(Y2 + XMw + XSX) - XYw
157
            
158
            # -------------------- update q(alpha(j)) for each j ----------
159
            # update rate parameter for Gamma distributed precision of weights
160
            b[active] = self.b + 0.5*(w[active]**2 + np.sum(Ri**2,axis = 1))
161
            
162
            # -------------------- check convergence ----------------------
163
            # determine relevant vector as is described in Bishop & Tipping 
164
            # (i.e. using mean of posterior distribution)
165
            active  = np.abs(w) > self.prune_thresh
166
            
167
            # make sure there is at least one relevant feature
168
            if np.sum(active) == 0:
169
                active[np.argmax(np.abs(w))] = True
170
            # all irrelevant features are forced to zero
171
            w[~active] = 0
172
            # check convergence
173
            if np.sum(abs(w-w0) > self.tol) == 0 or i==self.n_iter-1:
174
                break
175
            w0 = np.copy(w)
176
            # if only one relevant feature => terminate
177
            if np.sum(active)== 1:
178
                if X.shape[1] > 3 and self.prune_thresh > 1e-1:
179
                   warnings.warn(("Only one relevant feature found! it can be useful to decrease"
180
                                  "value for parameter prune_thresh"))
181
                break
182
            
183
        # update parameters after last update
184
        e_tau         = c / d
185
        e_A           = a / b 
186
        XXa           = XX[active,:][:,active]
187
        XYa           = XY[active]
188
        w[active], self.sigma_ = self._posterior_weights(XXa,XYa,e_tau,e_A[active],True)
189
        self._e_tau_  = e_tau        
190
        self.coef_    = w
191
        self._set_intercept(X_mean,y_mean,X_std)
192
        self.active_  = active 
193
        return self
194
        
195
        
196
        
197
    def predict_dist(self,X):
198
        '''
199
        Computes predictive distribution for test set.
200
        Predictive distribution for each data point is one dimensional
201
        Gaussian and therefore is characterised by mean and standard
202
        deviation.
203
        
204
        Parameters
205
        -----------
206
        X: {array-like, sparse} [n_samples_test, n_features]
207
           Test data, matrix of explanatory variables
208
           
209
        Returns
210
        -------
211
        y_hat: numpy array of size (n_samples_test,)
212
           Estimated values of targets on test set (Mean of predictive distribution)
213
           
214
        var_hat: numpy array of size (n_samples_test,)
215
           Error bounds (Standard deviation of predictive distribution)
216
        '''
217
        y_hat        = self._decision_function(X)
218
        data_noise   = 1./self._e_tau_
219
        model_noise  = np.sum( np.dot(X[:,self.active_],self.sigma_) * X[:,self.active_], axis = 1)
220
        var_hat      = data_noise + model_noise        
221
        return y_hat, var_hat
222
        
223
    
224
    
225
    def _posterior_weights(self, XX, XY, exp_tau, exp_A, full_covar = False):
226
        '''
227
        Calculates parameters of posterior distribution of weights
228
        
229
        Parameters:
230
        -----------
231
        X:  numpy array of size n_features
232
            Matrix of active features (changes at each iteration)
233
        
234
        XY: numpy array of size [n_features]
235
            Dot product of X and Y (for faster computations)
236
        exp_tau: float
237
            Mean of precision parameter of noise
238
            
239
        exp_A: numpy array of size n_features
240
            Vector of precisions for weights
241
           
242
        Returns:
243
        --------
244
        [Mw, Sigma]: list of two numpy arrays
245
        
246
        Mw: mean of posterior distribution
247
        Sigma: covariance matrix
248
        '''
249
        # compute precision parameter
250
        S    = exp_tau*XX       
251
        np.fill_diagonal(S, np.diag(S) + exp_A)
252
        
253
        # cholesky decomposition
254
        R    = np.linalg.cholesky(S)
255
        
256
        # find mean of posterior distribution
257
        RtMw = solve_triangular(R, exp_tau*XY, lower = True, check_finite = False)
258
        Mw   = solve_triangular(R.T, RtMw, lower = False, check_finite = False)
259
        
260
        # use cholesky decomposition of S to find inverse ( or diagonal of inverse)
261
        Ri    = solve_triangular(R, np.eye(R.shape[1]), lower = True, check_finite = False)
262
        if full_covar:
263
            Sigma = np.dot(Ri.T,Ri)
264
            return [Mw,Sigma]
265
        else:
266
            return [Mw,Ri]
267

268

269
#----------------------   Classification   ---------------------------------------------
270

271

272

273
def lam(eps):
274
    ''' 
275
    Calculates lambda eps [part of local variational approximation
276
    to sigmoid function]
277
    '''
278
    return 0.5 / eps * ( expit(eps) - 0.5 )
279
    
280

281
class VBClassificationARD(LinearClassifierMixin, BaseEstimator):
282
    '''
283
    Variational Bayesian Logistic Regression with local variational approximation.
284
    
285
    
286
    Parameters:
287
    -----------
288
    n_iter: int, optional (DEFAULT = 50 )
289
       Maximum number of iterations
290
       
291
    tol: float, optional (DEFAULT = 1e-3)
292
       Convergence threshold, if cange in coefficients is less than threshold
293
       algorithm is terminated
294
    
295
    fit_intercept: bool, optinal ( DEFAULT = True )
296
       If True uses bias term in model fitting
297
       
298
    a: float, optional (DEFAULT = 1e-6)
299
       Rate parameter for Gamma prior on precision parameter of coefficients
300
       
301
    b: float, optional (DEFAULT = 1e-6)
302
       Shape parameter for Gamma prior on precision parameter of coefficients
303
       
304
    prune_thresh: float, optional (DEFAULT = 1e-4)
305
       Threshold for pruning out variable (applied after model is fitted)
306
    
307
    verbose: bool, optional (DEFAULT = False)
308
       Verbose mode
309
       
310
       
311
    Attributes
312
    ----------
313
    coef_ : array, shape = (n_features)
314
        Coefficients of the regression model (mean of posterior distribution)
315
    sigma_ : array, shape = (n_features, n_features)
316
        estimated covariance matrix of the weights, computed only
317
        for non-zero coefficients
318
    
319
    intercept_: array, shape = (n_features)
320
        intercepts
321
        
322
    active_ : array, dtype = np.bool, shape = (n_features)
323
       True for non-zero coefficients, False otherwise        
324
    References:
325
    -----------
326
    [1] Bishop 2006, Pattern Recognition and Machine Learning ( Chapter 7,10 )
327
    [2] Murphy 2012, Machine Learning A Probabilistic Perspective ( Chapter 14,21 )
328
    [3] Bishop & Tipping 2000, Variational Relevance Vector Machine
329
    '''
330
    def __init__(self,  n_iter = 100, tol = 1e-3, fit_intercept = True,
331
                 a = 1e-4, b = 1e-4, prune_thresh = 1e-4, verbose = True):
332
        self.n_iter            = n_iter
333
        self.tol               = tol
334
        self.verbose           = verbose
335
        self.prune_thresh      = prune_thresh
336
        self.fit_intercept     = fit_intercept
337
        self.a                 = a
338
        self.b                 = b
339
        
340
        
341
    def fit(self,X,y):
342
        '''
343
        Fits variational Bayesian Logistic Regression with local variational bound
344
        
345
        Parameters
346
        ----------
347
        X: array-like of size (n_samples, n_features)
348
           Matrix of explanatory variables
349
           
350
        y: array-like of size (n_samples,)
351
           Vector of dependent variables
352
        Returns
353
        -------
354
        self: object
355
           self
356
        '''
357
        # preprocess data
358
        X,y = check_X_y( X, y , dtype = np.float64)
359
        check_classification_targets(y)
360
        self.classes_ = np.unique(y)
361
        n_classes = len(self.classes_)
362
        
363
        # take into account bias term if required 
364
        n_samples, n_features = X.shape
365
        n_features = n_features + int(self.fit_intercept)
366
        if self.fit_intercept:
367
            X = np.hstack( (np.ones([n_samples,1]),X))
368
        
369
        # handle multiclass problems using One-vs-Rest 
370
        if n_classes < 2:
371
            raise ValueError("Need samples of at least 2 classes")
372
        if n_classes > 2:
373
            self.coef_, self.sigma_ = [0]*n_classes,[0]*n_classes
374
            self.intercept_         = [0]*n_classes
375
            self.active_            = [0]*n_classes
376
        else:
377
            self.coef_, self.sigma_, self.intercept_ = [0],[0],[0]
378
            self.active_ = [0]
379
        
380
        # hyperparameters of precision for weights
381
        a  = self.a + 0.5 * np.ones(n_features)
382
        b  = self.b * np.ones(n_features)
383
        
384
        for i in range(len(self.coef_)):
385
            if n_classes == 2:
386
                pos_class = self.classes_[1]
387
            else:
388
                pos_class   = self.classes_[i]
389
            mask            = (y == pos_class)
390
            y_bin           = np.ones(y.shape)
391
            y_bin[~mask]    = 0
392
            coef_, sigma_, intercept_, active_   = self._fit(X,y_bin,a,b)
393
            self.coef_[i]      = coef_
394
            self.intercept_[i] = intercept_
395
            self.sigma_[i]     = sigma_
396
            self.active_[i]    = active_
397
            
398
        self.coef_  = np.asarray(self.coef_)
399
        return self
400
        
401

402
    def predict_proba(self,x):
403
        '''
404
        Predicts probabilities of targets for test set
405
        
406
        Parameters
407
        ----------
408
        X: array-like of size [n_samples_test,n_features]
409
           Matrix of explanatory variables (test set)
410
           
411
        Returns
412
        -------
413
        probs: numpy array of size [n_samples_test]
414
           Estimated probabilities of target classes
415
        '''
416
        scores = self.decision_function(x)
417
        if self.fit_intercept:
418
            x = np.hstack( (np.ones([x.shape[0],1]),x))
419
        var = [np.sum(np.dot(x[:,a],s)*x[:,a],axis = 1) for a,s in zip(self.active_,self.sigma_)]
420
        sigma  = np.asarray(var)
421
        ks = 1. / ( 1. + np.pi*sigma / 8)**0.5
422
        probs = expit(scores.T*ks).T
423
        if probs.shape[1] == 1:
424
            probs =  np.hstack([1 - probs, probs])
425
        else:
426
            probs /= np.reshape(np.sum(probs, axis = 1), (probs.shape[0],1))
427
        return probs
428

429
            
430
    def _fit(self,X,y,a,b):
431
        '''
432
        Fits single classifier for each class (for OVR framework)
433
        '''
434
        eps     = 1 # default starting parameter for Jaakola Jordan bound
435
        w0      = np.zeros(X.shape[1])
436
        w       = np.copy(w0)
437
        active  = np.ones(X.shape[1], dtype = np.bool)
438
        XY      = np.dot(X.T, y - 0.5)
439
        
440
        for i in range(self.n_iter):
441
            # In the E-step we update approximation of 
442
            # posterior distribution q(w,alpha) = q(w)*q(alpha)
443

444
            # --------- update q(w) ------------------
445
            l       = lam(eps)
446
            Xa      = X[:,active]
447
            XYa     = XY[active]   #np.dot(Xa.T,(y-0.5))
448
            w[active],Ri = u,v   = self._posterior_dist(Xa,l,a[active],b[active],XYa)
449
        
450
            # -------- update q(alpha) ---------------
451
            b[active] = self.b + 0.5*(w[active]**2 + np.sum(Ri**2,1))
452
            
453
            # -------- update eps  ------------
454
            # In the M-step we update parameter eps which controls 
455
            # accuracy of local variational approximation to lower bound
456
            XMX = np.dot(Xa,w[active])**2
457
            XSX = np.sum( np.dot(Xa,Ri.T)**2, axis = 1)
458
            eps = np.sqrt( XMX + XSX )
459
            
460
            # determine relevant vector as is described in Bishop & Tipping 
461
            # (i.e. using mean of posterior distribution)
462
            active  = np.abs(w) > self.prune_thresh
463
            
464
            # do not prune intercept & make sure there is at least one 'relevant feature'.
465
            # If only one relevant feature , then choose rv with largest posterior mean
466
            if self.fit_intercept:
467
                active[0] = True
468
                if np.sum(active[1:]) == 0:
469
                    active[np.argmax(np.abs(w[1:]))] = True
470
            else:
471
                if np.sum(active) == 0:
472
                    active[np.argmax(np.abs(w))] = True
473
            # all irrelevant features are forced to zero
474
            w[~active] = 0
475
            # check convergence
476
            if np.sum(abs(w-w0) > self.tol) == 0 or i==self.n_iter-1:
477
                break
478
            w0 = np.copy(w)
479
            # if only one relevant feature => terminate
480
            if np.sum(active) - 1*self.fit_intercept == 1:
481
                if X.shape[1] > 3 and self.prune_thresh > 1e-1:
482
                   warnings.warn(("Only one relevant feature found! it can be useful to decrease"
483
                                  "value for parameter prune_thresh"))
484
                break
485
            
486
        l   = lam(eps)
487
        Xa  = X[:,active]
488
        XYa = np.dot(Xa.T,(y-0.5)) 
489
        w[active] , sigma_  = self._posterior_dist(Xa,l,a[active],b[active],XYa,True)
490
        
491
        # separate intercept & coefficients
492
        intercept_ = 0
493
        if self.fit_intercept:
494
            intercept_ = w[0]
495
            coef_      = np.copy(w[1:])
496
        else:
497
            coef_      = w
498
        return coef_, sigma_ , intercept_, active
499

500

501
    def _posterior_dist(self,X,l,a,b,XY,full_covar = False):
502
        '''
503
        Finds gaussian approximation to posterior of coefficients
504
        '''
505
        sigma_inv  = 2*np.dot(X.T*l,X)
506
        alpha_vec  = a / b
507
        if self.fit_intercept:
508
            alpha_vec[0] = np.finfo(np.float64).eps
509
        np.fill_diagonal(sigma_inv, np.diag(sigma_inv) + alpha_vec)
510
        R     = np.linalg.cholesky(sigma_inv)
511
        Z     = solve_triangular(R,XY, lower = True)
512
        mean_ = solve_triangular(R.T,Z,lower = False)
513
        Ri    = solve_triangular(R,np.eye(X.shape[1]), lower = True)
514
        if full_covar:
515
            sigma_   = np.dot(Ri.T,Ri)
516
            return mean_ , sigma_
517
        else:
518
            return mean_ , Ri
519

520

521

522
Product

Resources

Company