CoCalc -- gbt.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/ml/src/gbt.cpp
¹⁶³³⁷ views
1

2
#include "precomp.hpp"
3
#include <time.h>
4

5
#if 0
6

7
#define pCvSeq CvSeq*
8
#define pCvDTreeNode CvDTreeNode*
9

10
//===========================================================================
11
//----------------------------- CvGBTreesParams -----------------------------
12
//===========================================================================
13

14
CvGBTreesParams::CvGBTreesParams()
15
            : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
16
{
17
    weak_count = 200;
18
    loss_function_type = CvGBTrees::SQUARED_LOSS;
19
    subsample_portion = 0.8f;
20
    shrinkage = 0.01f;
21
}
22

23
//===========================================================================
24

25
CvGBTreesParams::CvGBTreesParams( int _loss_function_type, int _weak_count,
26
                         float _shrinkage, float _subsample_portion,
27
                         int _max_depth, bool _use_surrogates )
28
            : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
29
{
30
    loss_function_type = _loss_function_type;
31
    weak_count = _weak_count;
32
    shrinkage = _shrinkage;
33
    subsample_portion = _subsample_portion;
34
    max_depth = _max_depth;
35
    use_surrogates = _use_surrogates;
36
}
37

38
//===========================================================================
39
//------------------------------- CvGBTrees ---------------------------------
40
//===========================================================================
41

42
CvGBTrees::CvGBTrees()
43
{
44
    data = 0;
45
    weak = 0;
46
    default_model_name = "my_boost_tree";
47
    orig_response = sum_response = sum_response_tmp = 0;
48
    subsample_train = subsample_test = 0;
49
    missing = sample_idx = 0;
50
    class_labels = 0;
51
    class_count = 1;
52
    delta = 0.0f;
53

54
    clear();
55
}
56

57
//===========================================================================
58

59
int CvGBTrees::get_len(const CvMat* mat) const
60
{
61
    return (mat->cols > mat->rows) ? mat->cols : mat->rows;
62
}
63

64
//===========================================================================
65

66
void CvGBTrees::clear()
67
{
68
    if( weak )
69
    {
70
        CvSeqReader reader;
71
        CvSlice slice = CV_WHOLE_SEQ;
72
        CvDTree* tree;
73

74
        //data->shared = false;
75
        for (int i=0; i<class_count; ++i)
76
        {
77
            int weak_count = cvSliceLength( slice, weak[i] );
78
            if ((weak[i]) && (weak_count))
79
            {
80
                cvStartReadSeq( weak[i], &reader );
81
                cvSetSeqReaderPos( &reader, slice.start_index );
82
                for (int j=0; j<weak_count; ++j)
83
                {
84
                    CV_READ_SEQ_ELEM( tree, reader );
85
                    //tree->clear();
86
                    delete tree;
87
                    tree = 0;
88
                }
89
            }
90
        }
91
        for (int i=0; i<class_count; ++i)
92
            if (weak[i]) cvReleaseMemStorage( &(weak[i]->storage) );
93
        delete[] weak;
94
    }
95
    if (data)
96
    {
97
        data->shared = false;
98
        delete data;
99
    }
100
    weak = 0;
101
    data = 0;
102
    delta = 0.0f;
103
    cvReleaseMat( &orig_response );
104
    cvReleaseMat( &sum_response );
105
    cvReleaseMat( &sum_response_tmp );
106
    cvReleaseMat( &subsample_train );
107
    cvReleaseMat( &subsample_test );
108
    cvReleaseMat( &sample_idx );
109
    cvReleaseMat( &missing );
110
    cvReleaseMat( &class_labels );
111
}
112

113
//===========================================================================
114

115
CvGBTrees::~CvGBTrees()
116
{
117
    clear();
118
}
119

120
//===========================================================================
121

122
CvGBTrees::CvGBTrees( const CvMat* _train_data, int _tflag,
123
                  const CvMat* _responses, const CvMat* _var_idx,
124
                  const CvMat* _sample_idx, const CvMat* _var_type,
125
                  const CvMat* _missing_mask, CvGBTreesParams _params )
126
{
127
    weak = 0;
128
    data = 0;
129
    default_model_name = "my_boost_tree";
130
    orig_response = sum_response = sum_response_tmp = 0;
131
    subsample_train = subsample_test = 0;
132
    missing = sample_idx = 0;
133
    class_labels = 0;
134
    class_count = 1;
135
    delta = 0.0f;
136

137
    train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
138
           _var_type, _missing_mask, _params );
139
}
140

141
//===========================================================================
142

143
bool CvGBTrees::problem_type() const
144
{
145
    switch (params.loss_function_type)
146
    {
147
    case DEVIANCE_LOSS: return false;
148
    default: return true;
149
    }
150
}
151

152
//===========================================================================
153

154
bool
155
CvGBTrees::train( CvMLData* _data, CvGBTreesParams _params, bool update )
156
{
157
    bool result;
158
    result = train ( _data->get_values(), CV_ROW_SAMPLE,
159
            _data->get_responses(), _data->get_var_idx(),
160
            _data->get_train_sample_idx(), _data->get_var_types(),
161
            _data->get_missing(), _params, update);
162
                                         //update is not supported
163
    return result;
164
}
165

166
//===========================================================================
167

168

169
bool
170
CvGBTrees::train( const CvMat* _train_data, int _tflag,
171
              const CvMat* _responses, const CvMat* _var_idx,
172
              const CvMat* _sample_idx, const CvMat* _var_type,
173
              const CvMat* _missing_mask,
174
              CvGBTreesParams _params, bool /*_update*/ ) //update is not supported
175
{
176
    CvMemStorage* storage = 0;
177

178
    params = _params;
179
    bool is_regression = problem_type();
180

181
    clear();
182
    /*
183
      n - count of samples
184
      m - count of variables
185
    */
186
    int n = _train_data->rows;
187
    int m = _train_data->cols;
188
    if (_tflag != CV_ROW_SAMPLE)
189
    {
190
        int tmp;
191
        CV_SWAP(n,m,tmp);
192
    }
193

194
    CvMat* new_responses = cvCreateMat( n, 1, CV_32F);
195
    cvZero(new_responses);
196

197
    data = new CvDTreeTrainData( _train_data, _tflag, new_responses, _var_idx,
198
        _sample_idx, _var_type, _missing_mask, _params, true, true );
199
    if (_missing_mask)
200
    {
201
        missing = cvCreateMat(_missing_mask->rows, _missing_mask->cols,
202
                              _missing_mask->type);
203
        cvCopy( _missing_mask, missing);
204
    }
205

206
    orig_response = cvCreateMat( 1, n, CV_32F );
207
    int step = (_responses->cols > _responses->rows) ? 1 : _responses->step / CV_ELEM_SIZE(_responses->type);
208
    switch (CV_MAT_TYPE(_responses->type))
209
    {
210
        case CV_32FC1:
211
        {
212
            for (int i=0; i<n; ++i)
213
                orig_response->data.fl[i] = _responses->data.fl[i*step];
214
        }; break;
215
        case CV_32SC1:
216
        {
217
            for (int i=0; i<n; ++i)
218
                orig_response->data.fl[i] = (float) _responses->data.i[i*step];
219
        }; break;
220
        default:
221
            CV_Error(CV_StsUnmatchedFormats, "Response should be a 32fC1 or 32sC1 vector.");
222
    }
223

224
    if (!is_regression)
225
    {
226
        class_count = 0;
227
        unsigned char * mask = new unsigned char[n];
228
        memset(mask, 0, n);
229
        // compute the count of different output classes
230
        for (int i=0; i<n; ++i)
231
            if (!mask[i])
232
            {
233
                class_count++;
234
                for (int j=i; j<n; ++j)
235
                    if (int(orig_response->data.fl[j]) == int(orig_response->data.fl[i]))
236
                        mask[j] = 1;
237
            }
238
        delete[] mask;
239

240
        class_labels = cvCreateMat(1, class_count, CV_32S);
241
        class_labels->data.i[0] = int(orig_response->data.fl[0]);
242
        int j = 1;
243
        for (int i=1; i<n; ++i)
244
        {
245
            int k = 0;
246
            while ((k<j) && (int(orig_response->data.fl[i]) - class_labels->data.i[k]))
247
                k++;
248
            if (k == j)
249
            {
250
                class_labels->data.i[k] = int(orig_response->data.fl[i]);
251
                j++;
252
            }
253
        }
254
    }
255

256
    // inside gbt learning process only regression decision trees are built
257
    data->is_classifier = false;
258

259
    // preproccessing sample indices
260
    if (_sample_idx)
261
    {
262
        int sample_idx_len = get_len(_sample_idx);
263

264
        switch (CV_MAT_TYPE(_sample_idx->type))
265
        {
266
            case CV_32SC1:
267
            {
268
                sample_idx = cvCreateMat( 1, sample_idx_len, CV_32S );
269
                for (int i=0; i<sample_idx_len; ++i)
270
                    sample_idx->data.i[i] = _sample_idx->data.i[i];
271
                std::sort(sample_idx->data.i, sample_idx->data.i + sample_idx_len);
272
            } break;
273
            case CV_8S:
274
            case CV_8U:
275
            {
276
                int active_samples_count = 0;
277
                for (int i=0; i<sample_idx_len; ++i)
278
                    active_samples_count += int( _sample_idx->data.ptr[i] );
279
                sample_idx = cvCreateMat( 1, active_samples_count, CV_32S );
280
                active_samples_count = 0;
281
                for (int i=0; i<sample_idx_len; ++i)
282
                    if (int( _sample_idx->data.ptr[i] ))
283
                        sample_idx->data.i[active_samples_count++] = i;
284

285
            } break;
286
            default: CV_Error(CV_StsUnmatchedFormats, "_sample_idx should be a 32sC1, 8sC1 or 8uC1 vector.");
287
        }
288
    }
289
    else
290
    {
291
        sample_idx = cvCreateMat( 1, n, CV_32S );
292
        for (int i=0; i<n; ++i)
293
            sample_idx->data.i[i] = i;
294
    }
295

296
    sum_response = cvCreateMat(class_count, n, CV_32F);
297
    sum_response_tmp = cvCreateMat(class_count, n, CV_32F);
298
    cvZero(sum_response);
299

300
    delta = 0.0f;
301
    /*
302
      in the case of a regression problem the initial guess (the zero term
303
      in the sum) is set to the mean of all the training responses, that is
304
      the best constant model
305
    */
306
    if (is_regression) base_value = find_optimal_value(sample_idx);
307
    /*
308
      in the case of a classification problem the initial guess (the zero term
309
      in the sum) is set to zero for all the trees sequences
310
    */
311
    else base_value = 0.0f;
312
    /*
313
      current predicition on all training samples is set to be
314
      equal to the base_value
315
    */
316
    cvSet( sum_response, cvScalar(base_value) );
317

318
    weak = new pCvSeq[class_count];
319
    for (int i=0; i<class_count; ++i)
320
    {
321
        storage = cvCreateMemStorage();
322
        weak[i] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
323
        storage = 0;
324
    }
325

326
    // subsample params and data
327
    rng = &cv::theRNG();
328

329
    int samples_count = get_len(sample_idx);
330

331
    params.subsample_portion = params.subsample_portion <= FLT_EPSILON ||
332
        1 - params.subsample_portion <= FLT_EPSILON
333
        ? 1 : params.subsample_portion;
334
    int train_sample_count = cvFloor(params.subsample_portion * samples_count);
335
    if (train_sample_count == 0)
336
        train_sample_count = samples_count;
337
    int test_sample_count = samples_count - train_sample_count;
338
    int* idx_data = new int[samples_count];
339
    subsample_train = cvCreateMatHeader( 1, train_sample_count, CV_32SC1 );
340
    *subsample_train = cvMat( 1, train_sample_count, CV_32SC1, idx_data );
341
    if (test_sample_count)
342
    {
343
        subsample_test  = cvCreateMatHeader( 1, test_sample_count, CV_32SC1 );
344
        *subsample_test = cvMat( 1, test_sample_count, CV_32SC1,
345
                                 idx_data + train_sample_count );
346
    }
347

348
    // training procedure
349

350
    for ( int i=0; i < params.weak_count; ++i )
351
    {
352
        do_subsample();
353
        for ( int k=0; k < class_count; ++k )
354
        {
355
            find_gradient(k);
356
            CvDTree* tree = new CvDTree;
357
            tree->train( data, subsample_train );
358
            change_values(tree, k);
359

360
            if (subsample_test)
361
            {
362
                CvMat x;
363
                CvMat x_miss;
364
                int* sample_data = sample_idx->data.i;
365
                int* subsample_data = subsample_test->data.i;
366
                int s_step = (sample_idx->cols > sample_idx->rows) ? 1
367
                             : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
368
                for (int j=0; j<get_len(subsample_test); ++j)
369
                {
370
                    int idx = *(sample_data + subsample_data[j]*s_step);
371
                    float res = 0.0f;
372
                    if (_tflag == CV_ROW_SAMPLE)
373
                        cvGetRow( data->train_data, &x, idx);
374
                    else
375
                        cvGetCol( data->train_data, &x, idx);
376

377
                    if (missing)
378
                    {
379
                        if (_tflag == CV_ROW_SAMPLE)
380
                            cvGetRow( missing, &x_miss, idx);
381
                        else
382
                            cvGetCol( missing, &x_miss, idx);
383

384
                        res = (float)tree->predict(&x, &x_miss)->value;
385
                    }
386
                    else
387
                    {
388
                        res = (float)tree->predict(&x)->value;
389
                    }
390
                    sum_response_tmp->data.fl[idx + k*n] =
391
                                    sum_response->data.fl[idx + k*n] +
392
                                    params.shrinkage * res;
393
                }
394
            }
395

396
            cvSeqPush( weak[k], &tree );
397
            tree = 0;
398
        } // k=0..class_count
399
    CvMat* tmp;
400
    tmp = sum_response_tmp;
401
    sum_response_tmp = sum_response;
402
    sum_response = tmp;
403
    tmp = 0;
404
    } // i=0..params.weak_count
405

406
    delete[] idx_data;
407
    cvReleaseMat(&new_responses);
408
    data->free_train_data();
409

410
    return true;
411

412
} // CvGBTrees::train(...)
413

414
//===========================================================================
415

416
inline float Sign(float x)
417
  {
418
  if (x<0.0f) return -1.0f;
419
  else if (x>0.0f) return 1.0f;
420
  return 0.0f;
421
  }
422

423
//===========================================================================
424

425
void CvGBTrees::find_gradient(const int k)
426
{
427
    int* sample_data = sample_idx->data.i;
428
    int* subsample_data = subsample_train->data.i;
429
    float* grad_data = data->responses->data.fl;
430
    float* resp_data = orig_response->data.fl;
431
    float* current_data = sum_response->data.fl;
432

433
    switch (params.loss_function_type)
434
    // loss_function_type in
435
    // {SQUARED_LOSS, ABSOLUTE_LOSS, HUBER_LOSS, DEVIANCE_LOSS}
436
    {
437
        case SQUARED_LOSS:
438
        {
439
            for (int i=0; i<get_len(subsample_train); ++i)
440
            {
441
                int s_step = (sample_idx->cols > sample_idx->rows) ? 1
442
                             : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
443
                int idx = *(sample_data + subsample_data[i]*s_step);
444
                grad_data[idx] = resp_data[idx] - current_data[idx];
445
            }
446
        }; break;
447

448
        case ABSOLUTE_LOSS:
449
        {
450
            for (int i=0; i<get_len(subsample_train); ++i)
451
            {
452
                int s_step = (sample_idx->cols > sample_idx->rows) ? 1
453
                             : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
454
                int idx = *(sample_data + subsample_data[i]*s_step);
455
                grad_data[idx] = Sign(resp_data[idx] - current_data[idx]);
456
            }
457
        }; break;
458

459
        case HUBER_LOSS:
460
        {
461
            float alpha = 0.2f;
462
            int n = get_len(subsample_train);
463
            int s_step = (sample_idx->cols > sample_idx->rows) ? 1
464
                         : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
465

466
            float* residuals = new float[n];
467
            for (int i=0; i<n; ++i)
468
            {
469
                int idx = *(sample_data + subsample_data[i]*s_step);
470
                residuals[i] = fabs(resp_data[idx] - current_data[idx]);
471
            }
472
            std::sort(residuals, residuals + n);
473

474
            delta = residuals[int(ceil(n*alpha))];
475

476
            for (int i=0; i<n; ++i)
477
            {
478
                int idx = *(sample_data + subsample_data[i]*s_step);
479
                float r = resp_data[idx] - current_data[idx];
480
                grad_data[idx] = (fabs(r) > delta) ? delta*Sign(r) : r;
481
            }
482
            delete[] residuals;
483

484
        }; break;
485

486
        case DEVIANCE_LOSS:
487
        {
488
            for (int i=0; i<get_len(subsample_train); ++i)
489
            {
490
                double exp_fk = 0;
491
                double exp_sfi = 0;
492
                int s_step = (sample_idx->cols > sample_idx->rows) ? 1
493
                             : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
494
                int idx = *(sample_data + subsample_data[i]*s_step);
495

496
                for (int j=0; j<class_count; ++j)
497
                {
498
                    double res;
499
                    res = current_data[idx + j*sum_response->cols];
500
                    res = exp(res);
501
                    if (j == k) exp_fk = res;
502
                    exp_sfi += res;
503
                }
504
                int orig_label = int(resp_data[idx]);
505
                /*
506
                grad_data[idx] = (float)(!(k-class_labels->data.i[orig_label]+1)) -
507
                                 (float)(exp_fk / exp_sfi);
508
                */
509
                int ensemble_label = 0;
510
                while (class_labels->data.i[ensemble_label] - orig_label)
511
                    ensemble_label++;
512

513
                grad_data[idx] = (float)(!(k-ensemble_label)) -
514
                                 (float)(exp_fk / exp_sfi);
515
            }
516
        }; break;
517

518
        default: break;
519
    }
520

521
} // CvGBTrees::find_gradient(...)
522

523
//===========================================================================
524

525
void CvGBTrees::change_values(CvDTree* tree, const int _k)
526
{
527
    CvDTreeNode** predictions = new pCvDTreeNode[get_len(subsample_train)];
528

529
    int* sample_data = sample_idx->data.i;
530
    int* subsample_data = subsample_train->data.i;
531
    int s_step = (sample_idx->cols > sample_idx->rows) ? 1
532
                 : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
533

534
    CvMat x;
535
    CvMat miss_x;
536

537
    for (int i=0; i<get_len(subsample_train); ++i)
538
    {
539
        int idx = *(sample_data + subsample_data[i]*s_step);
540
        if (data->tflag == CV_ROW_SAMPLE)
541
            cvGetRow( data->train_data, &x, idx);
542
        else
543
            cvGetCol( data->train_data, &x, idx);
544

545
        if (missing)
546
        {
547
            if (data->tflag == CV_ROW_SAMPLE)
548
                cvGetRow( missing, &miss_x, idx);
549
            else
550
                cvGetCol( missing, &miss_x, idx);
551

552
            predictions[i] = tree->predict(&x, &miss_x);
553
        }
554
        else
555
            predictions[i] = tree->predict(&x);
556
    }
557

558

559
    CvDTreeNode** leaves;
560
    int leaves_count = 0;
561
    leaves = GetLeaves( tree, leaves_count);
562

563
    for (int i=0; i<leaves_count; ++i)
564
    {
565
        int samples_in_leaf = 0;
566
        for (int j=0; j<get_len(subsample_train); ++j)
567
        {
568
            if (leaves[i] == predictions[j]) samples_in_leaf++;
569
        }
570

571
        if (!samples_in_leaf) // It should not be done anyways! but...
572
        {
573
            leaves[i]->value = 0.0;
574
            continue;
575
        }
576

577
        CvMat* leaf_idx = cvCreateMat(1, samples_in_leaf, CV_32S);
578
        int* leaf_idx_data = leaf_idx->data.i;
579

580
        for (int j=0; j<get_len(subsample_train); ++j)
581
        {
582
            int idx = *(sample_data + subsample_data[j]*s_step);
583
            if (leaves[i] == predictions[j])
584
                *leaf_idx_data++ = idx;
585
        }
586

587
        float value = find_optimal_value(leaf_idx);
588
        leaves[i]->value = value;
589

590
        leaf_idx_data = leaf_idx->data.i;
591

592
        int len = sum_response_tmp->cols;
593
        for (int j=0; j<get_len(leaf_idx); ++j)
594
        {
595
            int idx = leaf_idx_data[j];
596
            sum_response_tmp->data.fl[idx + _k*len] =
597
                                    sum_response->data.fl[idx + _k*len] +
598
                                    params.shrinkage * value;
599
        }
600
        leaf_idx_data = 0;
601
        cvReleaseMat(&leaf_idx);
602
    }
603

604
    // releasing the memory
605
    for (int i=0; i<get_len(subsample_train); ++i)
606
    {
607
        predictions[i] = 0;
608
    }
609
    delete[] predictions;
610

611
    for (int i=0; i<leaves_count; ++i)
612
    {
613
        leaves[i] = 0;
614
    }
615
    delete[] leaves;
616

617
}
618

619
//===========================================================================
620
/*
621
void CvGBTrees::change_values(CvDTree* tree, const int _k)
622
{
623

624
    CvDTreeNode** leaves;
625
    int leaves_count = 0;
626
    int offset = _k*sum_response_tmp->cols;
627
    CvMat leaf_idx;
628
    leaf_idx.rows = 1;
629

630
    leaves = GetLeaves( tree, leaves_count);
631

632
    for (int i=0; i<leaves_count; ++i)
633
    {
634
        int n = leaves[i]->sample_count;
635
        int* leaf_idx_data = new int[n];
636
        data->get_sample_indices(leaves[i], leaf_idx_data);
637
        //CvMat* leaf_idx = new CvMat();
638
        //cvInitMatHeader(leaf_idx, n, 1, CV_32S, leaf_idx_data);
639
        leaf_idx.cols = n;
640
        leaf_idx.data.i = leaf_idx_data;
641

642
        float value = find_optimal_value(&leaf_idx);
643
        leaves[i]->value = value;
644
        float val = params.shrinkage * value;
645

646

647
        for (int j=0; j<n; ++j)
648
        {
649
            int idx = leaf_idx_data[j] + offset;
650
            sum_response_tmp->data.fl[idx] = sum_response->data.fl[idx] + val;
651
        }
652
        //leaf_idx_data = 0;
653
        //cvReleaseMat(&leaf_idx);
654
        leaf_idx.data.i = 0;
655
        //delete leaf_idx;
656
        delete[] leaf_idx_data;
657
    }
658

659
    // releasing the memory
660
    for (int i=0; i<leaves_count; ++i)
661
    {
662
        leaves[i] = 0;
663
    }
664
    delete[] leaves;
665

666
}    //change_values(...);
667
*/
668
//===========================================================================
669

670
float CvGBTrees::find_optimal_value( const CvMat* _Idx )
671
{
672

673
    double gamma = (double)0.0;
674

675
    int* idx = _Idx->data.i;
676
    float* resp_data = orig_response->data.fl;
677
    float* cur_data = sum_response->data.fl;
678
    int n = get_len(_Idx);
679

680
    switch (params.loss_function_type)
681
    // SQUARED_LOSS=0, ABSOLUTE_LOSS=1, HUBER_LOSS=3, DEVIANCE_LOSS=4
682
    {
683
    case SQUARED_LOSS:
684
        {
685
            for (int i=0; i<n; ++i)
686
                gamma += resp_data[idx[i]] - cur_data[idx[i]];
687
            gamma /= (double)n;
688
        }; break;
689

690
    case ABSOLUTE_LOSS:
691
        {
692
            float* residuals = new float[n];
693
            for (int i=0; i<n; ++i, ++idx)
694
                residuals[i] = (resp_data[*idx] - cur_data[*idx]);
695
            std::sort(residuals, residuals + n);
696
            if (n % 2)
697
                gamma = residuals[n/2];
698
            else gamma = (residuals[n/2-1] + residuals[n/2]) / 2.0f;
699
            delete[] residuals;
700
        }; break;
701

702
    case HUBER_LOSS:
703
        {
704
            float* residuals = new float[n];
705
            for (int i=0; i<n; ++i, ++idx)
706
                residuals[i] = (resp_data[*idx] - cur_data[*idx]);
707
            std::sort(residuals, residuals + n);
708

709
            int n_half = n >> 1;
710
            float r_median = (n == n_half<<1) ?
711
                        (residuals[n_half-1] + residuals[n_half]) / 2.0f :
712
                        residuals[n_half];
713

714
            for (int i=0; i<n; ++i)
715
            {
716
                float dif = residuals[i] - r_median;
717
                gamma += (delta < fabs(dif)) ? Sign(dif)*delta : dif;
718
            }
719
            gamma /= (double)n;
720
            gamma += r_median;
721
            delete[] residuals;
722

723
        }; break;
724

725
    case DEVIANCE_LOSS:
726
        {
727
            float* grad_data = data->responses->data.fl;
728
            double tmp1 = 0;
729
            double tmp2 = 0;
730
            double tmp  = 0;
731
            for (int i=0; i<n; ++i)
732
            {
733
                tmp = grad_data[idx[i]];
734
                tmp1 += tmp;
735
                tmp2 += fabs(tmp)*(1-fabs(tmp));
736
            };
737
            if (tmp2 == 0)
738
            {
739
                tmp2 = 1;
740
            }
741

742
            gamma = ((double)(class_count-1)) / (double)class_count * (tmp1/tmp2);
743
        }; break;
744

745
    default: break;
746
    }
747

748
    return float(gamma);
749

750
} // CvGBTrees::find_optimal_value
751

752
//===========================================================================
753

754

755
void CvGBTrees::leaves_get( CvDTreeNode** leaves, int& count, CvDTreeNode* node )
756
{
757
    if (node->left != NULL)  leaves_get(leaves, count, node->left);
758
    if (node->right != NULL) leaves_get(leaves, count, node->right);
759
    if ((node->left == NULL) && (node->right == NULL))
760
        leaves[count++] = node;
761
}
762

763
//---------------------------------------------------------------------------
764

765
CvDTreeNode** CvGBTrees::GetLeaves( const CvDTree* dtree, int& len )
766
{
767
    len = 0;
768
    CvDTreeNode** leaves = new pCvDTreeNode[(size_t)1 << params.max_depth];
769
    leaves_get(leaves, len, const_cast<pCvDTreeNode>(dtree->get_root()));
770
    return leaves;
771
}
772

773
//===========================================================================
774

775
void CvGBTrees::do_subsample()
776
{
777

778
    int n = get_len(sample_idx);
779
    int* idx = subsample_train->data.i;
780

781
    for (int i = 0; i < n; i++ )
782
        idx[i] = i;
783

784
    if (subsample_test)
785
        for (int i = 0; i < n; i++)
786
        {
787
            int a = (*rng)(n);
788
            int b = (*rng)(n);
789
            int t;
790
            CV_SWAP( idx[a], idx[b], t );
791
        }
792

793
/*
794
    int n = get_len(sample_idx);
795
    if (subsample_train == 0)
796
        subsample_train = cvCreateMat(1, n, CV_32S);
797
    int* subsample_data = subsample_train->data.i;
798
    for (int i=0; i<n; ++i)
799
        subsample_data[i] = i;
800
    subsample_test = 0;
801
*/
802
}
803

804
//===========================================================================
805

806
float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing,
807
        CvMat* weak_responses, CvSlice slice, int k) const
808
{
809
    float result = 0.0f;
810

811
    if (!weak) return 0.0f;
812

813
    CvSeqReader reader;
814
    int weak_count = cvSliceLength( slice, weak[class_count-1] );
815
    CvDTree* tree;
816

817
    if (weak_responses)
818
    {
819
        if (CV_MAT_TYPE(weak_responses->type) != CV_32F)
820
            return 0.0f;
821
        if ((k >= 0) && (k<class_count) && (weak_responses->rows != 1))
822
            return 0.0f;
823
        if ((k == -1) && (weak_responses->rows != class_count))
824
            return 0.0f;
825
        if (weak_responses->cols != weak_count)
826
            return 0.0f;
827
    }
828

829
    float* sum = new float[class_count];
830
    memset(sum, 0, class_count*sizeof(float));
831

832
    for (int i=0; i<class_count; ++i)
833
    {
834
        if ((weak[i]) && (weak_count))
835
        {
836
            cvStartReadSeq( weak[i], &reader );
837
            cvSetSeqReaderPos( &reader, slice.start_index );
838
            for (int j=0; j<weak_count; ++j)
839
            {
840
                CV_READ_SEQ_ELEM( tree, reader );
841
                float p = (float)(tree->predict(_sample, _missing)->value);
842
                sum[i] += params.shrinkage * p;
843
                if (weak_responses)
844
                    weak_responses->data.fl[i*weak_count+j] = p;
845
            }
846
        }
847
    }
848

849
    for (int i=0; i<class_count; ++i)
850
        sum[i] += base_value;
851

852
    if (class_count == 1)
853
    {
854
        result = sum[0];
855
        delete[] sum;
856
        return result;
857
    }
858

859
    if ((k>=0) && (k<class_count))
860
    {
861
        result = sum[k];
862
        delete[] sum;
863
        return result;
864
    }
865

866
    float max = sum[0];
867
    int class_label = 0;
868
    for (int i=1; i<class_count; ++i)
869
        if (sum[i] > max)
870
        {
871
            max = sum[i];
872
            class_label = i;
873
        }
874

875
    delete[] sum;
876

877
    /*
878
    int orig_class_label = -1;
879
    for (int i=0; i<get_len(class_labels); ++i)
880
        if (class_labels->data.i[i] == class_label+1)
881
            orig_class_label = i;
882
    */
883
    int orig_class_label = class_labels->data.i[class_label];
884

885
    return float(orig_class_label);
886
}
887

888

889
class Tree_predictor : public cv::ParallelLoopBody
890
{
891
private:
892
    pCvSeq* weak;
893
    float* sum;
894
    const int k;
895
    const CvMat* sample;
896
    const CvMat* missing;
897
    const float shrinkage;
898

899
    static cv::Mutex SumMutex;
900

901

902
public:
903
    Tree_predictor() : weak(0), sum(0), k(0), sample(0), missing(0), shrinkage(1.0f) {}
904
    Tree_predictor(pCvSeq* _weak, const int _k, const float _shrinkage,
905
                   const CvMat* _sample, const CvMat* _missing, float* _sum ) :
906
                   weak(_weak), sum(_sum), k(_k), sample(_sample),
907
                   missing(_missing), shrinkage(_shrinkage)
908
    {}
909

910
    Tree_predictor( const Tree_predictor& p, cv::Split ) :
911
            weak(p.weak), sum(p.sum), k(p.k), sample(p.sample),
912
            missing(p.missing), shrinkage(p.shrinkage)
913
    {}
914

915
    Tree_predictor& operator=( const Tree_predictor& )
916
    { return *this; }
917

918
    virtual void operator()(const cv::Range& range) const
919
    {
920
        CvSeqReader reader;
921
        int begin = range.start;
922
        int end = range.end;
923

924
        int weak_count = end - begin;
925
        CvDTree* tree;
926

927
        for (int i=0; i<k; ++i)
928
        {
929
            float tmp_sum = 0.0f;
930
            if ((weak[i]) && (weak_count))
931
            {
932
                cvStartReadSeq( weak[i], &reader );
933
                cvSetSeqReaderPos( &reader, begin );
934
                for (int j=0; j<weak_count; ++j)
935
                {
936
                    CV_READ_SEQ_ELEM( tree, reader );
937
                    tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value);
938
                }
939
            }
940

941
            {
942
                cv::AutoLock lock(SumMutex);
943
                sum[i] += tmp_sum;
944
            }
945
        }
946
    } // Tree_predictor::operator()
947

948
    virtual ~Tree_predictor() {}
949

950
}; // class Tree_predictor
951

952
cv::Mutex Tree_predictor::SumMutex;
953

954

955
float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
956
            CvMat* /*weak_responses*/, CvSlice slice, int k) const
957
    {
958
        float result = 0.0f;
959
        if (!weak) return 0.0f;
960
        float* sum = new float[class_count];
961
        for (int i=0; i<class_count; ++i)
962
            sum[i] = 0.0f;
963
        int begin = slice.start_index;
964
        int end = begin + cvSliceLength( slice, weak[0] );
965

966
        pCvSeq* weak_seq = weak;
967
        Tree_predictor predictor = Tree_predictor(weak_seq, class_count,
968
                                    params.shrinkage, _sample, _missing, sum);
969

970
        cv::parallel_for_(cv::Range(begin, end), predictor);
971

972
        for (int i=0; i<class_count; ++i)
973
            sum[i] = sum[i] /** params.shrinkage*/ + base_value;
974

975
        if (class_count == 1)
976
        {
977
            result = sum[0];
978
            delete[] sum;
979
            return result;
980
        }
981

982
        if ((k>=0) && (k<class_count))
983
        {
984
            result = sum[k];
985
            delete[] sum;
986
            return result;
987
        }
988

989
        float max = sum[0];
990
        int class_label = 0;
991
        for (int i=1; i<class_count; ++i)
992
            if (sum[i] > max)
993
            {
994
                max = sum[i];
995
                class_label = i;
996
            }
997

998
        delete[] sum;
999
        int orig_class_label = class_labels->data.i[class_label];
1000

1001
        return float(orig_class_label);
1002
    }
1003

1004

1005
//===========================================================================
1006

1007
void CvGBTrees::write_params( CvFileStorage* fs ) const
1008
{
1009
    const char* loss_function_type_str =
1010
        params.loss_function_type == SQUARED_LOSS ? "SquaredLoss" :
1011
        params.loss_function_type == ABSOLUTE_LOSS ? "AbsoluteLoss" :
1012
        params.loss_function_type == HUBER_LOSS ? "HuberLoss" :
1013
        params.loss_function_type == DEVIANCE_LOSS ? "DevianceLoss" : 0;
1014

1015

1016
    if( loss_function_type_str )
1017
        cvWriteString( fs, "loss_function", loss_function_type_str );
1018
    else
1019
        cvWriteInt( fs, "loss_function", params.loss_function_type );
1020

1021
    cvWriteInt( fs, "ensemble_length", params.weak_count );
1022
    cvWriteReal( fs, "shrinkage", params.shrinkage );
1023
    cvWriteReal( fs, "subsample_portion", params.subsample_portion );
1024
    //cvWriteInt( fs, "max_tree_depth", params.max_depth );
1025
    //cvWriteString( fs, "use_surrogate_splits", params.use_surrogates ? "true" : "false");
1026
    if (class_labels) cvWrite( fs, "class_labels", class_labels);
1027

1028
    data->is_classifier = !problem_type();
1029
    data->write_params( fs );
1030
    data->is_classifier = 0;
1031
}
1032

1033

1034
//===========================================================================
1035

1036
void CvGBTrees::read_params( CvFileStorage* fs, CvFileNode* fnode )
1037
{
1038
    CV_FUNCNAME( "CvGBTrees::read_params" );
1039
    __BEGIN__;
1040

1041

1042
    CvFileNode* temp;
1043

1044
    if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
1045
        return;
1046

1047
    data = new CvDTreeTrainData();
1048
    CV_CALL( data->read_params(fs, fnode));
1049
    data->shared = true;
1050

1051
    params.max_depth = data->params.max_depth;
1052
    params.min_sample_count = data->params.min_sample_count;
1053
    params.max_categories = data->params.max_categories;
1054
    params.priors = data->params.priors;
1055
    params.regression_accuracy = data->params.regression_accuracy;
1056
    params.use_surrogates = data->params.use_surrogates;
1057

1058
    temp = cvGetFileNodeByName( fs, fnode, "loss_function" );
1059
    if( !temp )
1060
        EXIT;
1061

1062
    if( temp && CV_NODE_IS_STRING(temp->tag) )
1063
    {
1064
        const char* loss_function_type_str = cvReadString( temp, "" );
1065
        params.loss_function_type = strcmp( loss_function_type_str, "SquaredLoss" ) == 0 ? SQUARED_LOSS :
1066
                            strcmp( loss_function_type_str, "AbsoluteLoss" ) == 0 ? ABSOLUTE_LOSS :
1067
                            strcmp( loss_function_type_str, "HuberLoss" ) == 0 ? HUBER_LOSS :
1068
                            strcmp( loss_function_type_str, "DevianceLoss" ) == 0 ? DEVIANCE_LOSS : -1;
1069
    }
1070
    else
1071
        params.loss_function_type = cvReadInt( temp, -1 );
1072

1073

1074
    if( params.loss_function_type < SQUARED_LOSS || params.loss_function_type > DEVIANCE_LOSS ||  params.loss_function_type == 2)
1075
        CV_ERROR( CV_StsBadArg, "Unknown loss function" );
1076

1077
    params.weak_count = cvReadIntByName( fs, fnode, "ensemble_length" );
1078
    params.shrinkage = (float)cvReadRealByName( fs, fnode, "shrinkage", 0.1 );
1079
    params.subsample_portion = (float)cvReadRealByName( fs, fnode, "subsample_portion", 1.0 );
1080

1081
    if (data->is_classifier)
1082
    {
1083
        class_labels = (CvMat*)cvReadByName( fs, fnode, "class_labels" );
1084
        if( class_labels && !CV_IS_MAT(class_labels))
1085
            CV_ERROR( CV_StsParseError, "class_labels must stored as a matrix");
1086
    }
1087
    data->is_classifier = 0;
1088

1089
    __END__;
1090
}
1091

1092

1093

1094

1095
void CvGBTrees::write( CvFileStorage* fs, const char* name ) const
1096
{
1097
    CV_FUNCNAME( "CvGBTrees::write" );
1098

1099
    __BEGIN__;
1100

1101
    CvSeqReader reader;
1102
    int i;
1103
    cv::String s;
1104

1105
    cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_GBT );
1106

1107
    if( !weak )
1108
        CV_ERROR( CV_StsBadArg, "The model has not been trained yet" );
1109

1110
    write_params( fs );
1111
    cvWriteReal( fs, "base_value", base_value);
1112
    cvWriteInt( fs, "class_count", class_count);
1113

1114
    for ( int j=0; j < class_count; ++j )
1115
    {
1116
        s = cv::format("trees_%d", j);
1117
        cvStartWriteStruct( fs, s.c_str(), CV_NODE_SEQ );
1118

1119
        cvStartReadSeq( weak[j], &reader );
1120

1121
        for( i = 0; i < weak[j]->total; i++ )
1122
        {
1123
            CvDTree* tree;
1124
            CV_READ_SEQ_ELEM( tree, reader );
1125
            cvStartWriteStruct( fs, 0, CV_NODE_MAP );
1126
            tree->write( fs );
1127
            cvEndWriteStruct( fs );
1128
        }
1129

1130
        cvEndWriteStruct( fs );
1131
    }
1132

1133
    cvEndWriteStruct( fs );
1134

1135
    __END__;
1136
}
1137

1138

1139
//===========================================================================
1140

1141

1142
void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )
1143
{
1144

1145
    CV_FUNCNAME( "CvGBTrees::read" );
1146

1147
    __BEGIN__;
1148

1149
    CvSeqReader reader;
1150
    CvFileNode* trees_fnode;
1151
    CvMemStorage* storage;
1152
    int i, ntrees;
1153
    cv::String s;
1154

1155
    clear();
1156
    read_params( fs, node );
1157

1158
    if( !data )
1159
        EXIT;
1160

1161
    base_value = (float)cvReadRealByName( fs, node, "base_value", 0.0 );
1162
    class_count = cvReadIntByName( fs, node, "class_count", 1 );
1163

1164
    weak = new pCvSeq[class_count];
1165

1166

1167
    for (int j=0; j<class_count; ++j)
1168
    {
1169
        s = cv::format("trees_%d", j);
1170

1171
        trees_fnode = cvGetFileNodeByName( fs, node, s.c_str() );
1172
        if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
1173
            CV_ERROR( CV_StsParseError, "<trees_x> tag is missing" );
1174

1175
        cvStartReadSeq( trees_fnode->data.seq, &reader );
1176
        ntrees = trees_fnode->data.seq->total;
1177

1178
        if( ntrees != params.weak_count )
1179
            CV_ERROR( CV_StsUnmatchedSizes,
1180
            "The number of trees stored does not match <ntrees> tag value" );
1181

1182
        CV_CALL( storage = cvCreateMemStorage() );
1183
        weak[j] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
1184

1185
        for( i = 0; i < ntrees; i++ )
1186
        {
1187
            CvDTree* tree = new CvDTree();
1188
            CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, data ));
1189
            CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
1190
            cvSeqPush( weak[j], &tree );
1191
        }
1192
    }
1193

1194
    __END__;
1195
}
1196

1197
//===========================================================================
1198

1199
class Sample_predictor : public cv::ParallelLoopBody
1200
{
1201
private:
1202
    const CvGBTrees* gbt;
1203
    float* predictions;
1204
    const CvMat* samples;
1205
    const CvMat* missing;
1206
    const CvMat* idx;
1207
    CvSlice slice;
1208

1209
public:
1210
    Sample_predictor() : gbt(0), predictions(0), samples(0), missing(0),
1211
                         idx(0), slice(CV_WHOLE_SEQ)
1212
    {}
1213

1214
    Sample_predictor(const CvGBTrees* _gbt, float* _predictions,
1215
                   const CvMat* _samples, const CvMat* _missing,
1216
                   const CvMat* _idx, CvSlice _slice=CV_WHOLE_SEQ) :
1217
                   gbt(_gbt), predictions(_predictions), samples(_samples),
1218
                   missing(_missing), idx(_idx), slice(_slice)
1219
    {}
1220

1221

1222
    Sample_predictor( const Sample_predictor& p, cv::Split ) :
1223
            gbt(p.gbt), predictions(p.predictions),
1224
            samples(p.samples), missing(p.missing), idx(p.idx),
1225
            slice(p.slice)
1226
    {}
1227

1228

1229
    virtual void operator()(const cv::Range& range) const
1230
    {
1231
        int begin = range.start;
1232
        int end = range.end;
1233

1234
        CvMat x;
1235
        CvMat miss;
1236

1237
        for (int i=begin; i<end; ++i)
1238
        {
1239
            int j = idx ? idx->data.i[i] : i;
1240
            cvGetRow(samples, &x, j);
1241
            if (!missing)
1242
            {
1243
                predictions[i] = gbt->predict_serial(&x,0,0,slice);
1244
            }
1245
            else
1246
            {
1247
                cvGetRow(missing, &miss, j);
1248
                predictions[i] = gbt->predict_serial(&x,&miss,0,slice);
1249
            }
1250
        }
1251
    } // Sample_predictor::operator()
1252

1253
    virtual ~Sample_predictor() {}
1254

1255
}; // class Sample_predictor
1256

1257

1258

1259
// type in {CV_TRAIN_ERROR, CV_TEST_ERROR}
1260
float
1261
CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
1262
{
1263

1264
    float err = 0.0f;
1265
    const CvMat* _sample_idx = (type == CV_TRAIN_ERROR) ?
1266
                              _data->get_train_sample_idx() :
1267
                              _data->get_test_sample_idx();
1268
    const CvMat* response = _data->get_responses();
1269

1270
    int n = _sample_idx ? get_len(_sample_idx) : 0;
1271
    n = (type == CV_TRAIN_ERROR && n == 0) ? _data->get_values()->rows : n;
1272

1273
    if (!n)
1274
        return -FLT_MAX;
1275

1276
    float* pred_resp = 0;
1277
    bool needsFreeing = false;
1278

1279
    if (resp)
1280
    {
1281
        resp->resize(n);
1282
        pred_resp = &((*resp)[0]);
1283
    }
1284
    else
1285
    {
1286
        pred_resp = new float[n];
1287
        needsFreeing = true;
1288
    }
1289

1290
    Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(),
1291
            _data->get_missing(), _sample_idx);
1292

1293
    cv::parallel_for_(cv::Range(0,n), predictor);
1294

1295
    int* sidx = _sample_idx ? _sample_idx->data.i : 0;
1296
    int r_step = CV_IS_MAT_CONT(response->type) ?
1297
                1 : response->step / CV_ELEM_SIZE(response->type);
1298

1299

1300
    if ( !problem_type() )
1301
    {
1302
        for( int i = 0; i < n; i++ )
1303
        {
1304
            int si = sidx ? sidx[i] : i;
1305
            int d = fabs((double)pred_resp[i] - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
1306
            err += d;
1307
        }
1308
        err = err / (float)n * 100.0f;
1309
    }
1310
    else
1311
    {
1312
        for( int i = 0; i < n; i++ )
1313
        {
1314
            int si = sidx ? sidx[i] : i;
1315
            float d = pred_resp[i] - response->data.fl[si*r_step];
1316
            err += d*d;
1317
        }
1318
        err = err / (float)n;
1319
    }
1320

1321
    if (needsFreeing)
1322
        delete[]pred_resp;
1323

1324
    return err;
1325
}
1326

1327

1328
CvGBTrees::CvGBTrees( const cv::Mat& trainData, int tflag,
1329
          const cv::Mat& responses, const cv::Mat& varIdx,
1330
          const cv::Mat& sampleIdx, const cv::Mat& varType,
1331
          const cv::Mat& missingDataMask,
1332
          CvGBTreesParams _params )
1333
{
1334
    data = 0;
1335
    weak = 0;
1336
    default_model_name = "my_boost_tree";
1337
    orig_response = sum_response = sum_response_tmp = 0;
1338
    subsample_train = subsample_test = 0;
1339
    missing = sample_idx = 0;
1340
    class_labels = 0;
1341
    class_count = 1;
1342
    delta = 0.0f;
1343

1344
    clear();
1345

1346
    train(trainData, tflag, responses, varIdx, sampleIdx, varType, missingDataMask, _params, false);
1347
}
1348

1349
bool CvGBTrees::train( const cv::Mat& trainData, int tflag,
1350
                   const cv::Mat& responses, const cv::Mat& varIdx,
1351
                   const cv::Mat& sampleIdx, const cv::Mat& varType,
1352
                   const cv::Mat& missingDataMask,
1353
                   CvGBTreesParams _params,
1354
                   bool update )
1355
{
1356
    CvMat _trainData = trainData, _responses = responses;
1357
    CvMat _varIdx = varIdx, _sampleIdx = sampleIdx, _varType = varType;
1358
    CvMat _missingDataMask = missingDataMask;
1359

1360
    return train( &_trainData, tflag, &_responses, varIdx.empty() ? 0 : &_varIdx,
1361
                  sampleIdx.empty() ? 0 : &_sampleIdx, varType.empty() ? 0 : &_varType,
1362
                  missingDataMask.empty() ? 0 : &_missingDataMask, _params, update);
1363
}
1364

1365
float CvGBTrees::predict( const cv::Mat& sample, const cv::Mat& _missing,
1366
                          const cv::Range& slice, int k ) const
1367
{
1368
    CvMat _sample = sample, miss = _missing;
1369
    return predict(&_sample, _missing.empty() ? 0 : &miss, 0,
1370
                   slice==cv::Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end), k);
1371
}
1372

1373
#endif
1374

1375
Product

Resources

Company