Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/ml/src/gbt.cpp
16337 views
1
2
#include "precomp.hpp"
3
#include <time.h>
4
5
#if 0
6
7
#define pCvSeq CvSeq*
8
#define pCvDTreeNode CvDTreeNode*
9
10
//===========================================================================
11
//----------------------------- CvGBTreesParams -----------------------------
12
//===========================================================================
13
14
CvGBTreesParams::CvGBTreesParams()
15
: CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
16
{
17
weak_count = 200;
18
loss_function_type = CvGBTrees::SQUARED_LOSS;
19
subsample_portion = 0.8f;
20
shrinkage = 0.01f;
21
}
22
23
//===========================================================================
24
25
CvGBTreesParams::CvGBTreesParams( int _loss_function_type, int _weak_count,
26
float _shrinkage, float _subsample_portion,
27
int _max_depth, bool _use_surrogates )
28
: CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
29
{
30
loss_function_type = _loss_function_type;
31
weak_count = _weak_count;
32
shrinkage = _shrinkage;
33
subsample_portion = _subsample_portion;
34
max_depth = _max_depth;
35
use_surrogates = _use_surrogates;
36
}
37
38
//===========================================================================
39
//------------------------------- CvGBTrees ---------------------------------
40
//===========================================================================
41
42
CvGBTrees::CvGBTrees()
43
{
44
data = 0;
45
weak = 0;
46
default_model_name = "my_boost_tree";
47
orig_response = sum_response = sum_response_tmp = 0;
48
subsample_train = subsample_test = 0;
49
missing = sample_idx = 0;
50
class_labels = 0;
51
class_count = 1;
52
delta = 0.0f;
53
54
clear();
55
}
56
57
//===========================================================================
58
59
int CvGBTrees::get_len(const CvMat* mat) const
60
{
61
return (mat->cols > mat->rows) ? mat->cols : mat->rows;
62
}
63
64
//===========================================================================
65
66
void CvGBTrees::clear()
67
{
68
if( weak )
69
{
70
CvSeqReader reader;
71
CvSlice slice = CV_WHOLE_SEQ;
72
CvDTree* tree;
73
74
//data->shared = false;
75
for (int i=0; i<class_count; ++i)
76
{
77
int weak_count = cvSliceLength( slice, weak[i] );
78
if ((weak[i]) && (weak_count))
79
{
80
cvStartReadSeq( weak[i], &reader );
81
cvSetSeqReaderPos( &reader, slice.start_index );
82
for (int j=0; j<weak_count; ++j)
83
{
84
CV_READ_SEQ_ELEM( tree, reader );
85
//tree->clear();
86
delete tree;
87
tree = 0;
88
}
89
}
90
}
91
for (int i=0; i<class_count; ++i)
92
if (weak[i]) cvReleaseMemStorage( &(weak[i]->storage) );
93
delete[] weak;
94
}
95
if (data)
96
{
97
data->shared = false;
98
delete data;
99
}
100
weak = 0;
101
data = 0;
102
delta = 0.0f;
103
cvReleaseMat( &orig_response );
104
cvReleaseMat( &sum_response );
105
cvReleaseMat( &sum_response_tmp );
106
cvReleaseMat( &subsample_train );
107
cvReleaseMat( &subsample_test );
108
cvReleaseMat( &sample_idx );
109
cvReleaseMat( &missing );
110
cvReleaseMat( &class_labels );
111
}
112
113
//===========================================================================
114
115
CvGBTrees::~CvGBTrees()
116
{
117
clear();
118
}
119
120
//===========================================================================
121
122
CvGBTrees::CvGBTrees( const CvMat* _train_data, int _tflag,
123
const CvMat* _responses, const CvMat* _var_idx,
124
const CvMat* _sample_idx, const CvMat* _var_type,
125
const CvMat* _missing_mask, CvGBTreesParams _params )
126
{
127
weak = 0;
128
data = 0;
129
default_model_name = "my_boost_tree";
130
orig_response = sum_response = sum_response_tmp = 0;
131
subsample_train = subsample_test = 0;
132
missing = sample_idx = 0;
133
class_labels = 0;
134
class_count = 1;
135
delta = 0.0f;
136
137
train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
138
_var_type, _missing_mask, _params );
139
}
140
141
//===========================================================================
142
143
bool CvGBTrees::problem_type() const
144
{
145
switch (params.loss_function_type)
146
{
147
case DEVIANCE_LOSS: return false;
148
default: return true;
149
}
150
}
151
152
//===========================================================================
153
154
bool
155
CvGBTrees::train( CvMLData* _data, CvGBTreesParams _params, bool update )
156
{
157
bool result;
158
result = train ( _data->get_values(), CV_ROW_SAMPLE,
159
_data->get_responses(), _data->get_var_idx(),
160
_data->get_train_sample_idx(), _data->get_var_types(),
161
_data->get_missing(), _params, update);
162
//update is not supported
163
return result;
164
}
165
166
//===========================================================================
167
168
169
bool
170
CvGBTrees::train( const CvMat* _train_data, int _tflag,
171
const CvMat* _responses, const CvMat* _var_idx,
172
const CvMat* _sample_idx, const CvMat* _var_type,
173
const CvMat* _missing_mask,
174
CvGBTreesParams _params, bool /*_update*/ ) //update is not supported
175
{
176
CvMemStorage* storage = 0;
177
178
params = _params;
179
bool is_regression = problem_type();
180
181
clear();
182
/*
183
n - count of samples
184
m - count of variables
185
*/
186
int n = _train_data->rows;
187
int m = _train_data->cols;
188
if (_tflag != CV_ROW_SAMPLE)
189
{
190
int tmp;
191
CV_SWAP(n,m,tmp);
192
}
193
194
CvMat* new_responses = cvCreateMat( n, 1, CV_32F);
195
cvZero(new_responses);
196
197
data = new CvDTreeTrainData( _train_data, _tflag, new_responses, _var_idx,
198
_sample_idx, _var_type, _missing_mask, _params, true, true );
199
if (_missing_mask)
200
{
201
missing = cvCreateMat(_missing_mask->rows, _missing_mask->cols,
202
_missing_mask->type);
203
cvCopy( _missing_mask, missing);
204
}
205
206
orig_response = cvCreateMat( 1, n, CV_32F );
207
int step = (_responses->cols > _responses->rows) ? 1 : _responses->step / CV_ELEM_SIZE(_responses->type);
208
switch (CV_MAT_TYPE(_responses->type))
209
{
210
case CV_32FC1:
211
{
212
for (int i=0; i<n; ++i)
213
orig_response->data.fl[i] = _responses->data.fl[i*step];
214
}; break;
215
case CV_32SC1:
216
{
217
for (int i=0; i<n; ++i)
218
orig_response->data.fl[i] = (float) _responses->data.i[i*step];
219
}; break;
220
default:
221
CV_Error(CV_StsUnmatchedFormats, "Response should be a 32fC1 or 32sC1 vector.");
222
}
223
224
if (!is_regression)
225
{
226
class_count = 0;
227
unsigned char * mask = new unsigned char[n];
228
memset(mask, 0, n);
229
// compute the count of different output classes
230
for (int i=0; i<n; ++i)
231
if (!mask[i])
232
{
233
class_count++;
234
for (int j=i; j<n; ++j)
235
if (int(orig_response->data.fl[j]) == int(orig_response->data.fl[i]))
236
mask[j] = 1;
237
}
238
delete[] mask;
239
240
class_labels = cvCreateMat(1, class_count, CV_32S);
241
class_labels->data.i[0] = int(orig_response->data.fl[0]);
242
int j = 1;
243
for (int i=1; i<n; ++i)
244
{
245
int k = 0;
246
while ((k<j) && (int(orig_response->data.fl[i]) - class_labels->data.i[k]))
247
k++;
248
if (k == j)
249
{
250
class_labels->data.i[k] = int(orig_response->data.fl[i]);
251
j++;
252
}
253
}
254
}
255
256
// inside gbt learning process only regression decision trees are built
257
data->is_classifier = false;
258
259
// preproccessing sample indices
260
if (_sample_idx)
261
{
262
int sample_idx_len = get_len(_sample_idx);
263
264
switch (CV_MAT_TYPE(_sample_idx->type))
265
{
266
case CV_32SC1:
267
{
268
sample_idx = cvCreateMat( 1, sample_idx_len, CV_32S );
269
for (int i=0; i<sample_idx_len; ++i)
270
sample_idx->data.i[i] = _sample_idx->data.i[i];
271
std::sort(sample_idx->data.i, sample_idx->data.i + sample_idx_len);
272
} break;
273
case CV_8S:
274
case CV_8U:
275
{
276
int active_samples_count = 0;
277
for (int i=0; i<sample_idx_len; ++i)
278
active_samples_count += int( _sample_idx->data.ptr[i] );
279
sample_idx = cvCreateMat( 1, active_samples_count, CV_32S );
280
active_samples_count = 0;
281
for (int i=0; i<sample_idx_len; ++i)
282
if (int( _sample_idx->data.ptr[i] ))
283
sample_idx->data.i[active_samples_count++] = i;
284
285
} break;
286
default: CV_Error(CV_StsUnmatchedFormats, "_sample_idx should be a 32sC1, 8sC1 or 8uC1 vector.");
287
}
288
}
289
else
290
{
291
sample_idx = cvCreateMat( 1, n, CV_32S );
292
for (int i=0; i<n; ++i)
293
sample_idx->data.i[i] = i;
294
}
295
296
sum_response = cvCreateMat(class_count, n, CV_32F);
297
sum_response_tmp = cvCreateMat(class_count, n, CV_32F);
298
cvZero(sum_response);
299
300
delta = 0.0f;
301
/*
302
in the case of a regression problem the initial guess (the zero term
303
in the sum) is set to the mean of all the training responses, that is
304
the best constant model
305
*/
306
if (is_regression) base_value = find_optimal_value(sample_idx);
307
/*
308
in the case of a classification problem the initial guess (the zero term
309
in the sum) is set to zero for all the trees sequences
310
*/
311
else base_value = 0.0f;
312
/*
313
current predicition on all training samples is set to be
314
equal to the base_value
315
*/
316
cvSet( sum_response, cvScalar(base_value) );
317
318
weak = new pCvSeq[class_count];
319
for (int i=0; i<class_count; ++i)
320
{
321
storage = cvCreateMemStorage();
322
weak[i] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
323
storage = 0;
324
}
325
326
// subsample params and data
327
rng = &cv::theRNG();
328
329
int samples_count = get_len(sample_idx);
330
331
params.subsample_portion = params.subsample_portion <= FLT_EPSILON ||
332
1 - params.subsample_portion <= FLT_EPSILON
333
? 1 : params.subsample_portion;
334
int train_sample_count = cvFloor(params.subsample_portion * samples_count);
335
if (train_sample_count == 0)
336
train_sample_count = samples_count;
337
int test_sample_count = samples_count - train_sample_count;
338
int* idx_data = new int[samples_count];
339
subsample_train = cvCreateMatHeader( 1, train_sample_count, CV_32SC1 );
340
*subsample_train = cvMat( 1, train_sample_count, CV_32SC1, idx_data );
341
if (test_sample_count)
342
{
343
subsample_test = cvCreateMatHeader( 1, test_sample_count, CV_32SC1 );
344
*subsample_test = cvMat( 1, test_sample_count, CV_32SC1,
345
idx_data + train_sample_count );
346
}
347
348
// training procedure
349
350
for ( int i=0; i < params.weak_count; ++i )
351
{
352
do_subsample();
353
for ( int k=0; k < class_count; ++k )
354
{
355
find_gradient(k);
356
CvDTree* tree = new CvDTree;
357
tree->train( data, subsample_train );
358
change_values(tree, k);
359
360
if (subsample_test)
361
{
362
CvMat x;
363
CvMat x_miss;
364
int* sample_data = sample_idx->data.i;
365
int* subsample_data = subsample_test->data.i;
366
int s_step = (sample_idx->cols > sample_idx->rows) ? 1
367
: sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
368
for (int j=0; j<get_len(subsample_test); ++j)
369
{
370
int idx = *(sample_data + subsample_data[j]*s_step);
371
float res = 0.0f;
372
if (_tflag == CV_ROW_SAMPLE)
373
cvGetRow( data->train_data, &x, idx);
374
else
375
cvGetCol( data->train_data, &x, idx);
376
377
if (missing)
378
{
379
if (_tflag == CV_ROW_SAMPLE)
380
cvGetRow( missing, &x_miss, idx);
381
else
382
cvGetCol( missing, &x_miss, idx);
383
384
res = (float)tree->predict(&x, &x_miss)->value;
385
}
386
else
387
{
388
res = (float)tree->predict(&x)->value;
389
}
390
sum_response_tmp->data.fl[idx + k*n] =
391
sum_response->data.fl[idx + k*n] +
392
params.shrinkage * res;
393
}
394
}
395
396
cvSeqPush( weak[k], &tree );
397
tree = 0;
398
} // k=0..class_count
399
CvMat* tmp;
400
tmp = sum_response_tmp;
401
sum_response_tmp = sum_response;
402
sum_response = tmp;
403
tmp = 0;
404
} // i=0..params.weak_count
405
406
delete[] idx_data;
407
cvReleaseMat(&new_responses);
408
data->free_train_data();
409
410
return true;
411
412
} // CvGBTrees::train(...)
413
414
//===========================================================================
415
416
inline float Sign(float x)
417
{
418
if (x<0.0f) return -1.0f;
419
else if (x>0.0f) return 1.0f;
420
return 0.0f;
421
}
422
423
//===========================================================================
424
425
void CvGBTrees::find_gradient(const int k)
426
{
427
int* sample_data = sample_idx->data.i;
428
int* subsample_data = subsample_train->data.i;
429
float* grad_data = data->responses->data.fl;
430
float* resp_data = orig_response->data.fl;
431
float* current_data = sum_response->data.fl;
432
433
switch (params.loss_function_type)
434
// loss_function_type in
435
// {SQUARED_LOSS, ABSOLUTE_LOSS, HUBER_LOSS, DEVIANCE_LOSS}
436
{
437
case SQUARED_LOSS:
438
{
439
for (int i=0; i<get_len(subsample_train); ++i)
440
{
441
int s_step = (sample_idx->cols > sample_idx->rows) ? 1
442
: sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
443
int idx = *(sample_data + subsample_data[i]*s_step);
444
grad_data[idx] = resp_data[idx] - current_data[idx];
445
}
446
}; break;
447
448
case ABSOLUTE_LOSS:
449
{
450
for (int i=0; i<get_len(subsample_train); ++i)
451
{
452
int s_step = (sample_idx->cols > sample_idx->rows) ? 1
453
: sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
454
int idx = *(sample_data + subsample_data[i]*s_step);
455
grad_data[idx] = Sign(resp_data[idx] - current_data[idx]);
456
}
457
}; break;
458
459
case HUBER_LOSS:
460
{
461
float alpha = 0.2f;
462
int n = get_len(subsample_train);
463
int s_step = (sample_idx->cols > sample_idx->rows) ? 1
464
: sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
465
466
float* residuals = new float[n];
467
for (int i=0; i<n; ++i)
468
{
469
int idx = *(sample_data + subsample_data[i]*s_step);
470
residuals[i] = fabs(resp_data[idx] - current_data[idx]);
471
}
472
std::sort(residuals, residuals + n);
473
474
delta = residuals[int(ceil(n*alpha))];
475
476
for (int i=0; i<n; ++i)
477
{
478
int idx = *(sample_data + subsample_data[i]*s_step);
479
float r = resp_data[idx] - current_data[idx];
480
grad_data[idx] = (fabs(r) > delta) ? delta*Sign(r) : r;
481
}
482
delete[] residuals;
483
484
}; break;
485
486
case DEVIANCE_LOSS:
487
{
488
for (int i=0; i<get_len(subsample_train); ++i)
489
{
490
double exp_fk = 0;
491
double exp_sfi = 0;
492
int s_step = (sample_idx->cols > sample_idx->rows) ? 1
493
: sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
494
int idx = *(sample_data + subsample_data[i]*s_step);
495
496
for (int j=0; j<class_count; ++j)
497
{
498
double res;
499
res = current_data[idx + j*sum_response->cols];
500
res = exp(res);
501
if (j == k) exp_fk = res;
502
exp_sfi += res;
503
}
504
int orig_label = int(resp_data[idx]);
505
/*
506
grad_data[idx] = (float)(!(k-class_labels->data.i[orig_label]+1)) -
507
(float)(exp_fk / exp_sfi);
508
*/
509
int ensemble_label = 0;
510
while (class_labels->data.i[ensemble_label] - orig_label)
511
ensemble_label++;
512
513
grad_data[idx] = (float)(!(k-ensemble_label)) -
514
(float)(exp_fk / exp_sfi);
515
}
516
}; break;
517
518
default: break;
519
}
520
521
} // CvGBTrees::find_gradient(...)
522
523
//===========================================================================
524
525
void CvGBTrees::change_values(CvDTree* tree, const int _k)
526
{
527
CvDTreeNode** predictions = new pCvDTreeNode[get_len(subsample_train)];
528
529
int* sample_data = sample_idx->data.i;
530
int* subsample_data = subsample_train->data.i;
531
int s_step = (sample_idx->cols > sample_idx->rows) ? 1
532
: sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
533
534
CvMat x;
535
CvMat miss_x;
536
537
for (int i=0; i<get_len(subsample_train); ++i)
538
{
539
int idx = *(sample_data + subsample_data[i]*s_step);
540
if (data->tflag == CV_ROW_SAMPLE)
541
cvGetRow( data->train_data, &x, idx);
542
else
543
cvGetCol( data->train_data, &x, idx);
544
545
if (missing)
546
{
547
if (data->tflag == CV_ROW_SAMPLE)
548
cvGetRow( missing, &miss_x, idx);
549
else
550
cvGetCol( missing, &miss_x, idx);
551
552
predictions[i] = tree->predict(&x, &miss_x);
553
}
554
else
555
predictions[i] = tree->predict(&x);
556
}
557
558
559
CvDTreeNode** leaves;
560
int leaves_count = 0;
561
leaves = GetLeaves( tree, leaves_count);
562
563
for (int i=0; i<leaves_count; ++i)
564
{
565
int samples_in_leaf = 0;
566
for (int j=0; j<get_len(subsample_train); ++j)
567
{
568
if (leaves[i] == predictions[j]) samples_in_leaf++;
569
}
570
571
if (!samples_in_leaf) // It should not be done anyways! but...
572
{
573
leaves[i]->value = 0.0;
574
continue;
575
}
576
577
CvMat* leaf_idx = cvCreateMat(1, samples_in_leaf, CV_32S);
578
int* leaf_idx_data = leaf_idx->data.i;
579
580
for (int j=0; j<get_len(subsample_train); ++j)
581
{
582
int idx = *(sample_data + subsample_data[j]*s_step);
583
if (leaves[i] == predictions[j])
584
*leaf_idx_data++ = idx;
585
}
586
587
float value = find_optimal_value(leaf_idx);
588
leaves[i]->value = value;
589
590
leaf_idx_data = leaf_idx->data.i;
591
592
int len = sum_response_tmp->cols;
593
for (int j=0; j<get_len(leaf_idx); ++j)
594
{
595
int idx = leaf_idx_data[j];
596
sum_response_tmp->data.fl[idx + _k*len] =
597
sum_response->data.fl[idx + _k*len] +
598
params.shrinkage * value;
599
}
600
leaf_idx_data = 0;
601
cvReleaseMat(&leaf_idx);
602
}
603
604
// releasing the memory
605
for (int i=0; i<get_len(subsample_train); ++i)
606
{
607
predictions[i] = 0;
608
}
609
delete[] predictions;
610
611
for (int i=0; i<leaves_count; ++i)
612
{
613
leaves[i] = 0;
614
}
615
delete[] leaves;
616
617
}
618
619
//===========================================================================
620
/*
621
void CvGBTrees::change_values(CvDTree* tree, const int _k)
622
{
623
624
CvDTreeNode** leaves;
625
int leaves_count = 0;
626
int offset = _k*sum_response_tmp->cols;
627
CvMat leaf_idx;
628
leaf_idx.rows = 1;
629
630
leaves = GetLeaves( tree, leaves_count);
631
632
for (int i=0; i<leaves_count; ++i)
633
{
634
int n = leaves[i]->sample_count;
635
int* leaf_idx_data = new int[n];
636
data->get_sample_indices(leaves[i], leaf_idx_data);
637
//CvMat* leaf_idx = new CvMat();
638
//cvInitMatHeader(leaf_idx, n, 1, CV_32S, leaf_idx_data);
639
leaf_idx.cols = n;
640
leaf_idx.data.i = leaf_idx_data;
641
642
float value = find_optimal_value(&leaf_idx);
643
leaves[i]->value = value;
644
float val = params.shrinkage * value;
645
646
647
for (int j=0; j<n; ++j)
648
{
649
int idx = leaf_idx_data[j] + offset;
650
sum_response_tmp->data.fl[idx] = sum_response->data.fl[idx] + val;
651
}
652
//leaf_idx_data = 0;
653
//cvReleaseMat(&leaf_idx);
654
leaf_idx.data.i = 0;
655
//delete leaf_idx;
656
delete[] leaf_idx_data;
657
}
658
659
// releasing the memory
660
for (int i=0; i<leaves_count; ++i)
661
{
662
leaves[i] = 0;
663
}
664
delete[] leaves;
665
666
} //change_values(...);
667
*/
668
//===========================================================================
669
670
float CvGBTrees::find_optimal_value( const CvMat* _Idx )
671
{
672
673
double gamma = (double)0.0;
674
675
int* idx = _Idx->data.i;
676
float* resp_data = orig_response->data.fl;
677
float* cur_data = sum_response->data.fl;
678
int n = get_len(_Idx);
679
680
switch (params.loss_function_type)
681
// SQUARED_LOSS=0, ABSOLUTE_LOSS=1, HUBER_LOSS=3, DEVIANCE_LOSS=4
682
{
683
case SQUARED_LOSS:
684
{
685
for (int i=0; i<n; ++i)
686
gamma += resp_data[idx[i]] - cur_data[idx[i]];
687
gamma /= (double)n;
688
}; break;
689
690
case ABSOLUTE_LOSS:
691
{
692
float* residuals = new float[n];
693
for (int i=0; i<n; ++i, ++idx)
694
residuals[i] = (resp_data[*idx] - cur_data[*idx]);
695
std::sort(residuals, residuals + n);
696
if (n % 2)
697
gamma = residuals[n/2];
698
else gamma = (residuals[n/2-1] + residuals[n/2]) / 2.0f;
699
delete[] residuals;
700
}; break;
701
702
case HUBER_LOSS:
703
{
704
float* residuals = new float[n];
705
for (int i=0; i<n; ++i, ++idx)
706
residuals[i] = (resp_data[*idx] - cur_data[*idx]);
707
std::sort(residuals, residuals + n);
708
709
int n_half = n >> 1;
710
float r_median = (n == n_half<<1) ?
711
(residuals[n_half-1] + residuals[n_half]) / 2.0f :
712
residuals[n_half];
713
714
for (int i=0; i<n; ++i)
715
{
716
float dif = residuals[i] - r_median;
717
gamma += (delta < fabs(dif)) ? Sign(dif)*delta : dif;
718
}
719
gamma /= (double)n;
720
gamma += r_median;
721
delete[] residuals;
722
723
}; break;
724
725
case DEVIANCE_LOSS:
726
{
727
float* grad_data = data->responses->data.fl;
728
double tmp1 = 0;
729
double tmp2 = 0;
730
double tmp = 0;
731
for (int i=0; i<n; ++i)
732
{
733
tmp = grad_data[idx[i]];
734
tmp1 += tmp;
735
tmp2 += fabs(tmp)*(1-fabs(tmp));
736
};
737
if (tmp2 == 0)
738
{
739
tmp2 = 1;
740
}
741
742
gamma = ((double)(class_count-1)) / (double)class_count * (tmp1/tmp2);
743
}; break;
744
745
default: break;
746
}
747
748
return float(gamma);
749
750
} // CvGBTrees::find_optimal_value
751
752
//===========================================================================
753
754
755
void CvGBTrees::leaves_get( CvDTreeNode** leaves, int& count, CvDTreeNode* node )
756
{
757
if (node->left != NULL) leaves_get(leaves, count, node->left);
758
if (node->right != NULL) leaves_get(leaves, count, node->right);
759
if ((node->left == NULL) && (node->right == NULL))
760
leaves[count++] = node;
761
}
762
763
//---------------------------------------------------------------------------
764
765
CvDTreeNode** CvGBTrees::GetLeaves( const CvDTree* dtree, int& len )
766
{
767
len = 0;
768
CvDTreeNode** leaves = new pCvDTreeNode[(size_t)1 << params.max_depth];
769
leaves_get(leaves, len, const_cast<pCvDTreeNode>(dtree->get_root()));
770
return leaves;
771
}
772
773
//===========================================================================
774
775
void CvGBTrees::do_subsample()
776
{
777
778
int n = get_len(sample_idx);
779
int* idx = subsample_train->data.i;
780
781
for (int i = 0; i < n; i++ )
782
idx[i] = i;
783
784
if (subsample_test)
785
for (int i = 0; i < n; i++)
786
{
787
int a = (*rng)(n);
788
int b = (*rng)(n);
789
int t;
790
CV_SWAP( idx[a], idx[b], t );
791
}
792
793
/*
794
int n = get_len(sample_idx);
795
if (subsample_train == 0)
796
subsample_train = cvCreateMat(1, n, CV_32S);
797
int* subsample_data = subsample_train->data.i;
798
for (int i=0; i<n; ++i)
799
subsample_data[i] = i;
800
subsample_test = 0;
801
*/
802
}
803
804
//===========================================================================
805
806
float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing,
807
CvMat* weak_responses, CvSlice slice, int k) const
808
{
809
float result = 0.0f;
810
811
if (!weak) return 0.0f;
812
813
CvSeqReader reader;
814
int weak_count = cvSliceLength( slice, weak[class_count-1] );
815
CvDTree* tree;
816
817
if (weak_responses)
818
{
819
if (CV_MAT_TYPE(weak_responses->type) != CV_32F)
820
return 0.0f;
821
if ((k >= 0) && (k<class_count) && (weak_responses->rows != 1))
822
return 0.0f;
823
if ((k == -1) && (weak_responses->rows != class_count))
824
return 0.0f;
825
if (weak_responses->cols != weak_count)
826
return 0.0f;
827
}
828
829
float* sum = new float[class_count];
830
memset(sum, 0, class_count*sizeof(float));
831
832
for (int i=0; i<class_count; ++i)
833
{
834
if ((weak[i]) && (weak_count))
835
{
836
cvStartReadSeq( weak[i], &reader );
837
cvSetSeqReaderPos( &reader, slice.start_index );
838
for (int j=0; j<weak_count; ++j)
839
{
840
CV_READ_SEQ_ELEM( tree, reader );
841
float p = (float)(tree->predict(_sample, _missing)->value);
842
sum[i] += params.shrinkage * p;
843
if (weak_responses)
844
weak_responses->data.fl[i*weak_count+j] = p;
845
}
846
}
847
}
848
849
for (int i=0; i<class_count; ++i)
850
sum[i] += base_value;
851
852
if (class_count == 1)
853
{
854
result = sum[0];
855
delete[] sum;
856
return result;
857
}
858
859
if ((k>=0) && (k<class_count))
860
{
861
result = sum[k];
862
delete[] sum;
863
return result;
864
}
865
866
float max = sum[0];
867
int class_label = 0;
868
for (int i=1; i<class_count; ++i)
869
if (sum[i] > max)
870
{
871
max = sum[i];
872
class_label = i;
873
}
874
875
delete[] sum;
876
877
/*
878
int orig_class_label = -1;
879
for (int i=0; i<get_len(class_labels); ++i)
880
if (class_labels->data.i[i] == class_label+1)
881
orig_class_label = i;
882
*/
883
int orig_class_label = class_labels->data.i[class_label];
884
885
return float(orig_class_label);
886
}
887
888
889
class Tree_predictor : public cv::ParallelLoopBody
890
{
891
private:
892
pCvSeq* weak;
893
float* sum;
894
const int k;
895
const CvMat* sample;
896
const CvMat* missing;
897
const float shrinkage;
898
899
static cv::Mutex SumMutex;
900
901
902
public:
903
Tree_predictor() : weak(0), sum(0), k(0), sample(0), missing(0), shrinkage(1.0f) {}
904
Tree_predictor(pCvSeq* _weak, const int _k, const float _shrinkage,
905
const CvMat* _sample, const CvMat* _missing, float* _sum ) :
906
weak(_weak), sum(_sum), k(_k), sample(_sample),
907
missing(_missing), shrinkage(_shrinkage)
908
{}
909
910
Tree_predictor( const Tree_predictor& p, cv::Split ) :
911
weak(p.weak), sum(p.sum), k(p.k), sample(p.sample),
912
missing(p.missing), shrinkage(p.shrinkage)
913
{}
914
915
Tree_predictor& operator=( const Tree_predictor& )
916
{ return *this; }
917
918
virtual void operator()(const cv::Range& range) const
919
{
920
CvSeqReader reader;
921
int begin = range.start;
922
int end = range.end;
923
924
int weak_count = end - begin;
925
CvDTree* tree;
926
927
for (int i=0; i<k; ++i)
928
{
929
float tmp_sum = 0.0f;
930
if ((weak[i]) && (weak_count))
931
{
932
cvStartReadSeq( weak[i], &reader );
933
cvSetSeqReaderPos( &reader, begin );
934
for (int j=0; j<weak_count; ++j)
935
{
936
CV_READ_SEQ_ELEM( tree, reader );
937
tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value);
938
}
939
}
940
941
{
942
cv::AutoLock lock(SumMutex);
943
sum[i] += tmp_sum;
944
}
945
}
946
} // Tree_predictor::operator()
947
948
virtual ~Tree_predictor() {}
949
950
}; // class Tree_predictor
951
952
cv::Mutex Tree_predictor::SumMutex;
953
954
955
float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
956
CvMat* /*weak_responses*/, CvSlice slice, int k) const
957
{
958
float result = 0.0f;
959
if (!weak) return 0.0f;
960
float* sum = new float[class_count];
961
for (int i=0; i<class_count; ++i)
962
sum[i] = 0.0f;
963
int begin = slice.start_index;
964
int end = begin + cvSliceLength( slice, weak[0] );
965
966
pCvSeq* weak_seq = weak;
967
Tree_predictor predictor = Tree_predictor(weak_seq, class_count,
968
params.shrinkage, _sample, _missing, sum);
969
970
cv::parallel_for_(cv::Range(begin, end), predictor);
971
972
for (int i=0; i<class_count; ++i)
973
sum[i] = sum[i] /** params.shrinkage*/ + base_value;
974
975
if (class_count == 1)
976
{
977
result = sum[0];
978
delete[] sum;
979
return result;
980
}
981
982
if ((k>=0) && (k<class_count))
983
{
984
result = sum[k];
985
delete[] sum;
986
return result;
987
}
988
989
float max = sum[0];
990
int class_label = 0;
991
for (int i=1; i<class_count; ++i)
992
if (sum[i] > max)
993
{
994
max = sum[i];
995
class_label = i;
996
}
997
998
delete[] sum;
999
int orig_class_label = class_labels->data.i[class_label];
1000
1001
return float(orig_class_label);
1002
}
1003
1004
1005
//===========================================================================
1006
1007
void CvGBTrees::write_params( CvFileStorage* fs ) const
1008
{
1009
const char* loss_function_type_str =
1010
params.loss_function_type == SQUARED_LOSS ? "SquaredLoss" :
1011
params.loss_function_type == ABSOLUTE_LOSS ? "AbsoluteLoss" :
1012
params.loss_function_type == HUBER_LOSS ? "HuberLoss" :
1013
params.loss_function_type == DEVIANCE_LOSS ? "DevianceLoss" : 0;
1014
1015
1016
if( loss_function_type_str )
1017
cvWriteString( fs, "loss_function", loss_function_type_str );
1018
else
1019
cvWriteInt( fs, "loss_function", params.loss_function_type );
1020
1021
cvWriteInt( fs, "ensemble_length", params.weak_count );
1022
cvWriteReal( fs, "shrinkage", params.shrinkage );
1023
cvWriteReal( fs, "subsample_portion", params.subsample_portion );
1024
//cvWriteInt( fs, "max_tree_depth", params.max_depth );
1025
//cvWriteString( fs, "use_surrogate_splits", params.use_surrogates ? "true" : "false");
1026
if (class_labels) cvWrite( fs, "class_labels", class_labels);
1027
1028
data->is_classifier = !problem_type();
1029
data->write_params( fs );
1030
data->is_classifier = 0;
1031
}
1032
1033
1034
//===========================================================================
1035
1036
void CvGBTrees::read_params( CvFileStorage* fs, CvFileNode* fnode )
1037
{
1038
CV_FUNCNAME( "CvGBTrees::read_params" );
1039
__BEGIN__;
1040
1041
1042
CvFileNode* temp;
1043
1044
if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
1045
return;
1046
1047
data = new CvDTreeTrainData();
1048
CV_CALL( data->read_params(fs, fnode));
1049
data->shared = true;
1050
1051
params.max_depth = data->params.max_depth;
1052
params.min_sample_count = data->params.min_sample_count;
1053
params.max_categories = data->params.max_categories;
1054
params.priors = data->params.priors;
1055
params.regression_accuracy = data->params.regression_accuracy;
1056
params.use_surrogates = data->params.use_surrogates;
1057
1058
temp = cvGetFileNodeByName( fs, fnode, "loss_function" );
1059
if( !temp )
1060
EXIT;
1061
1062
if( temp && CV_NODE_IS_STRING(temp->tag) )
1063
{
1064
const char* loss_function_type_str = cvReadString( temp, "" );
1065
params.loss_function_type = strcmp( loss_function_type_str, "SquaredLoss" ) == 0 ? SQUARED_LOSS :
1066
strcmp( loss_function_type_str, "AbsoluteLoss" ) == 0 ? ABSOLUTE_LOSS :
1067
strcmp( loss_function_type_str, "HuberLoss" ) == 0 ? HUBER_LOSS :
1068
strcmp( loss_function_type_str, "DevianceLoss" ) == 0 ? DEVIANCE_LOSS : -1;
1069
}
1070
else
1071
params.loss_function_type = cvReadInt( temp, -1 );
1072
1073
1074
if( params.loss_function_type < SQUARED_LOSS || params.loss_function_type > DEVIANCE_LOSS || params.loss_function_type == 2)
1075
CV_ERROR( CV_StsBadArg, "Unknown loss function" );
1076
1077
params.weak_count = cvReadIntByName( fs, fnode, "ensemble_length" );
1078
params.shrinkage = (float)cvReadRealByName( fs, fnode, "shrinkage", 0.1 );
1079
params.subsample_portion = (float)cvReadRealByName( fs, fnode, "subsample_portion", 1.0 );
1080
1081
if (data->is_classifier)
1082
{
1083
class_labels = (CvMat*)cvReadByName( fs, fnode, "class_labels" );
1084
if( class_labels && !CV_IS_MAT(class_labels))
1085
CV_ERROR( CV_StsParseError, "class_labels must stored as a matrix");
1086
}
1087
data->is_classifier = 0;
1088
1089
__END__;
1090
}
1091
1092
1093
1094
1095
void CvGBTrees::write( CvFileStorage* fs, const char* name ) const
1096
{
1097
CV_FUNCNAME( "CvGBTrees::write" );
1098
1099
__BEGIN__;
1100
1101
CvSeqReader reader;
1102
int i;
1103
cv::String s;
1104
1105
cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_GBT );
1106
1107
if( !weak )
1108
CV_ERROR( CV_StsBadArg, "The model has not been trained yet" );
1109
1110
write_params( fs );
1111
cvWriteReal( fs, "base_value", base_value);
1112
cvWriteInt( fs, "class_count", class_count);
1113
1114
for ( int j=0; j < class_count; ++j )
1115
{
1116
s = cv::format("trees_%d", j);
1117
cvStartWriteStruct( fs, s.c_str(), CV_NODE_SEQ );
1118
1119
cvStartReadSeq( weak[j], &reader );
1120
1121
for( i = 0; i < weak[j]->total; i++ )
1122
{
1123
CvDTree* tree;
1124
CV_READ_SEQ_ELEM( tree, reader );
1125
cvStartWriteStruct( fs, 0, CV_NODE_MAP );
1126
tree->write( fs );
1127
cvEndWriteStruct( fs );
1128
}
1129
1130
cvEndWriteStruct( fs );
1131
}
1132
1133
cvEndWriteStruct( fs );
1134
1135
__END__;
1136
}
1137
1138
1139
//===========================================================================
1140
1141
1142
void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )
1143
{
1144
1145
CV_FUNCNAME( "CvGBTrees::read" );
1146
1147
__BEGIN__;
1148
1149
CvSeqReader reader;
1150
CvFileNode* trees_fnode;
1151
CvMemStorage* storage;
1152
int i, ntrees;
1153
cv::String s;
1154
1155
clear();
1156
read_params( fs, node );
1157
1158
if( !data )
1159
EXIT;
1160
1161
base_value = (float)cvReadRealByName( fs, node, "base_value", 0.0 );
1162
class_count = cvReadIntByName( fs, node, "class_count", 1 );
1163
1164
weak = new pCvSeq[class_count];
1165
1166
1167
for (int j=0; j<class_count; ++j)
1168
{
1169
s = cv::format("trees_%d", j);
1170
1171
trees_fnode = cvGetFileNodeByName( fs, node, s.c_str() );
1172
if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
1173
CV_ERROR( CV_StsParseError, "<trees_x> tag is missing" );
1174
1175
cvStartReadSeq( trees_fnode->data.seq, &reader );
1176
ntrees = trees_fnode->data.seq->total;
1177
1178
if( ntrees != params.weak_count )
1179
CV_ERROR( CV_StsUnmatchedSizes,
1180
"The number of trees stored does not match <ntrees> tag value" );
1181
1182
CV_CALL( storage = cvCreateMemStorage() );
1183
weak[j] = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvDTree*), storage );
1184
1185
for( i = 0; i < ntrees; i++ )
1186
{
1187
CvDTree* tree = new CvDTree();
1188
CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, data ));
1189
CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
1190
cvSeqPush( weak[j], &tree );
1191
}
1192
}
1193
1194
__END__;
1195
}
1196
1197
//===========================================================================
1198
1199
class Sample_predictor : public cv::ParallelLoopBody
1200
{
1201
private:
1202
const CvGBTrees* gbt;
1203
float* predictions;
1204
const CvMat* samples;
1205
const CvMat* missing;
1206
const CvMat* idx;
1207
CvSlice slice;
1208
1209
public:
1210
Sample_predictor() : gbt(0), predictions(0), samples(0), missing(0),
1211
idx(0), slice(CV_WHOLE_SEQ)
1212
{}
1213
1214
Sample_predictor(const CvGBTrees* _gbt, float* _predictions,
1215
const CvMat* _samples, const CvMat* _missing,
1216
const CvMat* _idx, CvSlice _slice=CV_WHOLE_SEQ) :
1217
gbt(_gbt), predictions(_predictions), samples(_samples),
1218
missing(_missing), idx(_idx), slice(_slice)
1219
{}
1220
1221
1222
Sample_predictor( const Sample_predictor& p, cv::Split ) :
1223
gbt(p.gbt), predictions(p.predictions),
1224
samples(p.samples), missing(p.missing), idx(p.idx),
1225
slice(p.slice)
1226
{}
1227
1228
1229
virtual void operator()(const cv::Range& range) const
1230
{
1231
int begin = range.start;
1232
int end = range.end;
1233
1234
CvMat x;
1235
CvMat miss;
1236
1237
for (int i=begin; i<end; ++i)
1238
{
1239
int j = idx ? idx->data.i[i] : i;
1240
cvGetRow(samples, &x, j);
1241
if (!missing)
1242
{
1243
predictions[i] = gbt->predict_serial(&x,0,0,slice);
1244
}
1245
else
1246
{
1247
cvGetRow(missing, &miss, j);
1248
predictions[i] = gbt->predict_serial(&x,&miss,0,slice);
1249
}
1250
}
1251
} // Sample_predictor::operator()
1252
1253
virtual ~Sample_predictor() {}
1254
1255
}; // class Sample_predictor
1256
1257
1258
1259
// type in {CV_TRAIN_ERROR, CV_TEST_ERROR}
1260
float
1261
CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
1262
{
1263
1264
float err = 0.0f;
1265
const CvMat* _sample_idx = (type == CV_TRAIN_ERROR) ?
1266
_data->get_train_sample_idx() :
1267
_data->get_test_sample_idx();
1268
const CvMat* response = _data->get_responses();
1269
1270
int n = _sample_idx ? get_len(_sample_idx) : 0;
1271
n = (type == CV_TRAIN_ERROR && n == 0) ? _data->get_values()->rows : n;
1272
1273
if (!n)
1274
return -FLT_MAX;
1275
1276
float* pred_resp = 0;
1277
bool needsFreeing = false;
1278
1279
if (resp)
1280
{
1281
resp->resize(n);
1282
pred_resp = &((*resp)[0]);
1283
}
1284
else
1285
{
1286
pred_resp = new float[n];
1287
needsFreeing = true;
1288
}
1289
1290
Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(),
1291
_data->get_missing(), _sample_idx);
1292
1293
cv::parallel_for_(cv::Range(0,n), predictor);
1294
1295
int* sidx = _sample_idx ? _sample_idx->data.i : 0;
1296
int r_step = CV_IS_MAT_CONT(response->type) ?
1297
1 : response->step / CV_ELEM_SIZE(response->type);
1298
1299
1300
if ( !problem_type() )
1301
{
1302
for( int i = 0; i < n; i++ )
1303
{
1304
int si = sidx ? sidx[i] : i;
1305
int d = fabs((double)pred_resp[i] - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
1306
err += d;
1307
}
1308
err = err / (float)n * 100.0f;
1309
}
1310
else
1311
{
1312
for( int i = 0; i < n; i++ )
1313
{
1314
int si = sidx ? sidx[i] : i;
1315
float d = pred_resp[i] - response->data.fl[si*r_step];
1316
err += d*d;
1317
}
1318
err = err / (float)n;
1319
}
1320
1321
if (needsFreeing)
1322
delete[]pred_resp;
1323
1324
return err;
1325
}
1326
1327
1328
CvGBTrees::CvGBTrees( const cv::Mat& trainData, int tflag,
1329
const cv::Mat& responses, const cv::Mat& varIdx,
1330
const cv::Mat& sampleIdx, const cv::Mat& varType,
1331
const cv::Mat& missingDataMask,
1332
CvGBTreesParams _params )
1333
{
1334
data = 0;
1335
weak = 0;
1336
default_model_name = "my_boost_tree";
1337
orig_response = sum_response = sum_response_tmp = 0;
1338
subsample_train = subsample_test = 0;
1339
missing = sample_idx = 0;
1340
class_labels = 0;
1341
class_count = 1;
1342
delta = 0.0f;
1343
1344
clear();
1345
1346
train(trainData, tflag, responses, varIdx, sampleIdx, varType, missingDataMask, _params, false);
1347
}
1348
1349
bool CvGBTrees::train( const cv::Mat& trainData, int tflag,
1350
const cv::Mat& responses, const cv::Mat& varIdx,
1351
const cv::Mat& sampleIdx, const cv::Mat& varType,
1352
const cv::Mat& missingDataMask,
1353
CvGBTreesParams _params,
1354
bool update )
1355
{
1356
CvMat _trainData = trainData, _responses = responses;
1357
CvMat _varIdx = varIdx, _sampleIdx = sampleIdx, _varType = varType;
1358
CvMat _missingDataMask = missingDataMask;
1359
1360
return train( &_trainData, tflag, &_responses, varIdx.empty() ? 0 : &_varIdx,
1361
sampleIdx.empty() ? 0 : &_sampleIdx, varType.empty() ? 0 : &_varType,
1362
missingDataMask.empty() ? 0 : &_missingDataMask, _params, update);
1363
}
1364
1365
float CvGBTrees::predict( const cv::Mat& sample, const cv::Mat& _missing,
1366
const cv::Range& slice, int k ) const
1367
{
1368
CvMat _sample = sample, miss = _missing;
1369
return predict(&_sample, _missing.empty() ? 0 : &miss, 0,
1370
slice==cv::Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end), k);
1371
}
1372
1373
#endif
1374
1375