CoCalc -- old_ml

GitHub Repository: Tetragramm/opencv
Path: blob/master/apps/traincascade/old_ml_boost.cpp
¹⁶³³⁷ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                        Intel License Agreement
11
//
12
// Copyright (C) 2000, Intel Corporation, all rights reserved.
13
// Third party copyrights are property of their respective owners.
14
//
15
// Redistribution and use in source and binary forms, with or without modification,
16
// are permitted provided that the following conditions are met:
17
//
18
//   * Redistribution's of source code must retain the above copyright notice,
19
//     this list of conditions and the following disclaimer.
20
//
21
//   * Redistribution's in binary form must reproduce the above copyright notice,
22
//     this list of conditions and the following disclaimer in the documentation
23
//     and/or other materials provided with the distribution.
24
//
25
//   * The name of Intel Corporation may not be used to endorse or promote products
26
//     derived from this software without specific prior written permission.
27
//
28
// This software is provided by the copyright holders and contributors "as is" and
29
// any express or implied warranties, including, but not limited to, the implied
30
// warranties of merchantability and fitness for a particular purpose are disclaimed.
31
// In no event shall the Intel Corporation or contributors be liable for any direct,
32
// indirect, incidental, special, exemplary, or consequential damages
33
// (including, but not limited to, procurement of substitute goods or services;
34
// loss of use, data, or profits; or business interruption) however caused
35
// and on any theory of liability, whether in contract, strict liability,
36
// or tort (including negligence or otherwise) arising in any way out of
37
// the use of this software, even if advised of the possibility of such damage.
38
//
39
//M*/
40

41
#include "old_ml_precomp.hpp"
42

43
static inline double
44
log_ratio( double val )
45
{
46
    const double eps = 1e-5;
47

48
    val = MAX( val, eps );
49
    val = MIN( val, 1. - eps );
50
    return log( val/(1. - val) );
51
}
52

53

54
CvBoostParams::CvBoostParams()
55
{
56
    boost_type = CvBoost::REAL;
57
    weak_count = 100;
58
    weight_trim_rate = 0.95;
59
    cv_folds = 0;
60
    max_depth = 1;
61
}
62

63

64
CvBoostParams::CvBoostParams( int _boost_type, int _weak_count,
65
                                        double _weight_trim_rate, int _max_depth,
66
                                        bool _use_surrogates, const float* _priors )
67
{
68
    boost_type = _boost_type;
69
    weak_count = _weak_count;
70
    weight_trim_rate = _weight_trim_rate;
71
    split_criteria = CvBoost::DEFAULT;
72
    cv_folds = 0;
73
    max_depth = _max_depth;
74
    use_surrogates = _use_surrogates;
75
    priors = _priors;
76
}
77

78

79

80
///////////////////////////////// CvBoostTree ///////////////////////////////////
81

82
CvBoostTree::CvBoostTree()
83
{
84
    ensemble = 0;
85
}
86

87

88
CvBoostTree::~CvBoostTree()
89
{
90
    clear();
91
}
92

93

94
void
95
CvBoostTree::clear()
96
{
97
    CvDTree::clear();
98
    ensemble = 0;
99
}
100

101

102
bool
103
CvBoostTree::train( CvDTreeTrainData* _train_data,
104
                    const CvMat* _subsample_idx, CvBoost* _ensemble )
105
{
106
    clear();
107
    ensemble = _ensemble;
108
    data = _train_data;
109
    data->shared = true;
110
    return do_train( _subsample_idx );
111
}
112

113

114
bool
115
CvBoostTree::train( const CvMat*, int, const CvMat*, const CvMat*,
116
                    const CvMat*, const CvMat*, const CvMat*, CvDTreeParams )
117
{
118
    assert(0);
119
    return false;
120
}
121

122

123
bool
124
CvBoostTree::train( CvDTreeTrainData*, const CvMat* )
125
{
126
    assert(0);
127
    return false;
128
}
129

130

131
void
132
CvBoostTree::scale( double _scale )
133
{
134
    CvDTreeNode* node = root;
135

136
    // traverse the tree and scale all the node values
137
    for(;;)
138
    {
139
        CvDTreeNode* parent;
140
        for(;;)
141
        {
142
            node->value *= _scale;
143
            if( !node->left )
144
                break;
145
            node = node->left;
146
        }
147

148
        for( parent = node->parent; parent && parent->right == node;
149
            node = parent, parent = parent->parent )
150
            ;
151

152
        if( !parent )
153
            break;
154

155
        node = parent->right;
156
    }
157
}
158

159

160
void
161
CvBoostTree::try_split_node( CvDTreeNode* node )
162
{
163
    CvDTree::try_split_node( node );
164

165
    if( !node->left )
166
    {
167
        // if the node has not been split,
168
        // store the responses for the corresponding training samples
169
        double* weak_eval = ensemble->get_weak_response()->data.db;
170
        cv::AutoBuffer<int> inn_buf(node->sample_count);
171
        const int* labels = data->get_cv_labels(node, inn_buf.data());
172
        int i, count = node->sample_count;
173
        double value = node->value;
174

175
        for( i = 0; i < count; i++ )
176
            weak_eval[labels[i]] = value;
177
    }
178
}
179

180

181
double
182
CvBoostTree::calc_node_dir( CvDTreeNode* node )
183
{
184
    char* dir = (char*)data->direction->data.ptr;
185
    const double* weights = ensemble->get_subtree_weights()->data.db;
186
    int i, n = node->sample_count, vi = node->split->var_idx;
187
    double L, R;
188

189
    assert( !node->split->inversed );
190

191
    if( data->get_var_type(vi) >= 0 ) // split on categorical var
192
    {
193
        cv::AutoBuffer<int> inn_buf(n);
194
        const int* cat_labels = data->get_cat_var_data(node, vi, inn_buf.data());
195
        const int* subset = node->split->subset;
196
        double sum = 0, sum_abs = 0;
197

198
        for( i = 0; i < n; i++ )
199
        {
200
            int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
201
            double w = weights[i];
202
            int d = idx >= 0 ? CV_DTREE_CAT_DIR(idx,subset) : 0;
203
            sum += d*w; sum_abs += (d & 1)*w;
204
            dir[i] = (char)d;
205
        }
206

207
        R = (sum_abs + sum) * 0.5;
208
        L = (sum_abs - sum) * 0.5;
209
    }
210
    else // split on ordered var
211
    {
212
        cv::AutoBuffer<uchar> inn_buf(2*n*sizeof(int)+n*sizeof(float));
213
        float* values_buf = (float*)inn_buf.data();
214
        int* sorted_indices_buf = (int*)(values_buf + n);
215
        int* sample_indices_buf = sorted_indices_buf + n;
216
        const float* values = 0;
217
        const int* sorted_indices = 0;
218
        data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf );
219
        int split_point = node->split->ord.split_point;
220
        int n1 = node->get_num_valid(vi);
221

222
        assert( 0 <= split_point && split_point < n1-1 );
223
        L = R = 0;
224

225
        for( i = 0; i <= split_point; i++ )
226
        {
227
            int idx = sorted_indices[i];
228
            double w = weights[idx];
229
            dir[idx] = (char)-1;
230
            L += w;
231
        }
232

233
        for( ; i < n1; i++ )
234
        {
235
            int idx = sorted_indices[i];
236
            double w = weights[idx];
237
            dir[idx] = (char)1;
238
            R += w;
239
        }
240

241
        for( ; i < n; i++ )
242
            dir[sorted_indices[i]] = (char)0;
243
    }
244

245
    node->maxlr = MAX( L, R );
246
    return node->split->quality/(L + R);
247
}
248

249

250
CvDTreeSplit*
251
CvBoostTree::find_split_ord_class( CvDTreeNode* node, int vi, float init_quality,
252
                                    CvDTreeSplit* _split, uchar* _ext_buf )
253
{
254
    const float epsilon = FLT_EPSILON*2;
255

256
    const double* weights = ensemble->get_subtree_weights()->data.db;
257
    int n = node->sample_count;
258
    int n1 = node->get_num_valid(vi);
259

260
    cv::AutoBuffer<uchar> inn_buf;
261
    if( !_ext_buf )
262
        inn_buf.allocate(n*(3*sizeof(int)+sizeof(float)));
263
    uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
264
    float* values_buf = (float*)ext_buf;
265
    int* sorted_indices_buf = (int*)(values_buf + n);
266
    int* sample_indices_buf = sorted_indices_buf + n;
267
    const float* values = 0;
268
    const int* sorted_indices = 0;
269
    data->get_ord_var_data( node, vi, values_buf, sorted_indices_buf, &values, &sorted_indices, sample_indices_buf );
270
    int* responses_buf = sorted_indices_buf + n;
271
    const int* responses = data->get_class_labels( node, responses_buf );
272
    const double* rcw0 = weights + n;
273
    double lcw[2] = {0,0}, rcw[2];
274
    int i, best_i = -1;
275
    double best_val = init_quality;
276
    int boost_type = ensemble->get_params().boost_type;
277
    int split_criteria = ensemble->get_params().split_criteria;
278

279
    rcw[0] = rcw0[0]; rcw[1] = rcw0[1];
280
    for( i = n1; i < n; i++ )
281
    {
282
        int idx = sorted_indices[i];
283
        double w = weights[idx];
284
        rcw[responses[idx]] -= w;
285
    }
286

287
    if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS )
288
        split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI;
289

290
    if( split_criteria == CvBoost::GINI )
291
    {
292
        double L = 0, R = rcw[0] + rcw[1];
293
        double lsum2 = 0, rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1];
294

295
        for( i = 0; i < n1 - 1; i++ )
296
        {
297
            int idx = sorted_indices[i];
298
            double w = weights[idx], w2 = w*w;
299
            double lv, rv;
300
            idx = responses[idx];
301
            L += w; R -= w;
302
            lv = lcw[idx]; rv = rcw[idx];
303
            lsum2 += 2*lv*w + w2;
304
            rsum2 -= 2*rv*w - w2;
305
            lcw[idx] = lv + w; rcw[idx] = rv - w;
306

307
            if( values[i] + epsilon < values[i+1] )
308
            {
309
                double val = (lsum2*R + rsum2*L)/(L*R);
310
                if( best_val < val )
311
                {
312
                    best_val = val;
313
                    best_i = i;
314
                }
315
            }
316
        }
317
    }
318
    else
319
    {
320
        for( i = 0; i < n1 - 1; i++ )
321
        {
322
            int idx = sorted_indices[i];
323
            double w = weights[idx];
324
            idx = responses[idx];
325
            lcw[idx] += w;
326
            rcw[idx] -= w;
327

328
            if( values[i] + epsilon < values[i+1] )
329
            {
330
                double val = lcw[0] + rcw[1], val2 = lcw[1] + rcw[0];
331
                val = MAX(val, val2);
332
                if( best_val < val )
333
                {
334
                    best_val = val;
335
                    best_i = i;
336
                }
337
            }
338
        }
339
    }
340

341
    CvDTreeSplit* split = 0;
342
    if( best_i >= 0 )
343
    {
344
        split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f );
345
        split->var_idx = vi;
346
        split->ord.c = (values[best_i] + values[best_i+1])*0.5f;
347
        split->ord.split_point = best_i;
348
        split->inversed = 0;
349
        split->quality = (float)best_val;
350
    }
351
    return split;
352
}
353

354
template<typename T>
355
class LessThanPtr
356
{
357
public:
358
    bool operator()(T* a, T* b) const { return *a < *b; }
359
};
360

361
CvDTreeSplit*
362
CvBoostTree::find_split_cat_class( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
363
{
364
    int ci = data->get_var_type(vi);
365
    int n = node->sample_count;
366
    int mi = data->cat_count->data.i[ci];
367

368
    int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*);
369
    cv::AutoBuffer<uchar> inn_buf((2*mi+3)*sizeof(double) + mi*sizeof(double*));
370
    if( !_ext_buf)
371
        inn_buf.allocate( base_size + 2*n*sizeof(int) );
372
    uchar* base_buf = inn_buf.data();
373
    uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size;
374

375
    int* cat_labels_buf = (int*)ext_buf;
376
    const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
377
    int* responses_buf = cat_labels_buf + n;
378
    const int* responses = data->get_class_labels(node, responses_buf);
379
    double lcw[2]={0,0}, rcw[2]={0,0};
380

381
    double* cjk = (double*)cv::alignPtr(base_buf,sizeof(double))+2;
382
    const double* weights = ensemble->get_subtree_weights()->data.db;
383
    double** dbl_ptr = (double**)(cjk + 2*mi);
384
    int i, j, k, idx;
385
    double L = 0, R;
386
    double best_val = init_quality;
387
    int best_subset = -1, subset_i;
388
    int boost_type = ensemble->get_params().boost_type;
389
    int split_criteria = ensemble->get_params().split_criteria;
390

391
    // init array of counters:
392
    // c_{jk} - number of samples that have vi-th input variable = j and response = k.
393
    for( j = -1; j < mi; j++ )
394
        cjk[j*2] = cjk[j*2+1] = 0;
395

396
    for( i = 0; i < n; i++ )
397
    {
398
        double w = weights[i];
399
        j = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
400
        k = responses[i];
401
        cjk[j*2 + k] += w;
402
    }
403

404
    for( j = 0; j < mi; j++ )
405
    {
406
        rcw[0] += cjk[j*2];
407
        rcw[1] += cjk[j*2+1];
408
        dbl_ptr[j] = cjk + j*2 + 1;
409
    }
410

411
    R = rcw[0] + rcw[1];
412

413
    if( split_criteria != CvBoost::GINI && split_criteria != CvBoost::MISCLASS )
414
        split_criteria = boost_type == CvBoost::DISCRETE ? CvBoost::MISCLASS : CvBoost::GINI;
415

416
    // sort rows of c_jk by increasing c_j,1
417
    // (i.e. by the weight of samples in j-th category that belong to class 1)
418
    std::sort(dbl_ptr, dbl_ptr + mi, LessThanPtr<double>());
419

420
    for( subset_i = 0; subset_i < mi-1; subset_i++ )
421
    {
422
        idx = (int)(dbl_ptr[subset_i] - cjk)/2;
423
        const double* crow = cjk + idx*2;
424
        double w0 = crow[0], w1 = crow[1];
425
        double weight = w0 + w1;
426

427
        if( weight < FLT_EPSILON )
428
            continue;
429

430
        lcw[0] += w0; rcw[0] -= w0;
431
        lcw[1] += w1; rcw[1] -= w1;
432

433
        if( split_criteria == CvBoost::GINI )
434
        {
435
            double lsum2 = lcw[0]*lcw[0] + lcw[1]*lcw[1];
436
            double rsum2 = rcw[0]*rcw[0] + rcw[1]*rcw[1];
437

438
            L += weight;
439
            R -= weight;
440

441
            if( L > FLT_EPSILON && R > FLT_EPSILON )
442
            {
443
                double val = (lsum2*R + rsum2*L)/(L*R);
444
                if( best_val < val )
445
                {
446
                    best_val = val;
447
                    best_subset = subset_i;
448
                }
449
            }
450
        }
451
        else
452
        {
453
            double val = lcw[0] + rcw[1];
454
            double val2 = lcw[1] + rcw[0];
455

456
            val = MAX(val, val2);
457
            if( best_val < val )
458
            {
459
                best_val = val;
460
                best_subset = subset_i;
461
            }
462
        }
463
    }
464

465
    CvDTreeSplit* split = 0;
466
    if( best_subset >= 0 )
467
    {
468
        split = _split ? _split : data->new_split_cat( 0, -1.0f);
469
        split->var_idx = vi;
470
        split->quality = (float)best_val;
471
        memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int));
472
        for( i = 0; i <= best_subset; i++ )
473
        {
474
            idx = (int)(dbl_ptr[i] - cjk) >> 1;
475
            split->subset[idx >> 5] |= 1 << (idx & 31);
476
        }
477
    }
478
    return split;
479
}
480

481

482
CvDTreeSplit*
483
CvBoostTree::find_split_ord_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
484
{
485
    const float epsilon = FLT_EPSILON*2;
486
    const double* weights = ensemble->get_subtree_weights()->data.db;
487
    int n = node->sample_count;
488
    int n1 = node->get_num_valid(vi);
489

490
    cv::AutoBuffer<uchar> inn_buf;
491
    if( !_ext_buf )
492
        inn_buf.allocate(2*n*(sizeof(int)+sizeof(float)));
493
    uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
494

495
    float* values_buf = (float*)ext_buf;
496
    int* indices_buf = (int*)(values_buf + n);
497
    int* sample_indices_buf = indices_buf + n;
498
    const float* values = 0;
499
    const int* indices = 0;
500
    data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf );
501
    float* responses_buf = (float*)(indices_buf + n);
502
    const float* responses = data->get_ord_responses( node, responses_buf, sample_indices_buf );
503

504
    int i, best_i = -1;
505
    double L = 0, R = weights[n];
506
    double best_val = init_quality, lsum = 0, rsum = node->value*R;
507

508
    // compensate for missing values
509
    for( i = n1; i < n; i++ )
510
    {
511
        int idx = indices[i];
512
        double w = weights[idx];
513
        rsum -= responses[idx]*w;
514
        R -= w;
515
    }
516

517
    // find the optimal split
518
    for( i = 0; i < n1 - 1; i++ )
519
    {
520
        int idx = indices[i];
521
        double w = weights[idx];
522
        double t = responses[idx]*w;
523
        L += w; R -= w;
524
        lsum += t; rsum -= t;
525

526
        if( values[i] + epsilon < values[i+1] )
527
        {
528
            double val = (lsum*lsum*R + rsum*rsum*L)/(L*R);
529
            if( best_val < val )
530
            {
531
                best_val = val;
532
                best_i = i;
533
            }
534
        }
535
    }
536

537
    CvDTreeSplit* split = 0;
538
    if( best_i >= 0 )
539
    {
540
        split = _split ? _split : data->new_split_ord( 0, 0.0f, 0, 0, 0.0f );
541
        split->var_idx = vi;
542
        split->ord.c = (values[best_i] + values[best_i+1])*0.5f;
543
        split->ord.split_point = best_i;
544
        split->inversed = 0;
545
        split->quality = (float)best_val;
546
    }
547
    return split;
548
}
549

550

551
CvDTreeSplit*
552
CvBoostTree::find_split_cat_reg( CvDTreeNode* node, int vi, float init_quality, CvDTreeSplit* _split, uchar* _ext_buf )
553
{
554
    const double* weights = ensemble->get_subtree_weights()->data.db;
555
    int ci = data->get_var_type(vi);
556
    int n = node->sample_count;
557
    int mi = data->cat_count->data.i[ci];
558
    int base_size = (2*mi+3)*sizeof(double) + mi*sizeof(double*);
559
    cv::AutoBuffer<uchar> inn_buf(base_size);
560
    if( !_ext_buf )
561
        inn_buf.allocate(base_size + n*(2*sizeof(int) + sizeof(float)));
562
    uchar* base_buf = inn_buf.data();
563
    uchar* ext_buf = _ext_buf ? _ext_buf : base_buf + base_size;
564

565
    int* cat_labels_buf = (int*)ext_buf;
566
    const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
567
    float* responses_buf = (float*)(cat_labels_buf + n);
568
    int* sample_indices_buf = (int*)(responses_buf + n);
569
    const float* responses = data->get_ord_responses(node, responses_buf, sample_indices_buf);
570

571
    double* sum = (double*)cv::alignPtr(base_buf,sizeof(double)) + 1;
572
    double* counts = sum + mi + 1;
573
    double** sum_ptr = (double**)(counts + mi);
574
    double L = 0, R = 0, best_val = init_quality, lsum = 0, rsum = 0;
575
    int i, best_subset = -1, subset_i;
576

577
    for( i = -1; i < mi; i++ )
578
        sum[i] = counts[i] = 0;
579

580
    // calculate sum response and weight of each category of the input var
581
    for( i = 0; i < n; i++ )
582
    {
583
        int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
584
        double w = weights[i];
585
        double s = sum[idx] + responses[i]*w;
586
        double nc = counts[idx] + w;
587
        sum[idx] = s;
588
        counts[idx] = nc;
589
    }
590

591
    // calculate average response in each category
592
    for( i = 0; i < mi; i++ )
593
    {
594
        R += counts[i];
595
        rsum += sum[i];
596
        sum[i] = fabs(counts[i]) > DBL_EPSILON ? sum[i]/counts[i] : 0;
597
        sum_ptr[i] = sum + i;
598
    }
599

600
    std::sort(sum_ptr, sum_ptr + mi, LessThanPtr<double>());
601

602
    // revert back to unnormalized sums
603
    // (there should be a very little loss in accuracy)
604
    for( i = 0; i < mi; i++ )
605
        sum[i] *= counts[i];
606

607
    for( subset_i = 0; subset_i < mi-1; subset_i++ )
608
    {
609
        int idx = (int)(sum_ptr[subset_i] - sum);
610
        double ni = counts[idx];
611

612
        if( ni > FLT_EPSILON )
613
        {
614
            double s = sum[idx];
615
            lsum += s; L += ni;
616
            rsum -= s; R -= ni;
617

618
            if( L > FLT_EPSILON && R > FLT_EPSILON )
619
            {
620
                double val = (lsum*lsum*R + rsum*rsum*L)/(L*R);
621
                if( best_val < val )
622
                {
623
                    best_val = val;
624
                    best_subset = subset_i;
625
                }
626
            }
627
        }
628
    }
629

630
    CvDTreeSplit* split = 0;
631
    if( best_subset >= 0 )
632
    {
633
        split = _split ? _split : data->new_split_cat( 0, -1.0f);
634
        split->var_idx = vi;
635
        split->quality = (float)best_val;
636
        memset( split->subset, 0, (data->max_c_count + 31)/32 * sizeof(int));
637
        for( i = 0; i <= best_subset; i++ )
638
        {
639
            int idx = (int)(sum_ptr[i] - sum);
640
            split->subset[idx >> 5] |= 1 << (idx & 31);
641
        }
642
    }
643
    return split;
644
}
645

646

647
CvDTreeSplit*
648
CvBoostTree::find_surrogate_split_ord( CvDTreeNode* node, int vi, uchar* _ext_buf )
649
{
650
    const float epsilon = FLT_EPSILON*2;
651
    int n = node->sample_count;
652
    cv::AutoBuffer<uchar> inn_buf;
653
    if( !_ext_buf )
654
        inn_buf.allocate(n*(2*sizeof(int)+sizeof(float)));
655
    uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
656
    float* values_buf = (float*)ext_buf;
657
    int* indices_buf = (int*)(values_buf + n);
658
    int* sample_indices_buf = indices_buf + n;
659
    const float* values = 0;
660
    const int* indices = 0;
661
    data->get_ord_var_data( node, vi, values_buf, indices_buf, &values, &indices, sample_indices_buf );
662

663
    const double* weights = ensemble->get_subtree_weights()->data.db;
664
    const char* dir = (char*)data->direction->data.ptr;
665
    int n1 = node->get_num_valid(vi);
666
    // LL - number of samples that both the primary and the surrogate splits send to the left
667
    // LR - ... primary split sends to the left and the surrogate split sends to the right
668
    // RL - ... primary split sends to the right and the surrogate split sends to the left
669
    // RR - ... both send to the right
670
    int i, best_i = -1, best_inversed = 0;
671
    double best_val;
672
    double LL = 0, RL = 0, LR, RR;
673
    double worst_val = node->maxlr;
674
    double sum = 0, sum_abs = 0;
675
    best_val = worst_val;
676

677
    for( i = 0; i < n1; i++ )
678
    {
679
        int idx = indices[i];
680
        double w = weights[idx];
681
        int d = dir[idx];
682
        sum += d*w; sum_abs += (d & 1)*w;
683
    }
684

685
    // sum_abs = R + L; sum = R - L
686
    RR = (sum_abs + sum)*0.5;
687
    LR = (sum_abs - sum)*0.5;
688

689
    // initially all the samples are sent to the right by the surrogate split,
690
    // LR of them are sent to the left by primary split, and RR - to the right.
691
    // now iteratively compute LL, LR, RL and RR for every possible surrogate split value.
692
    for( i = 0; i < n1 - 1; i++ )
693
    {
694
        int idx = indices[i];
695
        double w = weights[idx];
696
        int d = dir[idx];
697

698
        if( d < 0 )
699
        {
700
            LL += w; LR -= w;
701
            if( LL + RR > best_val && values[i] + epsilon < values[i+1] )
702
            {
703
                best_val = LL + RR;
704
                best_i = i; best_inversed = 0;
705
            }
706
        }
707
        else if( d > 0 )
708
        {
709
            RL += w; RR -= w;
710
            if( RL + LR > best_val && values[i] + epsilon < values[i+1] )
711
            {
712
                best_val = RL + LR;
713
                best_i = i; best_inversed = 1;
714
            }
715
        }
716
    }
717

718
    return best_i >= 0 && best_val > node->maxlr ? data->new_split_ord( vi,
719
        (values[best_i] + values[best_i+1])*0.5f, best_i,
720
        best_inversed, (float)best_val ) : 0;
721
}
722

723

724
CvDTreeSplit*
725
CvBoostTree::find_surrogate_split_cat( CvDTreeNode* node, int vi, uchar* _ext_buf )
726
{
727
    const char* dir = (char*)data->direction->data.ptr;
728
    const double* weights = ensemble->get_subtree_weights()->data.db;
729
    int n = node->sample_count;
730
    int i, mi = data->cat_count->data.i[data->get_var_type(vi)];
731

732
    int base_size = (2*mi+3)*sizeof(double);
733
    cv::AutoBuffer<uchar> inn_buf(base_size);
734
    if( !_ext_buf )
735
        inn_buf.allocate(base_size + n*sizeof(int));
736
    uchar* ext_buf = _ext_buf ? _ext_buf : inn_buf.data();
737
    int* cat_labels_buf = (int*)ext_buf;
738
    const int* cat_labels = data->get_cat_var_data(node, vi, cat_labels_buf);
739

740
    // LL - number of samples that both the primary and the surrogate splits send to the left
741
    // LR - ... primary split sends to the left and the surrogate split sends to the right
742
    // RL - ... primary split sends to the right and the surrogate split sends to the left
743
    // RR - ... both send to the right
744
    CvDTreeSplit* split = data->new_split_cat( vi, 0 );
745
    double best_val = 0;
746
    double* lc = (double*)cv::alignPtr(cat_labels_buf + n, sizeof(double)) + 1;
747
    double* rc = lc + mi + 1;
748

749
    for( i = -1; i < mi; i++ )
750
        lc[i] = rc[i] = 0;
751

752
    // 1. for each category calculate the weight of samples
753
    // sent to the left (lc) and to the right (rc) by the primary split
754
    for( i = 0; i < n; i++ )
755
    {
756
        int idx = ((cat_labels[i] == 65535) && data->is_buf_16u) ? -1 : cat_labels[i];
757
        double w = weights[i];
758
        int d = dir[i];
759
        double sum = lc[idx] + d*w;
760
        double sum_abs = rc[idx] + (d & 1)*w;
761
        lc[idx] = sum; rc[idx] = sum_abs;
762
    }
763

764
    for( i = 0; i < mi; i++ )
765
    {
766
        double sum = lc[i];
767
        double sum_abs = rc[i];
768
        lc[i] = (sum_abs - sum) * 0.5;
769
        rc[i] = (sum_abs + sum) * 0.5;
770
    }
771

772
    // 2. now form the split.
773
    // in each category send all the samples to the same direction as majority
774
    for( i = 0; i < mi; i++ )
775
    {
776
        double lval = lc[i], rval = rc[i];
777
        if( lval > rval )
778
        {
779
            split->subset[i >> 5] |= 1 << (i & 31);
780
            best_val += lval;
781
        }
782
        else
783
            best_val += rval;
784
    }
785

786
    split->quality = (float)best_val;
787
    if( split->quality <= node->maxlr )
788
        cvSetRemoveByPtr( data->split_heap, split ), split = 0;
789

790
    return split;
791
}
792

793

794
void
795
CvBoostTree::calc_node_value( CvDTreeNode* node )
796
{
797
    int i, n = node->sample_count;
798
    const double* weights = ensemble->get_weights()->data.db;
799
    cv::AutoBuffer<uchar> inn_buf(n*(sizeof(int) + ( data->is_classifier ? sizeof(int) : sizeof(int) + sizeof(float))));
800
    int* labels_buf = (int*)inn_buf.data();
801
    const int* labels = data->get_cv_labels(node, labels_buf);
802
    double* subtree_weights = ensemble->get_subtree_weights()->data.db;
803
    double rcw[2] = {0,0};
804
    int boost_type = ensemble->get_params().boost_type;
805

806
    if( data->is_classifier )
807
    {
808
        int* _responses_buf = labels_buf + n;
809
        const int* _responses = data->get_class_labels(node, _responses_buf);
810
        int m = data->get_num_classes();
811
        int* cls_count = data->counts->data.i;
812
        for( int k = 0; k < m; k++ )
813
            cls_count[k] = 0;
814

815
        for( i = 0; i < n; i++ )
816
        {
817
            int idx = labels[i];
818
            double w = weights[idx];
819
            int r = _responses[i];
820
            rcw[r] += w;
821
            cls_count[r]++;
822
            subtree_weights[i] = w;
823
        }
824

825
        node->class_idx = rcw[1] > rcw[0];
826

827
        if( boost_type == CvBoost::DISCRETE )
828
        {
829
            // ignore cat_map for responses, and use {-1,1},
830
            // as the whole ensemble response is computes as sign(sum_i(weak_response_i)
831
            node->value = node->class_idx*2 - 1;
832
        }
833
        else
834
        {
835
            double p = rcw[1]/(rcw[0] + rcw[1]);
836
            assert( boost_type == CvBoost::REAL );
837

838
            // store log-ratio of the probability
839
            node->value = 0.5*log_ratio(p);
840
        }
841
    }
842
    else
843
    {
844
        // in case of regression tree:
845
        //  * node value is 1/n*sum_i(Y_i), where Y_i is i-th response,
846
        //    n is the number of samples in the node.
847
        //  * node risk is the sum of squared errors: sum_i((Y_i - <node_value>)^2)
848
        double sum = 0, sum2 = 0, iw;
849
        float* values_buf = (float*)(labels_buf + n);
850
        int* sample_indices_buf = (int*)(values_buf + n);
851
        const float* values = data->get_ord_responses(node, values_buf, sample_indices_buf);
852

853
        for( i = 0; i < n; i++ )
854
        {
855
            int idx = labels[i];
856
            double w = weights[idx]/*priors[values[i] > 0]*/;
857
            double t = values[i];
858
            rcw[0] += w;
859
            subtree_weights[i] = w;
860
            sum += t*w;
861
            sum2 += t*t*w;
862
        }
863

864
        iw = 1./rcw[0];
865
        node->value = sum*iw;
866
        node->node_risk = sum2 - (sum*iw)*sum;
867

868
        // renormalize the risk, as in try_split_node the unweighted formula
869
        // sqrt(risk)/n is used, rather than sqrt(risk)/sum(weights_i)
870
        node->node_risk *= n*iw*n*iw;
871
    }
872

873
    // store summary weights
874
    subtree_weights[n] = rcw[0];
875
    subtree_weights[n+1] = rcw[1];
876
}
877

878

879
void CvBoostTree::read( CvFileStorage* fs, CvFileNode* fnode, CvBoost* _ensemble, CvDTreeTrainData* _data )
880
{
881
    CvDTree::read( fs, fnode, _data );
882
    ensemble = _ensemble;
883
}
884

885
void CvBoostTree::read( CvFileStorage*, CvFileNode* )
886
{
887
    assert(0);
888
}
889

890
void CvBoostTree::read( CvFileStorage* _fs, CvFileNode* _node,
891
                        CvDTreeTrainData* _data )
892
{
893
    CvDTree::read( _fs, _node, _data );
894
}
895

896

897
/////////////////////////////////// CvBoost /////////////////////////////////////
898

899
CvBoost::CvBoost()
900
{
901
    data = 0;
902
    weak = 0;
903
    default_model_name = "my_boost_tree";
904

905
    active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
906
        subsample_mask = weights = subtree_weights = 0;
907
    have_active_cat_vars = have_subsample = false;
908

909
    clear();
910
}
911

912

913
void CvBoost::prune( CvSlice slice )
914
{
915
    if( weak && weak->total > 0 )
916
    {
917
        CvSeqReader reader;
918
        int i, count = cvSliceLength( slice, weak );
919

920
        cvStartReadSeq( weak, &reader );
921
        cvSetSeqReaderPos( &reader, slice.start_index );
922

923
        for( i = 0; i < count; i++ )
924
        {
925
            CvBoostTree* w;
926
            CV_READ_SEQ_ELEM( w, reader );
927
            delete w;
928
        }
929

930
        cvSeqRemoveSlice( weak, slice );
931
    }
932
}
933

934

935
void CvBoost::clear()
936
{
937
    if( weak )
938
    {
939
        prune( CV_WHOLE_SEQ );
940
        cvReleaseMemStorage( &weak->storage );
941
    }
942
    if( data )
943
        delete data;
944
    weak = 0;
945
    data = 0;
946
    cvReleaseMat( &active_vars );
947
    cvReleaseMat( &active_vars_abs );
948
    cvReleaseMat( &orig_response );
949
    cvReleaseMat( &sum_response );
950
    cvReleaseMat( &weak_eval );
951
    cvReleaseMat( &subsample_mask );
952
    cvReleaseMat( &weights );
953
    cvReleaseMat( &subtree_weights );
954

955
    have_subsample = false;
956
}
957

958

959
CvBoost::~CvBoost()
960
{
961
    clear();
962
}
963

964

965
CvBoost::CvBoost( const CvMat* _train_data, int _tflag,
966
                  const CvMat* _responses, const CvMat* _var_idx,
967
                  const CvMat* _sample_idx, const CvMat* _var_type,
968
                  const CvMat* _missing_mask, CvBoostParams _params )
969
{
970
    weak = 0;
971
    data = 0;
972
    default_model_name = "my_boost_tree";
973

974
    active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
975
        subsample_mask = weights = subtree_weights = 0;
976

977
    train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
978
           _var_type, _missing_mask, _params );
979
}
980

981

982
bool
983
CvBoost::set_params( const CvBoostParams& _params )
984
{
985
    bool ok = false;
986

987
    CV_FUNCNAME( "CvBoost::set_params" );
988

989
    __BEGIN__;
990

991
    params = _params;
992
    if( params.boost_type != DISCRETE && params.boost_type != REAL &&
993
        params.boost_type != LOGIT && params.boost_type != GENTLE )
994
        CV_ERROR( CV_StsBadArg, "Unknown/unsupported boosting type" );
995

996
    params.weak_count = MAX( params.weak_count, 1 );
997
    params.weight_trim_rate = MAX( params.weight_trim_rate, 0. );
998
    params.weight_trim_rate = MIN( params.weight_trim_rate, 1. );
999
    if( params.weight_trim_rate < FLT_EPSILON )
1000
        params.weight_trim_rate = 1.f;
1001

1002
    if( params.boost_type == DISCRETE &&
1003
        params.split_criteria != GINI && params.split_criteria != MISCLASS )
1004
        params.split_criteria = MISCLASS;
1005
    if( params.boost_type == REAL &&
1006
        params.split_criteria != GINI && params.split_criteria != MISCLASS )
1007
        params.split_criteria = GINI;
1008
    if( (params.boost_type == LOGIT || params.boost_type == GENTLE) &&
1009
        params.split_criteria != SQERR )
1010
        params.split_criteria = SQERR;
1011

1012
    ok = true;
1013

1014
    __END__;
1015

1016
    return ok;
1017
}
1018

1019

1020
bool
1021
CvBoost::train( const CvMat* _train_data, int _tflag,
1022
              const CvMat* _responses, const CvMat* _var_idx,
1023
              const CvMat* _sample_idx, const CvMat* _var_type,
1024
              const CvMat* _missing_mask,
1025
              CvBoostParams _params, bool _update )
1026
{
1027
    bool ok = false;
1028
    CvMemStorage* storage = 0;
1029

1030
    CV_FUNCNAME( "CvBoost::train" );
1031

1032
    __BEGIN__;
1033

1034
    int i;
1035

1036
    set_params( _params );
1037

1038
    cvReleaseMat( &active_vars );
1039
    cvReleaseMat( &active_vars_abs );
1040

1041
    if( !_update || !data )
1042
    {
1043
        clear();
1044
        data = new CvDTreeTrainData( _train_data, _tflag, _responses, _var_idx,
1045
            _sample_idx, _var_type, _missing_mask, _params, true, true );
1046

1047
        if( data->get_num_classes() != 2 )
1048
            CV_ERROR( CV_StsNotImplemented,
1049
            "Boosted trees can only be used for 2-class classification." );
1050
        CV_CALL( storage = cvCreateMemStorage() );
1051
        weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
1052
        storage = 0;
1053
    }
1054
    else
1055
    {
1056
        data->set_data( _train_data, _tflag, _responses, _var_idx,
1057
            _sample_idx, _var_type, _missing_mask, _params, true, true, true );
1058
    }
1059

1060
    if ( (_params.boost_type == LOGIT) || (_params.boost_type == GENTLE) )
1061
        data->do_responses_copy();
1062

1063
    update_weights( 0 );
1064

1065
    for( i = 0; i < params.weak_count; i++ )
1066
    {
1067
        CvBoostTree* tree = new CvBoostTree;
1068
        if( !tree->train( data, subsample_mask, this ) )
1069
        {
1070
            delete tree;
1071
            break;
1072
        }
1073
        //cvCheckArr( get_weak_response());
1074
        cvSeqPush( weak, &tree );
1075
        update_weights( tree );
1076
        trim_weights();
1077
        if( cvCountNonZero(subsample_mask) == 0 )
1078
            break;
1079
    }
1080

1081
    if(weak->total > 0)
1082
    {
1083
        get_active_vars(); // recompute active_vars* maps and condensed_idx's in the splits.
1084
        data->is_classifier = true;
1085
        data->free_train_data();
1086
        ok = true;
1087
    }
1088
    else
1089
        clear();
1090

1091
    __END__;
1092

1093
    return ok;
1094
}
1095

1096
bool CvBoost::train( CvMLData* _data,
1097
             CvBoostParams _params,
1098
             bool update )
1099
{
1100
    bool result = false;
1101

1102
    CV_FUNCNAME( "CvBoost::train" );
1103

1104
    __BEGIN__;
1105

1106
    const CvMat* values = _data->get_values();
1107
    const CvMat* response = _data->get_responses();
1108
    const CvMat* missing = _data->get_missing();
1109
    const CvMat* var_types = _data->get_var_types();
1110
    const CvMat* train_sidx = _data->get_train_sample_idx();
1111
    const CvMat* var_idx = _data->get_var_idx();
1112

1113
    CV_CALL( result = train( values, CV_ROW_SAMPLE, response, var_idx,
1114
        train_sidx, var_types, missing, _params, update ) );
1115

1116
    __END__;
1117

1118
    return result;
1119
}
1120

1121
void CvBoost::initialize_weights(double (&p)[2])
1122
{
1123
    p[0] = 1.;
1124
    p[1] = 1.;
1125
}
1126

1127
void
1128
CvBoost::update_weights( CvBoostTree* tree )
1129
{
1130
    CV_FUNCNAME( "CvBoost::update_weights" );
1131

1132
    __BEGIN__;
1133

1134
    int i, n = data->sample_count;
1135
    double sumw = 0.;
1136
    int step = 0;
1137
    float* fdata = 0;
1138
    int *sample_idx_buf;
1139
    const int* sample_idx = 0;
1140
    cv::AutoBuffer<uchar> inn_buf;
1141
    size_t _buf_size = (params.boost_type == LOGIT) || (params.boost_type == GENTLE) ? (size_t)(data->sample_count)*sizeof(int) : 0;
1142
    if( !tree )
1143
        _buf_size += n*sizeof(int);
1144
    else
1145
    {
1146
        if( have_subsample )
1147
            _buf_size += data->get_length_subbuf()*(sizeof(float)+sizeof(uchar));
1148
    }
1149
    inn_buf.allocate(_buf_size);
1150
    uchar* cur_buf_pos = inn_buf.data();
1151

1152
    if ( (params.boost_type == LOGIT) || (params.boost_type == GENTLE) )
1153
    {
1154
        step = CV_IS_MAT_CONT(data->responses_copy->type) ?
1155
            1 : data->responses_copy->step / CV_ELEM_SIZE(data->responses_copy->type);
1156
        fdata = data->responses_copy->data.fl;
1157
        sample_idx_buf = (int*)cur_buf_pos;
1158
        cur_buf_pos = (uchar*)(sample_idx_buf + data->sample_count);
1159
        sample_idx = data->get_sample_indices( data->data_root, sample_idx_buf );
1160
    }
1161
    CvMat* dtree_data_buf = data->buf;
1162
    size_t length_buf_row = data->get_length_subbuf();
1163
    if( !tree ) // before training the first tree, initialize weights and other parameters
1164
    {
1165
        int* class_labels_buf = (int*)cur_buf_pos;
1166
        cur_buf_pos = (uchar*)(class_labels_buf + n);
1167
        const int* class_labels = data->get_class_labels(data->data_root, class_labels_buf);
1168
        // in case of logitboost and gentle adaboost each weak tree is a regression tree,
1169
        // so we need to convert class labels to floating-point values
1170

1171
        double w0 = 1./ n;
1172
        double p[2] = { 1., 1. };
1173
        initialize_weights(p);
1174

1175
        cvReleaseMat( &orig_response );
1176
        cvReleaseMat( &sum_response );
1177
        cvReleaseMat( &weak_eval );
1178
        cvReleaseMat( &subsample_mask );
1179
        cvReleaseMat( &weights );
1180
        cvReleaseMat( &subtree_weights );
1181

1182
        CV_CALL( orig_response = cvCreateMat( 1, n, CV_32S ));
1183
        CV_CALL( weak_eval = cvCreateMat( 1, n, CV_64F ));
1184
        CV_CALL( subsample_mask = cvCreateMat( 1, n, CV_8U ));
1185
        CV_CALL( weights = cvCreateMat( 1, n, CV_64F ));
1186
        CV_CALL( subtree_weights = cvCreateMat( 1, n + 2, CV_64F ));
1187

1188
        if( data->have_priors )
1189
        {
1190
            // compute weight scale for each class from their prior probabilities
1191
            int c1 = 0;
1192
            for( i = 0; i < n; i++ )
1193
                c1 += class_labels[i];
1194
            p[0] = data->priors->data.db[0]*(c1 < n ? 1./(n - c1) : 0.);
1195
            p[1] = data->priors->data.db[1]*(c1 > 0 ? 1./c1 : 0.);
1196
            p[0] /= p[0] + p[1];
1197
            p[1] = 1. - p[0];
1198
        }
1199

1200
        if (data->is_buf_16u)
1201
        {
1202
            unsigned short* labels = (unsigned short*)(dtree_data_buf->data.s + data->data_root->buf_idx*length_buf_row +
1203
                data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count);
1204
            for( i = 0; i < n; i++ )
1205
            {
1206
                // save original categorical responses {0,1}, convert them to {-1,1}
1207
                orig_response->data.i[i] = class_labels[i]*2 - 1;
1208
                // make all the samples active at start.
1209
                // later, in trim_weights() deactivate/reactive again some, if need
1210
                subsample_mask->data.ptr[i] = (uchar)1;
1211
                // make all the initial weights the same.
1212
                weights->data.db[i] = w0*p[class_labels[i]];
1213
                // set the labels to find (from within weak tree learning proc)
1214
                // the particular sample weight, and where to store the response.
1215
                labels[i] = (unsigned short)i;
1216
            }
1217
        }
1218
        else
1219
        {
1220
            int* labels = dtree_data_buf->data.i + data->data_root->buf_idx*length_buf_row +
1221
                data->data_root->offset + (size_t)(data->work_var_count-1)*data->sample_count;
1222

1223
            for( i = 0; i < n; i++ )
1224
            {
1225
                // save original categorical responses {0,1}, convert them to {-1,1}
1226
                orig_response->data.i[i] = class_labels[i]*2 - 1;
1227
                // make all the samples active at start.
1228
                // later, in trim_weights() deactivate/reactive again some, if need
1229
                subsample_mask->data.ptr[i] = (uchar)1;
1230
                // make all the initial weights the same.
1231
                weights->data.db[i] = w0*p[class_labels[i]];
1232
                // set the labels to find (from within weak tree learning proc)
1233
                // the particular sample weight, and where to store the response.
1234
                labels[i] = i;
1235
            }
1236
        }
1237

1238
        if( params.boost_type == LOGIT )
1239
        {
1240
            CV_CALL( sum_response = cvCreateMat( 1, n, CV_64F ));
1241

1242
            for( i = 0; i < n; i++ )
1243
            {
1244
                sum_response->data.db[i] = 0;
1245
                fdata[sample_idx[i]*step] = orig_response->data.i[i] > 0 ? 2.f : -2.f;
1246
            }
1247

1248
            // in case of logitboost each weak tree is a regression tree.
1249
            // the target function values are recalculated for each of the trees
1250
            data->is_classifier = false;
1251
        }
1252
        else if( params.boost_type == GENTLE )
1253
        {
1254
            for( i = 0; i < n; i++ )
1255
                fdata[sample_idx[i]*step] = (float)orig_response->data.i[i];
1256

1257
            data->is_classifier = false;
1258
        }
1259
    }
1260
    else
1261
    {
1262
        // at this moment, for all the samples that participated in the training of the most
1263
        // recent weak classifier we know the responses. For other samples we need to compute them
1264
        if( have_subsample )
1265
        {
1266
            float* values = (float*)cur_buf_pos;
1267
            cur_buf_pos = (uchar*)(values + data->get_length_subbuf());
1268
            uchar* missing = cur_buf_pos;
1269
            cur_buf_pos = missing + data->get_length_subbuf() * (size_t)CV_ELEM_SIZE(data->buf->type);
1270

1271
            CvMat _sample, _mask;
1272

1273
            // invert the subsample mask
1274
            cvXorS( subsample_mask, cvScalar(1.), subsample_mask );
1275
            data->get_vectors( subsample_mask, values, missing, 0 );
1276

1277
            _sample = cvMat( 1, data->var_count, CV_32F );
1278
            _mask = cvMat( 1, data->var_count, CV_8U );
1279

1280
            // run tree through all the non-processed samples
1281
            for( i = 0; i < n; i++ )
1282
                if( subsample_mask->data.ptr[i] )
1283
                {
1284
                    _sample.data.fl = values;
1285
                    _mask.data.ptr = missing;
1286
                    values += _sample.cols;
1287
                    missing += _mask.cols;
1288
                    weak_eval->data.db[i] = tree->predict( &_sample, &_mask, true )->value;
1289
                }
1290
        }
1291

1292
        // now update weights and other parameters for each type of boosting
1293
        if( params.boost_type == DISCRETE )
1294
        {
1295
            // Discrete AdaBoost:
1296
            //   weak_eval[i] (=f(x_i)) is in {-1,1}
1297
            //   err = sum(w_i*(f(x_i) != y_i))/sum(w_i)
1298
            //   C = log((1-err)/err)
1299
            //   w_i *= exp(C*(f(x_i) != y_i))
1300

1301
            double C, err = 0.;
1302
            double scale[] = { 1., 0. };
1303

1304
            for( i = 0; i < n; i++ )
1305
            {
1306
                double w = weights->data.db[i];
1307
                sumw += w;
1308
                err += w*(weak_eval->data.db[i] != orig_response->data.i[i]);
1309
            }
1310

1311
            if( sumw != 0 )
1312
                err /= sumw;
1313
            C = err = -log_ratio( err );
1314
            scale[1] = exp(err);
1315

1316
            sumw = 0;
1317
            for( i = 0; i < n; i++ )
1318
            {
1319
                double w = weights->data.db[i]*
1320
                    scale[weak_eval->data.db[i] != orig_response->data.i[i]];
1321
                sumw += w;
1322
                weights->data.db[i] = w;
1323
            }
1324

1325
            tree->scale( C );
1326
        }
1327
        else if( params.boost_type == REAL )
1328
        {
1329
            // Real AdaBoost:
1330
            //   weak_eval[i] = f(x_i) = 0.5*log(p(x_i)/(1-p(x_i))), p(x_i)=P(y=1|x_i)
1331
            //   w_i *= exp(-y_i*f(x_i))
1332

1333
            for( i = 0; i < n; i++ )
1334
                weak_eval->data.db[i] *= -orig_response->data.i[i];
1335

1336
            cvExp( weak_eval, weak_eval );
1337

1338
            for( i = 0; i < n; i++ )
1339
            {
1340
                double w = weights->data.db[i]*weak_eval->data.db[i];
1341
                sumw += w;
1342
                weights->data.db[i] = w;
1343
            }
1344
        }
1345
        else if( params.boost_type == LOGIT )
1346
        {
1347
            // LogitBoost:
1348
            //   weak_eval[i] = f(x_i) in [-z_max,z_max]
1349
            //   sum_response = F(x_i).
1350
            //   F(x_i) += 0.5*f(x_i)
1351
            //   p(x_i) = exp(F(x_i))/(exp(F(x_i)) + exp(-F(x_i))=1/(1+exp(-2*F(x_i)))
1352
            //   reuse weak_eval: weak_eval[i] <- p(x_i)
1353
            //   w_i = p(x_i)*1(1 - p(x_i))
1354
            //   z_i = ((y_i+1)/2 - p(x_i))/(p(x_i)*(1 - p(x_i)))
1355
            //   store z_i to the data->data_root as the new target responses
1356

1357
            const double lb_weight_thresh = FLT_EPSILON;
1358
            const double lb_z_max = 10.;
1359
            /*float* responses_buf = data->get_resp_float_buf();
1360
            const float* responses = 0;
1361
            data->get_ord_responses(data->data_root, responses_buf, &responses);*/
1362

1363
            /*if( weak->total == 7 )
1364
                putchar('*');*/
1365

1366
            for( i = 0; i < n; i++ )
1367
            {
1368
                double s = sum_response->data.db[i] + 0.5*weak_eval->data.db[i];
1369
                sum_response->data.db[i] = s;
1370
                weak_eval->data.db[i] = -2*s;
1371
            }
1372

1373
            cvExp( weak_eval, weak_eval );
1374

1375
            for( i = 0; i < n; i++ )
1376
            {
1377
                double p = 1./(1. + weak_eval->data.db[i]);
1378
                double w = p*(1 - p), z;
1379
                w = MAX( w, lb_weight_thresh );
1380
                weights->data.db[i] = w;
1381
                sumw += w;
1382
                if( orig_response->data.i[i] > 0 )
1383
                {
1384
                    z = 1./p;
1385
                    fdata[sample_idx[i]*step] = (float)MIN(z, lb_z_max);
1386
                }
1387
                else
1388
                {
1389
                    z = 1./(1-p);
1390
                    fdata[sample_idx[i]*step] = (float)-MIN(z, lb_z_max);
1391
                }
1392
            }
1393
        }
1394
        else
1395
        {
1396
            // Gentle AdaBoost:
1397
            //   weak_eval[i] = f(x_i) in [-1,1]
1398
            //   w_i *= exp(-y_i*f(x_i))
1399
            assert( params.boost_type == GENTLE );
1400

1401
            for( i = 0; i < n; i++ )
1402
                weak_eval->data.db[i] *= -orig_response->data.i[i];
1403

1404
            cvExp( weak_eval, weak_eval );
1405

1406
            for( i = 0; i < n; i++ )
1407
            {
1408
                double w = weights->data.db[i] * weak_eval->data.db[i];
1409
                weights->data.db[i] = w;
1410
                sumw += w;
1411
            }
1412
        }
1413
    }
1414

1415
    // renormalize weights
1416
    if( sumw > FLT_EPSILON )
1417
    {
1418
        sumw = 1./sumw;
1419
        for( i = 0; i < n; ++i )
1420
            weights->data.db[i] *= sumw;
1421
    }
1422

1423
    __END__;
1424
}
1425

1426

1427
void
1428
CvBoost::trim_weights()
1429
{
1430
    //CV_FUNCNAME( "CvBoost::trim_weights" );
1431

1432
    __BEGIN__;
1433

1434
    int i, count = data->sample_count, nz_count = 0;
1435
    double sum, threshold;
1436

1437
    if( params.weight_trim_rate <= 0. || params.weight_trim_rate >= 1. )
1438
        EXIT;
1439

1440
    // use weak_eval as temporary buffer for sorted weights
1441
    cvCopy( weights, weak_eval );
1442

1443
    std::sort(weak_eval->data.db, weak_eval->data.db + count);
1444

1445
    // as weight trimming occurs immediately after updating the weights,
1446
    // where they are renormalized, we assume that the weight sum = 1.
1447
    sum = 1. - params.weight_trim_rate;
1448

1449
    for( i = 0; i < count; i++ )
1450
    {
1451
        double w = weak_eval->data.db[i];
1452
        if( sum <= 0 )
1453
            break;
1454
        sum -= w;
1455
    }
1456

1457
    threshold = i < count ? weak_eval->data.db[i] : DBL_MAX;
1458

1459
    for( i = 0; i < count; i++ )
1460
    {
1461
        double w = weights->data.db[i];
1462
        int f = w >= threshold;
1463
        subsample_mask->data.ptr[i] = (uchar)f;
1464
        nz_count += f;
1465
    }
1466

1467
    have_subsample = nz_count < count;
1468

1469
    __END__;
1470
}
1471

1472

1473
const CvMat*
1474
CvBoost::get_active_vars( bool absolute_idx )
1475
{
1476
    CvMat* mask = 0;
1477
    CvMat* inv_map = 0;
1478
    CvMat* result = 0;
1479

1480
    CV_FUNCNAME( "CvBoost::get_active_vars" );
1481

1482
    __BEGIN__;
1483

1484
    if( !weak )
1485
        CV_ERROR( CV_StsError, "The boosted tree ensemble has not been trained yet" );
1486

1487
    if( !active_vars || !active_vars_abs )
1488
    {
1489
        CvSeqReader reader;
1490
        int i, j, nactive_vars;
1491
        CvBoostTree* wtree;
1492
        const CvDTreeNode* node;
1493

1494
        assert(!active_vars && !active_vars_abs);
1495
        mask = cvCreateMat( 1, data->var_count, CV_8U );
1496
        inv_map = cvCreateMat( 1, data->var_count, CV_32S );
1497
        cvZero( mask );
1498
        cvSet( inv_map, cvScalar(-1) );
1499

1500
        // first pass: compute the mask of used variables
1501
        cvStartReadSeq( weak, &reader );
1502
        for( i = 0; i < weak->total; i++ )
1503
        {
1504
            CV_READ_SEQ_ELEM(wtree, reader);
1505

1506
            node = wtree->get_root();
1507
            assert( node != 0 );
1508
            for(;;)
1509
            {
1510
                const CvDTreeNode* parent;
1511
                for(;;)
1512
                {
1513
                    CvDTreeSplit* split = node->split;
1514
                    for( ; split != 0; split = split->next )
1515
                        mask->data.ptr[split->var_idx] = 1;
1516
                    if( !node->left )
1517
                        break;
1518
                    node = node->left;
1519
                }
1520

1521
                for( parent = node->parent; parent && parent->right == node;
1522
                    node = parent, parent = parent->parent )
1523
                    ;
1524

1525
                if( !parent )
1526
                    break;
1527

1528
                node = parent->right;
1529
            }
1530
        }
1531

1532
        nactive_vars = cvCountNonZero(mask);
1533

1534
        //if ( nactive_vars > 0 )
1535
        {
1536
            active_vars = cvCreateMat( 1, nactive_vars, CV_32S );
1537
            active_vars_abs = cvCreateMat( 1, nactive_vars, CV_32S );
1538

1539
            have_active_cat_vars = false;
1540

1541
            for( i = j = 0; i < data->var_count; i++ )
1542
            {
1543
                if( mask->data.ptr[i] )
1544
                {
1545
                    active_vars->data.i[j] = i;
1546
                    active_vars_abs->data.i[j] = data->var_idx ? data->var_idx->data.i[i] : i;
1547
                    inv_map->data.i[i] = j;
1548
                    if( data->var_type->data.i[i] >= 0 )
1549
                        have_active_cat_vars = true;
1550
                    j++;
1551
                }
1552
            }
1553

1554

1555
            // second pass: now compute the condensed indices
1556
            cvStartReadSeq( weak, &reader );
1557
            for( i = 0; i < weak->total; i++ )
1558
            {
1559
                CV_READ_SEQ_ELEM(wtree, reader);
1560
                node = wtree->get_root();
1561
                for(;;)
1562
                {
1563
                    const CvDTreeNode* parent;
1564
                    for(;;)
1565
                    {
1566
                        CvDTreeSplit* split = node->split;
1567
                        for( ; split != 0; split = split->next )
1568
                        {
1569
                            split->condensed_idx = inv_map->data.i[split->var_idx];
1570
                            assert( split->condensed_idx >= 0 );
1571
                        }
1572

1573
                        if( !node->left )
1574
                            break;
1575
                        node = node->left;
1576
                    }
1577

1578
                    for( parent = node->parent; parent && parent->right == node;
1579
                        node = parent, parent = parent->parent )
1580
                        ;
1581

1582
                    if( !parent )
1583
                        break;
1584

1585
                    node = parent->right;
1586
                }
1587
            }
1588
        }
1589
    }
1590

1591
    result = absolute_idx ? active_vars_abs : active_vars;
1592

1593
    __END__;
1594

1595
    cvReleaseMat( &mask );
1596
    cvReleaseMat( &inv_map );
1597

1598
    return result;
1599
}
1600

1601

1602
float
1603
CvBoost::predict( const CvMat* _sample, const CvMat* _missing,
1604
                  CvMat* weak_responses, CvSlice slice,
1605
                  bool raw_mode, bool return_sum ) const
1606
{
1607
    float value = -FLT_MAX;
1608

1609
    CvSeqReader reader;
1610
    double sum = 0;
1611
    int wstep = 0;
1612
    const float* sample_data;
1613

1614
    if( !weak )
1615
        CV_Error( CV_StsError, "The boosted tree ensemble has not been trained yet" );
1616

1617
    if( !CV_IS_MAT(_sample) || CV_MAT_TYPE(_sample->type) != CV_32FC1 ||
1618
        (_sample->cols != 1 && _sample->rows != 1) ||
1619
        (_sample->cols + _sample->rows - 1 != data->var_all && !raw_mode) ||
1620
        (active_vars && _sample->cols + _sample->rows - 1 != active_vars->cols && raw_mode) )
1621
            CV_Error( CV_StsBadArg,
1622
        "the input sample must be 1d floating-point vector with the same "
1623
        "number of elements as the total number of variables or "
1624
        "as the number of variables used for training" );
1625

1626
    if( _missing )
1627
    {
1628
        if( !CV_IS_MAT(_missing) || !CV_IS_MASK_ARR(_missing) ||
1629
            !CV_ARE_SIZES_EQ(_missing, _sample) )
1630
            CV_Error( CV_StsBadArg,
1631
            "the missing data mask must be 8-bit vector of the same size as input sample" );
1632
    }
1633

1634
    int i, weak_count = cvSliceLength( slice, weak );
1635
    if( weak_count >= weak->total )
1636
    {
1637
        weak_count = weak->total;
1638
        slice.start_index = 0;
1639
    }
1640

1641
    if( weak_responses )
1642
    {
1643
        if( !CV_IS_MAT(weak_responses) ||
1644
            CV_MAT_TYPE(weak_responses->type) != CV_32FC1 ||
1645
            (weak_responses->cols != 1 && weak_responses->rows != 1) ||
1646
            weak_responses->cols + weak_responses->rows - 1 != weak_count )
1647
            CV_Error( CV_StsBadArg,
1648
            "The output matrix of weak classifier responses must be valid "
1649
            "floating-point vector of the same number of components as the length of input slice" );
1650
        wstep = CV_IS_MAT_CONT(weak_responses->type) ? 1 : weak_responses->step/sizeof(float);
1651
    }
1652

1653
    int var_count = active_vars->cols;
1654
    const int* vtype = data->var_type->data.i;
1655
    const int* cmap = data->cat_map->data.i;
1656
    const int* cofs = data->cat_ofs->data.i;
1657

1658
    cv::Mat sample = cv::cvarrToMat(_sample);
1659
    cv::Mat missing;
1660
    if(!_missing)
1661
        missing = cv::cvarrToMat(_missing);
1662

1663
    // if need, preprocess the input vector
1664
    if( !raw_mode )
1665
    {
1666
        int sstep, mstep = 0;
1667
        const float* src_sample;
1668
        const uchar* src_mask = 0;
1669
        float* dst_sample;
1670
        uchar* dst_mask;
1671
        const int* vidx = active_vars->data.i;
1672
        const int* vidx_abs = active_vars_abs->data.i;
1673
        bool have_mask = _missing != 0;
1674

1675
        sample = cv::Mat(1, var_count, CV_32FC1);
1676
        missing = cv::Mat(1, var_count, CV_8UC1);
1677

1678
        dst_sample = sample.ptr<float>();
1679
        dst_mask = missing.ptr<uchar>();
1680

1681
        src_sample = _sample->data.fl;
1682
        sstep = CV_IS_MAT_CONT(_sample->type) ? 1 : _sample->step/sizeof(src_sample[0]);
1683

1684
        if( _missing )
1685
        {
1686
            src_mask = _missing->data.ptr;
1687
            mstep = CV_IS_MAT_CONT(_missing->type) ? 1 : _missing->step;
1688
        }
1689

1690
        for( i = 0; i < var_count; i++ )
1691
        {
1692
            int idx = vidx[i], idx_abs = vidx_abs[i];
1693
            float val = src_sample[idx_abs*sstep];
1694
            int ci = vtype[idx];
1695
            uchar m = src_mask ? src_mask[idx_abs*mstep] : (uchar)0;
1696

1697
            if( ci >= 0 )
1698
            {
1699
                int a = cofs[ci], b = (ci+1 >= data->cat_ofs->cols) ? data->cat_map->cols : cofs[ci+1],
1700
                    c = a;
1701
                int ival = cvRound(val);
1702
                if ( (ival != val) && (!m) )
1703
                    CV_Error( CV_StsBadArg,
1704
                        "one of input categorical variable is not an integer" );
1705

1706
                while( a < b )
1707
                {
1708
                    c = (a + b) >> 1;
1709
                    if( ival < cmap[c] )
1710
                        b = c;
1711
                    else if( ival > cmap[c] )
1712
                        a = c+1;
1713
                    else
1714
                        break;
1715
                }
1716

1717
                if( c < 0 || ival != cmap[c] )
1718
                {
1719
                    m = 1;
1720
                    have_mask = true;
1721
                }
1722
                else
1723
                {
1724
                    val = (float)(c - cofs[ci]);
1725
                }
1726
            }
1727

1728
            dst_sample[i] = val;
1729
            dst_mask[i] = m;
1730
        }
1731

1732
        if( !have_mask )
1733
            missing.release();
1734
    }
1735
    else
1736
    {
1737
        if( !CV_IS_MAT_CONT(_sample->type & (_missing ? _missing->type : -1)) )
1738
            CV_Error( CV_StsBadArg, "In raw mode the input vectors must be continuous" );
1739
    }
1740

1741
    cvStartReadSeq( weak, &reader );
1742
    cvSetSeqReaderPos( &reader, slice.start_index );
1743

1744
    sample_data = sample.ptr<float>();
1745

1746
    if( !have_active_cat_vars && missing.empty() && !weak_responses )
1747
    {
1748
        for( i = 0; i < weak_count; i++ )
1749
        {
1750
            CvBoostTree* wtree;
1751
            const CvDTreeNode* node;
1752
            CV_READ_SEQ_ELEM( wtree, reader );
1753

1754
            node = wtree->get_root();
1755
            while( node->left )
1756
            {
1757
                CvDTreeSplit* split = node->split;
1758
                int vi = split->condensed_idx;
1759
                float val = sample_data[vi];
1760
                int dir = val <= split->ord.c ? -1 : 1;
1761
                if( split->inversed )
1762
                    dir = -dir;
1763
                node = dir < 0 ? node->left : node->right;
1764
            }
1765
            sum += node->value;
1766
        }
1767
    }
1768
    else
1769
    {
1770
        const int* avars = active_vars->data.i;
1771
        const uchar* m = !missing.empty() ? missing.ptr<uchar>() : 0;
1772

1773
        // full-featured version
1774
        for( i = 0; i < weak_count; i++ )
1775
        {
1776
            CvBoostTree* wtree;
1777
            const CvDTreeNode* node;
1778
            CV_READ_SEQ_ELEM( wtree, reader );
1779

1780
            node = wtree->get_root();
1781
            while( node->left )
1782
            {
1783
                const CvDTreeSplit* split = node->split;
1784
                int dir = 0;
1785
                for( ; !dir && split != 0; split = split->next )
1786
                {
1787
                    int vi = split->condensed_idx;
1788
                    int ci = vtype[avars[vi]];
1789
                    float val = sample_data[vi];
1790
                    if( m && m[vi] )
1791
                        continue;
1792
                    if( ci < 0 ) // ordered
1793
                        dir = val <= split->ord.c ? -1 : 1;
1794
                    else // categorical
1795
                    {
1796
                        int c = cvRound(val);
1797
                        dir = CV_DTREE_CAT_DIR(c, split->subset);
1798
                    }
1799
                    if( split->inversed )
1800
                        dir = -dir;
1801
                }
1802

1803
                if( !dir )
1804
                {
1805
                    int diff = node->right->sample_count - node->left->sample_count;
1806
                    dir = diff < 0 ? -1 : 1;
1807
                }
1808
                node = dir < 0 ? node->left : node->right;
1809
            }
1810
            if( weak_responses )
1811
                weak_responses->data.fl[i*wstep] = (float)node->value;
1812
            sum += node->value;
1813
        }
1814
    }
1815

1816
    if( return_sum )
1817
        value = (float)sum;
1818
    else
1819
    {
1820
        int cls_idx = sum >= 0;
1821
        if( raw_mode )
1822
            value = (float)cls_idx;
1823
        else
1824
            value = (float)cmap[cofs[vtype[data->var_count]] + cls_idx];
1825
    }
1826

1827
    return value;
1828
}
1829

1830
float CvBoost::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
1831
{
1832
    float err = 0;
1833
    const CvMat* values = _data->get_values();
1834
    const CvMat* response = _data->get_responses();
1835
    const CvMat* missing = _data->get_missing();
1836
    const CvMat* sample_idx = (type == CV_TEST_ERROR) ? _data->get_test_sample_idx() : _data->get_train_sample_idx();
1837
    const CvMat* var_types = _data->get_var_types();
1838
    int* sidx = sample_idx ? sample_idx->data.i : 0;
1839
    int r_step = CV_IS_MAT_CONT(response->type) ?
1840
                1 : response->step / CV_ELEM_SIZE(response->type);
1841
    bool is_classifier = var_types->data.ptr[var_types->cols-1] == CV_VAR_CATEGORICAL;
1842
    int sample_count = sample_idx ? sample_idx->cols : 0;
1843
    sample_count = (type == CV_TRAIN_ERROR && sample_count == 0) ? values->rows : sample_count;
1844
    float* pred_resp = 0;
1845
    if( resp && (sample_count > 0) )
1846
    {
1847
        resp->resize( sample_count );
1848
        pred_resp = &((*resp)[0]);
1849
    }
1850
    if ( is_classifier )
1851
    {
1852
        for( int i = 0; i < sample_count; i++ )
1853
        {
1854
            CvMat sample, miss;
1855
            int si = sidx ? sidx[i] : i;
1856
            cvGetRow( values, &sample, si );
1857
            if( missing )
1858
                cvGetRow( missing, &miss, si );
1859
            float r = (float)predict( &sample, missing ? &miss : 0 );
1860
            if( pred_resp )
1861
                pred_resp[i] = r;
1862
            int d = fabs((double)r - response->data.fl[si*r_step]) <= FLT_EPSILON ? 0 : 1;
1863
            err += d;
1864
        }
1865
        err = sample_count ? err / (float)sample_count * 100 : -FLT_MAX;
1866
    }
1867
    else
1868
    {
1869
        for( int i = 0; i < sample_count; i++ )
1870
        {
1871
            CvMat sample, miss;
1872
            int si = sidx ? sidx[i] : i;
1873
            cvGetRow( values, &sample, si );
1874
            if( missing )
1875
                cvGetRow( missing, &miss, si );
1876
            float r = (float)predict( &sample, missing ? &miss : 0 );
1877
            if( pred_resp )
1878
                pred_resp[i] = r;
1879
            float d = r - response->data.fl[si*r_step];
1880
            err += d*d;
1881
        }
1882
        err = sample_count ? err / (float)sample_count : -FLT_MAX;
1883
    }
1884
    return err;
1885
}
1886

1887
void CvBoost::write_params( CvFileStorage* fs ) const
1888
{
1889
    const char* boost_type_str =
1890
        params.boost_type == DISCRETE ? "DiscreteAdaboost" :
1891
        params.boost_type == REAL ? "RealAdaboost" :
1892
        params.boost_type == LOGIT ? "LogitBoost" :
1893
        params.boost_type == GENTLE ? "GentleAdaboost" : 0;
1894

1895
    const char* split_crit_str =
1896
        params.split_criteria == DEFAULT ? "Default" :
1897
        params.split_criteria == GINI ? "Gini" :
1898
        params.boost_type == MISCLASS ? "Misclassification" :
1899
        params.boost_type == SQERR ? "SquaredErr" : 0;
1900

1901
    if( boost_type_str )
1902
        cvWriteString( fs, "boosting_type", boost_type_str );
1903
    else
1904
        cvWriteInt( fs, "boosting_type", params.boost_type );
1905

1906
    if( split_crit_str )
1907
        cvWriteString( fs, "splitting_criteria", split_crit_str );
1908
    else
1909
        cvWriteInt( fs, "splitting_criteria", params.split_criteria );
1910

1911
    cvWriteInt( fs, "ntrees", weak->total );
1912
    cvWriteReal( fs, "weight_trimming_rate", params.weight_trim_rate );
1913

1914
    data->write_params( fs );
1915
}
1916

1917

1918
void CvBoost::read_params( CvFileStorage* fs, CvFileNode* fnode )
1919
{
1920
    CV_FUNCNAME( "CvBoost::read_params" );
1921

1922
    __BEGIN__;
1923

1924
    CvFileNode* temp;
1925

1926
    if( !fnode || !CV_NODE_IS_MAP(fnode->tag) )
1927
        return;
1928

1929
    data = new CvDTreeTrainData();
1930
    CV_CALL( data->read_params(fs, fnode));
1931
    data->shared = true;
1932

1933
    params.max_depth = data->params.max_depth;
1934
    params.min_sample_count = data->params.min_sample_count;
1935
    params.max_categories = data->params.max_categories;
1936
    params.priors = data->params.priors;
1937
    params.regression_accuracy = data->params.regression_accuracy;
1938
    params.use_surrogates = data->params.use_surrogates;
1939

1940
    temp = cvGetFileNodeByName( fs, fnode, "boosting_type" );
1941
    if( !temp )
1942
        return;
1943

1944
    if( temp && CV_NODE_IS_STRING(temp->tag) )
1945
    {
1946
        const char* boost_type_str = cvReadString( temp, "" );
1947
        params.boost_type = strcmp( boost_type_str, "DiscreteAdaboost" ) == 0 ? DISCRETE :
1948
                            strcmp( boost_type_str, "RealAdaboost" ) == 0 ? REAL :
1949
                            strcmp( boost_type_str, "LogitBoost" ) == 0 ? LOGIT :
1950
                            strcmp( boost_type_str, "GentleAdaboost" ) == 0 ? GENTLE : -1;
1951
    }
1952
    else
1953
        params.boost_type = cvReadInt( temp, -1 );
1954

1955
    if( params.boost_type < DISCRETE || params.boost_type > GENTLE )
1956
        CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
1957

1958
    temp = cvGetFileNodeByName( fs, fnode, "splitting_criteria" );
1959
    if( temp && CV_NODE_IS_STRING(temp->tag) )
1960
    {
1961
        const char* split_crit_str = cvReadString( temp, "" );
1962
        params.split_criteria = strcmp( split_crit_str, "Default" ) == 0 ? DEFAULT :
1963
                                strcmp( split_crit_str, "Gini" ) == 0 ? GINI :
1964
                                strcmp( split_crit_str, "Misclassification" ) == 0 ? MISCLASS :
1965
                                strcmp( split_crit_str, "SquaredErr" ) == 0 ? SQERR : -1;
1966
    }
1967
    else
1968
        params.split_criteria = cvReadInt( temp, -1 );
1969

1970
    if( params.split_criteria < DEFAULT || params.boost_type > SQERR )
1971
        CV_ERROR( CV_StsBadArg, "Unknown boosting type" );
1972

1973
    params.weak_count = cvReadIntByName( fs, fnode, "ntrees" );
1974
    params.weight_trim_rate = cvReadRealByName( fs, fnode, "weight_trimming_rate", 0. );
1975

1976
    __END__;
1977
}
1978

1979

1980

1981
void
1982
CvBoost::read( CvFileStorage* fs, CvFileNode* node )
1983
{
1984
    CV_FUNCNAME( "CvBoost::read" );
1985

1986
    __BEGIN__;
1987

1988
    CvSeqReader reader;
1989
    CvFileNode* trees_fnode;
1990
    CvMemStorage* storage;
1991
    int i, ntrees;
1992

1993
    clear();
1994
    read_params( fs, node );
1995

1996
    if( !data )
1997
        EXIT;
1998

1999
    trees_fnode = cvGetFileNodeByName( fs, node, "trees" );
2000
    if( !trees_fnode || !CV_NODE_IS_SEQ(trees_fnode->tag) )
2001
        CV_ERROR( CV_StsParseError, "<trees> tag is missing" );
2002

2003
    cvStartReadSeq( trees_fnode->data.seq, &reader );
2004
    ntrees = trees_fnode->data.seq->total;
2005

2006
    if( ntrees != params.weak_count )
2007
        CV_ERROR( CV_StsUnmatchedSizes,
2008
        "The number of trees stored does not match <ntrees> tag value" );
2009

2010
    CV_CALL( storage = cvCreateMemStorage() );
2011
    weak = cvCreateSeq( 0, sizeof(CvSeq), sizeof(CvBoostTree*), storage );
2012

2013
    for( i = 0; i < ntrees; i++ )
2014
    {
2015
        CvBoostTree* tree = new CvBoostTree();
2016
        CV_CALL(tree->read( fs, (CvFileNode*)reader.ptr, this, data ));
2017
        CV_NEXT_SEQ_ELEM( reader.seq->elem_size, reader );
2018
        cvSeqPush( weak, &tree );
2019
    }
2020
    get_active_vars();
2021

2022
    __END__;
2023
}
2024

2025

2026
void
2027
CvBoost::write( CvFileStorage* fs, const char* name ) const
2028
{
2029
    CV_FUNCNAME( "CvBoost::write" );
2030

2031
    __BEGIN__;
2032

2033
    CvSeqReader reader;
2034
    int i;
2035

2036
    cvStartWriteStruct( fs, name, CV_NODE_MAP, CV_TYPE_NAME_ML_BOOSTING );
2037

2038
    if( !weak )
2039
        CV_ERROR( CV_StsBadArg, "The classifier has not been trained yet" );
2040

2041
    write_params( fs );
2042
    cvStartWriteStruct( fs, "trees", CV_NODE_SEQ );
2043

2044
    cvStartReadSeq( weak, &reader );
2045

2046
    for( i = 0; i < weak->total; i++ )
2047
    {
2048
        CvBoostTree* tree;
2049
        CV_READ_SEQ_ELEM( tree, reader );
2050
        cvStartWriteStruct( fs, 0, CV_NODE_MAP );
2051
        tree->write( fs );
2052
        cvEndWriteStruct( fs );
2053
    }
2054

2055
    cvEndWriteStruct( fs );
2056
    cvEndWriteStruct( fs );
2057

2058
    __END__;
2059
}
2060

2061

2062
CvMat*
2063
CvBoost::get_weights()
2064
{
2065
    return weights;
2066
}
2067

2068

2069
CvMat*
2070
CvBoost::get_subtree_weights()
2071
{
2072
    return subtree_weights;
2073
}
2074

2075

2076
CvMat*
2077
CvBoost::get_weak_response()
2078
{
2079
    return weak_eval;
2080
}
2081

2082

2083
const CvBoostParams&
2084
CvBoost::get_params() const
2085
{
2086
    return params;
2087
}
2088

2089
CvSeq* CvBoost::get_weak_predictors()
2090
{
2091
    return weak;
2092
}
2093

2094
const CvDTreeTrainData* CvBoost::get_data() const
2095
{
2096
    return data;
2097
}
2098

2099
using namespace cv;
2100

2101
CvBoost::CvBoost( const Mat& _train_data, int _tflag,
2102
               const Mat& _responses, const Mat& _var_idx,
2103
               const Mat& _sample_idx, const Mat& _var_type,
2104
               const Mat& _missing_mask,
2105
               CvBoostParams _params )
2106
{
2107
    weak = 0;
2108
    data = 0;
2109
    default_model_name = "my_boost_tree";
2110
    active_vars = active_vars_abs = orig_response = sum_response = weak_eval =
2111
        subsample_mask = weights = subtree_weights = 0;
2112

2113
    train( _train_data, _tflag, _responses, _var_idx, _sample_idx,
2114
          _var_type, _missing_mask, _params );
2115
}
2116

2117

2118
bool
2119
CvBoost::train( const Mat& _train_data, int _tflag,
2120
               const Mat& _responses, const Mat& _var_idx,
2121
               const Mat& _sample_idx, const Mat& _var_type,
2122
               const Mat& _missing_mask,
2123
               CvBoostParams _params, bool _update )
2124
{
2125
    train_data_hdr = cvMat(_train_data);
2126
    train_data_mat = _train_data;
2127
    responses_hdr = cvMat(_responses);
2128
    responses_mat = _responses;
2129

2130
    CvMat vidx = cvMat(_var_idx), sidx = cvMat(_sample_idx), vtype = cvMat(_var_type), mmask = cvMat(_missing_mask);
2131

2132
    return train(&train_data_hdr, _tflag, &responses_hdr, vidx.data.ptr ? &vidx : 0,
2133
          sidx.data.ptr ? &sidx : 0, vtype.data.ptr ? &vtype : 0,
2134
          mmask.data.ptr ? &mmask : 0, _params, _update);
2135
}
2136

2137
float
2138
CvBoost::predict( const Mat& _sample, const Mat& _missing,
2139
                  const Range& slice, bool raw_mode, bool return_sum ) const
2140
{
2141
    CvMat sample = cvMat(_sample), mmask = cvMat(_missing);
2142
    /*if( weak_responses )
2143
    {
2144
        int weak_count = cvSliceLength( slice, weak );
2145
        if( weak_count >= weak->total )
2146
        {
2147
            weak_count = weak->total;
2148
            slice.start_index = 0;
2149
        }
2150

2151
        if( !(weak_responses->data && weak_responses->type() == CV_32FC1 &&
2152
              (weak_responses->cols == 1 || weak_responses->rows == 1) &&
2153
              weak_responses->cols + weak_responses->rows - 1 == weak_count) )
2154
            weak_responses->create(weak_count, 1, CV_32FC1);
2155
        pwr = &(wr = *weak_responses);
2156
    }*/
2157
    return predict(&sample, _missing.empty() ? 0 : &mmask, 0,
2158
                   slice == Range::all() ? CV_WHOLE_SEQ : cvSlice(slice.start, slice.end),
2159
                   raw_mode, return_sum);
2160
}
2161

2162
/* End of file. */
2163

2164
Product

Resources

Company