Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/core/src/matmul.cpp
16337 views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
// By downloading, copying, installing or using the software you agree to this license.
6
// If you do not agree to this license, do not download, install,
7
// copy or use the software.
8
//
9
//
10
// License Agreement
11
// For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
15
// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16
// Third party copyrights are property of their respective owners.
17
//
18
// Redistribution and use in source and binary forms, with or without modification,
19
// are permitted provided that the following conditions are met:
20
//
21
// * Redistribution's of source code must retain the above copyright notice,
22
// this list of conditions and the following disclaimer.
23
//
24
// * Redistribution's in binary form must reproduce the above copyright notice,
25
// this list of conditions and the following disclaimer in the documentation
26
// and/or other materials provided with the distribution.
27
//
28
// * The name of the copyright holders may not be used to endorse or promote products
29
// derived from this software without specific prior written permission.
30
//
31
// This software is provided by the copyright holders and contributors "as is" and
32
// any express or implied warranties, including, but not limited to, the implied
33
// warranties of merchantability and fitness for a particular purpose are disclaimed.
34
// In no event shall the Intel Corporation or contributors be liable for any direct,
35
// indirect, incidental, special, exemplary, or consequential damages
36
// (including, but not limited to, procurement of substitute goods or services;
37
// loss of use, data, or profits; or business interruption) however caused
38
// and on any theory of liability, whether in contract, strict liability,
39
// or tort (including negligence or otherwise) arising in any way out of
40
// the use of this software, even if advised of the possibility of such damage.
41
//
42
//M*/
43
44
#include <sstream>
45
#include "precomp.hpp"
46
#include "opencl_kernels_core.hpp"
47
#include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
48
#include "opencv2/core/opencl/runtime/opencl_core.hpp"
49
#include "intel_gpu_gemm.inl.hpp"
50
51
namespace cv
52
{
53
54
/****************************************************************************************\
55
* GEMM *
56
\****************************************************************************************/
57
58
static void
59
GEMM_CopyBlock( const uchar* src, size_t src_step,
60
uchar* dst, size_t dst_step,
61
Size size, size_t pix_size )
62
{
63
int j;
64
size.width *= (int)(pix_size / sizeof(int));
65
66
for( ; size.height--; src += src_step, dst += dst_step )
67
{
68
j=0;
69
#if CV_ENABLE_UNROLLED
70
for( ; j <= size.width - 4; j += 4 )
71
{
72
int t0 = ((const int*)src)[j];
73
int t1 = ((const int*)src)[j+1];
74
((int*)dst)[j] = t0;
75
((int*)dst)[j+1] = t1;
76
t0 = ((const int*)src)[j+2];
77
t1 = ((const int*)src)[j+3];
78
((int*)dst)[j+2] = t0;
79
((int*)dst)[j+3] = t1;
80
}
81
#endif
82
for( ; j < size.width; j++ )
83
((int*)dst)[j] = ((const int*)src)[j];
84
}
85
}
86
87
88
static void
89
GEMM_TransposeBlock( const uchar* src, size_t src_step,
90
uchar* dst, size_t dst_step,
91
Size size, size_t pix_size )
92
{
93
int i, j;
94
for( i = 0; i < size.width; i++, dst += dst_step, src += pix_size )
95
{
96
const uchar* _src = src;
97
switch( pix_size )
98
{
99
case sizeof(int):
100
for( j = 0; j < size.height; j++, _src += src_step )
101
((int*)dst)[j] = ((int*)_src)[0];
102
break;
103
case sizeof(int)*2:
104
for( j = 0; j < size.height*2; j += 2, _src += src_step )
105
{
106
int t0 = ((int*)_src)[0];
107
int t1 = ((int*)_src)[1];
108
((int*)dst)[j] = t0;
109
((int*)dst)[j+1] = t1;
110
}
111
break;
112
case sizeof(int)*4:
113
for( j = 0; j < size.height*4; j += 4, _src += src_step )
114
{
115
int t0 = ((int*)_src)[0];
116
int t1 = ((int*)_src)[1];
117
((int*)dst)[j] = t0;
118
((int*)dst)[j+1] = t1;
119
t0 = ((int*)_src)[2];
120
t1 = ((int*)_src)[3];
121
((int*)dst)[j+2] = t0;
122
((int*)dst)[j+3] = t1;
123
}
124
break;
125
default:
126
assert(0);
127
return;
128
}
129
}
130
}
131
132
133
template<typename T, typename WT> static void
134
GEMMSingleMul( const T* a_data, size_t a_step,
135
const T* b_data, size_t b_step,
136
const T* c_data, size_t c_step,
137
T* d_data, size_t d_step,
138
Size a_size, Size d_size,
139
double alpha, double beta, int flags )
140
{
141
int i, j, k, n = a_size.width, m = d_size.width, drows = d_size.height;
142
const T *_a_data = a_data, *_b_data = b_data, *_c_data = c_data;
143
cv::AutoBuffer<T> _a_buf;
144
T* a_buf = 0;
145
size_t a_step0, a_step1, c_step0, c_step1, t_step;
146
147
a_step /= sizeof(a_data[0]);
148
b_step /= sizeof(b_data[0]);
149
c_step /= sizeof(c_data[0]);
150
d_step /= sizeof(d_data[0]);
151
a_step0 = a_step;
152
a_step1 = 1;
153
154
if( !c_data )
155
c_step0 = c_step1 = 0;
156
else if( !(flags & GEMM_3_T) )
157
c_step0 = c_step, c_step1 = 1;
158
else
159
c_step0 = 1, c_step1 = c_step;
160
161
if( flags & GEMM_1_T )
162
{
163
CV_SWAP( a_step0, a_step1, t_step );
164
n = a_size.height;
165
if( a_step > 1 && n > 1 )
166
{
167
_a_buf.allocate(n);
168
a_buf = _a_buf.data();
169
}
170
}
171
172
if( n == 1 ) /* external product */
173
{
174
cv::AutoBuffer<T> _b_buf;
175
T* b_buf = 0;
176
177
if( a_step > 1 && a_size.height > 1 )
178
{
179
_a_buf.allocate(drows);
180
a_buf = _a_buf.data();
181
for( k = 0; k < drows; k++ )
182
a_buf[k] = a_data[a_step*k];
183
a_data = a_buf;
184
}
185
186
if( b_step > 1 )
187
{
188
_b_buf.allocate(d_size.width);
189
b_buf = _b_buf.data();
190
for( j = 0; j < d_size.width; j++ )
191
b_buf[j] = b_data[j*b_step];
192
b_data = b_buf;
193
}
194
195
for( i = 0; i < drows; i++, _c_data += c_step0, d_data += d_step )
196
{
197
WT al = WT(a_data[i])*alpha;
198
c_data = _c_data;
199
for( j = 0; j <= d_size.width - 2; j += 2, c_data += 2*c_step1 )
200
{
201
WT s0 = al*WT(b_data[j]);
202
WT s1 = al*WT(b_data[j+1]);
203
if( !c_data )
204
{
205
d_data[j] = T(s0);
206
d_data[j+1] = T(s1);
207
}
208
else
209
{
210
d_data[j] = T(s0 + WT(c_data[0])*beta);
211
d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta);
212
}
213
}
214
215
for( ; j < d_size.width; j++, c_data += c_step1 )
216
{
217
WT s0 = al*WT(b_data[j]);
218
if( !c_data )
219
d_data[j] = T(s0);
220
else
221
d_data[j] = T(s0 + WT(c_data[0])*beta);
222
}
223
}
224
}
225
else if( flags & GEMM_2_T ) /* A * Bt */
226
{
227
for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step )
228
{
229
a_data = _a_data;
230
b_data = _b_data;
231
c_data = _c_data;
232
233
if( a_buf )
234
{
235
for( k = 0; k < n; k++ )
236
a_buf[k] = a_data[a_step1*k];
237
a_data = a_buf;
238
}
239
240
for( j = 0; j < d_size.width; j++, b_data += b_step,
241
c_data += c_step1 )
242
{
243
WT s0(0), s1(0), s2(0), s3(0);
244
k = 0;
245
#if CV_ENABLE_UNROLLED
246
for( ; k <= n - 4; k += 4 )
247
{
248
s0 += WT(a_data[k])*WT(b_data[k]);
249
s1 += WT(a_data[k+1])*WT(b_data[k+1]);
250
s2 += WT(a_data[k+2])*WT(b_data[k+2]);
251
s3 += WT(a_data[k+3])*WT(b_data[k+3]);
252
}
253
#endif
254
for( ; k < n; k++ )
255
s0 += WT(a_data[k])*WT(b_data[k]);
256
s0 = (s0+s1+s2+s3)*alpha;
257
258
if( !c_data )
259
d_data[j] = T(s0);
260
else
261
d_data[j] = T(s0 + WT(c_data[0])*beta);
262
}
263
}
264
}
265
else if( d_size.width*sizeof(d_data[0]) <= 1600 )
266
{
267
for( i = 0; i < drows; i++, _a_data += a_step0,
268
_c_data += c_step0,
269
d_data += d_step )
270
{
271
a_data = _a_data, c_data = _c_data;
272
273
if( a_buf )
274
{
275
for( k = 0; k < n; k++ )
276
a_buf[k] = a_data[a_step1*k];
277
a_data = a_buf;
278
}
279
280
for( j = 0; j <= m - 4; j += 4, c_data += 4*c_step1 )
281
{
282
const T* b = _b_data + j;
283
WT s0(0), s1(0), s2(0), s3(0);
284
285
for( k = 0; k < n; k++, b += b_step )
286
{
287
WT a(a_data[k]);
288
s0 += a * WT(b[0]); s1 += a * WT(b[1]);
289
s2 += a * WT(b[2]); s3 += a * WT(b[3]);
290
}
291
292
if( !c_data )
293
{
294
d_data[j] = T(s0*alpha);
295
d_data[j+1] = T(s1*alpha);
296
d_data[j+2] = T(s2*alpha);
297
d_data[j+3] = T(s3*alpha);
298
}
299
else
300
{
301
s0 = s0*alpha; s1 = s1*alpha;
302
s2 = s2*alpha; s3 = s3*alpha;
303
d_data[j] = T(s0 + WT(c_data[0])*beta);
304
d_data[j+1] = T(s1 + WT(c_data[c_step1])*beta);
305
d_data[j+2] = T(s2 + WT(c_data[c_step1*2])*beta);
306
d_data[j+3] = T(s3 + WT(c_data[c_step1*3])*beta);
307
}
308
}
309
310
for( ; j < m; j++, c_data += c_step1 )
311
{
312
const T* b = _b_data + j;
313
WT s0(0);
314
315
for( k = 0; k < n; k++, b += b_step )
316
s0 += WT(a_data[k]) * WT(b[0]);
317
318
s0 = s0*alpha;
319
if( !c_data )
320
d_data[j] = T(s0);
321
else
322
d_data[j] = T(s0 + WT(c_data[0])*beta);
323
}
324
}
325
}
326
else
327
{
328
cv::AutoBuffer<WT> _d_buf(m);
329
WT* d_buf = _d_buf.data();
330
331
for( i = 0; i < drows; i++, _a_data += a_step0, _c_data += c_step0, d_data += d_step )
332
{
333
a_data = _a_data;
334
b_data = _b_data;
335
c_data = _c_data;
336
337
if( a_buf )
338
{
339
for( k = 0; k < n; k++ )
340
a_buf[k] = _a_data[a_step1*k];
341
a_data = a_buf;
342
}
343
344
for( j = 0; j < m; j++ )
345
d_buf[j] = WT(0);
346
347
for( k = 0; k < n; k++, b_data += b_step )
348
{
349
WT al(a_data[k]);
350
j=0;
351
#if CV_ENABLE_UNROLLED
352
for(; j <= m - 4; j += 4 )
353
{
354
WT t0 = d_buf[j] + WT(b_data[j])*al;
355
WT t1 = d_buf[j+1] + WT(b_data[j+1])*al;
356
d_buf[j] = t0;
357
d_buf[j+1] = t1;
358
t0 = d_buf[j+2] + WT(b_data[j+2])*al;
359
t1 = d_buf[j+3] + WT(b_data[j+3])*al;
360
d_buf[j+2] = t0;
361
d_buf[j+3] = t1;
362
}
363
#endif
364
for( ; j < m; j++ )
365
d_buf[j] += WT(b_data[j])*al;
366
}
367
368
if( !c_data )
369
for( j = 0; j < m; j++ )
370
d_data[j] = T(d_buf[j]*alpha);
371
else
372
for( j = 0; j < m; j++, c_data += c_step1 )
373
{
374
WT t = d_buf[j]*alpha;
375
d_data[j] = T(t + WT(c_data[0])*beta);
376
}
377
}
378
}
379
}
380
381
382
template<typename T, typename WT> static void
383
GEMMBlockMul( const T* a_data, size_t a_step,
384
const T* b_data, size_t b_step,
385
WT* d_data, size_t d_step,
386
Size a_size, Size d_size, int flags )
387
{
388
int i, j, k, n = a_size.width, m = d_size.width;
389
const T *_a_data = a_data, *_b_data = b_data;
390
cv::AutoBuffer<T> _a_buf;
391
T* a_buf = 0;
392
size_t a_step0, a_step1, t_step;
393
int do_acc = flags & 16;
394
395
a_step /= sizeof(a_data[0]);
396
b_step /= sizeof(b_data[0]);
397
d_step /= sizeof(d_data[0]);
398
399
a_step0 = a_step;
400
a_step1 = 1;
401
402
if( flags & GEMM_1_T )
403
{
404
CV_SWAP( a_step0, a_step1, t_step );
405
n = a_size.height;
406
_a_buf.allocate(n);
407
a_buf = _a_buf.data();
408
}
409
410
if( flags & GEMM_2_T )
411
{
412
/* second operand is transposed */
413
for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step )
414
{
415
a_data = _a_data; b_data = _b_data;
416
417
if( a_buf )
418
{
419
for( k = 0; k < n; k++ )
420
a_buf[k] = a_data[a_step1*k];
421
a_data = a_buf;
422
}
423
424
for( j = 0; j < d_size.width; j++, b_data += b_step )
425
{
426
WT s0 = do_acc ? d_data[j]:WT(0), s1(0);
427
for( k = 0; k <= n - 2; k += 2 )
428
{
429
s0 += WT(a_data[k])*WT(b_data[k]);
430
s1 += WT(a_data[k+1])*WT(b_data[k+1]);
431
}
432
433
for( ; k < n; k++ )
434
s0 += WT(a_data[k])*WT(b_data[k]);
435
436
d_data[j] = s0 + s1;
437
}
438
}
439
}
440
else
441
{
442
for( i = 0; i < d_size.height; i++, _a_data += a_step0, d_data += d_step )
443
{
444
a_data = _a_data, b_data = _b_data;
445
446
if( a_buf )
447
{
448
for( k = 0; k < n; k++ )
449
a_buf[k] = a_data[a_step1*k];
450
a_data = a_buf;
451
}
452
453
for( j = 0; j <= m - 4; j += 4 )
454
{
455
WT s0, s1, s2, s3;
456
const T* b = b_data + j;
457
458
if( do_acc )
459
{
460
s0 = d_data[j]; s1 = d_data[j+1];
461
s2 = d_data[j+2]; s3 = d_data[j+3];
462
}
463
else
464
s0 = s1 = s2 = s3 = WT(0);
465
466
for( k = 0; k < n; k++, b += b_step )
467
{
468
WT a(a_data[k]);
469
s0 += a * WT(b[0]); s1 += a * WT(b[1]);
470
s2 += a * WT(b[2]); s3 += a * WT(b[3]);
471
}
472
473
d_data[j] = s0; d_data[j+1] = s1;
474
d_data[j+2] = s2; d_data[j+3] = s3;
475
}
476
477
for( ; j < m; j++ )
478
{
479
const T* b = b_data + j;
480
WT s0 = do_acc ? d_data[j] : WT(0);
481
482
for( k = 0; k < n; k++, b += b_step )
483
s0 += WT(a_data[k]) * WT(b[0]);
484
485
d_data[j] = s0;
486
}
487
}
488
}
489
}
490
491
492
template<typename T, typename WT> static void
493
GEMMStore( const T* c_data, size_t c_step,
494
const WT* d_buf, size_t d_buf_step,
495
T* d_data, size_t d_step, Size d_size,
496
double alpha, double beta, int flags )
497
{
498
const T* _c_data = c_data;
499
int j;
500
size_t c_step0, c_step1;
501
502
c_step /= sizeof(c_data[0]);
503
d_buf_step /= sizeof(d_buf[0]);
504
d_step /= sizeof(d_data[0]);
505
506
if( !c_data )
507
c_step0 = c_step1 = 0;
508
else if( !(flags & GEMM_3_T) )
509
c_step0 = c_step, c_step1 = 1;
510
else
511
c_step0 = 1, c_step1 = c_step;
512
513
for( ; d_size.height--; _c_data += c_step0, d_buf += d_buf_step, d_data += d_step )
514
{
515
if( _c_data )
516
{
517
c_data = _c_data;
518
j=0;
519
#if CV_ENABLE_UNROLLED
520
for(; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )
521
{
522
WT t0 = alpha*d_buf[j];
523
WT t1 = alpha*d_buf[j+1];
524
t0 += beta*WT(c_data[0]);
525
t1 += beta*WT(c_data[c_step1]);
526
d_data[j] = T(t0);
527
d_data[j+1] = T(t1);
528
t0 = alpha*d_buf[j+2];
529
t1 = alpha*d_buf[j+3];
530
t0 += beta*WT(c_data[c_step1*2]);
531
t1 += beta*WT(c_data[c_step1*3]);
532
d_data[j+2] = T(t0);
533
d_data[j+3] = T(t1);
534
}
535
#endif
536
for( ; j < d_size.width; j++, c_data += c_step1 )
537
{
538
WT t0 = alpha*d_buf[j];
539
d_data[j] = T(t0 + WT(c_data[0])*beta);
540
}
541
}
542
else
543
{
544
j = 0;
545
#if CV_ENABLE_UNROLLED
546
for( ; j <= d_size.width - 4; j += 4 )
547
{
548
WT t0 = alpha*d_buf[j];
549
WT t1 = alpha*d_buf[j+1];
550
d_data[j] = T(t0);
551
d_data[j+1] = T(t1);
552
t0 = alpha*d_buf[j+2];
553
t1 = alpha*d_buf[j+3];
554
d_data[j+2] = T(t0);
555
d_data[j+3] = T(t1);
556
}
557
#endif
558
for( ; j < d_size.width; j++ )
559
d_data[j] = T(alpha*d_buf[j]);
560
}
561
}
562
}
563
564
565
typedef void (*GEMMSingleMulFunc)( const void* src1, size_t step1,
566
const void* src2, size_t step2, const void* src3, size_t step3,
567
void* dst, size_t dststep, Size srcsize, Size dstsize,
568
double alpha, double beta, int flags );
569
570
typedef void (*GEMMBlockMulFunc)( const void* src1, size_t step1,
571
const void* src2, size_t step2, void* dst, size_t dststep,
572
Size srcsize, Size dstsize, int flags );
573
574
typedef void (*GEMMStoreFunc)( const void* src1, size_t step1,
575
const void* src2, size_t step2, void* dst, size_t dststep,
576
Size dstsize, double alpha, double beta, int flags );
577
578
static void GEMMSingleMul_32f( const float* a_data, size_t a_step,
579
const float* b_data, size_t b_step,
580
const float* c_data, size_t c_step,
581
float* d_data, size_t d_step,
582
Size a_size, Size d_size,
583
double alpha, double beta, int flags )
584
{
585
GEMMSingleMul<float,double>(a_data, a_step, b_data, b_step, c_data,
586
c_step, d_data, d_step, a_size, d_size,
587
alpha, beta, flags);
588
}
589
590
static void GEMMSingleMul_64f( const double* a_data, size_t a_step,
591
const double* b_data, size_t b_step,
592
const double* c_data, size_t c_step,
593
double* d_data, size_t d_step,
594
Size a_size, Size d_size,
595
double alpha, double beta, int flags )
596
{
597
GEMMSingleMul<double,double>(a_data, a_step, b_data, b_step, c_data,
598
c_step, d_data, d_step, a_size, d_size,
599
alpha, beta, flags);
600
}
601
602
603
static void GEMMSingleMul_32fc( const Complexf* a_data, size_t a_step,
604
const Complexf* b_data, size_t b_step,
605
const Complexf* c_data, size_t c_step,
606
Complexf* d_data, size_t d_step,
607
Size a_size, Size d_size,
608
double alpha, double beta, int flags )
609
{
610
GEMMSingleMul<Complexf,Complexd>(a_data, a_step, b_data, b_step, c_data,
611
c_step, d_data, d_step, a_size, d_size,
612
alpha, beta, flags);
613
}
614
615
static void GEMMSingleMul_64fc( const Complexd* a_data, size_t a_step,
616
const Complexd* b_data, size_t b_step,
617
const Complexd* c_data, size_t c_step,
618
Complexd* d_data, size_t d_step,
619
Size a_size, Size d_size,
620
double alpha, double beta, int flags )
621
{
622
GEMMSingleMul<Complexd,Complexd>(a_data, a_step, b_data, b_step, c_data,
623
c_step, d_data, d_step, a_size, d_size,
624
alpha, beta, flags);
625
}
626
627
static void GEMMBlockMul_32f( const float* a_data, size_t a_step,
628
const float* b_data, size_t b_step,
629
double* d_data, size_t d_step,
630
Size a_size, Size d_size, int flags )
631
{
632
GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
633
}
634
635
636
static void GEMMBlockMul_64f( const double* a_data, size_t a_step,
637
const double* b_data, size_t b_step,
638
double* d_data, size_t d_step,
639
Size a_size, Size d_size, int flags )
640
{
641
GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
642
}
643
644
645
static void GEMMBlockMul_32fc( const Complexf* a_data, size_t a_step,
646
const Complexf* b_data, size_t b_step,
647
Complexd* d_data, size_t d_step,
648
Size a_size, Size d_size, int flags )
649
{
650
GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
651
}
652
653
654
static void GEMMBlockMul_64fc( const Complexd* a_data, size_t a_step,
655
const Complexd* b_data, size_t b_step,
656
Complexd* d_data, size_t d_step,
657
Size a_size, Size d_size, int flags )
658
{
659
GEMMBlockMul(a_data, a_step, b_data, b_step, d_data, d_step, a_size, d_size, flags);
660
}
661
662
663
static void GEMMStore_32f( const float* c_data, size_t c_step,
664
const double* d_buf, size_t d_buf_step,
665
float* d_data, size_t d_step, Size d_size,
666
double alpha, double beta, int flags )
667
{
668
GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
669
}
670
671
672
static void GEMMStore_64f( const double* c_data, size_t c_step,
673
const double* d_buf, size_t d_buf_step,
674
double* d_data, size_t d_step, Size d_size,
675
double alpha, double beta, int flags )
676
{
677
GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
678
}
679
680
681
static void GEMMStore_32fc( const Complexf* c_data, size_t c_step,
682
const Complexd* d_buf, size_t d_buf_step,
683
Complexf* d_data, size_t d_step, Size d_size,
684
double alpha, double beta, int flags )
685
{
686
GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
687
}
688
689
690
static void GEMMStore_64fc( const Complexd* c_data, size_t c_step,
691
const Complexd* d_buf, size_t d_buf_step,
692
Complexd* d_data, size_t d_step, Size d_size,
693
double alpha, double beta, int flags )
694
{
695
GEMMStore(c_data, c_step, d_buf, d_buf_step, d_data, d_step, d_size, alpha, beta, flags);
696
}
697
698
#ifdef HAVE_CLAMDBLAS
699
700
static bool ocl_gemm_amdblas( InputArray matA, InputArray matB, double alpha,
701
InputArray matC, double beta, OutputArray matD, int flags )
702
{
703
int type = matA.type(), esz = CV_ELEM_SIZE(type);
704
bool haveC = matC.kind() != cv::_InputArray::NONE;
705
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
706
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
707
708
if (atrans)
709
sizeA = Size(sizeA.height, sizeA.width);
710
if (btrans)
711
sizeB = Size(sizeB.height, sizeB.width);
712
if (haveC && ctrans)
713
sizeC = Size(sizeC.height, sizeC.width);
714
715
Size sizeD(sizeB.width, sizeA.height);
716
717
CV_Assert( matB.type() == type && (!haveC || matC.type() == type) );
718
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
719
720
matD.create(sizeD, type);
721
if ( matA.offset() % esz != 0 || matA.step() % esz != 0 ||
722
matB.offset() % esz != 0 || matB.step() % esz != 0 ||
723
(haveC && (matC.offset() % esz != 0 || matC.step() % esz != 0)) )
724
return false;
725
726
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
727
if (!ocl::internal::isCLBuffer(A) || !ocl::internal::isCLBuffer(B) || !ocl::internal::isCLBuffer(D))
728
{
729
return false;
730
}
731
if (haveC)
732
{
733
UMat C = matC.getUMat();
734
if (!ocl::internal::isCLBuffer(C))
735
return false;
736
}
737
if (haveC)
738
ctrans ? transpose(matC, D) : matC.copyTo(D);
739
else
740
D.setTo(Scalar::all(0));
741
742
int M = sizeD.height, N = sizeD.width, K = sizeA.width;
743
int lda = (int)A.step / esz, ldb = (int)B.step / esz, ldc = (int)D.step / esz;
744
int offa = (int)A.offset / esz, offb = (int)B.offset / esz, offc = (int)D.offset / esz;
745
746
cl_command_queue clq = (cl_command_queue)ocl::Queue::getDefault().ptr();
747
clAmdBlasTranspose transA = atrans ? clAmdBlasTrans : clAmdBlasNoTrans;
748
clAmdBlasTranspose transB = btrans ? clAmdBlasTrans : clAmdBlasNoTrans;
749
clAmdBlasOrder order = clAmdBlasRowMajor;
750
clAmdBlasStatus status = clAmdBlasSuccess;
751
752
if (type == CV_32FC1)
753
status = clAmdBlasSgemmEx(order, transA, transB, M, N, K,
754
(cl_float)alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
755
(const cl_mem)B.handle(ACCESS_READ), offb, ldb,
756
(cl_float)beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
757
1, &clq, 0, NULL, NULL);
758
else if (type == CV_64FC1)
759
status = clAmdBlasDgemmEx(order, transA, transB, M, N, K,
760
alpha, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
761
(const cl_mem)B.handle(ACCESS_READ), offb, ldb,
762
beta, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
763
1, &clq, 0, NULL, NULL);
764
else if (type == CV_32FC2)
765
{
766
cl_float2 alpha_2 = { { (cl_float)alpha, 0 } };
767
cl_float2 beta_2 = { { (cl_float)beta, 0 } };
768
status = clAmdBlasCgemmEx(order, transA, transB, M, N, K,
769
alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
770
(const cl_mem)B.handle(ACCESS_READ), offb, ldb,
771
beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
772
1, &clq, 0, NULL, NULL);
773
}
774
else if (type == CV_64FC2)
775
{
776
cl_double2 alpha_2 = { { alpha, 0 } };
777
cl_double2 beta_2 = { { beta, 0 } };
778
status = clAmdBlasZgemmEx(order, transA, transB, M, N, K,
779
alpha_2, (const cl_mem)A.handle(ACCESS_READ), offa, lda,
780
(const cl_mem)B.handle(ACCESS_READ), offb, ldb,
781
beta_2, (cl_mem)D.handle(ACCESS_RW), offc, ldc,
782
1, &clq, 0, NULL, NULL);
783
}
784
else
785
CV_Error(Error::StsUnsupportedFormat, "");
786
787
return status == clAmdBlasSuccess;
788
}
789
790
#endif
791
792
#ifdef HAVE_OPENCL
793
static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
794
InputArray matC, double beta, OutputArray matD, int flags )
795
{
796
int depth = matA.depth(), cn = matA.channels();
797
int type = CV_MAKETYPE(depth, cn);
798
799
CV_Assert_N( type == matB.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
800
801
const ocl::Device & dev = ocl::Device::getDefault();
802
bool doubleSupport = dev.doubleFPConfig() > 0;
803
804
if (!doubleSupport && depth == CV_64F)
805
return false;
806
807
bool haveC = matC.kind() != cv::_InputArray::NONE;
808
Size sizeA = matA.size(), sizeB = matB.size(), sizeC = haveC ? matC.size() : Size(0, 0);
809
bool atrans = (flags & GEMM_1_T) != 0, btrans = (flags & GEMM_2_T) != 0, ctrans = (flags & GEMM_3_T) != 0;
810
811
CV_Assert( !haveC || matC.type() == type );
812
813
Size sizeD(((btrans)? sizeB.height : sizeB.width),
814
((atrans)? sizeA.width : sizeA.height));
815
matD.create(sizeD, type);
816
817
UMat A = matA.getUMat(), B = matB.getUMat(), D = matD.getUMat();
818
819
820
if (!dev.intelSubgroupsSupport() || (depth == CV_64F) || cn != 1)
821
{
822
String opts;
823
824
if (atrans)
825
sizeA = Size(sizeA.height, sizeA.width);
826
if (btrans)
827
sizeB = Size(sizeB.height, sizeB.width);
828
if (haveC && ctrans)
829
sizeC = Size(sizeC.height, sizeC.width);
830
831
CV_Assert( sizeA.width == sizeB.height && (!haveC || sizeC == sizeD) );
832
833
int max_wg_size = (int)dev.maxWorkGroupSize();
834
int block_size = (max_wg_size / (32*cn) < 32) ? (max_wg_size / (16*cn) < 16) ? (max_wg_size / (8*cn) < 8) ? 1 : 8 : 16 : 32;
835
836
if (atrans)
837
A = A.t();
838
839
if (btrans)
840
B = B.t();
841
842
if (haveC)
843
ctrans ? transpose(matC, D) : matC.copyTo(D);
844
845
int vectorWidths[] = { 4, 4, 2, 2, 1, 4, cn, -1 };
846
int kercn = ocl::checkOptimalVectorWidth(vectorWidths, B, D);
847
848
opts += format(" -D T=%s -D T1=%s -D WT=%s -D cn=%d -D kercn=%d -D LOCAL_SIZE=%d%s%s%s",
849
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(CV_MAKETYPE(depth, kercn)),
850
cn, kercn, block_size,
851
(sizeA.width % block_size !=0) ? " -D NO_MULT" : "",
852
haveC ? " -D HAVE_C" : "",
853
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
854
855
ocl::Kernel k("gemm", cv::ocl::core::gemm_oclsrc, opts);
856
if (k.empty())
857
return false;
858
859
if (depth == CV_64F)
860
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
861
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
862
ocl::KernelArg::ReadWrite(D, cn, kercn),
863
sizeA.width, alpha, beta);
864
else
865
k.args(ocl::KernelArg::ReadOnlyNoSize(A),
866
ocl::KernelArg::ReadOnlyNoSize(B, cn, kercn),
867
ocl::KernelArg::ReadWrite(D, cn, kercn),
868
sizeA.width, (float)alpha, (float)beta);
869
870
size_t globalsize[2] = { (size_t)sizeD.width * cn / kercn, (size_t)sizeD.height};
871
size_t localsize[2] = { (size_t)block_size, (size_t)block_size};
872
873
return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
874
}
875
else
876
{
877
if (haveC && beta != 0.0)
878
{
879
ctrans ? transpose(matC, D) : matC.copyTo(D);
880
}
881
else
882
{
883
beta = 0.0;
884
}
885
886
return intel_gpu_gemm(A, sizeA,
887
B, sizeB,
888
D, sizeD,
889
alpha,
890
beta,
891
atrans, btrans);
892
}
893
}
894
#endif
895
896
static void gemmImpl( Mat A, Mat B, double alpha,
897
Mat C, double beta, Mat D, int flags )
898
{
899
CV_INSTRUMENT_REGION();
900
901
const int block_lin_size = 128;
902
const int block_size = block_lin_size * block_lin_size;
903
904
static double zero[] = {0,0,0,0};
905
static float zerof[] = {0,0,0,0};
906
907
Size a_size = A.size(), d_size;
908
int i, len = 0, type = A.type();
909
910
switch( flags & (GEMM_1_T|GEMM_2_T) )
911
{
912
case 0:
913
d_size = Size( B.cols, a_size.height );
914
len = B.rows;
915
break;
916
case 1:
917
d_size = Size( B.cols, a_size.width );
918
len = B.rows;
919
break;
920
case 2:
921
d_size = Size( B.rows, a_size.height );
922
len = B.cols;
923
break;
924
case 3:
925
d_size = Size( B.rows, a_size.width );
926
len = B.cols;
927
break;
928
}
929
930
if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) )
931
{
932
if( type == CV_32F )
933
{
934
float* d = D.ptr<float>();
935
const float *a = A.ptr<float>(),
936
*b = B.ptr<float>(),
937
*c = (const float*)C.data;
938
size_t d_step = D.step/sizeof(d[0]),
939
a_step = A.step/sizeof(a[0]),
940
b_step = B.step/sizeof(b[0]),
941
c_step = C.data ? C.step/sizeof(c[0]) : 0;
942
943
if( !c )
944
c = zerof;
945
946
switch( len )
947
{
948
case 2:
949
if( len == d_size.width && b != d )
950
{
951
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
952
{
953
float t0 = a[0]*b[0] + a[1]*b[b_step];
954
float t1 = a[0]*b[1] + a[1]*b[b_step+1];
955
d[0] = (float)(t0*alpha + c[0]*beta);
956
d[1] = (float)(t1*alpha + c[1]*beta);
957
}
958
}
959
else if( a != d )
960
{
961
int c_step0 = 1;
962
if( c == zerof )
963
{
964
c_step0 = 0;
965
c_step = 1;
966
}
967
968
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
969
{
970
float t0 = a[0]*b[0] + a[1]*b[b_step];
971
float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
972
d[0] = (float)(t0*alpha + c[0]*beta);
973
d[d_step] = (float)(t1*alpha + c[c_step]*beta);
974
}
975
}
976
else
977
break;
978
return;
979
case 3:
980
if( len == d_size.width && b != d )
981
{
982
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
983
{
984
float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
985
float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
986
float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
987
d[0] = (float)(t0*alpha + c[0]*beta);
988
d[1] = (float)(t1*alpha + c[1]*beta);
989
d[2] = (float)(t2*alpha + c[2]*beta);
990
}
991
}
992
else if( a != d )
993
{
994
int c_step0 = 1;
995
if( c == zerof )
996
{
997
c_step0 = 0;
998
c_step = 1;
999
}
1000
1001
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
1002
{
1003
float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
1004
float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
1005
float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];
1006
1007
d[0] = (float)(t0*alpha + c[0]*beta);
1008
d[d_step] = (float)(t1*alpha + c[c_step]*beta);
1009
d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
1010
}
1011
}
1012
else
1013
break;
1014
return;
1015
case 4:
1016
if( len == d_size.width && b != d )
1017
{
1018
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
1019
{
1020
float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
1021
float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
1022
float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
1023
float t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
1024
d[0] = (float)(t0*alpha + c[0]*beta);
1025
d[1] = (float)(t1*alpha + c[1]*beta);
1026
d[2] = (float)(t2*alpha + c[2]*beta);
1027
d[3] = (float)(t3*alpha + c[3]*beta);
1028
}
1029
}
1030
else if( len <= 16 && a != d )
1031
{
1032
int c_step0 = 1;
1033
if( c == zerof )
1034
{
1035
c_step0 = 0;
1036
c_step = 1;
1037
}
1038
1039
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
1040
{
1041
float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
1042
float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
1043
a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
1044
float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
1045
a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
1046
float t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
1047
a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
1048
d[0] = (float)(t0*alpha + c[0]*beta);
1049
d[d_step] = (float)(t1*alpha + c[c_step]*beta);
1050
d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
1051
d[d_step*3] = (float)(t3*alpha + c[c_step*3]*beta);
1052
}
1053
}
1054
else
1055
break;
1056
return;
1057
}
1058
}
1059
1060
if( type == CV_64F )
1061
{
1062
double* d = D.ptr<double>();
1063
const double *a = A.ptr<double>(),
1064
*b = B.ptr<double>(),
1065
*c = (const double*)C.data;
1066
size_t d_step = D.step/sizeof(d[0]),
1067
a_step = A.step/sizeof(a[0]),
1068
b_step = B.step/sizeof(b[0]),
1069
c_step = C.data ? C.step/sizeof(c[0]) : 0;
1070
if( !c )
1071
c = zero;
1072
1073
switch( len )
1074
{
1075
case 2:
1076
if( len == d_size.width && b != d )
1077
{
1078
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
1079
{
1080
double t0 = a[0]*b[0] + a[1]*b[b_step];
1081
double t1 = a[0]*b[1] + a[1]*b[b_step+1];
1082
d[0] = t0*alpha + c[0]*beta;
1083
d[1] = t1*alpha + c[1]*beta;
1084
}
1085
}
1086
else if( a != d )
1087
{
1088
int c_step0 = 1;
1089
if( c == zero )
1090
{
1091
c_step0 = 0;
1092
c_step = 1;
1093
}
1094
1095
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
1096
{
1097
double t0 = a[0]*b[0] + a[1]*b[b_step];
1098
double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
1099
d[0] = t0*alpha + c[0]*beta;
1100
d[d_step] = t1*alpha + c[c_step]*beta;
1101
}
1102
}
1103
else
1104
break;
1105
return;
1106
case 3:
1107
if( len == d_size.width && b != d )
1108
{
1109
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
1110
{
1111
double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
1112
double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
1113
double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
1114
d[0] = t0*alpha + c[0]*beta;
1115
d[1] = t1*alpha + c[1]*beta;
1116
d[2] = t2*alpha + c[2]*beta;
1117
}
1118
}
1119
else if( a != d )
1120
{
1121
int c_step0 = 1;
1122
if( c == zero )
1123
{
1124
c_step0 = 0;
1125
c_step = 1;
1126
}
1127
1128
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
1129
{
1130
double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
1131
double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
1132
double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];
1133
1134
d[0] = t0*alpha + c[0]*beta;
1135
d[d_step] = t1*alpha + c[c_step]*beta;
1136
d[d_step*2] = t2*alpha + c[c_step*2]*beta;
1137
}
1138
}
1139
else
1140
break;
1141
return;
1142
case 4:
1143
if( len == d_size.width && b != d )
1144
{
1145
for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
1146
{
1147
double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
1148
double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
1149
double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
1150
double t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
1151
d[0] = t0*alpha + c[0]*beta;
1152
d[1] = t1*alpha + c[1]*beta;
1153
d[2] = t2*alpha + c[2]*beta;
1154
d[3] = t3*alpha + c[3]*beta;
1155
}
1156
}
1157
else if( d_size.width <= 16 && a != d )
1158
{
1159
int c_step0 = 1;
1160
if( c == zero )
1161
{
1162
c_step0 = 0;
1163
c_step = 1;
1164
}
1165
1166
for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
1167
{
1168
double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
1169
double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
1170
a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
1171
double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
1172
a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
1173
double t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
1174
a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
1175
d[0] = t0*alpha + c[0]*beta;
1176
d[d_step] = t1*alpha + c[c_step]*beta;
1177
d[d_step*2] = t2*alpha + c[c_step*2]*beta;
1178
d[d_step*3] = t3*alpha + c[c_step*3]*beta;
1179
}
1180
}
1181
else
1182
break;
1183
return;
1184
}
1185
}
1186
}
1187
1188
{
1189
size_t b_step = B.step;
1190
GEMMSingleMulFunc singleMulFunc;
1191
GEMMBlockMulFunc blockMulFunc;
1192
GEMMStoreFunc storeFunc;
1193
Mat *matD = &D;
1194
const uchar* Cdata = C.data;
1195
size_t Cstep = C.data ? (size_t)C.step : 0;
1196
AutoBuffer<uchar> buf;
1197
1198
if( type == CV_32FC1 )
1199
{
1200
singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32f;
1201
blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32f;
1202
storeFunc = (GEMMStoreFunc)GEMMStore_32f;
1203
}
1204
else if( type == CV_64FC1 )
1205
{
1206
singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64f;
1207
blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64f;
1208
storeFunc = (GEMMStoreFunc)GEMMStore_64f;
1209
}
1210
else if( type == CV_32FC2 )
1211
{
1212
singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_32fc;
1213
blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_32fc;
1214
storeFunc = (GEMMStoreFunc)GEMMStore_32fc;
1215
}
1216
else
1217
{
1218
CV_Assert( type == CV_64FC2 );
1219
singleMulFunc = (GEMMSingleMulFunc)GEMMSingleMul_64fc;
1220
blockMulFunc = (GEMMBlockMulFunc)GEMMBlockMul_64fc;
1221
storeFunc = (GEMMStoreFunc)GEMMStore_64fc;
1222
}
1223
1224
if( (d_size.width == 1 || len == 1) && !(flags & GEMM_2_T) && B.isContinuous() )
1225
{
1226
b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type);
1227
flags |= GEMM_2_T;
1228
}
1229
1230
/*if( (d_size.width | d_size.height | len) >= 16 && icvBLAS_GEMM_32f_p != 0 )
1231
{
1232
blas_func = type == CV_32FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32f_p :
1233
type == CV_64FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64f_p :
1234
type == CV_32FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32fc_p :
1235
type == CV_64FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64fc_p : 0;
1236
}
1237
1238
if( blas_func )
1239
{
1240
const char* transa = flags & GEMM_1_T ? "t" : "n";
1241
const char* transb = flags & GEMM_2_T ? "t" : "n";
1242
int lda, ldb, ldd;
1243
1244
if( C->data.ptr )
1245
{
1246
if( C->data.ptr != D->data.ptr )
1247
{
1248
if( !(flags & GEMM_3_T) )
1249
cvCopy( C, D );
1250
else
1251
cvTranspose( C, D );
1252
}
1253
}
1254
1255
if( CV_MAT_DEPTH(type) == CV_32F )
1256
{
1257
Complex32f _alpha, _beta;
1258
1259
lda = A->step/sizeof(float);
1260
ldb = b_step/sizeof(float);
1261
ldd = D->step/sizeof(float);
1262
_alpha.re = (float)alpha;
1263
_alpha.im = 0;
1264
_beta.re = C->data.ptr ? (float)beta : 0;
1265
_beta.im = 0;
1266
if( CV_MAT_CN(type) == 2 )
1267
lda /= 2, ldb /= 2, ldd /= 2;
1268
1269
blas_func( transb, transa, &d_size.width, &d_size.height, &len,
1270
&_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
1271
&_beta, D->data.ptr, &ldd );
1272
}
1273
else
1274
{
1275
CvComplex64f _alpha, _beta;
1276
1277
lda = A->step/sizeof(double);
1278
ldb = b_step/sizeof(double);
1279
ldd = D->step/sizeof(double);
1280
_alpha.re = alpha;
1281
_alpha.im = 0;
1282
_beta.re = C->data.ptr ? beta : 0;
1283
_beta.im = 0;
1284
if( CV_MAT_CN(type) == 2 )
1285
lda /= 2, ldb /= 2, ldd /= 2;
1286
1287
blas_func( transb, transa, &d_size.width, &d_size.height, &len,
1288
&_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
1289
&_beta, D->data.ptr, &ldd );
1290
}
1291
}
1292
else*/ if( ((d_size.height <= block_lin_size/2 || d_size.width <= block_lin_size/2) &&
1293
len <= 10000) || len <= 10 ||
1294
(d_size.width <= block_lin_size &&
1295
d_size.height <= block_lin_size && len <= block_lin_size) )
1296
{
1297
singleMulFunc( A.ptr(), A.step, B.ptr(), b_step, Cdata, Cstep,
1298
matD->ptr(), matD->step, a_size, d_size, alpha, beta, flags );
1299
}
1300
else
1301
{
1302
int is_a_t = flags & GEMM_1_T;
1303
int is_b_t = flags & GEMM_2_T;
1304
int elem_size = CV_ELEM_SIZE(type);
1305
int dk0_1, dk0_2;
1306
size_t a_buf_size = 0, b_buf_size, d_buf_size;
1307
uchar* a_buf = 0;
1308
uchar* b_buf = 0;
1309
uchar* d_buf = 0;
1310
int j, k, di = 0, dj = 0, dk = 0;
1311
int dm0, dn0, dk0;
1312
size_t a_step0, a_step1, b_step0, b_step1, c_step0, c_step1;
1313
int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0);
1314
1315
if( !is_a_t )
1316
a_step0 = A.step, a_step1 = elem_size;
1317
else
1318
a_step0 = elem_size, a_step1 = A.step;
1319
1320
if( !is_b_t )
1321
b_step0 = b_step, b_step1 = elem_size;
1322
else
1323
b_step0 = elem_size, b_step1 = b_step;
1324
1325
if( C.empty() )
1326
{
1327
c_step0 = c_step1 = 0;
1328
flags &= ~GEMM_3_T;
1329
}
1330
else if( !(flags & GEMM_3_T) )
1331
c_step0 = C.step, c_step1 = elem_size;
1332
else
1333
c_step0 = elem_size, c_step1 = C.step;
1334
1335
dm0 = std::min( block_lin_size, d_size.height );
1336
dn0 = std::min( block_lin_size, d_size.width );
1337
dk0_1 = block_size / dm0;
1338
dk0_2 = block_size / dn0;
1339
dk0 = std::min( dk0_1, dk0_2 );
1340
dk0 = std::min( dk0, len );
1341
if( dk0*dm0 > block_size )
1342
dm0 = block_size / dk0;
1343
if( dk0*dn0 > block_size )
1344
dn0 = block_size / dk0;
1345
1346
dk0_1 = (dn0+dn0/8+2) & -2;
1347
b_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*elem_size;
1348
d_buf_size = (size_t)(dk0+dk0/8+1)*dk0_1*work_elem_size;
1349
1350
if( is_a_t )
1351
{
1352
a_buf_size = (size_t)(dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size;
1353
flags &= ~GEMM_1_T;
1354
}
1355
1356
buf.allocate(d_buf_size + b_buf_size + a_buf_size);
1357
d_buf = buf.data();
1358
b_buf = d_buf + d_buf_size;
1359
1360
if( is_a_t )
1361
a_buf = b_buf + b_buf_size;
1362
1363
for( i = 0; i < d_size.height; i += di )
1364
{
1365
di = dm0;
1366
if( i + di >= d_size.height || 8*(i + di) + di > 8*d_size.height )
1367
di = d_size.height - i;
1368
1369
for( j = 0; j < d_size.width; j += dj )
1370
{
1371
uchar* _d = matD->ptr() + i*matD->step + j*elem_size;
1372
const uchar* _c = Cdata + i*c_step0 + j*c_step1;
1373
size_t _d_step = matD->step;
1374
dj = dn0;
1375
1376
if( j + dj >= d_size.width || 8*(j + dj) + dj > 8*d_size.width )
1377
dj = d_size.width - j;
1378
1379
flags &= 15;
1380
if( dk0 < len )
1381
{
1382
_d = d_buf;
1383
_d_step = dj*work_elem_size;
1384
}
1385
1386
for( k = 0; k < len; k += dk )
1387
{
1388
const uchar* _a = A.ptr() + i*a_step0 + k*a_step1;
1389
size_t _a_step = A.step;
1390
const uchar* _b = B.ptr() + k*b_step0 + j*b_step1;
1391
size_t _b_step = b_step;
1392
Size a_bl_size;
1393
1394
dk = dk0;
1395
if( k + dk >= len || 8*(k + dk) + dk > 8*len )
1396
dk = len - k;
1397
1398
if( !is_a_t )
1399
a_bl_size.width = dk, a_bl_size.height = di;
1400
else
1401
a_bl_size.width = di, a_bl_size.height = dk;
1402
1403
if( a_buf && is_a_t )
1404
{
1405
_a_step = dk*elem_size;
1406
GEMM_TransposeBlock( _a, A.step, a_buf, _a_step, a_bl_size, elem_size );
1407
std::swap( a_bl_size.width, a_bl_size.height );
1408
_a = a_buf;
1409
}
1410
1411
if( dj < d_size.width )
1412
{
1413
Size b_size;
1414
if( !is_b_t )
1415
b_size.width = dj, b_size.height = dk;
1416
else
1417
b_size.width = dk, b_size.height = dj;
1418
1419
_b_step = b_size.width*elem_size;
1420
GEMM_CopyBlock( _b, b_step, b_buf, _b_step, b_size, elem_size );
1421
_b = b_buf;
1422
}
1423
1424
if( dk0 < len )
1425
blockMulFunc( _a, _a_step, _b, _b_step, _d, _d_step,
1426
a_bl_size, Size(dj,di), flags );
1427
else
1428
singleMulFunc( _a, _a_step, _b, _b_step, _c, Cstep,
1429
_d, _d_step, a_bl_size, Size(dj,di), alpha, beta, flags );
1430
flags |= 16;
1431
}
1432
1433
if( dk0 < len )
1434
storeFunc( _c, Cstep, _d, _d_step,
1435
matD->ptr(i) + j*elem_size,
1436
matD->step, Size(dj,di), alpha, beta, flags );
1437
}
1438
}
1439
}
1440
}
1441
}
1442
1443
template <typename fptype>inline static void
1444
callGemmImpl(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
1445
const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int m_a, int n_a, int n_d, int flags, int type)
1446
{
1447
CV_StaticAssert(GEMM_1_T == CV_HAL_GEMM_1_T, "Incompatible GEMM_1_T flag in HAL");
1448
CV_StaticAssert(GEMM_2_T == CV_HAL_GEMM_2_T, "Incompatible GEMM_2_T flag in HAL");
1449
CV_StaticAssert(GEMM_3_T == CV_HAL_GEMM_3_T, "Incompatible GEMM_3_T flag in HAL");
1450
1451
int b_m, b_n, c_m, c_n, m_d;
1452
1453
if(flags & GEMM_2_T)
1454
{
1455
b_m = n_d;
1456
if(flags & GEMM_1_T )
1457
{
1458
b_n = m_a;
1459
m_d = n_a;
1460
}
1461
else
1462
{
1463
b_n = n_a;
1464
m_d = m_a;
1465
}
1466
}
1467
else
1468
{
1469
b_n = n_d;
1470
if(flags & GEMM_1_T )
1471
{
1472
b_m = m_a;
1473
m_d = n_a;
1474
}
1475
else
1476
{
1477
m_d = m_a;
1478
b_m = n_a;
1479
}
1480
}
1481
1482
if(flags & GEMM_3_T)
1483
{
1484
c_m = n_d;
1485
c_n = m_d;
1486
}
1487
else
1488
{
1489
c_m = m_d;
1490
c_n = n_d;
1491
}
1492
1493
Mat A, B, C;
1494
if(src1 != NULL)
1495
A = Mat(m_a, n_a, type, (void*)src1, src1_step);
1496
if(src2 != NULL)
1497
B = Mat(b_m, b_n, type, (void*)src2, src2_step);
1498
if(src3 != NULL && beta != 0.0)
1499
C = Mat(c_m, c_n, type, (void*)src3, src3_step);
1500
Mat D(m_d, n_d, type, (void*)dst, dst_step);
1501
1502
gemmImpl(A, B, alpha, C, beta, D, flags);
1503
}
1504
1505
}
1506
1507
void cv::hal::gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
1508
float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
1509
int m_a, int n_a, int n_d, int flags)
1510
{
1511
1512
CALL_HAL(gemm32f, cv_hal_gemm32f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
1513
callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32F);
1514
}
1515
1516
void cv::hal::gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
1517
double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
1518
int m_a, int n_a, int n_d, int flags)
1519
{
1520
CALL_HAL(gemm64f, cv_hal_gemm64f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
1521
callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64F);
1522
}
1523
1524
CV_EXPORTS void cv::hal::gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
1525
float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
1526
int m_a, int n_a, int n_d, int flags)
1527
{
1528
CALL_HAL(gemm32fc, cv_hal_gemm32fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
1529
callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32FC2);
1530
}
1531
1532
CV_EXPORTS void cv::hal::gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
1533
double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
1534
int m_a, int n_a, int n_d, int flags)
1535
{
1536
CALL_HAL(gemm64fc, cv_hal_gemm64fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
1537
callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64FC2);
1538
}
1539
1540
void cv::gemm( InputArray matA, InputArray matB, double alpha,
1541
InputArray matC, double beta, OutputArray _matD, int flags )
1542
{
1543
#ifdef HAVE_CLAMDBLAS
1544
CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() &&
1545
matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes
1546
ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags))
1547
#endif
1548
1549
#ifdef HAVE_OPENCL
1550
CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2,
1551
ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags))
1552
#endif
1553
1554
Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0.0 ? matC.getMat() : Mat();
1555
Size a_size = A.size(), d_size;
1556
int len = 0, type = A.type();
1557
1558
CV_Assert_N( type == B.type(), (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
1559
1560
switch( flags & (GEMM_1_T|GEMM_2_T) )
1561
{
1562
case 0:
1563
d_size = Size( B.cols, a_size.height );
1564
len = B.rows;
1565
CV_Assert( a_size.width == len );
1566
break;
1567
case 1:
1568
d_size = Size( B.cols, a_size.width );
1569
len = B.rows;
1570
CV_Assert( a_size.height == len );
1571
break;
1572
case 2:
1573
d_size = Size( B.rows, a_size.height );
1574
len = B.cols;
1575
CV_Assert( a_size.width == len );
1576
break;
1577
case 3:
1578
d_size = Size( B.rows, a_size.width );
1579
len = B.cols;
1580
CV_Assert( a_size.height == len );
1581
break;
1582
}
1583
1584
if( !C.empty() )
1585
{
1586
CV_Assert_N( C.type() == type,
1587
(((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) ||
1588
((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height)));
1589
}
1590
1591
_matD.create( d_size.height, d_size.width, type );
1592
Mat D = _matD.getMat();
1593
if( (flags & GEMM_3_T) != 0 && C.data == D.data )
1594
{
1595
transpose( C, C );
1596
flags &= ~GEMM_3_T;
1597
}
1598
1599
Mat *DProxyPtr = &D, DProxy;
1600
if( D.data == A.data || D.data == B.data )
1601
{
1602
DProxy = Mat(d_size.height, d_size.width, D.type());
1603
DProxyPtr = &DProxy;
1604
}
1605
1606
if( type == CV_32FC1 )
1607
hal::gemm32f(A.ptr<float>(), A.step, B.ptr<float>(), B.step, static_cast<float>(alpha),
1608
C.ptr<float>(), C.step, static_cast<float>(beta),
1609
DProxyPtr->ptr<float>(), DProxyPtr->step,
1610
a_size.height, a_size.width, DProxyPtr->cols, flags);
1611
else if( type == CV_64FC1 )
1612
hal::gemm64f(A.ptr<double>(), A.step, B.ptr<double>(), B.step, alpha,
1613
C.ptr<double>(), C.step, beta,
1614
DProxyPtr->ptr<double>(), DProxyPtr->step,
1615
a_size.height, a_size.width, DProxyPtr->cols, flags);
1616
else if( type == CV_32FC2 )
1617
hal::gemm32fc(A.ptr<float>(), A.step, B.ptr<float>(), B.step, static_cast<float>(alpha),
1618
C.ptr<float>(), C.step, static_cast<float>(beta),
1619
DProxyPtr->ptr<float>(), DProxyPtr->step,
1620
a_size.height, a_size.width, DProxyPtr->cols, flags);
1621
else
1622
{
1623
CV_Assert( type == CV_64FC2 );
1624
hal::gemm64fc(A.ptr<double>(), A.step, B.ptr<double>(), B.step, alpha,
1625
C.ptr<double>(), C.step, beta,
1626
D.ptr<double>(), D.step,
1627
a_size.height, a_size.width, DProxyPtr->cols, flags);
1628
}
1629
1630
if(DProxyPtr != &D)
1631
DProxyPtr->copyTo(D);
1632
}
1633
1634
/****************************************************************************************\
1635
* Transform *
1636
\****************************************************************************************/
1637
1638
namespace cv
1639
{
1640
1641
template<typename T, typename WT> static void
1642
transform_( const T* src, T* dst, const WT* m, int len, int scn, int dcn )
1643
{
1644
int x;
1645
1646
if( scn == 2 && dcn == 2 )
1647
{
1648
for( x = 0; x < len*2; x += 2 )
1649
{
1650
WT v0 = src[x], v1 = src[x+1];
1651
T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]);
1652
T t1 = saturate_cast<T>(m[3]*v0 + m[4]*v1 + m[5]);
1653
dst[x] = t0; dst[x+1] = t1;
1654
}
1655
}
1656
else if( scn == 3 && dcn == 3 )
1657
{
1658
for( x = 0; x < len*3; x += 3 )
1659
{
1660
WT v0 = src[x], v1 = src[x+1], v2 = src[x+2];
1661
T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]);
1662
T t1 = saturate_cast<T>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]);
1663
T t2 = saturate_cast<T>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]);
1664
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1665
}
1666
}
1667
else if( scn == 3 && dcn == 1 )
1668
{
1669
for( x = 0; x < len; x++, src += 3 )
1670
dst[x] = saturate_cast<T>(m[0]*src[0] + m[1]*src[1] + m[2]*src[2] + m[3]);
1671
}
1672
else if( scn == 4 && dcn == 4 )
1673
{
1674
for( x = 0; x < len*4; x += 4 )
1675
{
1676
WT v0 = src[x], v1 = src[x+1], v2 = src[x+2], v3 = src[x+3];
1677
T t0 = saturate_cast<T>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]*v3 + m[4]);
1678
T t1 = saturate_cast<T>(m[5]*v0 + m[6]*v1 + m[7]*v2 + m[8]*v3 + m[9]);
1679
dst[x] = t0; dst[x+1] = t1;
1680
t0 = saturate_cast<T>(m[10]*v0 + m[11]*v1 + m[12]*v2 + m[13]*v3 + m[14]);
1681
t1 = saturate_cast<T>(m[15]*v0 + m[16]*v1 + m[17]*v2 + m[18]*v3 + m[19]);
1682
dst[x+2] = t0; dst[x+3] = t1;
1683
}
1684
}
1685
else
1686
{
1687
for( x = 0; x < len; x++, src += scn, dst += dcn )
1688
{
1689
const WT* _m = m;
1690
int j, k;
1691
for( j = 0; j < dcn; j++, _m += scn + 1 )
1692
{
1693
WT s = _m[scn];
1694
for( k = 0; k < scn; k++ )
1695
s += _m[k]*src[k];
1696
dst[j] = saturate_cast<T>(s);
1697
}
1698
}
1699
}
1700
}
1701
1702
#if CV_SIMD128
1703
static inline void
1704
load3x3Matrix(const float* m, v_float32x4& m0, v_float32x4& m1, v_float32x4& m2, v_float32x4& m3)
1705
{
1706
m0 = v_float32x4(m[0], m[4], m[8], 0);
1707
m1 = v_float32x4(m[1], m[5], m[9], 0);
1708
m2 = v_float32x4(m[2], m[6], m[10], 0);
1709
m3 = v_float32x4(m[3], m[7], m[11], 0);
1710
}
1711
1712
static inline v_int16x8
1713
v_matmulvec(const v_int16x8 &v0, const v_int16x8 &m0, const v_int16x8 &m1, const v_int16x8 &m2, const v_int32x4 &m3, const int BITS)
1714
{
1715
// v0 : 0 b0 g0 r0 b1 g1 r1 ?
1716
v_int32x4 t0 = v_dotprod(v0, m0); // a0 b0 a1 b1
1717
v_int32x4 t1 = v_dotprod(v0, m1); // c0 d0 c1 d1
1718
v_int32x4 t2 = v_dotprod(v0, m2); // e0 f0 e1 f1
1719
v_int32x4 t3 = v_setzero_s32();
1720
v_int32x4 s0, s1, s2, s3;
1721
v_transpose4x4(t0, t1, t2, t3, s0, s1, s2, s3);
1722
s0 = s0 + s1 + m3; // B0 G0 R0 ?
1723
s2 = s2 + s3 + m3; // B1 G1 R1 ?
1724
1725
s0 = s0 >> BITS;
1726
s2 = s2 >> BITS;
1727
1728
v_int16x8 result = v_pack(s0, v_setzero_s32()); // B0 G0 R0 0 0 0 0 0
1729
result = v_reinterpret_as_s16(v_reinterpret_as_s64(result) << 16); // 0 B0 G0 R0 0 0 0 0
1730
result = result | v_pack(v_setzero_s32(), s2); // 0 B0 G0 R0 B1 G1 R1 0
1731
return result;
1732
}
1733
#endif
1734
1735
static void
1736
transform_8u( const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn )
1737
{
1738
#if CV_SIMD128
1739
const int BITS = 10, SCALE = 1 << BITS;
1740
const float MAX_M = (float)(1 << (15 - BITS));
1741
1742
if( hasSIMD128() && scn == 3 && dcn == 3 &&
1743
std::abs(m[0]) < MAX_M && std::abs(m[1]) < MAX_M && std::abs(m[2]) < MAX_M && std::abs(m[3]) < MAX_M*256 &&
1744
std::abs(m[4]) < MAX_M && std::abs(m[5]) < MAX_M && std::abs(m[6]) < MAX_M && std::abs(m[7]) < MAX_M*256 &&
1745
std::abs(m[8]) < MAX_M && std::abs(m[9]) < MAX_M && std::abs(m[10]) < MAX_M && std::abs(m[11]) < MAX_M*256 )
1746
{
1747
const int nChannels = 3;
1748
const int cWidth = v_int16x8::nlanes;
1749
// faster fixed-point transformation
1750
short m00 = saturate_cast<short>(m[0]*SCALE), m01 = saturate_cast<short>(m[1]*SCALE),
1751
m02 = saturate_cast<short>(m[2]*SCALE), m10 = saturate_cast<short>(m[4]*SCALE),
1752
m11 = saturate_cast<short>(m[5]*SCALE), m12 = saturate_cast<short>(m[6]*SCALE),
1753
m20 = saturate_cast<short>(m[8]*SCALE), m21 = saturate_cast<short>(m[9]*SCALE),
1754
m22 = saturate_cast<short>(m[10]*SCALE);
1755
int m03 = saturate_cast<int>((m[3]+0.5f)*SCALE), m13 = saturate_cast<int>((m[7]+0.5f)*SCALE ),
1756
m23 = saturate_cast<int>((m[11]+0.5f)*SCALE);
1757
1758
v_int16x8 m0 = v_int16x8(0, m00, m01, m02, m00, m01, m02, 0);
1759
v_int16x8 m1 = v_int16x8(0, m10, m11, m12, m10, m11, m12, 0);
1760
v_int16x8 m2 = v_int16x8(0, m20, m21, m22, m20, m21, m22, 0);
1761
v_int32x4 m3 = v_int32x4(m03, m13, m23, 0);
1762
int x = 0;
1763
1764
for (; x <= (len - cWidth) * nChannels; x += cWidth * nChannels)
1765
{
1766
// load 8 pixels
1767
v_int16x8 v0 = v_reinterpret_as_s16(v_load_expand(src + x));
1768
v_int16x8 v1 = v_reinterpret_as_s16(v_load_expand(src + x + cWidth));
1769
v_int16x8 v2 = v_reinterpret_as_s16(v_load_expand(src + x + cWidth * 2));
1770
v_int16x8 v3;
1771
1772
// rotate and pack
1773
v3 = v_rotate_right<1>(v2); // 0 b6 g6 r6 b7 g7 r7 0
1774
v2 = v_rotate_left <5>(v2, v1); // 0 b4 g4 r4 b5 g5 r5 0
1775
v1 = v_rotate_left <3>(v1, v0); // 0 b2 g2 r2 b3 g3 r3 0
1776
v0 = v_rotate_left <1>(v0); // 0 b0 g0 r0 b1 g1 r1 0
1777
1778
// multiply with matrix and normalize
1779
v0 = v_matmulvec(v0, m0, m1, m2, m3, BITS); // 0 B0 G0 R0 B1 G1 R1 0
1780
v1 = v_matmulvec(v1, m0, m1, m2, m3, BITS); // 0 B2 G2 R2 B3 G3 R3 0
1781
v2 = v_matmulvec(v2, m0, m1, m2, m3, BITS); // 0 B4 G4 R4 B5 G5 R5 0
1782
v3 = v_matmulvec(v3, m0, m1, m2, m3, BITS); // 0 B6 G6 R6 B7 G7 R7 0
1783
1784
// narrow down as uint8x16
1785
v_uint8x16 z0 = v_pack_u(v0, v_setzero_s16()); // 0 B0 G0 R0 B1 G1 R1 0 0 0 0 0 0 0 0 0
1786
v_uint8x16 z1 = v_pack_u(v1, v_setzero_s16()); // 0 B2 G2 R2 B3 G3 R3 0 0 0 0 0 0 0 0 0
1787
v_uint8x16 z2 = v_pack_u(v2, v_setzero_s16()); // 0 B4 G4 R4 B5 G5 R5 0 0 0 0 0 0 0 0 0
1788
v_uint8x16 z3 = v_pack_u(v3, v_setzero_s16()); // 0 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0 0
1789
1790
// rotate and pack
1791
z0 = v_reinterpret_as_u8(v_reinterpret_as_u64(z0) >> 8) | v_reinterpret_as_u8(v_reinterpret_as_u64(z1) << 40); // B0 G0 R0 B1 G1 R1 B2 G2 0 0 0 0 0 0 0 0
1792
z1 = v_reinterpret_as_u8(v_reinterpret_as_u64(z1) >> 24) | v_reinterpret_as_u8(v_reinterpret_as_u64(z2) << 24); // R2 B3 G3 R3 B4 G4 R4 B5 0 0 0 0 0 0 0 0
1793
z2 = v_reinterpret_as_u8(v_reinterpret_as_u64(z2) >> 40) | v_reinterpret_as_u8(v_reinterpret_as_u64(z3) << 8); // G5 R6 B6 G6 R6 B7 G7 R7 0 0 0 0 0 0 0 0
1794
1795
// store on memory
1796
v_store_low(dst + x, z0);
1797
v_store_low(dst + x + cWidth, z1);
1798
v_store_low(dst + x + cWidth * 2, z2);
1799
}
1800
1801
for( ; x < len * nChannels; x += nChannels )
1802
{
1803
int v0 = src[x], v1 = src[x+1], v2 = src[x+2];
1804
uchar t0 = saturate_cast<uchar>((m00*v0 + m01*v1 + m02*v2 + m03)>>BITS);
1805
uchar t1 = saturate_cast<uchar>((m10*v0 + m11*v1 + m12*v2 + m13)>>BITS);
1806
uchar t2 = saturate_cast<uchar>((m20*v0 + m21*v1 + m22*v2 + m23)>>BITS);
1807
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1808
}
1809
return;
1810
}
1811
#endif
1812
1813
transform_(src, dst, m, len, scn, dcn);
1814
}
1815
1816
static void
1817
transform_16u( const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn )
1818
{
1819
#if CV_SIMD128 && !defined(__aarch64__)
1820
if( hasSIMD128() && scn == 3 && dcn == 3 )
1821
{
1822
const int nChannels = 3;
1823
const int cWidth = v_float32x4::nlanes;
1824
v_int16x8 delta = v_int16x8(0, -32768, -32768, -32768, -32768, -32768, -32768, 0);
1825
v_float32x4 m0, m1, m2, m3;
1826
load3x3Matrix(m, m0, m1, m2, m3);
1827
m3 -= v_float32x4(32768.f, 32768.f, 32768.f, 0.f);
1828
1829
int x = 0;
1830
for( ; x <= (len - cWidth) * nChannels; x += cWidth * nChannels )
1831
{
1832
// load 4 pixels
1833
v_uint16x8 v0_16 = v_load(src + x); // b0 g0 r0 b1 g1 r1 b2 g2
1834
v_uint16x8 v2_16 = v_load_low(src + x + cWidth * 2); // r2 b3 g3 r3 ? ? ? ?
1835
1836
// expand to 4 vectors
1837
v_uint32x4 v0_32, v1_32, v2_32, v3_32, dummy_32;
1838
v_expand(v_rotate_right<3>(v0_16), v1_32, dummy_32); // b1 g1 r1
1839
v_expand(v_rotate_right<1>(v2_16), v3_32, dummy_32); // b3 g3 r3
1840
v_expand(v_rotate_right<6>(v0_16, v2_16), v2_32, dummy_32); // b2 g2 r2
1841
v_expand(v0_16, v0_32, dummy_32); // b0 g0 r0
1842
1843
// convert to float32x4
1844
v_float32x4 x0 = v_cvt_f32(v_reinterpret_as_s32(v0_32)); // b0 g0 r0
1845
v_float32x4 x1 = v_cvt_f32(v_reinterpret_as_s32(v1_32)); // b1 g1 r1
1846
v_float32x4 x2 = v_cvt_f32(v_reinterpret_as_s32(v2_32)); // b2 g2 r2
1847
v_float32x4 x3 = v_cvt_f32(v_reinterpret_as_s32(v3_32)); // b3 g3 r3
1848
1849
// multiply and convert back to int32x4
1850
v_int32x4 y0, y1, y2, y3;
1851
y0 = v_round(v_matmuladd(x0, m0, m1, m2, m3)); // B0 G0 R0
1852
y1 = v_round(v_matmuladd(x1, m0, m1, m2, m3)); // B1 G1 R1
1853
y2 = v_round(v_matmuladd(x2, m0, m1, m2, m3)); // B2 G2 R2
1854
y3 = v_round(v_matmuladd(x3, m0, m1, m2, m3)); // B3 G3 R3
1855
1856
// narrow down to int16x8
1857
v_int16x8 v0 = v_add_wrap(v_pack(v_rotate_left<1>(y0), y1), delta); // 0 B0 G0 R0 B1 G1 R1 0
1858
v_int16x8 v2 = v_add_wrap(v_pack(v_rotate_left<1>(y2), y3), delta); // 0 B2 G2 R2 B3 G3 R3 0
1859
1860
// rotate and pack
1861
v0 = v_rotate_right<1>(v0) | v_rotate_left<5>(v2); // B0 G0 R0 B1 G1 R1 B2 G2
1862
v2 = v_rotate_right<3>(v2); // R2 B3 G3 R3 0 0 0 0
1863
1864
// store 4 pixels
1865
v_store(dst + x, v_reinterpret_as_u16(v0));
1866
v_store_low(dst + x + cWidth * 2, v_reinterpret_as_u16(v2));
1867
}
1868
1869
for( ; x < len * nChannels; x += nChannels )
1870
{
1871
float v0 = src[x], v1 = src[x + 1], v2 = src[x + 2];
1872
ushort t0 = saturate_cast<ushort>(m[0] * v0 + m[1] * v1 + m[2] * v2 + m[3]);
1873
ushort t1 = saturate_cast<ushort>(m[4] * v0 + m[5] * v1 + m[6] * v2 + m[7]);
1874
ushort t2 = saturate_cast<ushort>(m[8] * v0 + m[9] * v1 + m[10] * v2 + m[11]);
1875
dst[x] = t0; dst[x + 1] = t1; dst[x + 2] = t2;
1876
}
1877
return;
1878
}
1879
#endif
1880
1881
transform_(src, dst, m, len, scn, dcn);
1882
}
1883
1884
static void
1885
transform_32f( const float* src, float* dst, const float* m, int len, int scn, int dcn )
1886
{
1887
#if CV_SIMD128 && !defined(__aarch64__)
1888
if( hasSIMD128() )
1889
{
1890
int x = 0;
1891
if( scn == 3 && dcn == 3 )
1892
{
1893
const int cWidth = 3;
1894
v_float32x4 m0, m1, m2, m3;
1895
load3x3Matrix(m, m0, m1, m2, m3);
1896
1897
for( ; x < (len - 1)*cWidth; x += cWidth )
1898
{
1899
v_float32x4 x0 = v_load(src + x);
1900
v_float32x4 y0 = v_matmuladd(x0, m0, m1, m2, m3);
1901
v_store_low(dst + x, y0);
1902
dst[x + 2] = v_combine_high(y0, y0).get0();
1903
}
1904
1905
for( ; x < len*cWidth; x += cWidth )
1906
{
1907
float v0 = src[x], v1 = src[x+1], v2 = src[x+2];
1908
float t0 = saturate_cast<float>(m[0]*v0 + m[1]*v1 + m[2]*v2 + m[3]);
1909
float t1 = saturate_cast<float>(m[4]*v0 + m[5]*v1 + m[6]*v2 + m[7]);
1910
float t2 = saturate_cast<float>(m[8]*v0 + m[9]*v1 + m[10]*v2 + m[11]);
1911
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1912
}
1913
return;
1914
}
1915
1916
if( scn == 4 && dcn == 4 )
1917
{
1918
const int cWidth = 4;
1919
v_float32x4 m0 = v_float32x4(m[0], m[5], m[10], m[15]);
1920
v_float32x4 m1 = v_float32x4(m[1], m[6], m[11], m[16]);
1921
v_float32x4 m2 = v_float32x4(m[2], m[7], m[12], m[17]);
1922
v_float32x4 m3 = v_float32x4(m[3], m[8], m[13], m[18]);
1923
v_float32x4 m4 = v_float32x4(m[4], m[9], m[14], m[19]);
1924
1925
for( ; x < len*cWidth; x += cWidth )
1926
{
1927
v_float32x4 x0 = v_load(src + x);
1928
v_float32x4 y0 = v_matmul(x0, m0, m1, m2, m3) + m4;
1929
v_store(dst + x, y0);
1930
}
1931
return;
1932
}
1933
}
1934
#endif
1935
1936
transform_(src, dst, m, len, scn, dcn);
1937
}
1938
1939
1940
static void
1941
transform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
1942
{
1943
transform_(src, dst, m, len, scn, dcn);
1944
}
1945
1946
static void
1947
transform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
1948
{
1949
transform_(src, dst, m, len, scn, dcn);
1950
}
1951
1952
static void
1953
transform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
1954
{
1955
transform_(src, dst, m, len, scn, dcn);
1956
}
1957
1958
static void
1959
transform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
1960
{
1961
transform_(src, dst, m, len, scn, dcn);
1962
}
1963
1964
template<typename T, typename WT> static void
1965
diagtransform_( const T* src, T* dst, const WT* m, int len, int cn, int )
1966
{
1967
int x;
1968
1969
if( cn == 2 )
1970
{
1971
for( x = 0; x < len*2; x += 2 )
1972
{
1973
T t0 = saturate_cast<T>(m[0]*src[x] + m[2]);
1974
T t1 = saturate_cast<T>(m[4]*src[x+1] + m[5]);
1975
dst[x] = t0; dst[x+1] = t1;
1976
}
1977
}
1978
else if( cn == 3 )
1979
{
1980
for( x = 0; x < len*3; x += 3 )
1981
{
1982
T t0 = saturate_cast<T>(m[0]*src[x] + m[3]);
1983
T t1 = saturate_cast<T>(m[5]*src[x+1] + m[7]);
1984
T t2 = saturate_cast<T>(m[10]*src[x+2] + m[11]);
1985
dst[x] = t0; dst[x+1] = t1; dst[x+2] = t2;
1986
}
1987
}
1988
else if( cn == 4 )
1989
{
1990
for( x = 0; x < len*4; x += 4 )
1991
{
1992
T t0 = saturate_cast<T>(m[0]*src[x] + m[4]);
1993
T t1 = saturate_cast<T>(m[6]*src[x+1] + m[9]);
1994
dst[x] = t0; dst[x+1] = t1;
1995
t0 = saturate_cast<T>(m[12]*src[x+2] + m[14]);
1996
t1 = saturate_cast<T>(m[18]*src[x+3] + m[19]);
1997
dst[x+2] = t0; dst[x+3] = t1;
1998
}
1999
}
2000
else
2001
{
2002
for( x = 0; x < len; x++, src += cn, dst += cn )
2003
{
2004
const WT* _m = m;
2005
for( int j = 0; j < cn; j++, _m += cn + 1 )
2006
dst[j] = saturate_cast<T>(src[j]*_m[j] + _m[cn]);
2007
}
2008
}
2009
}
2010
2011
static void
2012
diagtransform_8u(const uchar* src, uchar* dst, const float* m, int len, int scn, int dcn)
2013
{
2014
diagtransform_(src, dst, m, len, scn, dcn);
2015
}
2016
2017
static void
2018
diagtransform_8s(const schar* src, schar* dst, const float* m, int len, int scn, int dcn)
2019
{
2020
diagtransform_(src, dst, m, len, scn, dcn);
2021
}
2022
2023
static void
2024
diagtransform_16u(const ushort* src, ushort* dst, const float* m, int len, int scn, int dcn)
2025
{
2026
diagtransform_(src, dst, m, len, scn, dcn);
2027
}
2028
2029
static void
2030
diagtransform_16s(const short* src, short* dst, const float* m, int len, int scn, int dcn)
2031
{
2032
diagtransform_(src, dst, m, len, scn, dcn);
2033
}
2034
2035
static void
2036
diagtransform_32s(const int* src, int* dst, const double* m, int len, int scn, int dcn)
2037
{
2038
diagtransform_(src, dst, m, len, scn, dcn);
2039
}
2040
2041
static void
2042
diagtransform_32f(const float* src, float* dst, const float* m, int len, int scn, int dcn)
2043
{
2044
diagtransform_(src, dst, m, len, scn, dcn);
2045
}
2046
2047
static void
2048
diagtransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
2049
{
2050
diagtransform_(src, dst, m, len, scn, dcn);
2051
}
2052
2053
2054
typedef void (*TransformFunc)( const uchar* src, uchar* dst, const uchar* m, int, int, int );
2055
2056
static TransformFunc getTransformFunc(int depth)
2057
{
2058
static TransformFunc transformTab[] =
2059
{
2060
(TransformFunc)transform_8u, (TransformFunc)transform_8s, (TransformFunc)transform_16u,
2061
(TransformFunc)transform_16s, (TransformFunc)transform_32s, (TransformFunc)transform_32f,
2062
(TransformFunc)transform_64f, 0
2063
};
2064
2065
return transformTab[depth];
2066
}
2067
2068
static TransformFunc getDiagTransformFunc(int depth)
2069
{
2070
static TransformFunc diagTransformTab[] =
2071
{
2072
(TransformFunc)diagtransform_8u, (TransformFunc)diagtransform_8s, (TransformFunc)diagtransform_16u,
2073
(TransformFunc)diagtransform_16s, (TransformFunc)diagtransform_32s, (TransformFunc)diagtransform_32f,
2074
(TransformFunc)diagtransform_64f, 0
2075
};
2076
2077
return diagTransformTab[depth];
2078
}
2079
2080
}
2081
2082
void cv::transform( InputArray _src, OutputArray _dst, InputArray _mtx )
2083
{
2084
CV_INSTRUMENT_REGION();
2085
2086
Mat src = _src.getMat(), m = _mtx.getMat();
2087
int depth = src.depth(), scn = src.channels(), dcn = m.rows;
2088
CV_Assert( scn == m.cols || scn + 1 == m.cols );
2089
bool isDiag = false;
2090
2091
_dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
2092
Mat dst = _dst.getMat();
2093
2094
int mtype = depth == CV_32S || depth == CV_64F ? CV_64F : CV_32F;
2095
AutoBuffer<double> _mbuf;
2096
double* mbuf;
2097
2098
if( !m.isContinuous() || m.type() != mtype || m.cols != scn + 1 )
2099
{
2100
_mbuf.allocate(dcn*(scn+1));
2101
mbuf = _mbuf.data();
2102
Mat tmp(dcn, scn+1, mtype, mbuf);
2103
memset(tmp.ptr(), 0, tmp.total()*tmp.elemSize());
2104
if( m.cols == scn+1 )
2105
m.convertTo(tmp, mtype);
2106
else
2107
{
2108
Mat tmppart = tmp.colRange(0, m.cols);
2109
m.convertTo(tmppart, mtype);
2110
}
2111
m = tmp;
2112
}
2113
else
2114
mbuf = m.ptr<double>();
2115
2116
if( scn == dcn )
2117
{
2118
int i, j;
2119
double eps = mtype == CV_32F ? FLT_EPSILON : DBL_EPSILON;
2120
2121
if( scn == 1 )
2122
{
2123
double alpha, beta;
2124
if( mtype == CV_32F )
2125
alpha = m.at<float>(0), beta = m.at<float>(1);
2126
else
2127
alpha = m.at<double>(0), beta = m.at<double>(1);
2128
src.convertTo(dst, dst.type(), alpha, beta);
2129
return;
2130
}
2131
2132
for( i = 0, isDiag = true; isDiag && i < scn; i++ )
2133
{
2134
for( j = 0; isDiag && j < scn; j++ )
2135
{
2136
double v = mtype == CV_32F ? m.at<float>(i, j) : m.at<double>(i, j);
2137
if( i != j && fabs(v) > eps )
2138
isDiag = false;
2139
}
2140
}
2141
}
2142
2143
TransformFunc func = isDiag ? getDiagTransformFunc(depth): getTransformFunc(depth);
2144
CV_Assert( func != 0 );
2145
2146
const Mat* arrays[] = {&src, &dst, 0};
2147
uchar* ptrs[2] = {};
2148
NAryMatIterator it(arrays, ptrs);
2149
size_t i, total = it.size;
2150
2151
for( i = 0; i < it.nplanes; i++, ++it )
2152
func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
2153
}
2154
2155
/****************************************************************************************\
2156
* Perspective Transform *
2157
\****************************************************************************************/
2158
2159
namespace cv
2160
{
2161
2162
template<typename T> static void
2163
perspectiveTransform_( const T* src, T* dst, const double* m, int len, int scn, int dcn )
2164
{
2165
const double eps = FLT_EPSILON;
2166
int i;
2167
2168
if( scn == 2 && dcn == 2 )
2169
{
2170
for( i = 0; i < len*2; i += 2 )
2171
{
2172
T x = src[i], y = src[i + 1];
2173
double w = x*m[6] + y*m[7] + m[8];
2174
2175
if( fabs(w) > eps )
2176
{
2177
w = 1./w;
2178
dst[i] = (T)((x*m[0] + y*m[1] + m[2])*w);
2179
dst[i+1] = (T)((x*m[3] + y*m[4] + m[5])*w);
2180
}
2181
else
2182
dst[i] = dst[i+1] = (T)0;
2183
}
2184
}
2185
else if( scn == 3 && dcn == 3 )
2186
{
2187
for( i = 0; i < len*3; i += 3 )
2188
{
2189
T x = src[i], y = src[i + 1], z = src[i + 2];
2190
double w = x*m[12] + y*m[13] + z*m[14] + m[15];
2191
2192
if( fabs(w) > eps )
2193
{
2194
w = 1./w;
2195
dst[i] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3]) * w);
2196
dst[i+1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7]) * w);
2197
dst[i+2] = (T)((x*m[8] + y*m[9] + z*m[10] + m[11]) * w);
2198
}
2199
else
2200
dst[i] = dst[i+1] = dst[i+2] = (T)0;
2201
}
2202
}
2203
else if( scn == 3 && dcn == 2 )
2204
{
2205
for( i = 0; i < len; i++, src += 3, dst += 2 )
2206
{
2207
T x = src[0], y = src[1], z = src[2];
2208
double w = x*m[8] + y*m[9] + z*m[10] + m[11];
2209
2210
if( fabs(w) > eps )
2211
{
2212
w = 1./w;
2213
dst[0] = (T)((x*m[0] + y*m[1] + z*m[2] + m[3])*w);
2214
dst[1] = (T)((x*m[4] + y*m[5] + z*m[6] + m[7])*w);
2215
}
2216
else
2217
dst[0] = dst[1] = (T)0;
2218
}
2219
}
2220
else
2221
{
2222
for( i = 0; i < len; i++, src += scn, dst += dcn )
2223
{
2224
const double* _m = m + dcn*(scn + 1);
2225
double w = _m[scn];
2226
int j, k;
2227
for( k = 0; k < scn; k++ )
2228
w += _m[k]*src[k];
2229
if( fabs(w) > eps )
2230
{
2231
_m = m;
2232
for( j = 0; j < dcn; j++, _m += scn + 1 )
2233
{
2234
double s = _m[scn];
2235
for( k = 0; k < scn; k++ )
2236
s += _m[k]*src[k];
2237
dst[j] = (T)(s*w);
2238
}
2239
}
2240
else
2241
for( j = 0; j < dcn; j++ )
2242
dst[j] = 0;
2243
}
2244
}
2245
}
2246
2247
2248
static void
2249
perspectiveTransform_32f(const float* src, float* dst, const double* m, int len, int scn, int dcn)
2250
{
2251
perspectiveTransform_(src, dst, m, len, scn, dcn);
2252
}
2253
2254
static void
2255
perspectiveTransform_64f(const double* src, double* dst, const double* m, int len, int scn, int dcn)
2256
{
2257
perspectiveTransform_(src, dst, m, len, scn, dcn);
2258
}
2259
2260
}
2261
2262
void cv::perspectiveTransform( InputArray _src, OutputArray _dst, InputArray _mtx )
2263
{
2264
CV_INSTRUMENT_REGION();
2265
2266
Mat src = _src.getMat(), m = _mtx.getMat();
2267
int depth = src.depth(), scn = src.channels(), dcn = m.rows-1;
2268
CV_Assert( scn + 1 == m.cols );
2269
CV_Assert( depth == CV_32F || depth == CV_64F );
2270
2271
_dst.create( src.size(), CV_MAKETYPE(depth, dcn) );
2272
Mat dst = _dst.getMat();
2273
2274
const int mtype = CV_64F;
2275
AutoBuffer<double> _mbuf;
2276
double* mbuf = m.ptr<double>();
2277
2278
if( !m.isContinuous() || m.type() != mtype )
2279
{
2280
_mbuf.allocate((dcn+1)*(scn+1));
2281
mbuf = _mbuf.data();
2282
Mat tmp(dcn+1, scn+1, mtype, mbuf);
2283
m.convertTo(tmp, mtype);
2284
m = tmp;
2285
}
2286
2287
TransformFunc func = depth == CV_32F ?
2288
(TransformFunc)perspectiveTransform_32f :
2289
(TransformFunc)perspectiveTransform_64f;
2290
CV_Assert( func != 0 );
2291
2292
const Mat* arrays[] = {&src, &dst, 0};
2293
uchar* ptrs[2] = {};
2294
NAryMatIterator it(arrays, ptrs);
2295
size_t i, total = it.size;
2296
2297
for( i = 0; i < it.nplanes; i++, ++it )
2298
func( ptrs[0], ptrs[1], (uchar*)mbuf, (int)total, scn, dcn );
2299
}
2300
2301
/****************************************************************************************\
2302
* ScaleAdd *
2303
\****************************************************************************************/
2304
2305
namespace cv
2306
{
2307
2308
static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
2309
int len, float* _alpha)
2310
{
2311
float alpha = *_alpha;
2312
int i = 0;
2313
#if CV_SIMD
2314
v_float32 v_alpha = vx_setall_f32(alpha);
2315
const int cWidth = v_float32::nlanes;
2316
for (; i <= len - cWidth; i += cWidth)
2317
v_store(dst + i, v_muladd(vx_load(src1 + i), v_alpha, vx_load(src2 + i)));
2318
vx_cleanup();
2319
#endif
2320
for (; i < len; i++)
2321
dst[i] = src1[i] * alpha + src2[i];
2322
}
2323
2324
2325
static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
2326
int len, double* _alpha)
2327
{
2328
double alpha = *_alpha;
2329
int i = 0;
2330
#if CV_SIMD_64F
2331
v_float64 a2 = vx_setall_f64(alpha);
2332
const int cWidth = v_float64::nlanes;
2333
for (; i <= len - cWidth; i += cWidth)
2334
v_store(dst + i, v_muladd(vx_load(src1 + i), a2, vx_load(src2 + i)));
2335
vx_cleanup();
2336
#endif
2337
for (; i < len; i++)
2338
dst[i] = src1[i] * alpha + src2[i];
2339
}
2340
2341
typedef void (*ScaleAddFunc)(const uchar* src1, const uchar* src2, uchar* dst, int len, const void* alpha);
2342
2343
#ifdef HAVE_OPENCL
2344
2345
static bool ocl_scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst, int type )
2346
{
2347
const ocl::Device & d = ocl::Device::getDefault();
2348
2349
bool doubleSupport = d.doubleFPConfig() > 0;
2350
Size size = _src1.size();
2351
int depth = CV_MAT_DEPTH(type);
2352
if ( (!doubleSupport && depth == CV_64F) || size != _src2.size() )
2353
return false;
2354
2355
_dst.create(size, type);
2356
int cn = CV_MAT_CN(type), wdepth = std::max(depth, CV_32F);
2357
int kercn = ocl::predictOptimalVectorWidthMax(_src1, _src2, _dst),
2358
rowsPerWI = d.isIntel() ? 4 : 1;
2359
2360
char cvt[2][50];
2361
ocl::Kernel k("KF", ocl::core::arithm_oclsrc,
2362
format("-D OP_SCALE_ADD -D BINARY_OP -D dstT=%s -D DEPTH_dst=%d -D workT=%s -D convertToWT1=%s"
2363
" -D srcT1=dstT -D srcT2=dstT -D convertToDT=%s -D workT1=%s"
2364
" -D wdepth=%d%s -D rowsPerWI=%d",
2365
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), depth,
2366
ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)),
2367
ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
2368
ocl::convertTypeStr(wdepth, depth, kercn, cvt[1]),
2369
ocl::typeToStr(wdepth), wdepth,
2370
doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI));
2371
if (k.empty())
2372
return false;
2373
2374
UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat();
2375
2376
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
2377
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
2378
dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
2379
2380
if (wdepth == CV_32F)
2381
k.args(src1arg, src2arg, dstarg, (float)alpha);
2382
else
2383
k.args(src1arg, src2arg, dstarg, alpha);
2384
2385
size_t globalsize[2] = { (size_t)dst.cols * cn / kercn, ((size_t)dst.rows + rowsPerWI - 1) / rowsPerWI };
2386
return k.run(2, globalsize, NULL, false);
2387
}
2388
2389
#endif
2390
2391
}
2392
2393
void cv::scaleAdd( InputArray _src1, double alpha, InputArray _src2, OutputArray _dst )
2394
{
2395
CV_INSTRUMENT_REGION();
2396
2397
int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
2398
CV_Assert( type == _src2.type() );
2399
2400
CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && _dst.isUMat(),
2401
ocl_scaleAdd(_src1, alpha, _src2, _dst, type))
2402
2403
if( depth < CV_32F )
2404
{
2405
addWeighted(_src1, alpha, _src2, 1, 0, _dst, depth);
2406
return;
2407
}
2408
2409
Mat src1 = _src1.getMat(), src2 = _src2.getMat();
2410
CV_Assert(src1.size == src2.size);
2411
2412
_dst.create(src1.dims, src1.size, type);
2413
Mat dst = _dst.getMat();
2414
2415
float falpha = (float)alpha;
2416
void* palpha = depth == CV_32F ? (void*)&falpha : (void*)&alpha;
2417
2418
ScaleAddFunc func = depth == CV_32F ? (ScaleAddFunc)scaleAdd_32f : (ScaleAddFunc)scaleAdd_64f;
2419
2420
if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous())
2421
{
2422
size_t len = src1.total()*cn;
2423
func(src1.ptr(), src2.ptr(), dst.ptr(), (int)len, palpha);
2424
return;
2425
}
2426
2427
const Mat* arrays[] = {&src1, &src2, &dst, 0};
2428
uchar* ptrs[3] = {};
2429
NAryMatIterator it(arrays, ptrs);
2430
size_t i, len = it.size*cn;
2431
2432
for( i = 0; i < it.nplanes; i++, ++it )
2433
func( ptrs[0], ptrs[1], ptrs[2], (int)len, palpha );
2434
}
2435
2436
/****************************************************************************************\
2437
* Covariation Matrix *
2438
\****************************************************************************************/
2439
2440
void cv::calcCovarMatrix( const Mat* data, int nsamples, Mat& covar, Mat& _mean, int flags, int ctype )
2441
{
2442
CV_INSTRUMENT_REGION();
2443
2444
CV_Assert_N( data, nsamples > 0 );
2445
Size size = data[0].size();
2446
int sz = size.width * size.height, esz = (int)data[0].elemSize();
2447
int type = data[0].type();
2448
Mat mean;
2449
ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F);
2450
2451
if( (flags & CV_COVAR_USE_AVG) != 0 )
2452
{
2453
CV_Assert( _mean.size() == size );
2454
if( _mean.isContinuous() && _mean.type() == ctype )
2455
mean = _mean.reshape(1, 1);
2456
else
2457
{
2458
_mean.convertTo(mean, ctype);
2459
mean = mean.reshape(1, 1);
2460
}
2461
}
2462
2463
Mat _data(nsamples, sz, type);
2464
2465
for( int i = 0; i < nsamples; i++ )
2466
{
2467
CV_Assert_N( data[i].size() == size, data[i].type() == type );
2468
if( data[i].isContinuous() )
2469
memcpy( _data.ptr(i), data[i].ptr(), sz*esz );
2470
else
2471
{
2472
Mat dataRow(size.height, size.width, type, _data.ptr(i));
2473
data[i].copyTo(dataRow);
2474
}
2475
}
2476
2477
calcCovarMatrix( _data, covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype );
2478
if( (flags & CV_COVAR_USE_AVG) == 0 )
2479
_mean = mean.reshape(1, size.height);
2480
}
2481
2482
void cv::calcCovarMatrix( InputArray _src, OutputArray _covar, InputOutputArray _mean, int flags, int ctype )
2483
{
2484
CV_INSTRUMENT_REGION();
2485
2486
if(_src.kind() == _InputArray::STD_VECTOR_MAT || _src.kind() == _InputArray::STD_ARRAY_MAT)
2487
{
2488
std::vector<cv::Mat> src;
2489
_src.getMatVector(src);
2490
2491
CV_Assert( src.size() > 0 );
2492
2493
Size size = src[0].size();
2494
int type = src[0].type();
2495
2496
ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), _mean.depth()), CV_32F);
2497
2498
Mat _data(static_cast<int>(src.size()), size.area(), type);
2499
2500
int i = 0;
2501
for(std::vector<cv::Mat>::iterator each = src.begin(); each != src.end(); ++each, ++i )
2502
{
2503
CV_Assert_N( (*each).size() == size, (*each).type() == type );
2504
Mat dataRow(size.height, size.width, type, _data.ptr(i));
2505
(*each).copyTo(dataRow);
2506
}
2507
2508
Mat mean;
2509
if( (flags & CV_COVAR_USE_AVG) != 0 )
2510
{
2511
CV_Assert( _mean.size() == size );
2512
2513
if( mean.type() != ctype )
2514
{
2515
mean = _mean.getMat();
2516
_mean.create(mean.size(), ctype);
2517
Mat tmp = _mean.getMat();
2518
mean.convertTo(tmp, ctype);
2519
mean = tmp;
2520
}
2521
2522
mean = _mean.getMat().reshape(1, 1);
2523
}
2524
2525
calcCovarMatrix( _data, _covar, mean, (flags & ~(CV_COVAR_ROWS|CV_COVAR_COLS)) | CV_COVAR_ROWS, ctype );
2526
2527
if( (flags & CV_COVAR_USE_AVG) == 0 )
2528
{
2529
mean = mean.reshape(1, size.height);
2530
mean.copyTo(_mean);
2531
}
2532
return;
2533
}
2534
2535
Mat data = _src.getMat(), mean;
2536
CV_Assert( ((flags & CV_COVAR_ROWS) != 0) ^ ((flags & CV_COVAR_COLS) != 0) );
2537
bool takeRows = (flags & CV_COVAR_ROWS) != 0;
2538
int type = data.type();
2539
int nsamples = takeRows ? data.rows : data.cols;
2540
CV_Assert( nsamples > 0 );
2541
Size size = takeRows ? Size(data.cols, 1) : Size(1, data.rows);
2542
2543
if( (flags & CV_COVAR_USE_AVG) != 0 )
2544
{
2545
mean = _mean.getMat();
2546
ctype = std::max(std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), mean.depth()), CV_32F);
2547
CV_Assert( mean.size() == size );
2548
if( mean.type() != ctype )
2549
{
2550
_mean.create(mean.size(), ctype);
2551
Mat tmp = _mean.getMat();
2552
mean.convertTo(tmp, ctype);
2553
mean = tmp;
2554
}
2555
}
2556
else
2557
{
2558
ctype = std::max(CV_MAT_DEPTH(ctype >= 0 ? ctype : type), CV_32F);
2559
reduce( _src, _mean, takeRows ? 0 : 1, CV_REDUCE_AVG, ctype );
2560
mean = _mean.getMat();
2561
}
2562
2563
mulTransposed( data, _covar, ((flags & CV_COVAR_NORMAL) == 0) ^ takeRows,
2564
mean, (flags & CV_COVAR_SCALE) != 0 ? 1./nsamples : 1, ctype );
2565
}
2566
2567
/****************************************************************************************\
2568
* Mahalanobis *
2569
\****************************************************************************************/
2570
2571
double cv::Mahalanobis( InputArray _v1, InputArray _v2, InputArray _icovar )
2572
{
2573
CV_INSTRUMENT_REGION();
2574
2575
Mat v1 = _v1.getMat(), v2 = _v2.getMat(), icovar = _icovar.getMat();
2576
int type = v1.type(), depth = v1.depth();
2577
Size sz = v1.size();
2578
int i, j, len = sz.width*sz.height*v1.channels();
2579
AutoBuffer<double> buf(len);
2580
double result = 0;
2581
2582
CV_Assert_N( type == v2.type(), type == icovar.type(),
2583
sz == v2.size(), len == icovar.rows && len == icovar.cols );
2584
2585
sz.width *= v1.channels();
2586
if( v1.isContinuous() && v2.isContinuous() )
2587
{
2588
sz.width *= sz.height;
2589
sz.height = 1;
2590
}
2591
2592
if( depth == CV_32F )
2593
{
2594
const float* src1 = v1.ptr<float>();
2595
const float* src2 = v2.ptr<float>();
2596
size_t step1 = v1.step/sizeof(src1[0]);
2597
size_t step2 = v2.step/sizeof(src2[0]);
2598
double* diff = buf.data();
2599
const float* mat = icovar.ptr<float>();
2600
size_t matstep = icovar.step/sizeof(mat[0]);
2601
2602
for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width )
2603
{
2604
for( i = 0; i < sz.width; i++ )
2605
diff[i] = src1[i] - src2[i];
2606
}
2607
2608
diff = buf.data();
2609
for( i = 0; i < len; i++, mat += matstep )
2610
{
2611
double row_sum = 0;
2612
j = 0;
2613
#if CV_ENABLE_UNROLLED
2614
for(; j <= len - 4; j += 4 )
2615
row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
2616
diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
2617
#endif
2618
for( ; j < len; j++ )
2619
row_sum += diff[j]*mat[j];
2620
result += row_sum * diff[i];
2621
}
2622
}
2623
else if( depth == CV_64F )
2624
{
2625
const double* src1 = v1.ptr<double>();
2626
const double* src2 = v2.ptr<double>();
2627
size_t step1 = v1.step/sizeof(src1[0]);
2628
size_t step2 = v2.step/sizeof(src2[0]);
2629
double* diff = buf.data();
2630
const double* mat = icovar.ptr<double>();
2631
size_t matstep = icovar.step/sizeof(mat[0]);
2632
2633
for( ; sz.height--; src1 += step1, src2 += step2, diff += sz.width )
2634
{
2635
for( i = 0; i < sz.width; i++ )
2636
diff[i] = src1[i] - src2[i];
2637
}
2638
2639
diff = buf.data();
2640
for( i = 0; i < len; i++, mat += matstep )
2641
{
2642
double row_sum = 0;
2643
j = 0;
2644
#if CV_ENABLE_UNROLLED
2645
for(; j <= len - 4; j += 4 )
2646
row_sum += diff[j]*mat[j] + diff[j+1]*mat[j+1] +
2647
diff[j+2]*mat[j+2] + diff[j+3]*mat[j+3];
2648
#endif
2649
for( ; j < len; j++ )
2650
row_sum += diff[j]*mat[j];
2651
result += row_sum * diff[i];
2652
}
2653
}
2654
else
2655
CV_Error( CV_StsUnsupportedFormat, "" );
2656
2657
return std::sqrt(result);
2658
}
2659
2660
/****************************************************************************************\
2661
* MulTransposed *
2662
\****************************************************************************************/
2663
2664
namespace cv
2665
{
2666
2667
template<typename sT, typename dT> static void
2668
MulTransposedR( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale )
2669
{
2670
int i, j, k;
2671
const sT* src = srcmat.ptr<sT>();
2672
dT* dst = dstmat.ptr<dT>();
2673
const dT* delta = deltamat.ptr<dT>();
2674
size_t srcstep = srcmat.step/sizeof(src[0]);
2675
size_t dststep = dstmat.step/sizeof(dst[0]);
2676
size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0;
2677
int delta_cols = deltamat.cols;
2678
Size size = srcmat.size();
2679
dT* tdst = dst;
2680
dT* col_buf = 0;
2681
dT* delta_buf = 0;
2682
int buf_size = size.height*sizeof(dT);
2683
AutoBuffer<uchar> buf;
2684
2685
if( delta && delta_cols < size.width )
2686
{
2687
assert( delta_cols == 1 );
2688
buf_size *= 5;
2689
}
2690
buf.allocate(buf_size);
2691
col_buf = (dT*)buf.data();
2692
2693
if( delta && delta_cols < size.width )
2694
{
2695
delta_buf = col_buf + size.height;
2696
for( i = 0; i < size.height; i++ )
2697
delta_buf[i*4] = delta_buf[i*4+1] =
2698
delta_buf[i*4+2] = delta_buf[i*4+3] = delta[i*deltastep];
2699
delta = delta_buf;
2700
deltastep = deltastep ? 4 : 0;
2701
}
2702
2703
if( !delta )
2704
for( i = 0; i < size.width; i++, tdst += dststep )
2705
{
2706
for( k = 0; k < size.height; k++ )
2707
col_buf[k] = src[k*srcstep+i];
2708
2709
for( j = i; j <= size.width - 4; j += 4 )
2710
{
2711
double s0 = 0, s1 = 0, s2 = 0, s3 = 0;
2712
const sT *tsrc = src + j;
2713
2714
for( k = 0; k < size.height; k++, tsrc += srcstep )
2715
{
2716
double a = col_buf[k];
2717
s0 += a * tsrc[0];
2718
s1 += a * tsrc[1];
2719
s2 += a * tsrc[2];
2720
s3 += a * tsrc[3];
2721
}
2722
2723
tdst[j] = (dT)(s0*scale);
2724
tdst[j+1] = (dT)(s1*scale);
2725
tdst[j+2] = (dT)(s2*scale);
2726
tdst[j+3] = (dT)(s3*scale);
2727
}
2728
2729
for( ; j < size.width; j++ )
2730
{
2731
double s0 = 0;
2732
const sT *tsrc = src + j;
2733
2734
for( k = 0; k < size.height; k++, tsrc += srcstep )
2735
s0 += (double)col_buf[k] * tsrc[0];
2736
2737
tdst[j] = (dT)(s0*scale);
2738
}
2739
}
2740
else
2741
for( i = 0; i < size.width; i++, tdst += dststep )
2742
{
2743
if( !delta_buf )
2744
for( k = 0; k < size.height; k++ )
2745
col_buf[k] = src[k*srcstep+i] - delta[k*deltastep+i];
2746
else
2747
for( k = 0; k < size.height; k++ )
2748
col_buf[k] = src[k*srcstep+i] - delta_buf[k*deltastep];
2749
2750
for( j = i; j <= size.width - 4; j += 4 )
2751
{
2752
double s0 = 0, s1 = 0, s2 = 0, s3 = 0;
2753
const sT *tsrc = src + j;
2754
const dT *d = delta_buf ? delta_buf : delta + j;
2755
2756
for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
2757
{
2758
double a = col_buf[k];
2759
s0 += a * (tsrc[0] - d[0]);
2760
s1 += a * (tsrc[1] - d[1]);
2761
s2 += a * (tsrc[2] - d[2]);
2762
s3 += a * (tsrc[3] - d[3]);
2763
}
2764
2765
tdst[j] = (dT)(s0*scale);
2766
tdst[j+1] = (dT)(s1*scale);
2767
tdst[j+2] = (dT)(s2*scale);
2768
tdst[j+3] = (dT)(s3*scale);
2769
}
2770
2771
for( ; j < size.width; j++ )
2772
{
2773
double s0 = 0;
2774
const sT *tsrc = src + j;
2775
const dT *d = delta_buf ? delta_buf : delta + j;
2776
2777
for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep )
2778
s0 += (double)col_buf[k] * (tsrc[0] - d[0]);
2779
2780
tdst[j] = (dT)(s0*scale);
2781
}
2782
}
2783
}
2784
2785
2786
template<typename sT, typename dT> static void
2787
MulTransposedL( const Mat& srcmat, Mat& dstmat, const Mat& deltamat, double scale )
2788
{
2789
int i, j, k;
2790
const sT* src = srcmat.ptr<sT>();
2791
dT* dst = dstmat.ptr<dT>();
2792
const dT* delta = deltamat.ptr<dT>();
2793
size_t srcstep = srcmat.step/sizeof(src[0]);
2794
size_t dststep = dstmat.step/sizeof(dst[0]);
2795
size_t deltastep = deltamat.rows > 1 ? deltamat.step/sizeof(delta[0]) : 0;
2796
int delta_cols = deltamat.cols;
2797
Size size = srcmat.size();
2798
dT* tdst = dst;
2799
2800
if( !delta )
2801
for( i = 0; i < size.height; i++, tdst += dststep )
2802
for( j = i; j < size.height; j++ )
2803
{
2804
double s = 0;
2805
const sT *tsrc1 = src + i*srcstep;
2806
const sT *tsrc2 = src + j*srcstep;
2807
2808
for( k = 0; k <= size.width - 4; k += 4 )
2809
s += (double)tsrc1[k]*tsrc2[k] + (double)tsrc1[k+1]*tsrc2[k+1] +
2810
(double)tsrc1[k+2]*tsrc2[k+2] + (double)tsrc1[k+3]*tsrc2[k+3];
2811
for( ; k < size.width; k++ )
2812
s += (double)tsrc1[k] * tsrc2[k];
2813
tdst[j] = (dT)(s*scale);
2814
}
2815
else
2816
{
2817
dT delta_buf[4];
2818
int delta_shift = delta_cols == size.width ? 4 : 0;
2819
AutoBuffer<uchar> buf(size.width*sizeof(dT));
2820
dT* row_buf = (dT*)buf.data();
2821
2822
for( i = 0; i < size.height; i++, tdst += dststep )
2823
{
2824
const sT *tsrc1 = src + i*srcstep;
2825
const dT *tdelta1 = delta + i*deltastep;
2826
2827
if( delta_cols < size.width )
2828
for( k = 0; k < size.width; k++ )
2829
row_buf[k] = tsrc1[k] - tdelta1[0];
2830
else
2831
for( k = 0; k < size.width; k++ )
2832
row_buf[k] = tsrc1[k] - tdelta1[k];
2833
2834
for( j = i; j < size.height; j++ )
2835
{
2836
double s = 0;
2837
const sT *tsrc2 = src + j*srcstep;
2838
const dT *tdelta2 = delta + j*deltastep;
2839
if( delta_cols < size.width )
2840
{
2841
delta_buf[0] = delta_buf[1] =
2842
delta_buf[2] = delta_buf[3] = tdelta2[0];
2843
tdelta2 = delta_buf;
2844
}
2845
for( k = 0; k <= size.width-4; k += 4, tdelta2 += delta_shift )
2846
s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]) +
2847
(double)row_buf[k+1]*(tsrc2[k+1] - tdelta2[1]) +
2848
(double)row_buf[k+2]*(tsrc2[k+2] - tdelta2[2]) +
2849
(double)row_buf[k+3]*(tsrc2[k+3] - tdelta2[3]);
2850
for( ; k < size.width; k++, tdelta2++ )
2851
s += (double)row_buf[k]*(tsrc2[k] - tdelta2[0]);
2852
tdst[j] = (dT)(s*scale);
2853
}
2854
}
2855
}
2856
}
2857
2858
typedef void (*MulTransposedFunc)(const Mat& src, Mat& dst, const Mat& delta, double scale);
2859
2860
}
2861
2862
void cv::mulTransposed( InputArray _src, OutputArray _dst, bool ata,
2863
InputArray _delta, double scale, int dtype )
2864
{
2865
CV_INSTRUMENT_REGION();
2866
2867
Mat src = _src.getMat(), delta = _delta.getMat();
2868
const int gemm_level = 100; // boundary above which GEMM is faster.
2869
int stype = src.type();
2870
dtype = std::max(std::max(CV_MAT_DEPTH(dtype >= 0 ? dtype : stype), delta.depth()), CV_32F);
2871
CV_Assert( src.channels() == 1 );
2872
2873
if( !delta.empty() )
2874
{
2875
CV_Assert_N( delta.channels() == 1,
2876
(delta.rows == src.rows || delta.rows == 1),
2877
(delta.cols == src.cols || delta.cols == 1));
2878
if( delta.type() != dtype )
2879
delta.convertTo(delta, dtype);
2880
}
2881
2882
int dsize = ata ? src.cols : src.rows;
2883
_dst.create( dsize, dsize, dtype );
2884
Mat dst = _dst.getMat();
2885
2886
if( src.data == dst.data || (stype == dtype &&
2887
(dst.cols >= gemm_level && dst.rows >= gemm_level &&
2888
src.cols >= gemm_level && src.rows >= gemm_level)))
2889
{
2890
Mat src2;
2891
const Mat* tsrc = &src;
2892
if( !delta.empty() )
2893
{
2894
if( delta.size() == src.size() )
2895
subtract( src, delta, src2 );
2896
else
2897
{
2898
repeat(delta, src.rows/delta.rows, src.cols/delta.cols, src2);
2899
subtract( src, src2, src2 );
2900
}
2901
tsrc = &src2;
2902
}
2903
gemm( *tsrc, *tsrc, scale, Mat(), 0, dst, ata ? GEMM_1_T : GEMM_2_T );
2904
}
2905
else
2906
{
2907
MulTransposedFunc func = 0;
2908
if(stype == CV_8U && dtype == CV_32F)
2909
{
2910
if(ata)
2911
func = MulTransposedR<uchar,float>;
2912
else
2913
func = MulTransposedL<uchar,float>;
2914
}
2915
else if(stype == CV_8U && dtype == CV_64F)
2916
{
2917
if(ata)
2918
func = MulTransposedR<uchar,double>;
2919
else
2920
func = MulTransposedL<uchar,double>;
2921
}
2922
else if(stype == CV_16U && dtype == CV_32F)
2923
{
2924
if(ata)
2925
func = MulTransposedR<ushort,float>;
2926
else
2927
func = MulTransposedL<ushort,float>;
2928
}
2929
else if(stype == CV_16U && dtype == CV_64F)
2930
{
2931
if(ata)
2932
func = MulTransposedR<ushort,double>;
2933
else
2934
func = MulTransposedL<ushort,double>;
2935
}
2936
else if(stype == CV_16S && dtype == CV_32F)
2937
{
2938
if(ata)
2939
func = MulTransposedR<short,float>;
2940
else
2941
func = MulTransposedL<short,float>;
2942
}
2943
else if(stype == CV_16S && dtype == CV_64F)
2944
{
2945
if(ata)
2946
func = MulTransposedR<short,double>;
2947
else
2948
func = MulTransposedL<short,double>;
2949
}
2950
else if(stype == CV_32F && dtype == CV_32F)
2951
{
2952
if(ata)
2953
func = MulTransposedR<float,float>;
2954
else
2955
func = MulTransposedL<float,float>;
2956
}
2957
else if(stype == CV_32F && dtype == CV_64F)
2958
{
2959
if(ata)
2960
func = MulTransposedR<float,double>;
2961
else
2962
func = MulTransposedL<float,double>;
2963
}
2964
else if(stype == CV_64F && dtype == CV_64F)
2965
{
2966
if(ata)
2967
func = MulTransposedR<double,double>;
2968
else
2969
func = MulTransposedL<double,double>;
2970
}
2971
if( !func )
2972
CV_Error( CV_StsUnsupportedFormat, "" );
2973
2974
func( src, dst, delta, scale );
2975
completeSymm( dst, false );
2976
}
2977
}
2978
2979
/****************************************************************************************\
2980
* Dot Product *
2981
\****************************************************************************************/
2982
2983
namespace cv
2984
{
2985
2986
template<typename T> double
2987
dotProd_(const T* src1, const T* src2, int len)
2988
{
2989
int i = 0;
2990
double result = 0;
2991
2992
#if CV_ENABLE_UNROLLED
2993
for( ; i <= len - 4; i += 4 )
2994
result += (double)src1[i]*src2[i] + (double)src1[i+1]*src2[i+1] +
2995
(double)src1[i+2]*src2[i+2] + (double)src1[i+3]*src2[i+3];
2996
#endif
2997
for( ; i < len; i++ )
2998
result += (double)src1[i]*src2[i];
2999
3000
return result;
3001
}
3002
3003
3004
static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
3005
{
3006
double r = 0;
3007
#if ARITHM_USE_IPP
3008
CV_IPP_RUN(IPP_VERSION_X100 > 201800 || cv::ipp::getIppTopFeatures() != ippCPUID_SSE42, CV_INSTRUMENT_FUN_IPP(ippiDotProd_8u64f_C1R, src1, len*sizeof(uchar), src2, len*sizeof(uchar), ippiSize(len, 1), &r) >= 0, r);
3009
#endif
3010
int i = 0;
3011
3012
#if CV_SIMD
3013
int len0 = len & -v_uint16::nlanes, blockSize0 = (1 << 15), blockSize;
3014
3015
while (i < len0)
3016
{
3017
blockSize = std::min(len0 - i, blockSize0);
3018
v_int32 v_sum = vx_setzero_s32();
3019
const int cWidth = v_uint16::nlanes;
3020
3021
int j = 0;
3022
for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
3023
{
3024
v_uint16 v_src10, v_src20, v_src11, v_src21;
3025
v_expand(vx_load(src1 + j), v_src10, v_src11);
3026
v_expand(vx_load(src2 + j), v_src20, v_src21);
3027
3028
v_sum += v_dotprod(v_reinterpret_as_s16(v_src10), v_reinterpret_as_s16(v_src20));
3029
v_sum += v_dotprod(v_reinterpret_as_s16(v_src11), v_reinterpret_as_s16(v_src21));
3030
}
3031
3032
for (; j <= blockSize - cWidth; j += cWidth)
3033
{
3034
v_int16 v_src10 = v_reinterpret_as_s16(vx_load_expand(src1 + j));
3035
v_int16 v_src20 = v_reinterpret_as_s16(vx_load_expand(src2 + j));
3036
3037
v_sum += v_dotprod(v_src10, v_src20);
3038
}
3039
r += (double)v_reduce_sum(v_sum);
3040
3041
src1 += blockSize;
3042
src2 += blockSize;
3043
i += blockSize;
3044
}
3045
vx_cleanup();
3046
#elif CV_NEON
3047
if( cv::checkHardwareSupport(CV_CPU_NEON) )
3048
{
3049
int len0 = len & -8, blockSize0 = (1 << 15), blockSize;
3050
uint32x4_t v_zero = vdupq_n_u32(0u);
3051
CV_DECL_ALIGNED(16) uint buf[4];
3052
3053
while( i < len0 )
3054
{
3055
blockSize = std::min(len0 - i, blockSize0);
3056
uint32x4_t v_sum = v_zero;
3057
3058
int j = 0;
3059
for( ; j <= blockSize - 16; j += 16 )
3060
{
3061
uint8x16_t v_src1 = vld1q_u8(src1 + j), v_src2 = vld1q_u8(src2 + j);
3062
3063
uint16x8_t v_src10 = vmovl_u8(vget_low_u8(v_src1)), v_src20 = vmovl_u8(vget_low_u8(v_src2));
3064
v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
3065
v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
3066
3067
v_src10 = vmovl_u8(vget_high_u8(v_src1));
3068
v_src20 = vmovl_u8(vget_high_u8(v_src2));
3069
v_sum = vmlal_u16(v_sum, vget_low_u16(v_src10), vget_low_u16(v_src20));
3070
v_sum = vmlal_u16(v_sum, vget_high_u16(v_src10), vget_high_u16(v_src20));
3071
}
3072
3073
for( ; j <= blockSize - 8; j += 8 )
3074
{
3075
uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + j)), v_src2 = vmovl_u8(vld1_u8(src2 + j));
3076
v_sum = vmlal_u16(v_sum, vget_low_u16(v_src1), vget_low_u16(v_src2));
3077
v_sum = vmlal_u16(v_sum, vget_high_u16(v_src1), vget_high_u16(v_src2));
3078
}
3079
3080
vst1q_u32(buf, v_sum);
3081
r += buf[0] + buf[1] + buf[2] + buf[3];
3082
3083
src1 += blockSize;
3084
src2 += blockSize;
3085
i += blockSize;
3086
}
3087
}
3088
#endif
3089
return r + dotProd_(src1, src2, len - i);
3090
}
3091
3092
3093
static double dotProd_8s(const schar* src1, const schar* src2, int len)
3094
{
3095
double r = 0.0;
3096
int i = 0;
3097
3098
#if CV_SIMD
3099
int len0 = len & -v_int16::nlanes, blockSize0 = (1 << 14), blockSize;
3100
3101
while (i < len0)
3102
{
3103
blockSize = std::min(len0 - i, blockSize0);
3104
v_int32 v_sum = vx_setzero_s32();
3105
const int cWidth = v_int16::nlanes;
3106
3107
int j = 0;
3108
for (; j <= blockSize - cWidth * 2; j += cWidth * 2)
3109
{
3110
v_int16 v_src10, v_src20, v_src11, v_src21;
3111
v_expand(vx_load(src1 + j), v_src10, v_src11);
3112
v_expand(vx_load(src2 + j), v_src20, v_src21);
3113
3114
v_sum += v_dotprod(v_src10, v_src20);
3115
v_sum += v_dotprod(v_src11, v_src21);
3116
}
3117
3118
for (; j <= blockSize - cWidth; j += cWidth)
3119
{
3120
v_int16 v_src10 = vx_load_expand(src1 + j);
3121
v_int16 v_src20 = vx_load_expand(src2 + j);
3122
3123
v_sum += v_dotprod(v_src10, v_src20);
3124
}
3125
r += (double)v_reduce_sum(v_sum);
3126
3127
src1 += blockSize;
3128
src2 += blockSize;
3129
i += blockSize;
3130
}
3131
vx_cleanup();
3132
#elif CV_NEON
3133
if( cv::checkHardwareSupport(CV_CPU_NEON) )
3134
{
3135
int len0 = len & -8, blockSize0 = (1 << 14), blockSize;
3136
int32x4_t v_zero = vdupq_n_s32(0);
3137
CV_DECL_ALIGNED(16) int buf[4];
3138
3139
while( i < len0 )
3140
{
3141
blockSize = std::min(len0 - i, blockSize0);
3142
int32x4_t v_sum = v_zero;
3143
3144
int j = 0;
3145
for( ; j <= blockSize - 16; j += 16 )
3146
{
3147
int8x16_t v_src1 = vld1q_s8(src1 + j), v_src2 = vld1q_s8(src2 + j);
3148
3149
int16x8_t v_src10 = vmovl_s8(vget_low_s8(v_src1)), v_src20 = vmovl_s8(vget_low_s8(v_src2));
3150
v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
3151
v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
3152
3153
v_src10 = vmovl_s8(vget_high_s8(v_src1));
3154
v_src20 = vmovl_s8(vget_high_s8(v_src2));
3155
v_sum = vmlal_s16(v_sum, vget_low_s16(v_src10), vget_low_s16(v_src20));
3156
v_sum = vmlal_s16(v_sum, vget_high_s16(v_src10), vget_high_s16(v_src20));
3157
}
3158
3159
for( ; j <= blockSize - 8; j += 8 )
3160
{
3161
int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + j)), v_src2 = vmovl_s8(vld1_s8(src2 + j));
3162
v_sum = vmlal_s16(v_sum, vget_low_s16(v_src1), vget_low_s16(v_src2));
3163
v_sum = vmlal_s16(v_sum, vget_high_s16(v_src1), vget_high_s16(v_src2));
3164
}
3165
3166
vst1q_s32(buf, v_sum);
3167
r += buf[0] + buf[1] + buf[2] + buf[3];
3168
3169
src1 += blockSize;
3170
src2 += blockSize;
3171
i += blockSize;
3172
}
3173
}
3174
#endif
3175
3176
return r + dotProd_(src1, src2, len - i);
3177
}
3178
3179
static double dotProd_16u(const ushort* src1, const ushort* src2, int len)
3180
{
3181
#if ARITHM_USE_IPP
3182
double r = 0;
3183
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_16u64f_C1R, src1, len*sizeof(ushort), src2, len*sizeof(ushort), ippiSize(len, 1), &r) >= 0, r);
3184
#endif
3185
return dotProd_(src1, src2, len);
3186
}
3187
3188
static double dotProd_16s(const short* src1, const short* src2, int len)
3189
{
3190
#if ARITHM_USE_IPP && (IPP_VERSION_X100 != 900) // bug in IPP 9.0.0
3191
double r = 0;
3192
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_16s64f_C1R, src1, len*sizeof(short), src2, len*sizeof(short), ippiSize(len, 1), &r) >= 0, r);
3193
#endif
3194
return dotProd_(src1, src2, len);
3195
}
3196
3197
static double dotProd_32s(const int* src1, const int* src2, int len)
3198
{
3199
#if ARITHM_USE_IPP
3200
double r = 0;
3201
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_32s64f_C1R, src1, len*sizeof(int), src2, len*sizeof(int), ippiSize(len, 1), &r) >= 0, r);
3202
#endif
3203
return dotProd_(src1, src2, len);
3204
}
3205
3206
static double dotProd_32f(const float* src1, const float* src2, int len)
3207
{
3208
double r = 0.0;
3209
3210
#if ARITHM_USE_IPP
3211
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippiDotProd_32f64f_C1R, src1, len*sizeof(float), src2, len*sizeof(float), ippiSize(len, 1), &r, ippAlgHintFast) >= 0, r);
3212
#endif
3213
int i = 0;
3214
3215
#if CV_SIMD
3216
int len0 = len & -v_float32::nlanes, blockSize0 = (1 << 13), blockSize;
3217
3218
while (i < len0)
3219
{
3220
blockSize = std::min(len0 - i, blockSize0);
3221
v_float32 v_sum = vx_setzero_f32();
3222
3223
int j = 0;
3224
int cWidth = v_float32::nlanes;
3225
for (; j <= blockSize - cWidth; j += cWidth)
3226
v_sum = v_muladd(vx_load(src1 + j), vx_load(src2 + j), v_sum);
3227
3228
r += v_reduce_sum(v_sum);
3229
3230
src1 += blockSize;
3231
src2 += blockSize;
3232
i += blockSize;
3233
}
3234
vx_cleanup();
3235
#endif
3236
return r + dotProd_(src1, src2, len - i);
3237
}
3238
3239
static double dotProd_64f(const double* src1, const double* src2, int len)
3240
{
3241
#if ARITHM_USE_IPP
3242
double r = 0;
3243
CV_IPP_RUN_FAST(CV_INSTRUMENT_FUN_IPP(ippsDotProd_64f, src1, src2, len, &r) >= 0, r);
3244
#endif
3245
3246
return dotProd_(src1, src2, len);
3247
}
3248
3249
3250
typedef double (*DotProdFunc)(const uchar* src1, const uchar* src2, int len);
3251
3252
static DotProdFunc getDotProdFunc(int depth)
3253
{
3254
static DotProdFunc dotProdTab[] =
3255
{
3256
(DotProdFunc)GET_OPTIMIZED(dotProd_8u), (DotProdFunc)GET_OPTIMIZED(dotProd_8s),
3257
(DotProdFunc)dotProd_16u, (DotProdFunc)dotProd_16s,
3258
(DotProdFunc)dotProd_32s, (DotProdFunc)GET_OPTIMIZED(dotProd_32f),
3259
(DotProdFunc)dotProd_64f, 0
3260
};
3261
3262
return dotProdTab[depth];
3263
}
3264
3265
double Mat::dot(InputArray _mat) const
3266
{
3267
CV_INSTRUMENT_REGION();
3268
3269
Mat mat = _mat.getMat();
3270
int cn = channels();
3271
DotProdFunc func = getDotProdFunc(depth());
3272
CV_Assert_N( mat.type() == type(), mat.size == size, func != 0 );
3273
3274
if( isContinuous() && mat.isContinuous() )
3275
{
3276
size_t len = total()*cn;
3277
if( len == (size_t)(int)len )
3278
return func(data, mat.data, (int)len);
3279
}
3280
3281
const Mat* arrays[] = {this, &mat, 0};
3282
uchar* ptrs[2] = {};
3283
NAryMatIterator it(arrays, ptrs);
3284
int len = (int)(it.size*cn);
3285
double r = 0;
3286
3287
for( size_t i = 0; i < it.nplanes; i++, ++it )
3288
r += func( ptrs[0], ptrs[1], len );
3289
3290
return r;
3291
}
3292
3293
}
3294
3295
/****************************************************************************************\
3296
* Earlier API *
3297
\****************************************************************************************/
3298
3299
CV_IMPL void cvGEMM( const CvArr* Aarr, const CvArr* Barr, double alpha,
3300
const CvArr* Carr, double beta, CvArr* Darr, int flags )
3301
{
3302
cv::Mat A = cv::cvarrToMat(Aarr), B = cv::cvarrToMat(Barr);
3303
cv::Mat C, D = cv::cvarrToMat(Darr);
3304
3305
if( Carr )
3306
C = cv::cvarrToMat(Carr);
3307
3308
CV_Assert_N( (D.rows == ((flags & CV_GEMM_A_T) == 0 ? A.rows : A.cols)),
3309
(D.cols == ((flags & CV_GEMM_B_T) == 0 ? B.cols : B.rows)),
3310
D.type() == A.type() );
3311
3312
gemm( A, B, alpha, C, beta, D, flags );
3313
}
3314
3315
3316
CV_IMPL void
3317
cvTransform( const CvArr* srcarr, CvArr* dstarr,
3318
const CvMat* transmat, const CvMat* shiftvec )
3319
{
3320
cv::Mat m = cv::cvarrToMat(transmat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3321
3322
if( shiftvec )
3323
{
3324
cv::Mat v = cv::cvarrToMat(shiftvec).reshape(1,m.rows),
3325
_m(m.rows, m.cols + 1, m.type()), m1 = _m.colRange(0,m.cols), v1 = _m.col(m.cols);
3326
m.convertTo(m1, m1.type());
3327
v.convertTo(v1, v1.type());
3328
m = _m;
3329
}
3330
3331
CV_Assert_N( dst.depth() == src.depth(), dst.channels() == m.rows );
3332
cv::transform( src, dst, m );
3333
}
3334
3335
3336
CV_IMPL void
3337
cvPerspectiveTransform( const CvArr* srcarr, CvArr* dstarr, const CvMat* mat )
3338
{
3339
cv::Mat m = cv::cvarrToMat(mat), src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
3340
3341
CV_Assert_N( dst.type() == src.type(), dst.channels() == m.rows-1 );
3342
cv::perspectiveTransform( src, dst, m );
3343
}
3344
3345
3346
CV_IMPL void cvScaleAdd( const CvArr* srcarr1, CvScalar scale,
3347
const CvArr* srcarr2, CvArr* dstarr )
3348
{
3349
cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
3350
3351
CV_Assert_N( src1.size == dst.size, src1.type() == dst.type() );
3352
cv::scaleAdd( src1, scale.val[0], cv::cvarrToMat(srcarr2), dst );
3353
}
3354
3355
3356
CV_IMPL void
3357
cvCalcCovarMatrix( const CvArr** vecarr, int count,
3358
CvArr* covarr, CvArr* avgarr, int flags )
3359
{
3360
cv::Mat cov0 = cv::cvarrToMat(covarr), cov = cov0, mean0, mean;
3361
CV_Assert_N( vecarr != 0, count >= 1 );
3362
3363
if( avgarr )
3364
mean = mean0 = cv::cvarrToMat(avgarr);
3365
3366
if( (flags & CV_COVAR_COLS) != 0 || (flags & CV_COVAR_ROWS) != 0 )
3367
{
3368
3369
cv::Mat data = cv::cvarrToMat(vecarr[0]);
3370
cv::calcCovarMatrix( data, cov, mean, flags, cov.type() );
3371
}
3372
else
3373
{
3374
std::vector<cv::Mat> data(count);
3375
for( int i = 0; i < count; i++ )
3376
data[i] = cv::cvarrToMat(vecarr[i]);
3377
cv::calcCovarMatrix( &data[0], count, cov, mean, flags, cov.type() );
3378
}
3379
3380
if( mean.data != mean0.data && mean0.data )
3381
mean.convertTo(mean0, mean0.type());
3382
3383
if( cov.data != cov0.data )
3384
cov.convertTo(cov0, cov0.type());
3385
}
3386
3387
3388
CV_IMPL double
3389
cvMahalanobis( const CvArr* srcAarr, const CvArr* srcBarr, const CvArr* matarr )
3390
{
3391
return cv::Mahalanobis(cv::cvarrToMat(srcAarr),
3392
cv::cvarrToMat(srcBarr), cv::cvarrToMat(matarr));
3393
}
3394
3395
CV_IMPL void
3396
cvMulTransposed( const CvArr* srcarr, CvArr* dstarr,
3397
int order, const CvArr* deltaarr, double scale )
3398
{
3399
cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0, delta;
3400
if( deltaarr )
3401
delta = cv::cvarrToMat(deltaarr);
3402
cv::mulTransposed( src, dst, order != 0, delta, scale, dst.type());
3403
if( dst.data != dst0.data )
3404
dst.convertTo(dst0, dst0.type());
3405
}
3406
3407
CV_IMPL double cvDotProduct( const CvArr* srcAarr, const CvArr* srcBarr )
3408
{
3409
return cv::cvarrToMat(srcAarr).dot(cv::cvarrToMat(srcBarr));
3410
}
3411
3412
3413
CV_IMPL void
3414
cvCalcPCA( const CvArr* data_arr, CvArr* avg_arr, CvArr* eigenvals, CvArr* eigenvects, int flags )
3415
{
3416
cv::Mat data = cv::cvarrToMat(data_arr), mean0 = cv::cvarrToMat(avg_arr);
3417
cv::Mat evals0 = cv::cvarrToMat(eigenvals), evects0 = cv::cvarrToMat(eigenvects);
3418
cv::Mat mean = mean0, evals = evals0, evects = evects0;
3419
3420
cv::PCA pca;
3421
pca.mean = mean;
3422
pca.eigenvalues = evals;
3423
pca.eigenvectors = evects;
3424
3425
pca(data, (flags & CV_PCA_USE_AVG) ? mean : cv::Mat(),
3426
flags, !evals.empty() ? evals.rows + evals.cols - 1 : 0);
3427
3428
if( pca.mean.size() == mean.size() )
3429
pca.mean.convertTo( mean, mean.type() );
3430
else
3431
{
3432
cv::Mat temp; pca.mean.convertTo( temp, mean.type() );
3433
transpose( temp, mean );
3434
}
3435
3436
evals = pca.eigenvalues;
3437
evects = pca.eigenvectors;
3438
int ecount0 = evals0.cols + evals0.rows - 1;
3439
int ecount = evals.cols + evals.rows - 1;
3440
3441
CV_Assert_N( (evals0.cols == 1 || evals0.rows == 1),
3442
ecount0 <= ecount,
3443
evects0.cols == evects.cols,
3444
evects0.rows == ecount0 );
3445
3446
cv::Mat temp = evals0;
3447
if( evals.rows == 1 )
3448
evals.colRange(0, ecount0).convertTo(temp, evals0.type());
3449
else
3450
evals.rowRange(0, ecount0).convertTo(temp, evals0.type());
3451
if( temp.data != evals0.data )
3452
transpose(temp, evals0);
3453
evects.rowRange(0, ecount0).convertTo( evects0, evects0.type() );
3454
3455
// otherwise some datatype's or size's were incorrect, so the output arrays have been reallocated
3456
CV_Assert( mean0.data == mean.data );
3457
}
3458
3459
3460
CV_IMPL void
3461
cvProjectPCA( const CvArr* data_arr, const CvArr* avg_arr,
3462
const CvArr* eigenvects, CvArr* result_arr )
3463
{
3464
cv::Mat data = cv::cvarrToMat(data_arr), mean = cv::cvarrToMat(avg_arr);
3465
cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0;
3466
3467
cv::PCA pca;
3468
pca.mean = mean;
3469
int n;
3470
if( mean.rows == 1 )
3471
{
3472
CV_Assert_N(dst.cols <= evects.rows, dst.rows == data.rows);
3473
n = dst.cols;
3474
}
3475
else
3476
{
3477
CV_Assert_N(dst.rows <= evects.rows, dst.cols == data.cols);
3478
n = dst.rows;
3479
}
3480
pca.eigenvectors = evects.rowRange(0, n);
3481
3482
cv::Mat result = pca.project(data);
3483
if( result.cols != dst.cols )
3484
result = result.reshape(1, 1);
3485
result.convertTo(dst, dst.type());
3486
3487
CV_Assert(dst0.data == dst.data);
3488
}
3489
3490
3491
CV_IMPL void
3492
cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr,
3493
const CvArr* eigenvects, CvArr* result_arr )
3494
{
3495
cv::Mat data = cv::cvarrToMat(proj_arr), mean = cv::cvarrToMat(avg_arr);
3496
cv::Mat evects = cv::cvarrToMat(eigenvects), dst0 = cv::cvarrToMat(result_arr), dst = dst0;
3497
3498
cv::PCA pca;
3499
pca.mean = mean;
3500
int n;
3501
if( mean.rows == 1 )
3502
{
3503
CV_Assert_N(data.cols <= evects.rows, dst.rows == data.rows);
3504
n = data.cols;
3505
}
3506
else
3507
{
3508
CV_Assert_N(data.rows <= evects.rows, dst.cols == data.cols);
3509
n = data.rows;
3510
}
3511
pca.eigenvectors = evects.rowRange(0, n);
3512
3513
cv::Mat result = pca.backProject(data);
3514
result.convertTo(dst, dst.type());
3515
3516
CV_Assert(dst0.data == dst.data);
3517
}
3518
3519
/* End of file. */
3520
3521