CoCalc -- gfluidcore.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/gapi/src/backends/fluid/gfluidcore.cpp
¹⁶³⁴⁵ views
1
// This file is part of OpenCV project.
2
// It is subject to the license terms in the LICENSE file found in the top-level directory
3
// of this distribution and at http://opencv.org/license.html.
4
//
5
// Copyright (C) 2018 Intel Corporation
6

7
#if !defined(GAPI_STANDALONE)
8

9
#include "precomp.hpp"
10

11
#include "opencv2/gapi/own/assert.hpp"
12
#include "opencv2/core/traits.hpp"
13
#include "opencv2/core/hal/intrin.hpp"
14

15
#include "opencv2/gapi/core.hpp"
16

17
#include "opencv2/gapi/fluid/gfluidbuffer.hpp"
18
#include "opencv2/gapi/fluid/gfluidkernel.hpp"
19

20
#include "gfluidbuffer_priv.hpp"
21
#include "gfluidbackend.hpp"
22
#include "gfluidutils.hpp"
23
#include "gfluidcore.hpp"
24

25
#include <cassert>
26
#include <cmath>
27
#include <cstdlib>
28

29
namespace cv {
30
namespace gapi {
31
namespace fluid {
32

33
//---------------------
34
//
35
// Arithmetic functions
36
//
37
//---------------------
38

39
template<typename DST, typename SRC1, typename SRC2>
40
static inline DST absdiff(SRC1 x, SRC2 y)
41
{
42
    auto result = x > y? x - y: y - x;
43
    return saturate<DST>(result, roundf);
44
}
45

46
template<typename DST, typename SRC1, typename SRC2>
47
static inline DST addWeighted(SRC1 src1, SRC2 src2, float alpha, float beta, float gamma)
48
{
49
    float dst = src1*alpha + src2*beta + gamma;
50
    return saturate<DST>(dst, roundf);
51
}
52

53
template<typename DST, typename SRC1, typename SRC2>
54
static inline DST add(SRC1 x, SRC2 y)
55
{
56
    return saturate<DST>(x + y, roundf);
57
}
58

59
template<typename DST, typename SRC1, typename SRC2>
60
static inline DST sub(SRC1 x, SRC2 y)
61
{
62
    return saturate<DST>(x - y, roundf);
63
}
64

65
template<typename DST, typename SRC1, typename SRC2>
66
static inline DST subr(SRC1 x, SRC2 y)
67
{
68
    return saturate<DST>(y - x, roundf); // reverse: y - x
69
}
70

71
template<typename DST, typename SRC1, typename SRC2>
72
static inline DST mul(SRC1 x, SRC2 y, float scale=1)
73
{
74
    auto result = scale * x * y;
75
    return saturate<DST>(result, rintf);
76
}
77

78
template<typename DST, typename SRC1, typename SRC2>
79
static inline DST div(SRC1 x, SRC2 y, float scale=1)
80
{
81
    // like OpenCV: returns 0, if y=0
82
    auto result = y? scale * x / y: 0;
83
    return saturate<DST>(result, rintf);
84
}
85

86
template<typename DST, typename SRC1, typename SRC2>
87
static inline DST divr(SRC1 x, SRC2 y, float scale=1)
88
{
89
    auto result = x? scale * y / x: 0; // reverse: y / x
90
    return saturate<DST>(result, rintf);
91
}
92

93
//---------------------------
94
//
95
// Fluid kernels: addWeighted
96
//
97
//---------------------------
98

99
template<typename DST, typename SRC1, typename SRC2>
100
static void run_addweighted(Buffer &dst, const View &src1, const View &src2,
101
                            double alpha, double beta, double gamma)
102
{
103
    static_assert(std::is_same<SRC1, SRC2>::value, "wrong types");
104

105
    const auto *in1 = src1.InLine<SRC1>(0);
106
    const auto *in2 = src2.InLine<SRC2>(0);
107
          auto *out = dst.OutLine<DST>();
108

109
    int width  = dst.length();
110
    int chan   = dst.meta().chan;
111
    int length = width * chan;
112

113
    // NB: assume in/out types are not 64-bits
114
    auto _alpha = static_cast<float>( alpha );
115
    auto _beta  = static_cast<float>( beta  );
116
    auto _gamma = static_cast<float>( gamma );
117

118
    for (int l=0; l < length; l++)
119
        out[l] = addWeighted<DST>(in1[l], in2[l], _alpha, _beta, _gamma);
120
}
121

122
GAPI_FLUID_KERNEL(GFluidAddW, cv::gapi::core::GAddW, false)
123
{
124
    static const int Window = 1;
125

126
    static void run(const View &src1, double alpha, const View &src2,
127
                                      double beta, double gamma, int /*dtype*/,
128
                        Buffer &dst)
129
    {
130
        //      DST     SRC1    SRC2    OP               __VA_ARGS__
131
        BINARY_(uchar , uchar , uchar , run_addweighted, dst, src1, src2, alpha, beta, gamma);
132
        BINARY_(uchar , ushort, ushort, run_addweighted, dst, src1, src2, alpha, beta, gamma);
133
        BINARY_(uchar ,  short,  short, run_addweighted, dst, src1, src2, alpha, beta, gamma);
134
        BINARY_( short,  short,  short, run_addweighted, dst, src1, src2, alpha, beta, gamma);
135
        BINARY_(ushort, ushort, ushort, run_addweighted, dst, src1, src2, alpha, beta, gamma);
136
        BINARY_( float, uchar , uchar , run_addweighted, dst, src1, src2, alpha, beta, gamma);
137
        BINARY_( float, ushort, ushort, run_addweighted, dst, src1, src2, alpha, beta, gamma);
138
        BINARY_( float,  short,  short, run_addweighted, dst, src1, src2, alpha, beta, gamma);
139

140
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
141
    }
142
};
143

144
//--------------------------
145
//
146
// Fluid kernels: +, -, *, /
147
//
148
//--------------------------
149

150
enum Arithm { ARITHM_ABSDIFF, ARITHM_ADD, ARITHM_SUBTRACT, ARITHM_MULTIPLY, ARITHM_DIVIDE };
151

152
template<typename DST, typename SRC1, typename SRC2>
153
static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm arithm,
154
                       double scale=1)
155
{
156
    static_assert(std::is_same<SRC1, SRC2>::value, "wrong types");
157

158
    const auto *in1 = src1.InLine<SRC1>(0);
159
    const auto *in2 = src2.InLine<SRC2>(0);
160
          auto *out = dst.OutLine<DST>();
161

162
    int width  = dst.length();
163
    int chan   = dst.meta().chan;
164
    int length = width * chan;
165

166
    // NB: assume in/out types are not 64-bits
167
    float _scale = static_cast<float>( scale );
168

169
    switch (arithm)
170
    {
171
    case ARITHM_ABSDIFF:
172
        for (int l=0; l < length; l++)
173
            out[l] = absdiff<DST>(in1[l], in2[l]);
174
        break;
175
    case ARITHM_ADD:
176
        for (int l=0; l < length; l++)
177
            out[l] = add<DST>(in1[l], in2[l]);
178
        break;
179
    case ARITHM_SUBTRACT:
180
        for (int l=0; l < length; l++)
181
            out[l] = sub<DST>(in1[l], in2[l]);
182
        break;
183
    case ARITHM_MULTIPLY:
184
        for (int l=0; l < length; l++)
185
            out[l] = mul<DST>(in1[l], in2[l], _scale);
186
        break;
187
    case ARITHM_DIVIDE:
188
        for (int l=0; l < length; l++)
189
            out[l] = div<DST>(in1[l], in2[l], _scale);
190
        break;
191
    default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation");
192
    }
193
}
194

195
GAPI_FLUID_KERNEL(GFluidAdd, cv::gapi::core::GAdd, false)
196
{
197
    static const int Window = 1;
198

199
    static void run(const View &src1, const View &src2, int /*dtype*/, Buffer &dst)
200
    {
201
        //      DST     SRC1    SRC2    OP          __VA_ARGS__
202
        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_ADD);
203
        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_ADD);
204
        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_ADD);
205
        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_ADD);
206
        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_ADD);
207
        BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_ADD);
208
        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_ADD);
209

210
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
211
    }
212
};
213

214
GAPI_FLUID_KERNEL(GFluidSub, cv::gapi::core::GSub, false)
215
{
216
    static const int Window = 1;
217

218
    static void run(const View &src1, const View &src2, int /*dtype*/, Buffer &dst)
219
    {
220
        //      DST     SRC1    SRC2    OP          __VA_ARGS__
221
        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
222
        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
223
        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
224
        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
225
        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
226
        BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
227
        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
228

229
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
230
    }
231
};
232

233
GAPI_FLUID_KERNEL(GFluidMul, cv::gapi::core::GMul, false)
234
{
235
    static const int Window = 1;
236

237
    static void run(const View &src1, const View &src2, double scale, int /*dtype*/, Buffer &dst)
238
    {
239
        //      DST     SRC1    SRC2    OP          __VA_ARGS__
240
        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
241
        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
242
        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
243
        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
244
        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
245
        BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
246
        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
247

248
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
249
    }
250
};
251

252
GAPI_FLUID_KERNEL(GFluidDiv, cv::gapi::core::GDiv, false)
253
{
254
    static const int Window = 1;
255

256
    static void run(const View &src1, const View &src2, double scale, int /*dtype*/, Buffer &dst)
257
    {
258
        //      DST     SRC1    SRC2    OP          __VA_ARGS__
259
        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
260
        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
261
        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
262
        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
263
        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
264
        BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
265
        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
266

267
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
268
    }
269
};
270

271
GAPI_FLUID_KERNEL(GFluidAbsDiff, cv::gapi::core::GAbsDiff, false)
272
{
273
    static const int Window = 1;
274

275
    static void run(const View &src1, const View &src2, Buffer &dst)
276
    {
277
        //      DST     SRC1    SRC2    OP          __VA_ARGS__
278
        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_ABSDIFF);
279
        BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_ABSDIFF);
280
        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_ABSDIFF);
281
        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_ABSDIFF);
282

283
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
284
    }
285
};
286

287
//--------------------------------------
288
//
289
// Fluid kernels: +, -, *, / with Scalar
290
//
291
//--------------------------------------
292

293
static inline v_uint16x8  v_add_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x + y; }
294
static inline v_uint16x8  v_sub_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x - y; }
295
static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; }
296

297
static inline v_float32x4  v_add_32f(const v_float32x4 &x, const v_float32x4 &y) { return x + y; }
298
static inline v_float32x4  v_sub_32f(const v_float32x4 &x, const v_float32x4 &y) { return x - y; }
299
static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; }
300

301
static inline int  s_add_8u(uchar x, uchar y) { return x + y; }
302
static inline int  s_sub_8u(uchar x, uchar y) { return x - y; }
303
static inline int s_subr_8u(uchar x, uchar y) { return y - x; }
304

305
static inline float  s_add_32f(float x, float y) { return x + y; }
306
static inline float  s_sub_32f(float x, float y) { return x - y; }
307
static inline float s_subr_32f(float x, float y) { return y - x; }
308

309
// manual SIMD if important case 8UC3
310
static void run_arithm_s3(uchar out[], const uchar in[], int width, const uchar scalar[],
311
                          v_uint16x8 (*v_op)(const v_uint16x8&, const v_uint16x8&),
312
                          int (*s_op)(uchar, uchar))
313
{
314
    int w = 0;
315

316
#if CV_SIMD128
317
    for (; w <= width-16; w+=16)
318
    {
319
        v_uint8x16 x, y, z;
320
        v_load_deinterleave(&in[3*w], x, y, z);
321

322
        v_uint16x8 r0, r1;
323

324
        v_expand(x, r0, r1);
325
        r0 = v_op(r0, v_setall_u16(scalar[0])); // x + scalar[0]
326
        r1 = v_op(r1, v_setall_u16(scalar[0]));
327
        x = v_pack(r0, r1);
328

329
        v_expand(y, r0, r1);
330
        r0 = v_op(r0, v_setall_u16(scalar[1])); // y + scalar[1]
331
        r1 = v_op(r1, v_setall_u16(scalar[1]));
332
        y = v_pack(r0, r1);
333

334
        v_expand(z, r0, r1);
335
        r0 = v_op(r0, v_setall_u16(scalar[2])); // z + scalar[2]
336
        r1 = v_op(r1, v_setall_u16(scalar[2]));
337
        z = v_pack(r0, r1);
338

339
        v_store_interleave(&out[3*w], x, y, z);
340
    }
341
#endif
342
    UNUSED(v_op);
343
    for (; w < width; w++)
344
    {
345
        out[3*w    ] = saturate<uchar>( s_op(in[3*w    ], scalar[0]) );
346
        out[3*w + 1] = saturate<uchar>( s_op(in[3*w + 1], scalar[1]) );
347
        out[3*w + 2] = saturate<uchar>( s_op(in[3*w + 2], scalar[2]) );
348
    }
349
}
350

351
// manually SIMD if rounding 32F into 8U, single channel
352
static void run_arithm_s1(uchar out[], const float in[], int width, const float scalar[],
353
                          v_float32x4 (*v_op)(const v_float32x4&, const v_float32x4&),
354
                          float (*s_op)(float, float))
355
{
356
    int w = 0;
357

358
#if CV_SIMD128
359
    for (; w <= width-16; w+=16)
360
    {
361
        v_float32x4 r0, r1, r2, r3;
362
        r0 = v_load(&in[w     ]);
363
        r1 = v_load(&in[w +  4]);
364
        r2 = v_load(&in[w +  8]);
365
        r3 = v_load(&in[w + 12]);
366

367
        r0 = v_op(r0, v_setall_f32(scalar[0])); // r + scalar[0]
368
        r1 = v_op(r1, v_setall_f32(scalar[0]));
369
        r2 = v_op(r2, v_setall_f32(scalar[0]));
370
        r3 = v_op(r3, v_setall_f32(scalar[0]));
371

372
        v_int32x4 i0, i1, i2, i3;
373
        i0 = v_round(r0);
374
        i1 = v_round(r1);
375
        i2 = v_round(r2);
376
        i3 = v_round(r3);
377

378
        v_uint16x8 us0, us1;
379
        us0 = v_pack_u(i0, i1);
380
        us1 = v_pack_u(i2, i3);
381

382
        v_uint8x16 uc;
383
        uc = v_pack(us0, us1);
384

385
        v_store(&out[w], uc);
386
    }
387
#endif
388
    UNUSED(v_op);
389
    for (; w < width; w++)
390
    {
391
        out[w] = saturate<uchar>(s_op(in[w], scalar[0]), std::roundf);
392
    }
393
}
394

395
static void run_arithm_s_add3(uchar out[], const uchar in[], int width, const uchar scalar[])
396
{
397
    run_arithm_s3(out, in, width, scalar, v_add_16u, s_add_8u);
398
}
399

400
static void run_arithm_s_sub3(uchar out[], const uchar in[], int width, const uchar scalar[])
401
{
402
    run_arithm_s3(out, in, width, scalar, v_sub_16u, s_sub_8u);
403
}
404

405
static void run_arithm_s_subr3(uchar out[], const uchar in[], int width, const uchar scalar[])
406
{
407
    run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr
408
}
409

410
static void run_arithm_s_add1(uchar out[], const float in[], int width, const float scalar[])
411
{
412
    run_arithm_s1(out, in, width, scalar, v_add_32f, s_add_32f);
413
}
414

415
static void run_arithm_s_sub1(uchar out[], const float in[], int width, const float scalar[])
416
{
417
    run_arithm_s1(out, in, width, scalar, v_sub_32f, s_sub_32f);
418
}
419

420
static void run_arithm_s_subr1(uchar out[], const float in[], int width, const float scalar[])
421
{
422
    run_arithm_s1(out, in, width, scalar, v_subr_32f, s_subr_32f); // reverse: subr
423
}
424

425
// manually unroll the inner cycle by channels
426
template<typename DST, typename SRC, typename SCALAR, typename FUNC>
427
static void run_arithm_s(DST out[], const SRC in[], int width, int chan,
428
                         const SCALAR scalar[4], FUNC func)
429
{
430
    if (chan == 4)
431
    {
432
        for (int w=0; w < width; w++)
433
        {
434
            out[4*w + 0] = func(in[4*w + 0], scalar[0]);
435
            out[4*w + 1] = func(in[4*w + 1], scalar[1]);
436
            out[4*w + 2] = func(in[4*w + 2], scalar[2]);
437
            out[4*w + 3] = func(in[4*w + 3], scalar[3]);
438
        }
439
    }
440
    else
441
    if (chan == 3)
442
    {
443
        for (int w=0; w < width; w++)
444
        {
445
            out[3*w + 0] = func(in[3*w + 0], scalar[0]);
446
            out[3*w + 1] = func(in[3*w + 1], scalar[1]);
447
            out[3*w + 2] = func(in[3*w + 2], scalar[2]);
448
        }
449
    }
450
    else
451
    if (chan == 2)
452
    {
453
        for (int w=0; w < width; w++)
454
        {
455
            out[2*w + 0] = func(in[2*w + 0], scalar[0]);
456
            out[2*w + 1] = func(in[2*w + 1], scalar[1]);
457
        }
458
    }
459
    else
460
    if (chan == 1)
461
    {
462
        for (int w=0; w < width; w++)
463
        {
464
            out[w] = func(in[w], scalar[0]);
465
        }
466
    }
467
    else
468
        CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
469
}
470

471
template<typename DST, typename SRC>
472
static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Arithm arithm,
473
                         float scale=1)
474
{
475
    const auto *in  = src.InLine<SRC>(0);
476
          auto *out = dst.OutLine<DST>();
477

478
    int width  = dst.length();
479
    int chan   = dst.meta().chan;
480

481
    // What if we cast the scalar into the SRC type?
482
    const SRC myscal[4] = { static_cast<SRC>(scalar[0]), static_cast<SRC>(scalar[1]),
483
                            static_cast<SRC>(scalar[2]), static_cast<SRC>(scalar[3]) };
484
    bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) &&
485
                     (myscal[2] == scalar[2]) && (myscal[3] == scalar[3]);
486

487
    switch (arithm)
488
    {
489
    case ARITHM_ABSDIFF:
490
        for (int w=0; w < width; w++)
491
            for (int c=0; c < chan; c++)
492
                out[chan*w + c] = absdiff<DST>(in[chan*w + c], scalar[c]);
493
        break;
494
    case ARITHM_ADD:
495
        if (usemyscal)
496
        {
497
            if (std::is_same<DST,uchar>::value &&
498
                std::is_same<SRC,uchar>::value &&
499
                chan == 3)
500
                run_arithm_s_add3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
501
            else if (std::is_same<DST,uchar>::value &&
502
                     std::is_same<SRC,float>::value &&
503
                     chan == 1)
504
                run_arithm_s_add1((uchar*)out, (const float*)in, width, (const float*)myscal);
505
            else
506
                run_arithm_s(out, in, width, chan, myscal, add<DST,SRC,SRC>);
507
        }
508
        else
509
            run_arithm_s(out, in, width, chan, scalar, add<DST,SRC,float>);
510
        break;
511
    case ARITHM_SUBTRACT:
512
        if (usemyscal)
513
        {
514
            if (std::is_same<DST,uchar>::value &&
515
                std::is_same<SRC,uchar>::value &&
516
                chan == 3)
517
                run_arithm_s_sub3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
518
            else if (std::is_same<DST,uchar>::value &&
519
                     std::is_same<SRC,float>::value &&
520
                     chan == 1)
521
                run_arithm_s_sub1((uchar*)out, (const float*)in, width, (const float*)myscal);
522
            else
523
                run_arithm_s(out, in, width, chan, myscal, sub<DST,SRC,SRC>);
524
        }
525
        else
526
            run_arithm_s(out, in, width, chan, scalar, sub<DST,SRC,float>);
527
        break;
528
    // TODO: optimize miltiplication and division
529
    case ARITHM_MULTIPLY:
530
        for (int w=0; w < width; w++)
531
            for (int c=0; c < chan; c++)
532
                out[chan*w + c] = mul<DST>(in[chan*w + c], scalar[c], scale);
533
        break;
534
    case ARITHM_DIVIDE:
535
        for (int w=0; w < width; w++)
536
            for (int c=0; c < chan; c++)
537
                out[chan*w + c] = div<DST>(in[chan*w + c], scalar[c], scale);
538
        break;
539
    default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation");
540
    }
541
}
542

543
template<typename DST, typename SRC>
544
static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], Arithm arithm,
545
                          float scale=1)
546
{
547
    const auto *in  = src.InLine<SRC>(0);
548
          auto *out = dst.OutLine<DST>();
549

550
    int width  = dst.length();
551
    int chan   = dst.meta().chan;
552

553
    // What if we cast the scalar into the SRC type?
554
    const SRC myscal[4] = { static_cast<SRC>(scalar[0]), static_cast<SRC>(scalar[1]),
555
                            static_cast<SRC>(scalar[2]), static_cast<SRC>(scalar[3]) };
556
    bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) &&
557
                     (myscal[2] == scalar[2]) && (myscal[3] == scalar[3]);
558

559
    switch (arithm)
560
    {
561
    case ARITHM_SUBTRACT:
562
        if (usemyscal)
563
        {
564
            if (std::is_same<DST,uchar>::value &&
565
                std::is_same<SRC,uchar>::value &&
566
                chan == 3)
567
                run_arithm_s_subr3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
568
            else if (std::is_same<DST,uchar>::value &&
569
                     std::is_same<SRC,float>::value &&
570
                     chan == 1)
571
                run_arithm_s_subr1((uchar*)out, (const float*)in, width, (const float*)myscal);
572
            else
573
                run_arithm_s(out, in, width, chan, myscal, subr<DST,SRC,SRC>);
574
        }
575
        else
576
            run_arithm_s(out, in, width, chan, scalar, subr<DST,SRC,float>);
577
        break;
578
    // TODO: optimize division
579
    case ARITHM_DIVIDE:
580
        for (int w=0; w < width; w++)
581
            for (int c=0; c < chan; c++)
582
                out[chan*w + c] = div<DST>(scalar[c], in[chan*w + c], scale);
583
        break;
584
    default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation");
585
    }
586
}
587

588
GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, false)
589
{
590
    static const int Window = 1;
591

592
    static void run(const View &src, const cv::Scalar &_scalar, Buffer &dst)
593
    {
594
        const float scalar[4] = {
595
            static_cast<float>(_scalar[0]),
596
            static_cast<float>(_scalar[1]),
597
            static_cast<float>(_scalar[2]),
598
            static_cast<float>(_scalar[3])
599
        };
600

601
        //     DST     SRC     OP            __VA_ARGS__
602
        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
603
        UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
604
        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
605

606
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
607
    }
608
};
609

610
GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, false)
611
{
612
    static const int Window = 1;
613

614
    static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
615
    {
616
        const float scalar[4] = {
617
            static_cast<float>(_scalar[0]),
618
            static_cast<float>(_scalar[1]),
619
            static_cast<float>(_scalar[2]),
620
            static_cast<float>(_scalar[3])
621
        };
622

623
        //     DST     SRC     OP            __VA_ARGS__
624
        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_ADD);
625
        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
626
        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_ADD);
627
        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
628
        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_ADD);
629
        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
630
        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_ADD);
631

632
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
633
    }
634
};
635

636
GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, false)
637
{
638
    static const int Window = 1;
639

640
    static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
641
    {
642
        const float scalar[4] = {
643
            static_cast<float>(_scalar[0]),
644
            static_cast<float>(_scalar[1]),
645
            static_cast<float>(_scalar[2]),
646
            static_cast<float>(_scalar[3])
647
        };
648

649
        //     DST     SRC     OP            __VA_ARGS__
650
        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
651
        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
652
        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
653
        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
654
        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
655
        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
656
        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
657

658
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
659
    }
660
};
661

662
GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false)
663
{
664
    static const int Window = 1;
665

666
    static void run(const cv::Scalar &_scalar, const View &src, int /*dtype*/, Buffer &dst)
667
    {
668
        const float scalar[4] = {
669
            static_cast<float>(_scalar[0]),
670
            static_cast<float>(_scalar[1]),
671
            static_cast<float>(_scalar[2]),
672
            static_cast<float>(_scalar[3])
673
        };
674

675
        //     DST     SRC     OP             __VA_ARGS__
676
        UNARY_(uchar , uchar , run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
677
        UNARY_(uchar ,  short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
678
        UNARY_(uchar ,  float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
679
        UNARY_( short,  short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
680
        UNARY_( float, uchar , run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
681
        UNARY_( float,  short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
682
        UNARY_( float,  float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
683

684
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
685
    }
686
};
687

688
GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, false)
689
{
690
    static const int Window = 1;
691

692
    static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
693
    {
694
        const float scalar[4] = {
695
            static_cast<float>(_scalar[0]),
696
            static_cast<float>(_scalar[1]),
697
            static_cast<float>(_scalar[2]),
698
            static_cast<float>(_scalar[3])
699
        };
700
        const float scale = 1.f;
701

702
        //     DST     SRC     OP            __VA_ARGS__
703
        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
704
        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
705
        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
706
        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
707
        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
708
        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
709
        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
710

711
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
712
    }
713
};
714

715
GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, false)
716
{
717
    static const int Window = 1;
718

719
    static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst)
720
    {
721
        const float scalar[4] = {
722
            static_cast<float>(_scalar),
723
            static_cast<float>(_scalar),
724
            static_cast<float>(_scalar),
725
            static_cast<float>(_scalar)
726
        };
727
        const float scale = 1.f;
728

729
        //     DST     SRC     OP            __VA_ARGS__
730
        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
731
        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
732
        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
733
        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
734
        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
735
        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
736
        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
737

738
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
739
    }
740
};
741

742
GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, false)
743
{
744
    static const int Window = 1;
745

746
    static void run(const View &src, const cv::Scalar &_scalar, double _scale, int /*dtype*/,
747
                    Buffer &dst)
748
    {
749
        const float scalar[4] = {
750
            static_cast<float>(_scalar[0]),
751
            static_cast<float>(_scalar[1]),
752
            static_cast<float>(_scalar[2]),
753
            static_cast<float>(_scalar[3])
754
        };
755
        const float scale = static_cast<float>(_scale);
756

757
        //     DST     SRC     OP            __VA_ARGS__
758
        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
759
        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
760
        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
761
        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
762
        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
763
        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
764
        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
765

766
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
767
    }
768
};
769

770
GAPI_FLUID_KERNEL(GFluidDivRC, cv::gapi::core::GDivRC, false)
771
{
772
    static const int Window = 1;
773

774
    static void run(const cv::Scalar &_scalar, const View &src, double _scale, int /*dtype*/,
775
                    Buffer &dst)
776
    {
777
        const float scalar[4] = {
778
            static_cast<float>(_scalar[0]),
779
            static_cast<float>(_scalar[1]),
780
            static_cast<float>(_scalar[2]),
781
            static_cast<float>(_scalar[3])
782
        };
783
        const float scale = static_cast<float>(_scale);
784

785
        //     DST     SRC     OP             __VA_ARGS__
786
        UNARY_(uchar , uchar , run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
787
        UNARY_(uchar ,  short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
788
        UNARY_(uchar ,  float, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
789
        UNARY_( short,  short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
790
        UNARY_( float, uchar , run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
791
        UNARY_( float,  short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
792
        UNARY_( float,  float, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
793

794
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
795
    }
796
};
797

798
//----------------------------
799
//
800
// Fluid math kernels: bitwise
801
//
802
//----------------------------
803

804
enum Bitwise { BW_AND, BW_OR, BW_XOR, BW_NOT };
805

806
template<typename DST, typename SRC1, typename SRC2>
807
static void run_bitwise2(Buffer &dst, const View &src1, const View &src2, Bitwise bitwise)
808
{
809
    static_assert(std::is_same<DST, SRC1>::value, "wrong types");
810
    static_assert(std::is_same<DST, SRC2>::value, "wrong types");
811

812
    const auto *in1 = src1.InLine<SRC1>(0);
813
    const auto *in2 = src2.InLine<SRC2>(0);
814
          auto *out = dst.OutLine<DST>();
815

816
    int width  = dst.length();
817
    int chan   = dst.meta().chan;
818
    int length = width * chan;
819

820
    switch (bitwise)
821
    {
822
    case BW_AND:
823
        for (int l=0; l < length; l++)
824
            out[l] = in1[l] & in2[l];
825
        break;
826
    case BW_OR:
827
        for (int l=0; l < length; l++)
828
            out[l] = in1[l] | in2[l];
829
        break;
830
    case BW_XOR:
831
        for (int l=0; l < length; l++)
832
            out[l] = in1[l] ^ in2[l];
833
        break;
834
    default: CV_Error(cv::Error::StsBadArg, "unsupported bitwise operation");
835
    }
836
}
837

838
template<typename DST, typename SRC>
839
static void run_bitwise1(Buffer &dst, const View &src, Bitwise bitwise)
840
{
841
    static_assert(std::is_same<DST, SRC>::value, "wrong types");
842

843
    const auto *in  = src.InLine<SRC>(0);
844
          auto *out = dst.OutLine<DST>();
845

846
    int width  = dst.length();
847
    int chan   = dst.meta().chan;
848
    int length = width * chan;
849

850
    switch (bitwise)
851
    {
852
    case BW_NOT:
853
        for (int l=0; l < length; l++)
854
            out[l] = ~in[l];
855
        break;
856
    default: CV_Error(cv::Error::StsBadArg, "unsupported bitwise operation");
857
    }
858
}
859

860
GAPI_FLUID_KERNEL(GFluidAnd, cv::gapi::core::GAnd, false)
861
{
862
    static const int Window = 1;
863

864
    static void run(const View &src1, const View &src2, Buffer &dst)
865
    {
866

867
        //      DST     SRC1    SRC2    OP            __VA_ARGS__
868
        BINARY_(uchar , uchar , uchar , run_bitwise2, dst, src1, src2, BW_AND);
869
        BINARY_(ushort, ushort, ushort, run_bitwise2, dst, src1, src2, BW_AND);
870
        BINARY_( short,  short,  short, run_bitwise2, dst, src1, src2, BW_AND);
871

872
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
873
    }
874
};
875

876
GAPI_FLUID_KERNEL(GFluidOr, cv::gapi::core::GOr, false)
877
{
878
    static const int Window = 1;
879

880
    static void run(const View &src1, const View &src2, Buffer &dst)
881
    {
882

883
        //      DST     SRC1    SRC2    OP            __VA_ARGS__
884
        BINARY_(uchar , uchar , uchar , run_bitwise2, dst, src1, src2, BW_OR);
885
        BINARY_(ushort, ushort, ushort, run_bitwise2, dst, src1, src2, BW_OR);
886
        BINARY_( short,  short,  short, run_bitwise2, dst, src1, src2, BW_OR);
887

888
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
889
    }
890
};
891

892
GAPI_FLUID_KERNEL(GFluidXor, cv::gapi::core::GXor, false)
893
{
894
    static const int Window = 1;
895

896
    static void run(const View &src1, const View &src2, Buffer &dst)
897
    {
898

899
        //      DST     SRC1    SRC2    OP            __VA_ARGS__
900
        BINARY_(uchar , uchar , uchar , run_bitwise2, dst, src1, src2, BW_XOR);
901
        BINARY_(ushort, ushort, ushort, run_bitwise2, dst, src1, src2, BW_XOR);
902
        BINARY_( short,  short,  short, run_bitwise2, dst, src1, src2, BW_XOR);
903

904
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
905
    }
906
};
907

908
GAPI_FLUID_KERNEL(GFluidNot, cv::gapi::core::GNot, false)
909
{
910
    static const int Window = 1;
911

912
    static void run(const View &src, Buffer &dst)
913
    {
914
        //     DST     SRC     OP            __VA_ARGS__
915
        UNARY_(uchar , uchar , run_bitwise1, dst, src, BW_NOT);
916
        UNARY_(ushort, ushort, run_bitwise1, dst, src, BW_NOT);
917
        UNARY_( short,  short, run_bitwise1, dst, src, BW_NOT);
918

919
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
920
    }
921
};
922

923
//-------------------
924
//
925
// Fluid kernels: LUT
926
//
927
//-------------------
928

929
GAPI_FLUID_KERNEL(GFluidLUT, cv::gapi::core::GLUT, false)
930
{
931
    static const int Window = 1;
932

933
    static void run(const View &src, const cv::Mat& lut, Buffer &dst)
934
    {
935
        GAPI_Assert(CV_8U == dst.meta().depth);
936
        GAPI_Assert(CV_8U == src.meta().depth);
937

938
        GAPI_DbgAssert(CV_8U == lut.type());
939
        GAPI_DbgAssert(256 == lut.cols * lut.rows);
940
        GAPI_DbgAssert(dst.length() == src.length());
941
        GAPI_DbgAssert(dst.meta().chan == src.meta().chan);
942

943
        const auto *in  = src.InLine<uchar>(0);
944
              auto *out = dst.OutLine<uchar>();
945

946
        int width  = dst.length();
947
        int chan   = dst.meta().chan;
948
        int length = width * chan;
949

950
        for (int l=0; l < length; l++)
951
            out[l] = lut.data[ in[l] ];
952
    }
953
};
954

955
//-------------------------
956
//
957
// Fluid kernels: convertTo
958
//
959
//-------------------------
960

961
template<typename DST, typename SRC>
962
static void run_convertto(Buffer &dst, const View &src, double _alpha, double _beta)
963
{
964
    const auto *in  = src.InLine<SRC>(0);
965
          auto *out = dst.OutLine<DST>();
966

967
    int width  = dst.length();
968
    int chan   = dst.meta().chan;
969
    int length = width * chan;
970

971
    // NB: don't do this if SRC or DST is 64-bit
972
    auto alpha = static_cast<float>( _alpha );
973
    auto beta  = static_cast<float>( _beta  );
974

975
    // compute faster if no alpha no beta
976
    if (alpha == 1 && beta == 0)
977
    {
978
        // manual SIMD if need rounding
979
        if (std::is_integral<DST>::value && std::is_floating_point<SRC>::value)
980
        {
981
            GAPI_Assert(( std::is_same<SRC,float>::value ));
982

983
            int l = 0; // cycle index
984

985
        #if CV_SIMD128
986
            if (std::is_same<DST,uchar>::value)
987
            {
988
                for (; l <= length-16; l+=16)
989
                {
990
                    v_int32x4 i0, i1, i2, i3;
991
                    i0 = v_round( v_load( (float*)& in[l     ] ) );
992
                    i1 = v_round( v_load( (float*)& in[l +  4] ) );
993
                    i2 = v_round( v_load( (float*)& in[l +  8] ) );
994
                    i3 = v_round( v_load( (float*)& in[l + 12] ) );
995

996
                    v_uint16x8 us0, us1;
997
                    us0 = v_pack_u(i0, i1);
998
                    us1 = v_pack_u(i2, i3);
999

1000
                    v_uint8x16 uc;
1001
                    uc = v_pack(us0, us1);
1002
                    v_store((uchar*)& out[l], uc);
1003
                }
1004
            }
1005
            if (std::is_same<DST,ushort>::value)
1006
            {
1007
                for (; l <= length-8; l+=8)
1008
                {
1009
                    v_int32x4 i0, i1;
1010
                    i0 = v_round( v_load( (float*)& in[l     ] ) );
1011
                    i1 = v_round( v_load( (float*)& in[l +  4] ) );
1012

1013
                    v_uint16x8 us;
1014
                    us = v_pack_u(i0, i1);
1015
                    v_store((ushort*)& out[l], us);
1016
                }
1017
            }
1018
        #endif
1019

1020
            // tail of SIMD cycle
1021
            for (; l < length; l++)
1022
            {
1023
                out[l] = saturate<DST>(in[l], rintf);
1024
            }
1025
        }
1026
        else if (std::is_integral<DST>::value) // here SRC is integral
1027
        {
1028
            for (int l=0; l < length; l++)
1029
            {
1030
                out[l] = saturate<DST>(in[l]);
1031
            }
1032
        }
1033
        else // DST is floating-point, SRC is any
1034
        {
1035
            for (int l=0; l < length; l++)
1036
            {
1037
                out[l] = static_cast<DST>(in[l]);
1038
            }
1039
        }
1040
    }
1041
    else // if alpha or beta is non-trivial
1042
    {
1043
        // TODO: optimize if alpha and beta and data are integral
1044
        for (int l=0; l < length; l++)
1045
        {
1046
            out[l] = saturate<DST>(in[l]*alpha + beta, rintf);
1047
        }
1048
    }
1049
}
1050

1051
GAPI_FLUID_KERNEL(GFluidConvertTo, cv::gapi::core::GConvertTo, false)
1052
{
1053
    static const int Window = 1;
1054

1055
    static void run(const View &src, int /*rtype*/, double alpha, double beta, Buffer &dst)
1056
    {
1057
        //     DST     SRC     OP             __VA_ARGS__
1058
        UNARY_(uchar , uchar , run_convertto, dst, src, alpha, beta);
1059
        UNARY_(uchar , ushort, run_convertto, dst, src, alpha, beta);
1060
        UNARY_(uchar ,  float, run_convertto, dst, src, alpha, beta);
1061
        UNARY_(ushort, uchar , run_convertto, dst, src, alpha, beta);
1062
        UNARY_(ushort, ushort, run_convertto, dst, src, alpha, beta);
1063
        UNARY_(ushort,  float, run_convertto, dst, src, alpha, beta);
1064
        UNARY_( float, uchar , run_convertto, dst, src, alpha, beta);
1065
        UNARY_( float, ushort, run_convertto, dst, src, alpha, beta);
1066
        UNARY_( float,  float, run_convertto, dst, src, alpha, beta);
1067

1068
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1069
    }
1070
};
1071

1072
//-----------------------------
1073
//
1074
// Fluid math kernels: min, max
1075
//
1076
//-----------------------------
1077

1078
enum Minmax { MM_MIN, MM_MAX };
1079

1080
template<typename DST, typename SRC1, typename SRC2>
1081
static void run_minmax(Buffer &dst, const View &src1, const View &src2, Minmax minmax)
1082
{
1083
    static_assert(std::is_same<DST, SRC1>::value, "wrong types");
1084
    static_assert(std::is_same<DST, SRC2>::value, "wrong types");
1085

1086
    const auto *in1 = src1.InLine<SRC1>(0);
1087
    const auto *in2 = src2.InLine<SRC2>(0);
1088
          auto *out = dst.OutLine<DST>();
1089

1090
    int width = dst.length();
1091
    int chan  = dst.meta().chan;
1092

1093
    int length = width * chan;
1094

1095
    switch (minmax)
1096
    {
1097
    case MM_MIN:
1098
        for (int l=0; l < length; l++)
1099
            out[l] = in1[l] < in2[l]? in1[l]: in2[l];
1100
        break;
1101
    case MM_MAX:
1102
        for (int l=0; l < length; l++)
1103
            out[l] = in1[l] > in2[l]? in1[l]: in2[l];
1104
        break;
1105
    default: CV_Error(cv::Error::StsBadArg, "unsupported min/max operation");
1106
    }
1107
}
1108

1109
GAPI_FLUID_KERNEL(GFluidMin, cv::gapi::core::GMin, false)
1110
{
1111
    static const int Window = 1;
1112

1113
    static void run(const View &src1, const View &src2, Buffer &dst)
1114
    {
1115
        //      DST     SRC1    SRC2    OP          __VA_ARGS__
1116
        BINARY_(uchar , uchar , uchar , run_minmax, dst, src1, src2, MM_MIN);
1117
        BINARY_(ushort, ushort, ushort, run_minmax, dst, src1, src2, MM_MIN);
1118
        BINARY_( short,  short,  short, run_minmax, dst, src1, src2, MM_MIN);
1119
        BINARY_( float,  float,  float, run_minmax, dst, src1, src2, MM_MIN);
1120

1121
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1122
    }
1123
};
1124

1125
GAPI_FLUID_KERNEL(GFluidMax, cv::gapi::core::GMax, false)
1126
{
1127
    static const int Window = 1;
1128

1129
    static void run(const View &src1, const View &src2, Buffer &dst)
1130
    {
1131
        //      DST     SRC1    SRC2    OP          __VA_ARGS__
1132
        BINARY_(uchar , uchar , uchar , run_minmax, dst, src1, src2, MM_MAX);
1133
        BINARY_(ushort, ushort, ushort, run_minmax, dst, src1, src2, MM_MAX);
1134
        BINARY_( short,  short,  short, run_minmax, dst, src1, src2, MM_MAX);
1135
        BINARY_( float,  float,  float, run_minmax, dst, src1, src2, MM_MAX);
1136

1137
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1138
    }
1139
};
1140

1141
//-----------------------
1142
//
1143
// Fluid kernels: compare
1144
//
1145
//-----------------------
1146

1147
enum Compare { CMP_EQ, CMP_NE, CMP_GE, CMP_GT, CMP_LE, CMP_LT };
1148

1149
template<typename DST, typename SRC1, typename SRC2>
1150
static void run_cmp(Buffer &dst, const View &src1, const View &src2, Compare compare)
1151
{
1152
    static_assert(std::is_same<SRC1, SRC2>::value, "wrong types");
1153
    static_assert(std::is_same<DST, uchar>::value, "wrong types");
1154

1155
    const auto *in1 = src1.InLine<SRC1>(0);
1156
    const auto *in2 = src2.InLine<SRC2>(0);
1157
          auto *out = dst.OutLine<DST>();
1158

1159
    int width = dst.length();
1160
    int chan  = dst.meta().chan;
1161

1162
    int length = width * chan;
1163

1164
    switch (compare)
1165
    {
1166
    case CMP_EQ:
1167
        for (int l=0; l < length; l++)
1168
            out[l] = in1[l] == in2[l]? 255: 0;
1169
        break;
1170
    case CMP_NE:
1171
        for (int l=0; l < length; l++)
1172
            out[l] = in1[l] != in2[l]? 255: 0;
1173
        break;
1174
    case CMP_GE:
1175
        for (int l=0; l < length; l++)
1176
            out[l] = in1[l] >= in2[l]? 255: 0;
1177
        break;
1178
    case CMP_LE:
1179
        for (int l=0; l < length; l++)
1180
            out[l] = in1[l] <= in2[l]? 255: 0;
1181
        break;
1182
    case CMP_GT:
1183
        for (int l=0; l < length; l++)
1184
            out[l] = in1[l] > in2[l]? 255: 0;
1185
        break;
1186
    case CMP_LT:
1187
        for (int l=0; l < length; l++)
1188
            out[l] = in1[l] < in2[l]? 255: 0;
1189
        break;
1190
    default:
1191
        CV_Error(cv::Error::StsBadArg, "unsupported compare operation");
1192
    }
1193
}
1194

1195
GAPI_FLUID_KERNEL(GFluidCmpEQ, cv::gapi::core::GCmpEQ, false)
1196
{
1197
    static const int Window = 1;
1198

1199
    static void run(const View &src1, const View &src2, Buffer &dst)
1200
    {
1201
        //      DST    SRC1    SRC2    OP       __VA_ARGS__
1202
        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_EQ);
1203
        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_EQ);
1204
        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_EQ);
1205

1206
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1207
    }
1208
};
1209

1210
GAPI_FLUID_KERNEL(GFluidCmpNE, cv::gapi::core::GCmpNE, false)
1211
{
1212
    static const int Window = 1;
1213

1214
    static void run(const View &src1, const View &src2, Buffer &dst)
1215
    {
1216
        //      DST    SRC1    SRC2    OP       __VA_ARGS__
1217
        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_NE);
1218
        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_NE);
1219
        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_NE);
1220

1221
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1222
    }
1223
};
1224

1225
GAPI_FLUID_KERNEL(GFluidCmpGE, cv::gapi::core::GCmpGE, false)
1226
{
1227
    static const int Window = 1;
1228

1229
    static void run(const View &src1, const View &src2, Buffer &dst)
1230
    {
1231
        //      DST    SRC1    SRC2    OP       __VA_ARGS__
1232
        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_GE);
1233
        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_GE);
1234
        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_GE);
1235

1236
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1237
    }
1238
};
1239

1240
GAPI_FLUID_KERNEL(GFluidCmpGT, cv::gapi::core::GCmpGT, false)
1241
{
1242
    static const int Window = 1;
1243

1244
    static void run(const View &src1, const View &src2, Buffer &dst)
1245
    {
1246
        //      DST    SRC1    SRC2    OP       __VA_ARGS__
1247
        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_GT);
1248
        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_GT);
1249
        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_GT);
1250

1251
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1252
    }
1253
};
1254

1255
GAPI_FLUID_KERNEL(GFluidCmpLE, cv::gapi::core::GCmpLE, false)
1256
{
1257
    static const int Window = 1;
1258

1259
    static void run(const View &src1, const View &src2, Buffer &dst)
1260
    {
1261
        //      DST    SRC1    SRC2    OP       __VA_ARGS__
1262
        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_LE);
1263
        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_LE);
1264
        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_LE);
1265

1266
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1267
    }
1268
};
1269

1270
GAPI_FLUID_KERNEL(GFluidCmpLT, cv::gapi::core::GCmpLT, false)
1271
{
1272
    static const int Window = 1;
1273

1274
    static void run(const View &src1, const View &src2, Buffer &dst)
1275
    {
1276
        //      DST    SRC1    SRC2    OP       __VA_ARGS__
1277
        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_LT);
1278
        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_LT);
1279
        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_LT);
1280

1281
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1282
    }
1283
};
1284

1285
//---------------------
1286
//
1287
// Compare with GScalar
1288
//
1289
//---------------------
1290

1291
template<typename DST, typename SRC, typename SCALAR=double>
1292
static void run_cmp(DST out[], const SRC in[], int length, Compare compare, SCALAR s)
1293
{
1294
    switch (compare)
1295
    {
1296
    case CMP_EQ:
1297
        for (int l=0; l < length; l++)
1298
            out[l] = in[l] == s? 255: 0;
1299
        break;
1300
    case CMP_NE:
1301
        for (int l=0; l < length; l++)
1302
            out[l] = in[l] != s? 255: 0;
1303
        break;
1304
    case CMP_GE:
1305
        for (int l=0; l < length; l++)
1306
            out[l] = in[l] >= s? 255: 0;
1307
        break;
1308
    case CMP_LE:
1309
        for (int l=0; l < length; l++)
1310
            out[l] = in[l] <= s? 255: 0;
1311
        break;
1312
    case CMP_GT:
1313
        for (int l=0; l < length; l++)
1314
            out[l] = in[l] > s? 255: 0;
1315
        break;
1316
    case CMP_LT:
1317
        for (int l=0; l < length; l++)
1318
            out[l] = in[l] < s? 255: 0;
1319
        break;
1320
    default:
1321
        CV_Error(cv::Error::StsBadArg, "unsupported compare operation");
1322
    }
1323
}
1324

1325
template<typename DST, typename SRC>
1326
static void run_cmp(Buffer &dst, const View &src, Compare compare, const cv::Scalar &scalar)
1327
{
1328
    static_assert(std::is_same<DST, uchar>::value, "wrong types");
1329

1330
    const auto *in  = src.InLine<SRC>(0);
1331
          auto *out = dst.OutLine<DST>();
1332

1333
    int width = dst.length();
1334
    int chan  = dst.meta().chan;
1335

1336
    int length = width * chan;
1337

1338
    // compute faster if scalar rounds to SRC
1339
    double d =                   scalar[0]  ;
1340
    SRC    s = static_cast<SRC>( scalar[0] );
1341

1342
    if (s == d)
1343
        run_cmp(out, in, length, compare, s);
1344
    else
1345
        run_cmp(out, in, length, compare, d);
1346
}
1347

1348
GAPI_FLUID_KERNEL(GFluidCmpEQScalar, cv::gapi::core::GCmpEQScalar, false)
1349
{
1350
    static const int Window = 1;
1351

1352
    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
1353
    {
1354
        //     DST    SRC     OP       __VA_ARGS__
1355
        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_EQ, scalar);
1356
        UNARY_(uchar,  short, run_cmp, dst, src, CMP_EQ, scalar);
1357
        UNARY_(uchar,  float, run_cmp, dst, src, CMP_EQ, scalar);
1358

1359
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1360
    }
1361
};
1362

1363
GAPI_FLUID_KERNEL(GFluidCmpNEScalar, cv::gapi::core::GCmpNEScalar, false)
1364
{
1365
    static const int Window = 1;
1366

1367
    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
1368
    {
1369
        //     DST    SRC     OP       __VA_ARGS__
1370
        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_NE, scalar);
1371
        UNARY_(uchar,  short, run_cmp, dst, src, CMP_NE, scalar);
1372
        UNARY_(uchar,  float, run_cmp, dst, src, CMP_NE, scalar);
1373

1374
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1375
    }
1376
};
1377

1378
GAPI_FLUID_KERNEL(GFluidCmpGEScalar, cv::gapi::core::GCmpGEScalar, false)
1379
{
1380
    static const int Window = 1;
1381

1382
    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
1383
    {
1384
        //     DST    SRC     OP       __VA_ARGS__
1385
        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_GE, scalar);
1386
        UNARY_(uchar,  short, run_cmp, dst, src, CMP_GE, scalar);
1387
        UNARY_(uchar,  float, run_cmp, dst, src, CMP_GE, scalar);
1388

1389
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1390
    }
1391
};
1392

1393
GAPI_FLUID_KERNEL(GFluidCmpGTScalar, cv::gapi::core::GCmpGTScalar, false)
1394
{
1395
    static const int Window = 1;
1396

1397
    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
1398
    {
1399
        //     DST    SRC     OP       __VA_ARGS__
1400
        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_GT, scalar);
1401
        UNARY_(uchar,  short, run_cmp, dst, src, CMP_GT, scalar);
1402
        UNARY_(uchar,  float, run_cmp, dst, src, CMP_GT, scalar);
1403

1404
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1405
    }
1406
};
1407

1408
GAPI_FLUID_KERNEL(GFluidCmpLEScalar, cv::gapi::core::GCmpLEScalar, false)
1409
{
1410
    static const int Window = 1;
1411

1412
    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
1413
    {
1414
        //     DST    SRC     OP       __VA_ARGS__
1415
        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_LE, scalar);
1416
        UNARY_(uchar,  short, run_cmp, dst, src, CMP_LE, scalar);
1417
        UNARY_(uchar,  float, run_cmp, dst, src, CMP_LE, scalar);
1418

1419
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1420
    }
1421
};
1422

1423
GAPI_FLUID_KERNEL(GFluidCmpLTScalar, cv::gapi::core::GCmpLTScalar, false)
1424
{
1425
    static const int Window = 1;
1426

1427
    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
1428
    {
1429
        //     DST    SRC     OP       __VA_ARGS__
1430
        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_LT, scalar);
1431
        UNARY_(uchar,  short, run_cmp, dst, src, CMP_LT, scalar);
1432
        UNARY_(uchar,  float, run_cmp, dst, src, CMP_LT, scalar);
1433

1434
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1435
    }
1436
};
1437

1438
//-------------------------
1439
//
1440
// Fluid kernels: threshold
1441
//
1442
//-------------------------
1443

1444
template<typename DST, typename SRC>
1445
static void run_threshold(Buffer &dst, const View &src, const cv::Scalar &thresh,
1446
                                                        const cv::Scalar &maxval,
1447
                                                                     int  type)
1448
{
1449
    static_assert(std::is_same<DST, SRC>::value, "wrong types");
1450

1451
    const auto *in  = src.InLine<SRC>(0);
1452
          auto *out = dst.OutLine<DST>();
1453

1454
    int width = dst.length();
1455
    int chan  = dst.meta().chan;
1456

1457
    int length = width * chan;
1458

1459
    DST thresh_ = saturate<DST>(thresh[0], floord);
1460
    DST threshd = saturate<DST>(thresh[0], roundd);
1461
    DST maxvald = saturate<DST>(maxval[0], roundd);
1462

1463
    switch (type)
1464
    {
1465
    case cv::THRESH_BINARY:
1466
        for (int l=0; l < length; l++)
1467
            out[l] = in[l] > thresh_? maxvald: 0;
1468
        break;
1469
    case cv::THRESH_BINARY_INV:
1470
        for (int l=0; l < length; l++)
1471
            out[l] = in[l] > thresh_? 0: maxvald;
1472
        break;
1473
    case cv::THRESH_TRUNC:
1474
        for (int l=0; l < length; l++)
1475
            out[l] = in[l] > thresh_? threshd: in[l];
1476
        break;
1477
    case cv::THRESH_TOZERO:
1478
        for (int l=0; l < length; l++)
1479
            out[l] = in[l] > thresh_? in[l]: 0;
1480
        break;
1481
    case cv::THRESH_TOZERO_INV:
1482
        for (int l=0; l < length; l++)
1483
            out[l] = in[l] > thresh_? 0: in[l];
1484
        break;
1485
    default: CV_Error(cv::Error::StsBadArg, "unsupported threshold type");
1486
    }
1487
}
1488

1489
GAPI_FLUID_KERNEL(GFluidThreshold, cv::gapi::core::GThreshold, false)
1490
{
1491
    static const int Window = 1;
1492

1493
    static void run(const View &src, const cv::Scalar &thresh,
1494
                                     const cv::Scalar &maxval,
1495
                                                  int  type,
1496
                        Buffer &dst)
1497
    {
1498
        //     DST     SRC     OP             __VA_ARGS__
1499
        UNARY_(uchar , uchar , run_threshold, dst, src, thresh, maxval, type);
1500
        UNARY_(ushort, ushort, run_threshold, dst, src, thresh, maxval, type);
1501
        UNARY_( short,  short, run_threshold, dst, src, thresh, maxval, type);
1502

1503
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1504
    }
1505
};
1506

1507
//------------------------
1508
//
1509
// Fluid kernels: in-range
1510
//
1511
//------------------------
1512

1513
static void run_inrange3(uchar out[], const uchar in[], int width,
1514
                         const uchar lower[], const uchar upper[])
1515
{
1516
    int w = 0; // cycle index
1517

1518
#if CV_SIMD128
1519
    for (; w <= width-16; w+=16)
1520
    {
1521
        v_uint8x16 i0, i1, i2;
1522
        v_load_deinterleave(&in[3*w], i0, i1, i2);
1523

1524
        v_uint8x16 o;
1525
        o = (i0 >= v_setall_u8(lower[0])) & (i0 <= v_setall_u8(upper[0])) &
1526
            (i1 >= v_setall_u8(lower[1])) & (i1 <= v_setall_u8(upper[1])) &
1527
            (i2 >= v_setall_u8(lower[2])) & (i2 <= v_setall_u8(upper[2]));
1528

1529
        v_store(&out[w], o);
1530
    }
1531
#endif
1532

1533
    for (; w < width; w++)
1534
    {
1535
        out[w] = in[3*w  ] >= lower[0] && in[3*w  ] <= upper[0] &&
1536
                 in[3*w+1] >= lower[1] && in[3*w+1] <= upper[1] &&
1537
                 in[3*w+2] >= lower[2] && in[3*w+2] <= upper[2] ? 255: 0;
1538
    }
1539
}
1540

1541
template<typename DST, typename SRC>
1542
static void run_inrange(Buffer &dst, const View &src, const cv::Scalar &upperb,
1543
                                                      const cv::Scalar &lowerb)
1544
{
1545
    static_assert(std::is_same<DST, uchar>::value, "wrong types");
1546
    static_assert(std::is_integral<SRC>::value,    "wrong types");
1547

1548
    const auto *in  = src.InLine<SRC>(0);
1549
          auto *out = dst.OutLine<DST>();
1550

1551
    int width = src.length();
1552
    int chan  = src.meta().chan;
1553
    GAPI_Assert(dst.meta().chan == 1);
1554

1555
    // for integral input, in[i] >= lower equals in[i] >= ceil(lower)
1556
    // so we can optimize compare operations by rounding lower/upper
1557
    SRC lower[4], upper[4];
1558
    for (int c=0; c < chan; c++)
1559
    {
1560
        lower[c] = saturate<SRC>(lowerb[c],  ceild);
1561
        upper[c] = saturate<SRC>(upperb[c], floord);
1562
    }
1563

1564
    // manually SIMD for important case if RGB/BGR
1565
    if (std::is_same<SRC,uchar>::value && chan==3)
1566
    {
1567
        run_inrange3((uchar*)out, (const uchar*)in, width,
1568
                     (const uchar*)lower, (const uchar*)upper);
1569
        return;
1570
    }
1571

1572
    // TODO: please manually SIMD if multiple channels:
1573
    // modern compilers would perfectly vectorize this code if one channel,
1574
    // but may need help with de-interleaving channels if RGB/BGR image etc
1575
    switch (chan)
1576
    {
1577
    case 1:
1578
        for (int w=0; w < width; w++)
1579
            out[w] = in[w] >= lower[0] && in[w] <= upper[0]? 255: 0;
1580
        break;
1581
    case 2:
1582
        for (int w=0; w < width; w++)
1583
            out[w] = in[2*w  ] >= lower[0] && in[2*w  ] <= upper[0] &&
1584
                     in[2*w+1] >= lower[1] && in[2*w+1] <= upper[1] ? 255: 0;
1585
        break;
1586
    case 3:
1587
        for (int w=0; w < width; w++)
1588
            out[w] = in[3*w  ] >= lower[0] && in[3*w  ] <= upper[0] &&
1589
                     in[3*w+1] >= lower[1] && in[3*w+1] <= upper[1] &&
1590
                     in[3*w+2] >= lower[2] && in[3*w+2] <= upper[2] ? 255: 0;
1591
        break;
1592
    case 4:
1593
        for (int w=0; w < width; w++)
1594
            out[w] = in[4*w  ] >= lower[0] && in[4*w  ] <= upper[0] &&
1595
                     in[4*w+1] >= lower[1] && in[4*w+1] <= upper[1] &&
1596
                     in[4*w+2] >= lower[2] && in[4*w+2] <= upper[2] &&
1597
                     in[4*w+3] >= lower[3] && in[4*w+3] <= upper[3] ? 255: 0;
1598
        break;
1599
    default: CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
1600
    }
1601
}
1602

1603
GAPI_FLUID_KERNEL(GFluidInRange, cv::gapi::core::GInRange, false)
1604
{
1605
    static const int Window = 1;
1606

1607
    static void run(const View &src, const cv::Scalar &lowerb, const cv::Scalar& upperb,
1608
                        Buffer &dst)
1609
    {
1610
        //       DST     SRC    OP           __VA_ARGS__
1611
        INRANGE_(uchar, uchar , run_inrange, dst, src, upperb, lowerb);
1612
        INRANGE_(uchar, ushort, run_inrange, dst, src, upperb, lowerb);
1613
        INRANGE_(uchar,  short, run_inrange, dst, src, upperb, lowerb);
1614

1615
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1616
    }
1617
};
1618

1619
//----------------------
1620
//
1621
// Fluid kernels: select
1622
//
1623
//----------------------
1624

1625
// manually vectored function for important case if RGB/BGR image
1626
static void run_select_row3(int width, uchar out[], uchar in1[], uchar in2[], uchar in3[])
1627
{
1628
    int w = 0; // cycle index
1629

1630
#if CV_SIMD128
1631
    for (; w <= width-16; w+=16)
1632
    {
1633
        v_uint8x16 a1, b1, c1;
1634
        v_uint8x16 a2, b2, c2;
1635
        v_uint8x16 mask;
1636
        v_uint8x16 a, b, c;
1637

1638
        v_load_deinterleave(&in1[3*w], a1, b1, c1);
1639
        v_load_deinterleave(&in2[3*w], a2, b2, c2);
1640

1641
        mask = v_load(&in3[w]);
1642
        mask = mask != v_setzero_u8();
1643

1644
        a = v_select(mask, a1, a2);
1645
        b = v_select(mask, b1, b2);
1646
        c = v_select(mask, c1, c2);
1647

1648
        v_store_interleave(&out[3*w], a, b, c);
1649
    }
1650
#endif
1651

1652
    for (; w < width; w++)
1653
    {
1654
        out[3*w    ] = in3[w]? in1[3*w    ]: in2[3*w    ];
1655
        out[3*w + 1] = in3[w]? in1[3*w + 1]: in2[3*w + 1];
1656
        out[3*w + 2] = in3[w]? in1[3*w + 2]: in2[3*w + 2];
1657
    }
1658
}
1659

1660
// parameter chan is compile-time known constant, normally chan=1..4
1661
template<int chan, typename DST, typename SRC1, typename SRC2, typename SRC3>
1662
static void run_select_row(int width, DST out[], SRC1 in1[], SRC2 in2[], SRC3 in3[])
1663
{
1664
    if (std::is_same<DST,uchar>::value && chan==3)
1665
    {
1666
        // manually vectored function for important case if RGB/BGR image
1667
        run_select_row3(width, (uchar*)out, (uchar*)in1, (uchar*)in2, (uchar*)in3);
1668
        return;
1669
    }
1670

1671
    // because `chan` is template parameter, its value is known at compilation time,
1672
    // so that modern compilers would efficiently vectorize this cycle if chan==1
1673
    // (if chan>1, compilers may need help with de-interleaving of the channels)
1674
    for (int w=0; w < width; w++)
1675
    {
1676
        for (int c=0; c < chan; c++)
1677
        {
1678
            out[w*chan + c] = in3[w]? in1[w*chan + c]: in2[w*chan + c];
1679
        }
1680
    }
1681
}
1682

1683
template<typename DST, typename SRC1, typename SRC2, typename SRC3>
1684
static void run_select(Buffer &dst, const View &src1, const View &src2, const View &src3)
1685
{
1686
    static_assert(std::is_same<DST ,  SRC1>::value, "wrong types");
1687
    static_assert(std::is_same<DST ,  SRC2>::value, "wrong types");
1688
    static_assert(std::is_same<uchar, SRC3>::value, "wrong types");
1689

1690
    auto *out = dst.OutLine<DST>();
1691

1692
    const auto *in1 = src1.InLine<SRC1>(0);
1693
    const auto *in2 = src2.InLine<SRC2>(0);
1694
    const auto *in3 = src3.InLine<SRC3>(0);
1695

1696
    int width = dst.length();
1697
    int chan  = dst.meta().chan;
1698

1699
    switch (chan)
1700
    {
1701
    case 1: run_select_row<1>(width, out, in1, in2, in3); break;
1702
    case 2: run_select_row<2>(width, out, in1, in2, in3); break;
1703
    case 3: run_select_row<3>(width, out, in1, in2, in3); break;
1704
    case 4: run_select_row<4>(width, out, in1, in2, in3); break;
1705
    default: CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
1706
    }
1707
}
1708

1709
GAPI_FLUID_KERNEL(GFluidSelect, cv::gapi::core::GSelect, false)
1710
{
1711
    static const int Window = 1;
1712

1713
    static void run(const View &src1, const View &src2, const View &src3, Buffer &dst)
1714
    {
1715
        //      DST     SRC1    SRC2    SRC3   OP          __VA_ARGS__
1716
        SELECT_(uchar , uchar , uchar , uchar, run_select, dst, src1, src2, src3);
1717
        SELECT_(ushort, ushort, ushort, uchar, run_select, dst, src1, src2, src3);
1718
        SELECT_( short,  short,  short, uchar, run_select, dst, src1, src2, src3);
1719

1720
        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
1721
    }
1722
};
1723

1724
//----------------------------------------------------
1725
//
1726
// Fluid kernels: split, merge, polat2cart, cart2polar
1727
//
1728
//----------------------------------------------------
1729

1730
GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false)
1731
{
1732
    static const int Window = 1;
1733

1734
    static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3)
1735
    {
1736
        const auto *in   =  src.InLine<uchar>(0);
1737
              auto *out1 = dst1.OutLine<uchar>();
1738
              auto *out2 = dst2.OutLine<uchar>();
1739
              auto *out3 = dst3.OutLine<uchar>();
1740

1741
        GAPI_Assert(3 == src.meta().chan);
1742
        int width = src.length();
1743

1744
        int w = 0; // cycle counter
1745

1746
    #if CV_SIMD128
1747
        for (; w <= width-16; w+=16)
1748
        {
1749
            v_uint8x16 a, b, c;
1750
            v_load_deinterleave(&in[3*w], a, b, c);
1751
            v_store(&out1[w], a);
1752
            v_store(&out2[w], b);
1753
            v_store(&out3[w], c);
1754
        }
1755
    #endif
1756

1757
        for (; w < width; w++)
1758
        {
1759
            out1[w] = in[3*w    ];
1760
            out2[w] = in[3*w + 1];
1761
            out3[w] = in[3*w + 2];
1762
        }
1763
    }
1764
};
1765

1766
GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
1767
{
1768
    static const int Window = 1;
1769

1770
    static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3, Buffer &dst4)
1771
    {
1772
        const auto *in   =  src.InLine<uchar>(0);
1773
              auto *out1 = dst1.OutLine<uchar>();
1774
              auto *out2 = dst2.OutLine<uchar>();
1775
              auto *out3 = dst3.OutLine<uchar>();
1776
              auto *out4 = dst4.OutLine<uchar>();
1777

1778
        GAPI_Assert(4 == src.meta().chan);
1779
        int width = src.length();
1780

1781
        int w = 0; // cycle counter
1782

1783
    #if CV_SIMD128
1784
        for (; w <= width-16; w+=16)
1785
        {
1786
            v_uint8x16 a, b, c, d;
1787
            v_load_deinterleave(&in[4*w], a, b, c, d);
1788
            v_store(&out1[w], a);
1789
            v_store(&out2[w], b);
1790
            v_store(&out3[w], c);
1791
            v_store(&out4[w], d);
1792
        }
1793
    #endif
1794

1795
        for (; w < width; w++)
1796
        {
1797
            out1[w] = in[4*w    ];
1798
            out2[w] = in[4*w + 1];
1799
            out3[w] = in[4*w + 2];
1800
            out4[w] = in[4*w + 3];
1801
        }
1802
    }
1803
};
1804

1805
GAPI_FLUID_KERNEL(GFluidMerge3, cv::gapi::core::GMerge3, false)
1806
{
1807
    static const int Window = 1;
1808

1809
    static void run(const View &src1, const View &src2, const View &src3, Buffer &dst)
1810
    {
1811
        const auto *in1 = src1.InLine<uchar>(0);
1812
        const auto *in2 = src2.InLine<uchar>(0);
1813
        const auto *in3 = src3.InLine<uchar>(0);
1814
              auto *out = dst.OutLine<uchar>();
1815

1816
        GAPI_Assert(3 == dst.meta().chan);
1817
        int width = dst.length();
1818

1819
        int w = 0; // cycle counter
1820

1821
    #if CV_SIMD128
1822
        for (; w <= width-16; w+=16)
1823
        {
1824
            v_uint8x16 a, b, c;
1825
            a = v_load(&in1[w]);
1826
            b = v_load(&in2[w]);
1827
            c = v_load(&in3[w]);
1828
            v_store_interleave(&out[3*w], a, b, c);
1829
        }
1830
    #endif
1831

1832
        for (; w < width; w++)
1833
        {
1834
            out[3*w    ] = in1[w];
1835
            out[3*w + 1] = in2[w];
1836
            out[3*w + 2] = in3[w];
1837
        }
1838
    }
1839
};
1840

1841
GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)
1842
{
1843
    static const int Window = 1;
1844

1845
    static void run(const View &src1, const View &src2, const View &src3, const View &src4,
1846
                    Buffer &dst)
1847
    {
1848
        const auto *in1 = src1.InLine<uchar>(0);
1849
        const auto *in2 = src2.InLine<uchar>(0);
1850
        const auto *in3 = src3.InLine<uchar>(0);
1851
        const auto *in4 = src4.InLine<uchar>(0);
1852
              auto *out = dst.OutLine<uchar>();
1853

1854
        GAPI_Assert(4 == dst.meta().chan);
1855
        int width = dst.length();
1856

1857
        int w = 0; // cycle counter
1858

1859
    #if CV_SIMD128
1860
        for (; w <= width-16; w+=16)
1861
        {
1862
            v_uint8x16 a, b, c, d;
1863
            a = v_load(&in1[w]);
1864
            b = v_load(&in2[w]);
1865
            c = v_load(&in3[w]);
1866
            d = v_load(&in4[w]);
1867
            v_store_interleave(&out[4*w], a, b, c, d);
1868
        }
1869
    #endif
1870

1871
        for (; w < width; w++)
1872
        {
1873
            out[4*w    ] = in1[w];
1874
            out[4*w + 1] = in2[w];
1875
            out[4*w + 2] = in3[w];
1876
            out[4*w + 3] = in4[w];
1877
        }
1878
    }
1879
};
1880

1881
GAPI_FLUID_KERNEL(GFluidPolarToCart, cv::gapi::core::GPolarToCart, false)
1882
{
1883
    static const int Window = 1;
1884

1885
    static void run(const View &src1, const View &src2, bool angleInDegrees,
1886
                    Buffer &dst1, Buffer &dst2)
1887
    {
1888
        GAPI_Assert(src1.meta().depth == CV_32F);
1889
        GAPI_Assert(src2.meta().depth == CV_32F);
1890
        GAPI_Assert(dst1.meta().depth == CV_32F);
1891
        GAPI_Assert(dst2.meta().depth == CV_32F);
1892

1893
        const auto * in1 = src1.InLine<float>(0);
1894
        const auto * in2 = src2.InLine<float>(0);
1895
              auto *out1 = dst1.OutLine<float>();
1896
              auto *out2 = dst2.OutLine<float>();
1897

1898
        int width = src1.length();
1899
        int chan  = src2.meta().chan;
1900
        int length = width * chan;
1901

1902
        // SIMD: compiler vectoring!
1903
        for (int l=0; l < length; l++)
1904
        {
1905
            float angle = angleInDegrees?
1906
                          in2[l] * static_cast<float>(CV_PI / 180):
1907
                          in2[l];
1908
            float magnitude = in1[l];
1909
            float x = magnitude * std::cos(angle);
1910
            float y = magnitude * std::sin(angle);
1911
            out1[l] = x;
1912
            out2[l] = y;
1913
        }
1914
    }
1915
};
1916

1917
GAPI_FLUID_KERNEL(GFluidCartToPolar, cv::gapi::core::GCartToPolar, false)
1918
{
1919
    static const int Window = 1;
1920

1921
    static void run(const View &src1, const View &src2, bool angleInDegrees,
1922
                    Buffer &dst1, Buffer &dst2)
1923
    {
1924
        GAPI_Assert(src1.meta().depth == CV_32F);
1925
        GAPI_Assert(src2.meta().depth == CV_32F);
1926
        GAPI_Assert(dst1.meta().depth == CV_32F);
1927
        GAPI_Assert(dst2.meta().depth == CV_32F);
1928

1929
        const auto * in1 = src1.InLine<float>(0);
1930
        const auto * in2 = src2.InLine<float>(0);
1931
              auto *out1 = dst1.OutLine<float>();
1932
              auto *out2 = dst2.OutLine<float>();
1933

1934
        int width = src1.length();
1935
        int chan  = src2.meta().chan;
1936
        int length = width * chan;
1937

1938
        // SIMD: compiler vectoring!
1939
        for (int l=0; l < length; l++)
1940
        {
1941
            float x = in1[l];
1942
            float y = in2[l];
1943
            float magnitude = std::hypot(y, x);
1944
            float angle_rad = std::atan2(y, x);
1945
            float angle = angleInDegrees?
1946
                          angle_rad * static_cast<float>(180 / CV_PI):
1947
                          angle_rad;
1948
            out1[l] = magnitude;
1949
            out2[l] = angle;
1950
        }
1951
    }
1952
};
1953

1954
GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::core::GResize, true)
1955
{
1956
    static const int Window = 1;
1957
    static const auto Kind = GFluidKernel::Kind::Resize;
1958

1959
    constexpr static const int INTER_RESIZE_COEF_BITS = 11;
1960
    constexpr static const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
1961
    constexpr static const short ONE = INTER_RESIZE_COEF_SCALE;
1962

1963
    struct ResizeUnit
1964
    {
1965
        short alpha0;
1966
        short alpha1;
1967
        int   s0;
1968
        int   s1;
1969
    };
1970

1971
    static ResizeUnit map(double ratio, int start, int max, int outCoord)
1972
    {
1973
        float f = static_cast<float>((outCoord + 0.5f) * ratio - 0.5f);
1974
        int s = cvFloor(f);
1975
        f -= s;
1976

1977
        ResizeUnit ru;
1978

1979
        ru.s0 = std::max(s - start, 0);
1980
        ru.s1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
1981

1982
        ru.alpha0 = saturate_cast<short>((1.0f - f) * INTER_RESIZE_COEF_SCALE);
1983
        ru.alpha1 = saturate_cast<short>((f) * INTER_RESIZE_COEF_SCALE);
1984

1985
        return ru;
1986
    }
1987

1988
    static void initScratch(const cv::GMatDesc& in,
1989
                            cv::Size outSz, double /*fx*/, double /*fy*/, int /*interp*/,
1990
                            cv::gapi::fluid::Buffer &scratch)
1991
    {
1992
        CV_Assert(in.depth == CV_8U && in.chan == 3);
1993

1994
        cv::Size scratch_size{static_cast<int>(outSz.width * sizeof(ResizeUnit)), 1};
1995

1996
        cv::GMatDesc desc;
1997
        desc.chan  = 1;
1998
        desc.depth = CV_8UC1;
1999
        desc.size  = to_own(scratch_size);
2000

2001
        cv::gapi::fluid::Buffer buffer(desc);
2002
        scratch = std::move(buffer);
2003

2004
        ResizeUnit* mapX = scratch.OutLine<ResizeUnit>();
2005
        double hRatio = (double)in.size.width / outSz.width;
2006

2007
        for (int x = 0, w = outSz.width; x < w; x++)
2008
        {
2009
            mapX[x] = map(hRatio, 0, in.size.width, x);
2010
        }
2011
    }
2012

2013
    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/)
2014
    {}
2015

2016
    static void run(const cv::gapi::fluid::View& in, cv::Size /*sz*/, double /*fx*/, double /*fy*/, int /*interp*/,
2017
                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch)
2018
    {
2019
        double vRatio = (double)in.meta().size.height / out.meta().size.height;
2020
        auto mapY = map(vRatio, in.y(), in.meta().size.height, out.y());
2021

2022
        auto beta0 = mapY.alpha0;
2023
        auto beta1 = mapY.alpha1;
2024

2025
        const auto src0 = in.InLine <unsigned char>(mapY.s0);
2026
        const auto src1 = in.InLine <unsigned char>(mapY.s1);
2027

2028
        auto dst = out.OutLine<unsigned char>();
2029

2030
        ResizeUnit* mapX = scratch.OutLine<ResizeUnit>();
2031

2032
        for (int x = 0; x < out.length(); x++)
2033
        {
2034
            short alpha0 = mapX[x].alpha0;
2035
            short alpha1 = mapX[x].alpha1;
2036
            int sx0 = mapX[x].s0;
2037
            int sx1 = mapX[x].s1;
2038

2039
            int res00 = src0[3*sx0    ]*alpha0 + src0[3*(sx1)    ]*alpha1;
2040
            int res10 = src1[3*sx0    ]*alpha0 + src1[3*(sx1)    ]*alpha1;
2041

2042
            int res01 = src0[3*sx0 + 1]*alpha0 + src0[3*(sx1) + 1]*alpha1;
2043
            int res11 = src1[3*sx0 + 1]*alpha0 + src1[3*(sx1) + 1]*alpha1;
2044

2045
            int res02 = src0[3*sx0 + 2]*alpha0 + src0[3*(sx1) + 2]*alpha1;
2046
            int res12 = src1[3*sx0 + 2]*alpha0 + src1[3*(sx1) + 2]*alpha1;
2047

2048
            dst[3*x    ] = uchar(( ((beta0 * (res00 >> 4)) >> 16) + ((beta1 * (res10 >> 4)) >> 16) + 2)>>2);
2049
            dst[3*x + 1] = uchar(( ((beta0 * (res01 >> 4)) >> 16) + ((beta1 * (res11 >> 4)) >> 16) + 2)>>2);
2050
            dst[3*x + 2] = uchar(( ((beta0 * (res02 >> 4)) >> 16) + ((beta1 * (res12 >> 4)) >> 16) + 2)>>2);
2051
        }
2052
    }
2053
};
2054

2055
} // namespace fliud
2056
} // namespace gapi
2057
} // namespace cv
2058

2059
cv::gapi::GKernelPackage cv::gapi::core::fluid::kernels()
2060
{
2061
    using namespace cv::gapi::fluid;
2062

2063
    return cv::gapi::kernels
2064
     <       GFluidAdd
2065
            ,GFluidSub
2066
            ,GFluidMul
2067
            ,GFluidDiv
2068
            ,GFluidAbsDiff
2069
            ,GFluidAnd
2070
            ,GFluidOr
2071
            ,GFluidXor
2072
            ,GFluidMin
2073
            ,GFluidMax
2074
            ,GFluidCmpGT
2075
            ,GFluidCmpGE
2076
            ,GFluidCmpLE
2077
            ,GFluidCmpLT
2078
            ,GFluidCmpEQ
2079
            ,GFluidCmpNE
2080
            ,GFluidAddW
2081
            ,GFluidNot
2082
            ,GFluidLUT
2083
            ,GFluidConvertTo
2084
            ,GFluidSplit3
2085
            ,GFluidSplit4
2086
            ,GFluidMerge3
2087
            ,GFluidMerge4
2088
            ,GFluidSelect
2089
            ,GFluidPolarToCart
2090
            ,GFluidCartToPolar
2091
            ,GFluidAddC
2092
            ,GFluidSubC
2093
            ,GFluidSubRC
2094
            ,GFluidMulC
2095
            ,GFluidMulCOld
2096
            ,GFluidDivC
2097
            ,GFluidDivRC
2098
            ,GFluidAbsDiffC
2099
            ,GFluidCmpGTScalar
2100
            ,GFluidCmpGEScalar
2101
            ,GFluidCmpLEScalar
2102
            ,GFluidCmpLTScalar
2103
            ,GFluidCmpEQScalar
2104
            ,GFluidCmpNEScalar
2105
            ,GFluidThreshold
2106
            ,GFluidInRange
2107
            ,GFluidResize
2108
        #if 0
2109
            ,GFluidMean        -- not fluid
2110
            ,GFluidSum         -- not fluid
2111
            ,GFluidNormL1      -- not fluid
2112
            ,GFluidNormL2      -- not fluid
2113
            ,GFluidNormInf     -- not fluid
2114
            ,GFluidIntegral    -- not fluid
2115
            ,GFluidThresholdOT -- not fluid
2116
            ,GFluidResize      -- not fluid (?)
2117
            ,GFluidRemap       -- not fluid
2118
            ,GFluidFlip        -- not fluid
2119
            ,GFluidCrop        -- not fluid
2120
            ,GFluidConcatHor
2121
            ,GFluidConcatVert  -- not fluid
2122
        #endif
2123
        >();
2124
}
2125

2126
#endif // !defined(GAPI_STANDALONE)
2127

2128
Product

Resources

Company