CoCalc -- blend.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/modules/imgproc/src/blend.cpp
¹⁶³⁵⁴ views
1
/*M///////////////////////////////////////////////////////////////////////////////////////
2
//
3
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4
//
5
//  By downloading, copying, installing or using the software you agree to this license.
6
//  If you do not agree to this license, do not download, install,
7
//  copy or use the software.
8
//
9
//
10
//                           License Agreement
11
//                For Open Source Computer Vision Library
12
//
13
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15
// Third party copyrights are property of their respective owners.
16
//
17
// @Authors
18
//    Nathan, [email protected]
19
//
20
// Redistribution and use in source and binary forms, with or without modification,
21
// are permitted provided that the following conditions are met:
22
//
23
//   * Redistribution's of source code must retain the above copyright notice,
24
//     this list of conditions and the following disclaimer.
25
//
26
//   * Redistribution's in binary form must reproduce the above copyright notice,
27
//     this list of conditions and the following disclaimer in the documentation
28
//     and/or other materials provided with the distribution.
29
//
30
//   * The name of the copyright holders may not be used to endorse or promote products
31
//     derived from this software without specific prior written permission.
32
//
33
// This software is provided by the copyright holders and contributors "as is" and
34
// any express or implied warranties, including, but not limited to, the implied
35
// warranties of merchantability and fitness for a particular purpose are disclaimed.
36
// In no event shall the Intel Corporation or contributors be liable for any direct,
37
// indirect, incidental, special, exemplary, or consequential damages
38
// (including, but not limited to, procurement of substitute goods or services;
39
// loss of use, data, or profits; or business interruption) however caused
40
// and on any theory of liability, whether in contract, strict liability,
41
// or tort (including negligence or otherwise) arising in any way out of
42
// the use of this software, even if advised of the possibility of such damage.
43
//
44
//M*/
45

46
#include "precomp.hpp"
47
#include "opencl_kernels_imgproc.hpp"
48
#include "opencv2/core/hal/intrin.hpp"
49

50
namespace cv {
51
#if CV_SIMD128
52
static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const v_float32x4& v_w1, const v_float32x4& v_w2)
53
{
54
    const v_float32x4 v_eps = v_setall_f32(1e-5f);
55
    v_float32x4 v_denom = v_w1 + v_w2 + v_eps;
56
    return (v_src1 * v_w1 + v_src2 * v_w2) / v_denom;
57
}
58
static inline v_float32x4 blend(const v_float32x4& v_src1, const v_float32x4& v_src2, const float* w_ptr1, const float* w_ptr2, int offset)
59
{
60
    v_float32x4 v_w1 = v_load(w_ptr1 + offset);
61
    v_float32x4 v_w2 = v_load(w_ptr2 + offset);
62
    return blend(v_src1, v_src2, v_w1, v_w2);
63
}
64
static inline v_uint32x4 saturate_f32_u32(const v_float32x4& vec)
65
{
66
    const v_int32x4 z = v_setzero_s32();
67
    const v_int32x4 x = v_setall_s32(255);
68
    return v_reinterpret_as_u32(v_min(v_max(v_round(vec), z), x));
69
}
70
static inline v_uint8x16 pack_f32tou8(v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
71
{
72
    v_uint32x4 a = saturate_f32_u32(val0);
73
    v_uint32x4 b = saturate_f32_u32(val1);
74
    v_uint32x4 c = saturate_f32_u32(val2);
75
    v_uint32x4 d = saturate_f32_u32(val3);
76
    v_uint16x8 e = v_pack(a, b);
77
    v_uint16x8 f = v_pack(c, d);
78
    return v_pack(e, f);
79
}
80
static inline void store_pack_f32tou8(uchar* ptr, v_float32x4& val0, v_float32x4& val1, v_float32x4& val2, v_float32x4& val3)
81
{
82
    v_store((ptr), pack_f32tou8(val0, val1, val2, val3));
83
}
84
static inline void expand_u8tof32(const v_uint8x16& src, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
85
{
86
    v_uint16x8 a0, a1;
87
    v_expand(src, a0, a1);
88
    v_uint32x4 b0, b1,b2,b3;
89
    v_expand(a0, b0, b1);
90
    v_expand(a1, b2, b3);
91
    dst0 = v_cvt_f32(v_reinterpret_as_s32(b0));
92
    dst1 = v_cvt_f32(v_reinterpret_as_s32(b1));
93
    dst2 = v_cvt_f32(v_reinterpret_as_s32(b2));
94
    dst3 = v_cvt_f32(v_reinterpret_as_s32(b3));
95
}
96
static inline void load_expand_u8tof32(const uchar* ptr, v_float32x4& dst0, v_float32x4& dst1, v_float32x4& dst2, v_float32x4& dst3)
97
{
98
    v_uint8x16 a = v_load((ptr));
99
    expand_u8tof32(a, dst0, dst1, dst2, dst3);
100
}
101
int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn);
102
int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn);
103
int blendLinearSimd128(const uchar* src1, const uchar* src2, const float* weights1, const float* weights2, uchar* dst, int x, int width, int cn)
104
{
105
    int step = v_uint8x16::nlanes * cn;
106
    int weight_step = v_uint8x16::nlanes;
107
    switch(cn)
108
    {
109
    case 1:
110
        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
111
        {
112
            v_float32x4 v_src10, v_src11, v_src12, v_src13;
113
            v_float32x4 v_src20, v_src21, v_src22, v_src23;
114
            load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
115
            load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
116

117
            v_float32x4 v_dst0 = blend(v_src10, v_src20, weights1, weights2, weight_offset);
118
            v_float32x4 v_dst1 = blend(v_src11, v_src21, weights1, weights2, weight_offset + 4);
119
            v_float32x4 v_dst2 = blend(v_src12, v_src22, weights1, weights2, weight_offset + 8);
120
            v_float32x4 v_dst3 = blend(v_src13, v_src23, weights1, weights2, weight_offset + 12);
121

122
            store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
123
        }
124
        break;
125
    case 2:
126
        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
127
        {
128
            v_uint8x16 v_src10, v_src11, v_src20, v_src21;
129
            v_load_deinterleave(src1 + x, v_src10, v_src11);
130
            v_load_deinterleave(src2 + x, v_src20, v_src21);
131
            v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113;
132
            v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213;
133
            expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103);
134
            expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113);
135
            expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203);
136
            expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213);
137

138
            v_float32x4 v_dst0 = blend(v_src100, v_src200, weights1, weights2, weight_offset);
139
            v_float32x4 v_dst1 = blend(v_src110, v_src210, weights1, weights2, weight_offset);
140
            v_float32x4 v_dst2 = blend(v_src101, v_src201, weights1, weights2, weight_offset + 4);
141
            v_float32x4 v_dst3 = blend(v_src111, v_src211, weights1, weights2, weight_offset + 4);
142
            v_float32x4 v_dst4 = blend(v_src102, v_src202, weights1, weights2, weight_offset + 8);
143
            v_float32x4 v_dst5 = blend(v_src112, v_src212, weights1, weights2, weight_offset + 8);
144
            v_float32x4 v_dst6 = blend(v_src103, v_src203, weights1, weights2, weight_offset + 12);
145
            v_float32x4 v_dst7 = blend(v_src113, v_src213, weights1, weights2, weight_offset + 12);
146

147
            v_uint8x16 v_dsta = pack_f32tou8(v_dst0, v_dst2, v_dst4, v_dst6);
148
            v_uint8x16 v_dstb = pack_f32tou8(v_dst1, v_dst3, v_dst5, v_dst7);
149
            v_store_interleave(dst + x, v_dsta, v_dstb);
150
        }
151
        break;
152
    case 3:
153
        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
154
        {
155
            v_uint8x16 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
156
            v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
157
            v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);
158

159
            v_float32x4 v_src100, v_src101, v_src102, v_src103, v_src110, v_src111, v_src112, v_src113, v_src120, v_src121, v_src122, v_src123;
160
            v_float32x4 v_src200, v_src201, v_src202, v_src203, v_src210, v_src211, v_src212, v_src213, v_src220, v_src221, v_src222, v_src223;
161
            expand_u8tof32(v_src10, v_src100, v_src101, v_src102, v_src103);
162
            expand_u8tof32(v_src11, v_src110, v_src111, v_src112, v_src113);
163
            expand_u8tof32(v_src12, v_src120, v_src121, v_src122, v_src123);
164
            expand_u8tof32(v_src20, v_src200, v_src201, v_src202, v_src203);
165
            expand_u8tof32(v_src21, v_src210, v_src211, v_src212, v_src213);
166
            expand_u8tof32(v_src22, v_src220, v_src221, v_src222, v_src223);
167

168
            v_float32x4 v_w10 = v_load(weights1 + weight_offset);
169
            v_float32x4 v_w11 = v_load(weights1 + weight_offset + 4);
170
            v_float32x4 v_w12 = v_load(weights1 + weight_offset + 8);
171
            v_float32x4 v_w13 = v_load(weights1 + weight_offset + 12);
172
            v_float32x4 v_w20 = v_load(weights2 + weight_offset);
173
            v_float32x4 v_w21 = v_load(weights2 + weight_offset + 4);
174
            v_float32x4 v_w22 = v_load(weights2 + weight_offset + 8);
175
            v_float32x4 v_w23 = v_load(weights2 + weight_offset + 12);
176
            v_src100 = blend(v_src100, v_src200, v_w10, v_w20);
177
            v_src110 = blend(v_src110, v_src210, v_w10, v_w20);
178
            v_src120 = blend(v_src120, v_src220, v_w10, v_w20);
179
            v_src101 = blend(v_src101, v_src201, v_w11, v_w21);
180
            v_src111 = blend(v_src111, v_src211, v_w11, v_w21);
181
            v_src121 = blend(v_src121, v_src221, v_w11, v_w21);
182
            v_src102 = blend(v_src102, v_src202, v_w12, v_w22);
183
            v_src112 = blend(v_src112, v_src212, v_w12, v_w22);
184
            v_src122 = blend(v_src122, v_src222, v_w12, v_w22);
185
            v_src103 = blend(v_src103, v_src203, v_w13, v_w23);
186
            v_src113 = blend(v_src113, v_src213, v_w13, v_w23);
187
            v_src123 = blend(v_src123, v_src223, v_w13, v_w23);
188

189

190
            v_uint8x16 v_dst0 = pack_f32tou8(v_src100, v_src101, v_src102, v_src103);
191
            v_uint8x16 v_dst1 = pack_f32tou8(v_src110, v_src111, v_src112, v_src113);
192
            v_uint8x16 v_dst2 = pack_f32tou8(v_src120, v_src121, v_src122, v_src123);
193
            v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
194
        }
195
        break;
196
    case 4:
197
        step = v_uint8x16::nlanes;
198
        weight_step = v_float32x4::nlanes;
199
        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += weight_step)
200
        {
201
            v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17;
202
            v_float32x4 v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27;
203
            load_expand_u8tof32(src1 + x, v_src10, v_src11, v_src12, v_src13);
204
            load_expand_u8tof32(src2 + x, v_src20, v_src21, v_src22, v_src23);
205

206
            v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_src14, v_src15, v_src16, v_src17);
207
            v_transpose4x4(v_src20, v_src21, v_src22, v_src23, v_src24, v_src25, v_src26, v_src27);
208

209
            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
210
            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
211
            v_src10 = blend(v_src14, v_src24, v_w1, v_w2);
212
            v_src11 = blend(v_src15, v_src25, v_w1, v_w2);
213
            v_src12 = blend(v_src16, v_src26, v_w1, v_w2);
214
            v_src13 = blend(v_src17, v_src27, v_w1, v_w2);
215

216
            v_float32x4 v_dst0, v_dst1, v_dst2, v_dst3;
217
            v_transpose4x4(v_src10, v_src11, v_src12, v_src13, v_dst0, v_dst1, v_dst2, v_dst3);
218
            store_pack_f32tou8(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
219
        }
220
        break;
221
    default:
222
        break;
223
    }
224
    return x;
225
}
226

227
int blendLinearSimd128(const float* src1, const float* src2, const float* weights1, const float* weights2, float* dst, int x, int width, int cn)
228
{
229
    int step = v_float32x4::nlanes*cn;
230
    switch(cn)
231
    {
232
    case 1:
233
        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
234
        {
235
            v_float32x4 v_src1 = v_load(src1 + x);
236
            v_float32x4 v_src2 = v_load(src2 + x);
237
            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
238
            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
239

240
            v_float32x4 v_dst = blend(v_src1, v_src2, v_w1, v_w2);
241

242
            v_store(dst + x, v_dst);
243
        }
244
        break;
245
    case 2:
246
        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
247
        {
248
            v_float32x4 v_src10, v_src11, v_src20, v_src21;
249
            v_load_deinterleave(src1 + x, v_src10, v_src11);
250
            v_load_deinterleave(src2 + x, v_src20, v_src21);
251
            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
252
            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
253

254
            v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
255
            v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
256

257
            v_store_interleave(dst + x, v_dst0, v_dst1);
258
        }
259
        break;
260
    case 3:
261
        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
262
        {
263
            v_float32x4 v_src10, v_src11, v_src12, v_src20, v_src21, v_src22;
264
            v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12);
265
            v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22);
266
            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
267
            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
268

269
            v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
270
            v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
271
            v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
272

273
            v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2);
274
        }
275
        break;
276
    case 4:
277
        for(int weight_offset = 0 ; x <= width - step; x += step, weight_offset += v_float32x4::nlanes)
278
        {
279
            v_float32x4 v_src10, v_src11, v_src12, v_src13, v_src20, v_src21, v_src22, v_src23;
280
            v_load_deinterleave(src1 + x, v_src10, v_src11, v_src12, v_src13);
281
            v_load_deinterleave(src2 + x, v_src20, v_src21, v_src22, v_src23);
282
            v_float32x4 v_w1 = v_load(weights1 + weight_offset);
283
            v_float32x4 v_w2 = v_load(weights2 + weight_offset);
284

285
            v_float32x4 v_dst0 = blend(v_src10, v_src20, v_w1, v_w2);
286
            v_float32x4 v_dst1 = blend(v_src11, v_src21, v_w1, v_w2);
287
            v_float32x4 v_dst2 = blend(v_src12, v_src22, v_w1, v_w2);
288
            v_float32x4 v_dst3 = blend(v_src13, v_src23, v_w1, v_w2);
289

290
            v_store_interleave(dst + x, v_dst0, v_dst1, v_dst2, v_dst3);
291
        }
292
        break;
293
    default:
294
        break;
295
    }
296
    return x;
297
}
298
#endif
299

300
template <typename T>
301
class BlendLinearInvoker :
302
        public ParallelLoopBody
303
{
304
public:
305
    BlendLinearInvoker(const Mat & _src1, const Mat & _src2, const Mat & _weights1,
306
                       const Mat & _weights2, Mat & _dst) :
307
        src1(&_src1), src2(&_src2), weights1(&_weights1), weights2(&_weights2), dst(&_dst)
308
    {
309
    }
310

311
    virtual void operator() (const Range & range) const CV_OVERRIDE
312
    {
313
        int cn = src1->channels(), width = src1->cols * cn;
314

315
        for (int y = range.start; y < range.end; ++y)
316
        {
317
            const float * const weights1_row = weights1->ptr<float>(y);
318
            const float * const weights2_row = weights2->ptr<float>(y);
319
            const T * const src1_row = src1->ptr<T>(y);
320
            const T * const src2_row = src2->ptr<T>(y);
321
            T * const dst_row = dst->ptr<T>(y);
322

323
            int x = 0;
324
            #if CV_SIMD128
325
            x = blendLinearSimd128(src1_row, src2_row, weights1_row, weights2_row, dst_row, x, width, cn);
326
            #endif
327

328
            for ( ; x < width; ++x)
329
            {
330
                int x1 = x / cn;
331
                float w1 = weights1_row[x1], w2 = weights2_row[x1];
332
                float den = (w1 + w2 + 1e-5f);
333
                float num = (src1_row[x] * w1 + src2_row[x] * w2);
334

335
                dst_row[x] = saturate_cast<T>(num / den);
336
            }
337
        }
338
    }
339

340
private:
341
    const BlendLinearInvoker & operator= (const BlendLinearInvoker &);
342
    BlendLinearInvoker(const BlendLinearInvoker &);
343

344
    const Mat * src1, * src2, * weights1, * weights2;
345
    Mat * dst;
346
};
347

348
#ifdef HAVE_OPENCL
349

350
static bool ocl_blendLinear( InputArray _src1, InputArray _src2, InputArray _weights1, InputArray _weights2, OutputArray _dst )
351
{
352
    int type = _src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
353

354
    char cvt[30];
355
    ocl::Kernel k("blendLinear", ocl::imgproc::blend_linear_oclsrc,
356
                  format("-D T=%s -D cn=%d -D convertToT=%s", ocl::typeToStr(depth),
357
                         cn, ocl::convertTypeStr(CV_32F, depth, 1, cvt)));
358
    if (k.empty())
359
        return false;
360

361
    UMat src1 = _src1.getUMat(), src2 = _src2.getUMat(), weights1 = _weights1.getUMat(),
362
            weights2 = _weights2.getUMat(), dst = _dst.getUMat();
363

364
    k.args(ocl::KernelArg::ReadOnlyNoSize(src1), ocl::KernelArg::ReadOnlyNoSize(src2),
365
           ocl::KernelArg::ReadOnlyNoSize(weights1), ocl::KernelArg::ReadOnlyNoSize(weights2),
366
           ocl::KernelArg::WriteOnly(dst));
367

368
    size_t globalsize[2] = { (size_t)dst.cols, (size_t)dst.rows };
369
    return k.run(2, globalsize, NULL, false);
370
}
371

372
#endif
373

374
}
375

376
void cv::blendLinear( InputArray _src1, InputArray _src2, InputArray _weights1, InputArray _weights2, OutputArray _dst )
377
{
378
    CV_INSTRUMENT_REGION();
379

380
    int type = _src1.type(), depth = CV_MAT_DEPTH(type);
381
    Size size = _src1.size();
382

383
    CV_Assert(depth == CV_8U || depth == CV_32F);
384
    CV_Assert(size == _src2.size() && size == _weights1.size() && size == _weights2.size());
385
    CV_Assert(type == _src2.type() && _weights1.type() == CV_32FC1 && _weights2.type() == CV_32FC1);
386

387
    _dst.create(size, type);
388

389
    CV_OCL_RUN(_dst.isUMat(),
390
               ocl_blendLinear(_src1, _src2, _weights1, _weights2, _dst))
391

392
    Mat src1 = _src1.getMat(), src2 = _src2.getMat(), weights1 = _weights1.getMat(),
393
            weights2 = _weights2.getMat(), dst = _dst.getMat();
394

395
    if (depth == CV_8U)
396
    {
397
        BlendLinearInvoker<uchar> invoker(src1, src2, weights1, weights2, dst);
398
        parallel_for_(Range(0, src1.rows), invoker, dst.total()/(double)(1<<16));
399
    }
400
    else if (depth == CV_32F)
401
    {
402
        BlendLinearInvoker<float> invoker(src1, src2, weights1, weights2, dst);
403
        parallel_for_(Range(0, src1.rows), invoker, dst.total()/(double)(1<<16));
404
    }
405
}
406

407
Product

Resources

Company