CoCalc -- dot_product.cpp

GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/dot_product.cpp
¹⁶³³⁷ views
1
/*
2
 * By downloading, copying, installing or using the software you agree to this license.
3
 * If you do not agree to this license, do not download, install,
4
 * copy or use the software.
5
 *
6
 *
7
 *                           License Agreement
8
 *                For Open Source Computer Vision Library
9
 *                        (3-clause BSD License)
10
 *
11
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
 * Third party copyrights are property of their respective owners.
13
 *
14
 * Redistribution and use in source and binary forms, with or without modification,
15
 * are permitted provided that the following conditions are met:
16
 *
17
 *   * Redistributions of source code must retain the above copyright notice,
18
 *     this list of conditions and the following disclaimer.
19
 *
20
 *   * Redistributions in binary form must reproduce the above copyright notice,
21
 *     this list of conditions and the following disclaimer in the documentation
22
 *     and/or other materials provided with the distribution.
23
 *
24
 *   * Neither the names of the copyright holders nor the names of the contributors
25
 *     may be used to endorse or promote products derived from this software
26
 *     without specific prior written permission.
27
 *
28
 * This software is provided by the copyright holders and contributors "as is" and
29
 * any express or implied warranties, including, but not limited to, the implied
30
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31
 * In no event shall copyright holders or contributors be liable for any direct,
32
 * indirect, incidental, special, exemplary, or consequential damages
33
 * (including, but not limited to, procurement of substitute goods or services;
34
 * loss of use, data, or profits; or business interruption) however caused
35
 * and on any theory of liability, whether in contract, strict liability,
36
 * or tort (including negligence or otherwise) arising in any way out of
37
 * the use of this software, even if advised of the possibility of such damage.
38
 */
39

40
#include "common.hpp"
41

42
namespace CAROTENE_NS {
43

44
f64 dotProduct(const Size2D &_size,
45
               const u8 * src0Base, ptrdiff_t src0Stride,
46
               const u8 * src1Base, ptrdiff_t src1Stride)
47
{
48
    internal::assertSupportedConfiguration();
49
#ifdef CAROTENE_NEON
50
    Size2D size(_size);
51
    if (src0Stride == src1Stride &&
52
        src0Stride == (ptrdiff_t)(size.width))
53
    {
54
        size.width *= size.height;
55
        size.height = 1;
56
    }
57

58
// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
59
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
60
#define DOT_UINT_BLOCKSIZE 66050*8
61
    f64 result = 0.0;
62
    for (size_t row = 0; row < size.height; ++row)
63
    {
64
        const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
65
        const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
66

67
        size_t i = 0;
68
        uint64x2_t ws = vmovq_n_u64(0);
69

70
        while(i + 16 <= size.width)
71
        {
72
            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
73

74
            uint32x4_t s1 = vmovq_n_u32(0);
75
            uint32x4_t s2 = vmovq_n_u32(0);
76

77
            for (; i <= lim; i += 16)
78
            {
79
                internal::prefetch(src0 + i);
80
                internal::prefetch(src1 + i);
81

82
                uint8x16_t vs1 = vld1q_u8(src0 + i);
83
                uint8x16_t vs2 = vld1q_u8(src1 + i);
84

85
                uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
86
                uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
87

88
                s1 = vpadalq_u16(s1, vdot1);
89
                s2 = vpadalq_u16(s2, vdot2);
90
            }
91

92
            ws = vpadalq_u32(ws, s1);
93
            ws = vpadalq_u32(ws, s2);
94
        }
95

96
        if(i + 8 <= size.width)
97
        {
98
            uint8x8_t vs1 = vld1_u8(src0 + i);
99
            uint8x8_t vs2 = vld1_u8(src1 + i);
100

101
            ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
102
            i += 8;
103
        }
104

105
        result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
106

107
        for (; i < size.width; ++i)
108
            result += s32(src0[i]) * s32(src1[i]);
109
    }
110
    return result;
111
#else
112
    (void)_size;
113
    (void)src0Base;
114
    (void)src0Stride;
115
    (void)src1Base;
116
    (void)src1Stride;
117

118
    return 0;
119
#endif
120
}
121

122
f64 dotProduct(const Size2D &_size,
123
               const s8 * src0Base, ptrdiff_t src0Stride,
124
               const s8 * src1Base, ptrdiff_t src1Stride)
125
{
126
    internal::assertSupportedConfiguration();
127
#ifdef CAROTENE_NEON
128
    Size2D size(_size);
129
    if (src0Stride == src1Stride &&
130
        src0Stride == (ptrdiff_t)(size.width))
131
    {
132
        size.width *= size.height;
133
        size.height = 1;
134
    }
135

136
// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
137
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
138
#define DOT_INT_BLOCKSIZE 131070*8
139
    f64 result = 0.0;
140
    for (size_t row = 0; row < size.height; ++row)
141
    {
142
        const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
143
        const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
144

145
        size_t i = 0;
146
        int64x2_t ws = vmovq_n_s64(0);
147

148
        while(i + 16 <= size.width)
149
        {
150
            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
151

152
            int32x4_t s1 = vmovq_n_s32(0);
153
            int32x4_t s2 = vmovq_n_s32(0);
154

155
            for (; i <= lim; i += 16)
156
            {
157
                internal::prefetch(src0 + i);
158
                internal::prefetch(src1 + i);
159

160
                int8x16_t vs1 = vld1q_s8(src0 + i);
161
                int8x16_t vs2 = vld1q_s8(src1 + i);
162

163
                int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
164
                int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
165

166
                s1 = vpadalq_s16(s1, vdot1);
167
                s2 = vpadalq_s16(s2, vdot2);
168
            }
169

170
            ws = vpadalq_s32(ws, s1);
171
            ws = vpadalq_s32(ws, s2);
172
        }
173

174
        if(i + 8 <= size.width)
175
        {
176
            int8x8_t vs1 = vld1_s8(src0 + i);
177
            int8x8_t vs2 = vld1_s8(src1 + i);
178

179
            ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
180
            i += 8;
181
        }
182

183
        result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
184

185
        for (; i < size.width; ++i)
186
            result += s32(src0[i]) * s32(src1[i]);
187
    }
188
    return result;
189
#else
190
    (void)_size;
191
    (void)src0Base;
192
    (void)src0Stride;
193
    (void)src1Base;
194
    (void)src1Stride;
195

196
    return 0;
197
#endif
198
}
199

200
f64 dotProduct(const Size2D &_size,
201
               const f32 * src0Base, ptrdiff_t src0Stride,
202
               const f32 * src1Base, ptrdiff_t src1Stride)
203
{
204
    internal::assertSupportedConfiguration();
205
#ifdef CAROTENE_NEON
206
    Size2D size(_size);
207
    if (src0Stride == src1Stride &&
208
        src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
209
    {
210
        size.width *= size.height;
211
        size.height = 1;
212
    }
213

214
#define DOT_FLOAT_BLOCKSIZE (1 << 13)
215
    f64 result = 0.0;
216
    for (size_t row = 0; row < size.height; ++row)
217
    {
218
        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
219
        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
220

221
        size_t i = 0;
222
        while(i + 4 <= size.width)
223
        {
224
            size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
225
            float32x4_t v_sum = vdupq_n_f32(0.0f);
226

227
            for( ; i <= lim; i += 4 )
228
            {
229
                internal::prefetch(src0 + i);
230
                internal::prefetch(src1 + i);
231
                v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
232
            }
233

234
            float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
235
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
236
        }
237

238
        if(i + 2 <= size.width)
239
        {
240
            float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
241
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
242
            i += 2;
243
        }
244

245
        for (; i < size.width; ++i)
246
            result += src0[i] * src1[i];
247
    }
248
    return result;
249
#else
250
    (void)_size;
251
    (void)src0Base;
252
    (void)src0Stride;
253
    (void)src1Base;
254
    (void)src1Stride;
255

256
    return 0;
257
#endif
258
}
259

260
} // namespace CAROTENE_NS
261

262
Product

Resources

Company