Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/dot_product.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
42
namespace CAROTENE_NS {
43
44
f64 dotProduct(const Size2D &_size,
45
const u8 * src0Base, ptrdiff_t src0Stride,
46
const u8 * src1Base, ptrdiff_t src1Stride)
47
{
48
internal::assertSupportedConfiguration();
49
#ifdef CAROTENE_NEON
50
Size2D size(_size);
51
if (src0Stride == src1Stride &&
52
src0Stride == (ptrdiff_t)(size.width))
53
{
54
size.width *= size.height;
55
size.height = 1;
56
}
57
58
// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow
59
// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements
60
#define DOT_UINT_BLOCKSIZE 66050*8
61
f64 result = 0.0;
62
for (size_t row = 0; row < size.height; ++row)
63
{
64
const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
65
const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
66
67
size_t i = 0;
68
uint64x2_t ws = vmovq_n_u64(0);
69
70
while(i + 16 <= size.width)
71
{
72
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
73
74
uint32x4_t s1 = vmovq_n_u32(0);
75
uint32x4_t s2 = vmovq_n_u32(0);
76
77
for (; i <= lim; i += 16)
78
{
79
internal::prefetch(src0 + i);
80
internal::prefetch(src1 + i);
81
82
uint8x16_t vs1 = vld1q_u8(src0 + i);
83
uint8x16_t vs2 = vld1q_u8(src1 + i);
84
85
uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));
86
uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));
87
88
s1 = vpadalq_u16(s1, vdot1);
89
s2 = vpadalq_u16(s2, vdot2);
90
}
91
92
ws = vpadalq_u32(ws, s1);
93
ws = vpadalq_u32(ws, s2);
94
}
95
96
if(i + 8 <= size.width)
97
{
98
uint8x8_t vs1 = vld1_u8(src0 + i);
99
uint8x8_t vs2 = vld1_u8(src1 + i);
100
101
ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));
102
i += 8;
103
}
104
105
result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);
106
107
for (; i < size.width; ++i)
108
result += s32(src0[i]) * s32(src1[i]);
109
}
110
return result;
111
#else
112
(void)_size;
113
(void)src0Base;
114
(void)src0Stride;
115
(void)src1Base;
116
(void)src1Stride;
117
118
return 0;
119
#endif
120
}
121
122
f64 dotProduct(const Size2D &_size,
123
const s8 * src0Base, ptrdiff_t src0Stride,
124
const s8 * src1Base, ptrdiff_t src1Stride)
125
{
126
internal::assertSupportedConfiguration();
127
#ifdef CAROTENE_NEON
128
Size2D size(_size);
129
if (src0Stride == src1Stride &&
130
src0Stride == (ptrdiff_t)(size.width))
131
{
132
size.width *= size.height;
133
size.height = 1;
134
}
135
136
// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
137
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
138
#define DOT_INT_BLOCKSIZE 131070*8
139
f64 result = 0.0;
140
for (size_t row = 0; row < size.height; ++row)
141
{
142
const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
143
const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
144
145
size_t i = 0;
146
int64x2_t ws = vmovq_n_s64(0);
147
148
while(i + 16 <= size.width)
149
{
150
size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;
151
152
int32x4_t s1 = vmovq_n_s32(0);
153
int32x4_t s2 = vmovq_n_s32(0);
154
155
for (; i <= lim; i += 16)
156
{
157
internal::prefetch(src0 + i);
158
internal::prefetch(src1 + i);
159
160
int8x16_t vs1 = vld1q_s8(src0 + i);
161
int8x16_t vs2 = vld1q_s8(src1 + i);
162
163
int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
164
int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));
165
166
s1 = vpadalq_s16(s1, vdot1);
167
s2 = vpadalq_s16(s2, vdot2);
168
}
169
170
ws = vpadalq_s32(ws, s1);
171
ws = vpadalq_s32(ws, s2);
172
}
173
174
if(i + 8 <= size.width)
175
{
176
int8x8_t vs1 = vld1_s8(src0 + i);
177
int8x8_t vs2 = vld1_s8(src1 + i);
178
179
ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
180
i += 8;
181
}
182
183
result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);
184
185
for (; i < size.width; ++i)
186
result += s32(src0[i]) * s32(src1[i]);
187
}
188
return result;
189
#else
190
(void)_size;
191
(void)src0Base;
192
(void)src0Stride;
193
(void)src1Base;
194
(void)src1Stride;
195
196
return 0;
197
#endif
198
}
199
200
f64 dotProduct(const Size2D &_size,
201
const f32 * src0Base, ptrdiff_t src0Stride,
202
const f32 * src1Base, ptrdiff_t src1Stride)
203
{
204
internal::assertSupportedConfiguration();
205
#ifdef CAROTENE_NEON
206
Size2D size(_size);
207
if (src0Stride == src1Stride &&
208
src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
209
{
210
size.width *= size.height;
211
size.height = 1;
212
}
213
214
#define DOT_FLOAT_BLOCKSIZE (1 << 13)
215
f64 result = 0.0;
216
for (size_t row = 0; row < size.height; ++row)
217
{
218
const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
219
const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);
220
221
size_t i = 0;
222
while(i + 4 <= size.width)
223
{
224
size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
225
float32x4_t v_sum = vdupq_n_f32(0.0f);
226
227
for( ; i <= lim; i += 4 )
228
{
229
internal::prefetch(src0 + i);
230
internal::prefetch(src1 + i);
231
v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
232
}
233
234
float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
235
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
236
}
237
238
if(i + 2 <= size.width)
239
{
240
float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
241
result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
242
i += 2;
243
}
244
245
for (; i < size.width; ++i)
246
result += src0[i] * src1[i];
247
}
248
return result;
249
#else
250
(void)_size;
251
(void)src0Base;
252
(void)src0Stride;
253
(void)src1Base;
254
(void)src1Stride;
255
256
return 0;
257
#endif
258
}
259
260
} // namespace CAROTENE_NS
261
262