Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Tetragramm
GitHub Repository: Tetragramm/opencv
Path: blob/master/3rdparty/carotene/src/norm.cpp
16337 views
1
/*
2
* By downloading, copying, installing or using the software you agree to this license.
3
* If you do not agree to this license, do not download, install,
4
* copy or use the software.
5
*
6
*
7
* License Agreement
8
* For Open Source Computer Vision Library
9
* (3-clause BSD License)
10
*
11
* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12
* Third party copyrights are property of their respective owners.
13
*
14
* Redistribution and use in source and binary forms, with or without modification,
15
* are permitted provided that the following conditions are met:
16
*
17
* * Redistributions of source code must retain the above copyright notice,
18
* this list of conditions and the following disclaimer.
19
*
20
* * Redistributions in binary form must reproduce the above copyright notice,
21
* this list of conditions and the following disclaimer in the documentation
22
* and/or other materials provided with the distribution.
23
*
24
* * Neither the names of the copyright holders nor the names of the contributors
25
* may be used to endorse or promote products derived from this software
26
* without specific prior written permission.
27
*
28
* This software is provided by the copyright holders and contributors "as is" and
29
* any express or implied warranties, including, but not limited to, the implied
30
* warranties of merchantability and fitness for a particular purpose are disclaimed.
31
* In no event shall copyright holders or contributors be liable for any direct,
32
* indirect, incidental, special, exemplary, or consequential damages
33
* (including, but not limited to, procurement of substitute goods or services;
34
* loss of use, data, or profits; or business interruption) however caused
35
* and on any theory of liability, whether in contract, strict liability,
36
* or tort (including negligence or otherwise) arising in any way out of
37
* the use of this software, even if advised of the possibility of such damage.
38
*/
39
40
#include "common.hpp"
41
42
namespace CAROTENE_NS {
43
44
//magic number; must be multiple of 4
45
#define NORM32F_BLOCK_SIZE 2048
46
47
s32 normInf(const Size2D &_size,
48
const u8 * srcBase, ptrdiff_t srcStride)
49
{
50
internal::assertSupportedConfiguration();
51
#ifdef CAROTENE_NEON
52
Size2D size(_size);
53
if (srcStride == (ptrdiff_t)(size.width))
54
{
55
size.width *= size.height;
56
size.height = 1;
57
}
58
s32 result = 0;
59
for(size_t k = 0; k < size.height; ++k)
60
{
61
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
62
size_t i = 0;
63
if (size.width >= 16)
64
{
65
uint8x16_t s = vld1q_u8(src);
66
for (i = 16; i <= size.width - 16; i += 16)
67
{
68
internal::prefetch(src + i);
69
uint8x16_t s1 = vld1q_u8(src + i);
70
s = vmaxq_u8(s1, s);
71
}
72
u8 s2[8];
73
uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
74
vst1_u8(s2, s3);
75
for (u32 j = 0; j < 8; j++)
76
result = std::max((s32)(s2[j]), result);
77
}
78
for ( ; i < size.width; i++)
79
result = std::max((s32)(src[i]), result);
80
}
81
return result;
82
#else
83
(void)_size;
84
(void)srcBase;
85
(void)srcStride;
86
87
return 0;
88
#endif
89
}
90
91
s32 normInf(const Size2D &_size,
92
const s8 * srcBase, ptrdiff_t srcStride)
93
{
94
internal::assertSupportedConfiguration();
95
#ifdef CAROTENE_NEON
96
Size2D size(_size);
97
if (srcStride == (ptrdiff_t)(size.width))
98
{
99
size.width *= size.height;
100
size.height = 1;
101
}
102
s32 result = 0;
103
for(size_t k = 0; k < size.height; ++k)
104
{
105
const s8* src = internal::getRowPtr( srcBase, srcStride, k);
106
size_t i = 0;
107
if (size.width >= 16)
108
{
109
uint8x16_t s = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src)));
110
for (i = 16; i <= size.width - 16; i += 16)
111
{
112
internal::prefetch(src + i);
113
uint8x16_t s1 = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src + i)));
114
s = vmaxq_u8(s1, s);
115
}
116
u8 s2[8];
117
uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
118
vst1_u8(s2, s3);
119
for (u32 j = 0; j < 8; j++)
120
result = std::max((s32)(s2[j]), result);
121
}
122
for ( ; i < size.width; i++)
123
result = std::max((s32)(std::abs(src[i])), result);
124
}
125
return result;
126
#else
127
(void)_size;
128
(void)srcBase;
129
(void)srcStride;
130
131
return 0;
132
#endif
133
}
134
135
s32 normInf(const Size2D &_size,
136
const u16 * srcBase, ptrdiff_t srcStride)
137
{
138
internal::assertSupportedConfiguration();
139
#ifdef CAROTENE_NEON
140
Size2D size(_size);
141
if (srcStride == (ptrdiff_t)(size.width))
142
{
143
size.width *= size.height;
144
size.height = 1;
145
}
146
s32 result = 0;
147
for(size_t k = 0; k < size.height; ++k)
148
{
149
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
150
size_t i = 0;
151
if (size.width >= 8)
152
{
153
uint16x8_t s = vld1q_u16(src);
154
for (i = 8; i <= size.width - 8; i += 8)
155
{
156
internal::prefetch(src + i);
157
uint16x8_t s1 = vld1q_u16(src + i);
158
s = vmaxq_u16(s1, s);
159
}
160
u16 s2[4];
161
uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
162
vst1_u16(s2, s3);
163
for (u32 j = 0; j < 4; j++)
164
result = std::max((s32)(s2[j]), result);
165
}
166
for ( ; i < size.width; i++)
167
result = std::max((s32)(src[i]), result);
168
}
169
return result;
170
#else
171
(void)_size;
172
(void)srcBase;
173
(void)srcStride;
174
175
return 0;
176
#endif
177
}
178
179
s32 normInf(const Size2D &_size,
180
const s16 * srcBase, ptrdiff_t srcStride)
181
{
182
internal::assertSupportedConfiguration();
183
#ifdef CAROTENE_NEON
184
Size2D size(_size);
185
if (srcStride == (ptrdiff_t)(size.width))
186
{
187
size.width *= size.height;
188
size.height = 1;
189
}
190
s32 result = 0;
191
for(size_t k = 0; k < size.height; ++k)
192
{
193
const s16* src = internal::getRowPtr( srcBase, srcStride, k);
194
size_t i = 0;
195
if (size.width >= 8)
196
{
197
uint16x8_t s = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src)));
198
for (i = 8; i <= size.width - 8; i += 8)
199
{
200
internal::prefetch(src + i);
201
uint16x8_t s1 = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src + i)));
202
s = vmaxq_u16(s1, s);
203
}
204
u16 s2[4];
205
uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
206
vst1_u16(s2, s3);
207
for (u32 j = 0; j < 4; j++)
208
result = std::max((s32)(s2[j]), result);
209
}
210
for ( ; i < size.width; i++)
211
result = std::max(std::abs((s32)(src[i])), result);
212
}
213
return result;
214
#else
215
(void)_size;
216
(void)srcBase;
217
(void)srcStride;
218
219
return 0;
220
#endif
221
}
222
223
s32 normInf(const Size2D &_size,
224
const s32 * srcBase, ptrdiff_t srcStride)
225
{
226
internal::assertSupportedConfiguration();
227
#ifdef CAROTENE_NEON
228
Size2D size(_size);
229
if (srcStride == (ptrdiff_t)(size.width))
230
{
231
size.width *= size.height;
232
size.height = 1;
233
}
234
s32 result = 0;
235
for(size_t k = 0; k < size.height; ++k)
236
{
237
const s32* src = internal::getRowPtr( srcBase, srcStride, k);
238
size_t i = 0;
239
if (size.width >= 4)
240
{
241
uint32x4_t s = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src)));
242
for (i = 4; i <= size.width - 4; i += 4)
243
{
244
internal::prefetch(src + i);
245
uint32x4_t s1 = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src + i)));
246
s = vmaxq_u32(s1, s);
247
}
248
u32 s2[2];
249
uint32x2_t s3 = vmax_u32(vget_low_u32(s), vget_high_u32(s));
250
vst1_u32(s2, s3);
251
for (u32 j = 0; j < 2; j++)
252
result = std::max((s32)(s2[j]), result);
253
}
254
for ( ; i < size.width; i++)
255
result = std::max((s32)(std::abs(src[i])), result);
256
}
257
return result;
258
#else
259
(void)_size;
260
(void)srcBase;
261
(void)srcStride;
262
263
return 0;
264
#endif
265
}
266
267
f32 normInf(const Size2D &_size,
268
const f32 * srcBase, ptrdiff_t srcStride)
269
{
270
internal::assertSupportedConfiguration();
271
#ifdef CAROTENE_NEON
272
Size2D size(_size);
273
if (srcStride == (ptrdiff_t)(size.width))
274
{
275
size.width *= size.height;
276
size.height = 1;
277
}
278
f32 result = 0;
279
for(size_t k = 0; k < size.height; ++k)
280
{
281
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
282
size_t i = 0;
283
if (size.width >= 4)
284
{
285
float32x4_t s = vabsq_f32(vld1q_f32(src));
286
for (i = 4; i <= size.width - 4; i += 4 )
287
{
288
internal::prefetch(src + i);
289
float32x4_t s1 = vld1q_f32(src + i);
290
float32x4_t sa = vabsq_f32(s1);
291
s = vmaxq_f32(sa, s);
292
}
293
f32 s2[2];
294
float32x2_t s3 = vmax_f32(vget_low_f32(s), vget_high_f32(s));
295
vst1_f32(s2, s3);
296
for (u32 j = 0; j < 2; j++)
297
result = std::max(s2[j], result);
298
}
299
for (; i < size.width; i++)
300
result = std::max(std::abs(src[i]), result);
301
}
302
return result;
303
#else
304
(void)_size;
305
(void)srcBase;
306
(void)srcStride;
307
308
return 0.;
309
#endif
310
}
311
312
s32 normL1(const Size2D &_size,
313
const u8 * srcBase, ptrdiff_t srcStride)
314
{
315
internal::assertSupportedConfiguration();
316
#ifdef CAROTENE_NEON
317
Size2D size(_size);
318
if (srcStride == (ptrdiff_t)(size.width))
319
{
320
size.width *= size.height;
321
size.height = 1;
322
}
323
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
324
s32 result = 0;
325
for(size_t k = 0; k < size.height; ++k)
326
{
327
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
328
size_t i = 0;
329
uint32x4_t vs = vmovq_n_u32(0);
330
for (; i < roiw8;)
331
{
332
size_t limit = std::min(size.width, i + 256) - 8;
333
uint8x8_t s0 = vld1_u8(src + i);
334
uint16x8_t s = vmovl_u8(s0);
335
336
for (i += 8; i <= limit; i += 8)
337
{
338
internal::prefetch(src + i);
339
uint8x8_t s1 = vld1_u8(src + i);
340
s = vaddw_u8(s, s1);
341
}
342
343
uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
344
vs = vaddw_u16(vs, s4);
345
}
346
347
u32 s2[2];
348
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
349
vst1_u32(s2, vs2);
350
351
result += (s32)(s2[0] + s2[1]);
352
353
for ( ; i < size.width; i++)
354
result += (s32)(src[i]);
355
}
356
return result;
357
#else
358
(void)_size;
359
(void)srcBase;
360
(void)srcStride;
361
362
return 0;
363
#endif
364
}
365
366
s32 normL1(const Size2D &_size,
367
const s8 * srcBase, ptrdiff_t srcStride)
368
{
369
internal::assertSupportedConfiguration();
370
#ifdef CAROTENE_NEON
371
Size2D size(_size);
372
if (srcStride == (ptrdiff_t)(size.width))
373
{
374
size.width *= size.height;
375
size.height = 1;
376
}
377
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
378
s32 result = 0;
379
for(size_t k = 0; k < size.height; ++k)
380
{
381
const s8* src = internal::getRowPtr( srcBase, srcStride, k);
382
size_t i = 0;
383
uint32x4_t vs = vmovq_n_u32(0);
384
385
for (; i < roiw8;)
386
{
387
size_t limit = std::min(size.width, i + 256) - 8;
388
uint8x8_t s0 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
389
uint16x8_t s = vmovl_u8(s0);
390
391
for (i += 8; i <= limit; i += 8)
392
{
393
internal::prefetch(src + i);
394
uint8x8_t s1 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
395
s = vaddw_u8(s, s1);
396
}
397
398
uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
399
vs = vaddw_u16(vs, s4);
400
}
401
402
u32 s2[2];
403
uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
404
vst1_u32(s2, vs2);
405
406
result += (s32)(s2[0] + s2[1]);
407
408
for ( ; i < size.width; i++)
409
result += (s32)(std::abs(src[i]));
410
}
411
return result;
412
#else
413
(void)_size;
414
(void)srcBase;
415
(void)srcStride;
416
417
return 0;
418
#endif
419
}
420
421
s32 normL1(const Size2D &_size,
422
const u16 * srcBase, ptrdiff_t srcStride)
423
{
424
internal::assertSupportedConfiguration();
425
#ifdef CAROTENE_NEON
426
Size2D size(_size);
427
if (srcStride == (ptrdiff_t)(size.width))
428
{
429
size.width *= size.height;
430
size.height = 1;
431
}
432
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
433
s32 result = 0;
434
for(size_t k = 0; k < size.height; ++k)
435
{
436
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
437
size_t i = 0;
438
uint32x4_t vs = vmovq_n_u32(0);
439
for (; i < roiw4; i += 4)
440
{
441
internal::prefetch(src + i);
442
uint16x4_t s = vld1_u16(src + i);
443
vs = vaddw_u16(vs, s);
444
}
445
u32 s2[4];
446
vst1q_u32(s2, vs);
447
for (u32 j = 0; j < 4; j++)
448
result += s2[j];
449
for ( ; i < size.width; i++)
450
result += (s32)(src[i]);
451
}
452
return result;
453
#else
454
(void)_size;
455
(void)srcBase;
456
(void)srcStride;
457
458
return 0;
459
#endif
460
}
461
462
s32 normL1(const Size2D &_size,
463
const s16 * srcBase, ptrdiff_t srcStride)
464
{
465
internal::assertSupportedConfiguration();
466
#ifdef CAROTENE_NEON
467
Size2D size(_size);
468
if (srcStride == (ptrdiff_t)(size.width))
469
{
470
size.width *= size.height;
471
size.height = 1;
472
}
473
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
474
s32 result = 0;
475
for(size_t k = 0; k < size.height; ++k)
476
{
477
const s16* src = internal::getRowPtr( srcBase, srcStride, k);
478
size_t i = 0;
479
uint32x4_t vs = vmovq_n_u32(0);
480
for (; i < roiw4; i += 4)
481
{
482
internal::prefetch(src + i);
483
uint16x4_t s = vreinterpret_u16_s16(vabs_s16(vld1_s16(src + i)));
484
vs = vaddw_u16(vs, s);
485
}
486
u32 s2[4];
487
vst1q_u32(s2, vs);
488
for (u32 j = 0; j < 4; j++)
489
result += s2[j];
490
for ( ; i < size.width; i++)
491
result += (s32)(std::abs(src[i]));
492
}
493
return result;
494
#else
495
(void)_size;
496
(void)srcBase;
497
(void)srcStride;
498
499
return 0;
500
#endif
501
}
502
503
f64 normL1(const Size2D &_size,
504
const s32 * srcBase, ptrdiff_t srcStride)
505
{
506
internal::assertSupportedConfiguration();
507
#ifdef CAROTENE_NEON
508
Size2D size(_size);
509
if (srcStride == (ptrdiff_t)(size.width))
510
{
511
size.width *= size.height;
512
size.height = 1;
513
}
514
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
515
f64 result = 0;
516
for(size_t k = 0; k < size.height; ++k)
517
{
518
const s32* src = internal::getRowPtr( srcBase, srcStride, k);
519
size_t i = 0;
520
for (; i < roiw4;)
521
{
522
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
523
float32x4_t s = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
524
for (i += 4; i <= limit; i += 4 )
525
{
526
internal::prefetch(src + i);
527
float32x4_t s1 = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
528
s = vaddq_f32(s, s1);
529
}
530
531
f32 s2[4];
532
vst1q_f32(s2, s);
533
534
for (u32 j = 0; j < 4; j++)
535
result += (f64)(s2[j]);
536
}
537
for ( ; i < size.width; i++)
538
result += (f64)(std::abs(src[i]));
539
}
540
return result;
541
#else
542
(void)_size;
543
(void)srcBase;
544
(void)srcStride;
545
546
return 0.;
547
#endif
548
}
549
550
f64 normL1(const Size2D &_size,
551
const f32 * srcBase, ptrdiff_t srcStride)
552
{
553
internal::assertSupportedConfiguration();
554
#ifdef CAROTENE_NEON
555
Size2D size(_size);
556
if (srcStride == (ptrdiff_t)(size.width))
557
{
558
size.width *= size.height;
559
size.height = 1;
560
}
561
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
562
f64 result = 0;
563
for(size_t k = 0; k < size.height; ++k)
564
{
565
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
566
size_t i = 0;
567
568
for (; i < roiw4;)
569
{
570
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
571
float32x4_t s = vabsq_f32(vld1q_f32(src + i));
572
for (i += 4; i <= limit; i += 4)
573
{
574
internal::prefetch(src + i);
575
float32x4_t s1 = vld1q_f32(src + i);
576
float32x4_t sa = vabsq_f32(s1);
577
s = vaddq_f32(sa, s);
578
}
579
580
f32 s2[4];
581
vst1q_f32(s2, s);
582
583
for (u32 j = 0; j < 4; j++)
584
result += (f64)(s2[j]);
585
}
586
for (; i < size.width; i++)
587
result += std::abs((f64)(src[i]));
588
}
589
return result;
590
#else
591
(void)_size;
592
(void)srcBase;
593
(void)srcStride;
594
595
return 0.;
596
#endif
597
}
598
599
s32 normL2(const Size2D &_size,
600
const u8 * srcBase, ptrdiff_t srcStride)
601
{
602
internal::assertSupportedConfiguration();
603
#ifdef CAROTENE_NEON
604
Size2D size(_size);
605
if (srcStride == (ptrdiff_t)(size.width))
606
{
607
size.width *= size.height;
608
size.height = 1;
609
}
610
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
611
s32 result = 0;
612
for(size_t k = 0; k < size.height; ++k)
613
{
614
const u8* src = internal::getRowPtr( srcBase, srcStride, k);
615
size_t i = 0;
616
617
uint32x4_t sl = vmovq_n_u32(0);
618
uint32x4_t sh = vmovq_n_u32(0);
619
620
for (; i < roiw8; i += 8)
621
{
622
internal::prefetch(src + i);
623
uint8x8_t s1 = vld1_u8(src + i);
624
uint16x8_t sq = vmull_u8(s1, s1);
625
626
sl = vaddw_u16(sl, vget_low_u16(sq));
627
sh = vaddw_u16(sh, vget_high_u16(sq));
628
}
629
630
uint32x4_t s = vaddq_u32(sl, sh);
631
uint32x2_t ss = vadd_u32(vget_low_u32(s), vget_high_u32(s));
632
633
u32 s2[2];
634
vst1_u32(s2, ss);
635
636
result += (s32)(s2[0] + s2[1]);
637
638
for (; i < size.width; i++)
639
result += (s32)(src[i]) * (s32)(src[i]);
640
}
641
return result;
642
#else
643
(void)_size;
644
(void)srcBase;
645
(void)srcStride;
646
647
return 0;
648
#endif
649
}
650
651
s32 normL2(const Size2D &_size,
652
const s8 * srcBase, ptrdiff_t srcStride)
653
{
654
internal::assertSupportedConfiguration();
655
#ifdef CAROTENE_NEON
656
Size2D size(_size);
657
if (srcStride == (ptrdiff_t)(size.width))
658
{
659
size.width *= size.height;
660
size.height = 1;
661
}
662
size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
663
s32 result = 0;
664
for(size_t k = 0; k < size.height; ++k)
665
{
666
const s8* src = internal::getRowPtr( srcBase, srcStride, k);
667
size_t i = 0;
668
669
int32x4_t sl = vmovq_n_s32(0);
670
int32x4_t sh = vmovq_n_s32(0);
671
672
for (; i < roiw8; i += 8)
673
{
674
internal::prefetch(src + i);
675
int8x8_t s1 = vld1_s8(src + i);
676
int16x8_t sq = vmull_s8(s1, s1);
677
678
sl = vaddw_s16(sl, vget_low_s16(sq));
679
sh = vaddw_s16(sh, vget_high_s16(sq));
680
}
681
682
int32x4_t s = vaddq_s32(sl, sh);
683
int32x2_t ss = vadd_s32(vget_low_s32(s), vget_high_s32(s));
684
685
s32 s2[2];
686
vst1_s32(s2, ss);
687
688
result += s2[0] + s2[1];
689
690
for (; i < size.width; i++)
691
result += (s32)(src[i]) * (s32)(src[i]);
692
}
693
return result;
694
#else
695
(void)_size;
696
(void)srcBase;
697
(void)srcStride;
698
699
return 0;
700
#endif
701
}
702
703
f64 normL2(const Size2D &_size,
704
const u16 * srcBase, ptrdiff_t srcStride)
705
{
706
internal::assertSupportedConfiguration();
707
#ifdef CAROTENE_NEON
708
Size2D size(_size);
709
if (srcStride == (ptrdiff_t)(size.width))
710
{
711
size.width *= size.height;
712
size.height = 1;
713
}
714
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
715
f64 result = 0;
716
for(size_t k = 0; k < size.height; ++k)
717
{
718
const u16* src = internal::getRowPtr( srcBase, srcStride, k);
719
size_t i = 0;
720
for (; i < roiw4;)
721
{
722
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
723
uint16x4_t s0 = vld1_u16(src+i);
724
float32x4_t s = vcvtq_f32_u32(vmull_u16(s0,s0));
725
for (i += 4; i <= limit; i += 4 )
726
{
727
internal::prefetch(src + i);
728
uint16x4_t s1 = vld1_u16(src+i);
729
float32x4_t sq = vcvtq_f32_u32(vmull_u16(s1, s1));
730
s = vaddq_f32(s, sq);
731
}
732
f32 s2[4];
733
vst1q_f32(s2, s);
734
for (u32 j = 0; j < 4; j++)
735
result += (f64)(s2[j]);
736
}
737
738
for ( ; i < size.width; i++)
739
result += (f64)(src[i]) * (f64)(src[i]);
740
}
741
return result;
742
#else
743
(void)_size;
744
(void)srcBase;
745
(void)srcStride;
746
747
return 0.;
748
#endif
749
}
750
751
f64 normL2(const Size2D &_size,
752
const s16 * srcBase, ptrdiff_t srcStride)
753
{
754
internal::assertSupportedConfiguration();
755
#ifdef CAROTENE_NEON
756
Size2D size(_size);
757
if (srcStride == (ptrdiff_t)(size.width))
758
{
759
size.width *= size.height;
760
size.height = 1;
761
}
762
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
763
f64 result = 0;
764
for(size_t k = 0; k < size.height; ++k)
765
{
766
const s16* src = internal::getRowPtr( srcBase, srcStride, k);
767
size_t i = 0;
768
for (; i < roiw4;)
769
{
770
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
771
int16x4_t s0 = vld1_s16(src+i);
772
float32x4_t s = vcvtq_f32_s32(vmull_s16(s0,s0));
773
for (i += 4; i <= limit; i += 4 )
774
{
775
internal::prefetch(src + i);
776
int16x4_t s1 = vld1_s16(src+i);
777
float32x4_t sq = vcvtq_f32_s32(vmull_s16(s1, s1));
778
s = vaddq_f32(s, sq);
779
}
780
f32 s2[4];
781
vst1q_f32(s2, s);
782
for (u32 j = 0; j < 4; j++)
783
result += (f64)(s2[j]);
784
}
785
786
for ( ; i < size.width; i++)
787
result += (f64)(src[i]) * (f64)(src[i]);
788
}
789
return result;
790
#else
791
(void)_size;
792
(void)srcBase;
793
(void)srcStride;
794
795
return 0.;
796
#endif
797
}
798
799
f64 normL2(const Size2D &_size,
800
const s32 * srcBase, ptrdiff_t srcStride)
801
{
802
internal::assertSupportedConfiguration();
803
#ifdef CAROTENE_NEON
804
Size2D size(_size);
805
if (srcStride == (ptrdiff_t)(size.width))
806
{
807
size.width *= size.height;
808
size.height = 1;
809
}
810
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
811
f64 result = 0;
812
for(size_t k = 0; k < size.height; ++k)
813
{
814
const s32* src = internal::getRowPtr( srcBase, srcStride, k);
815
size_t i = 0;
816
for (; i < roiw4;)
817
{
818
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
819
float32x4_t s = vcvtq_f32_s32(vld1q_s32(src + i));
820
s = vmulq_f32(s, s);
821
for (i += 4; i <= limit; i += 4 )
822
{
823
internal::prefetch(src + i);
824
float32x4_t s1 = vcvtq_f32_s32(vld1q_s32(src + i));
825
s = vmlaq_f32(s, s1, s1);
826
}
827
828
f32 s2[4];
829
vst1q_f32(s2, s);
830
831
for (u32 j = 0; j < 4; j++)
832
result += (f64)(s2[j]);
833
}
834
for ( ; i < size.width; i++)
835
result += (f64)(src[i]) * (f64)(src[i]);
836
}
837
return result;
838
#else
839
(void)_size;
840
(void)srcBase;
841
(void)srcStride;
842
843
return 0.;
844
#endif
845
}
846
847
f64 normL2(const Size2D &_size,
848
const f32 * srcBase, ptrdiff_t srcStride)
849
{
850
internal::assertSupportedConfiguration();
851
#ifdef CAROTENE_NEON
852
Size2D size(_size);
853
if (srcStride == (ptrdiff_t)(size.width))
854
{
855
size.width *= size.height;
856
size.height = 1;
857
}
858
size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
859
f64 result = 0;
860
for(size_t k = 0; k < size.height; ++k)
861
{
862
const f32* src = internal::getRowPtr( srcBase, srcStride, k);
863
size_t i = 0;
864
for (; i < roiw4;)
865
{
866
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
867
float32x4_t s = vld1q_f32(src + i);
868
s = vmulq_f32(s, s);
869
for (i += 4; i <= limit; i += 4 )
870
{
871
internal::prefetch(src + i);
872
float32x4_t s1 = vld1q_f32(src + i);
873
s = vmlaq_f32(s, s1, s1);
874
}
875
876
f32 s2[4];
877
vst1q_f32(s2, s);
878
879
for (u32 j = 0; j < 4; j++)
880
result += (f64)(s2[j]);
881
}
882
for ( ; i < size.width; i++)
883
result += (f64)(src[i]) * (f64)(src[i]);
884
}
885
return result;
886
#else
887
(void)_size;
888
(void)srcBase;
889
(void)srcStride;
890
891
return 0.;
892
#endif
893
}
894
895
s32 diffNormInf(const Size2D &_size,
896
const u8 * src0Base, ptrdiff_t src0Stride,
897
const u8 * src1Base, ptrdiff_t src1Stride)
898
{
899
internal::assertSupportedConfiguration();
900
#ifdef CAROTENE_NEON
901
Size2D size(_size);
902
if (src0Stride == src1Stride &&
903
src0Stride == (ptrdiff_t)(size.width))
904
{
905
size.width *= size.height;
906
size.height = 1;
907
}
908
s32 result = 0;
909
for(size_t k = 0; k < size.height; ++k)
910
{
911
const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k);
912
const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k);
913
size_t i = 0;
914
915
if (size.width >= 16)
916
{
917
uint8x16_t vs3 = vdupq_n_u8(0);
918
for (; i < size.width - 16; i += 16)
919
{
920
internal::prefetch(src1 + i);
921
internal::prefetch(src2 + i);
922
923
uint8x16_t vs1 = vld1q_u8(src1 + i);
924
uint8x16_t vs2 = vld1q_u8(src2 + i);
925
926
vs3 = vmaxq_u8(vs3, vabdq_u8(vs1, vs2));
927
}
928
929
u8 s2[8];
930
vst1_u8(s2, vpmax_u8(vget_low_u8(vs3), vget_high_u8(vs3)));
931
932
for (u32 j = 0; j < 8; j++)
933
result = std::max((s32)(s2[j]), result);
934
}
935
936
for (; i < size.width; i++)
937
{
938
result = std::max(std::abs((s32)(src1[i]) - (s32)(src2[i])), result);
939
}
940
}
941
return result;
942
#else
943
(void)_size;
944
(void)src0Base;
945
(void)src0Stride;
946
(void)src1Base;
947
(void)src1Stride;
948
949
return 0;
950
#endif
951
}
952
953
f32 diffNormInf(const Size2D &_size,
954
const f32 * src0Base, ptrdiff_t src0Stride,
955
const f32 * src1Base, ptrdiff_t src1Stride)
956
{
957
internal::assertSupportedConfiguration();
958
#ifdef CAROTENE_NEON
959
Size2D size(_size);
960
if (src0Stride == src1Stride &&
961
src0Stride == (ptrdiff_t)(size.width))
962
{
963
size.width *= size.height;
964
size.height = 1;
965
}
966
f32 result = 0;
967
for(size_t k = 0; k < size.height; ++k)
968
{
969
const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k);
970
const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k);
971
size_t i = 0;
972
973
if (size.width >= 4)
974
{
975
float32x4_t s = vabdq_f32(vld1q_f32(src1), vld1q_f32(src2));
976
977
for (i += 4; i <= size.width - 4; i += 4 )
978
{
979
internal::prefetch(src1 + i);
980
internal::prefetch(src2 + i);
981
982
float32x4_t vs1 = vld1q_f32(src1 + i);
983
float32x4_t vs2 = vld1q_f32(src2 + i);
984
985
float32x4_t vd = vabdq_f32(vs2, vs1);
986
s = vmaxq_f32(s, vd);
987
}
988
989
f32 s2[4];
990
vst1q_f32(s2, s);
991
992
for (u32 j = 0; j < 4; j++)
993
if (s2[j] > result)
994
result = s2[j];
995
}
996
997
for (; i < size.width; i++)
998
{
999
f32 v = std::abs(src1[i] - src2[i]);
1000
if (v > result)
1001
result = v;
1002
}
1003
}
1004
return result;
1005
#else
1006
(void)_size;
1007
(void)src0Base;
1008
(void)src0Stride;
1009
(void)src1Base;
1010
(void)src1Stride;
1011
1012
return 0.;
1013
#endif
1014
}
1015
1016
s32 diffNormL1(const Size2D &_size,
1017
const u8 * src0Base, ptrdiff_t src0Stride,
1018
const u8 * src1Base, ptrdiff_t src1Stride)
1019
{
1020
internal::assertSupportedConfiguration();
1021
#ifdef CAROTENE_NEON
1022
Size2D size(_size);
1023
if (src0Stride == src1Stride &&
1024
src0Stride == (ptrdiff_t)(size.width))
1025
{
1026
size.width *= size.height;
1027
size.height = 1;
1028
}
1029
s32 result = 0;
1030
for(size_t k = 0; k < size.height; ++k)
1031
{
1032
const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k);
1033
const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k);
1034
size_t i = 0;
1035
1036
if (size.width >= 16)
1037
{
1038
for(; i <= size.width - 16;)
1039
{
1040
size_t limit = std::min(size.width, i + 2*256) - 16;
1041
uint16x8_t si1 = vmovq_n_u16(0);
1042
uint16x8_t si2 = vmovq_n_u16(0);
1043
1044
for (; i <= limit; i += 16)
1045
{
1046
internal::prefetch(src1 + i);
1047
internal::prefetch(src2 + i);
1048
1049
uint8x16_t vs1 = vld1q_u8(src1 + i);
1050
uint8x16_t vs2 = vld1q_u8(src2 + i);
1051
1052
si1 = vabal_u8(si1, vget_low_u8(vs1), vget_low_u8(vs2));
1053
si2 = vabal_u8(si2, vget_high_u8(vs1), vget_high_u8(vs2));
1054
}
1055
1056
u32 s2[4];
1057
vst1q_u32(s2, vaddq_u32(vpaddlq_u16(si1), vpaddlq_u16(si2)));
1058
1059
for (u32 j = 0; j < 4; j++)
1060
{
1061
if ((s32)(0x7fFFffFFu - s2[j]) <= result)
1062
{
1063
return 0x7fFFffFF; //result already saturated
1064
}
1065
result = (s32)((u32)(result) + s2[j]);
1066
}
1067
}
1068
1069
}
1070
1071
for (; i < size.width; i++)
1072
{
1073
u32 v = std::abs((s32)(src1[i]) - (s32)(src2[i]));
1074
1075
if ((s32)(0x7fFFffFFu - v) <= result)
1076
{
1077
return 0x7fFFffFF; //result already saturated
1078
}
1079
result = (s32)((u32)(result) + v);
1080
}
1081
}
1082
return result;
1083
#else
1084
(void)_size;
1085
(void)src0Base;
1086
(void)src0Stride;
1087
(void)src1Base;
1088
(void)src1Stride;
1089
1090
return 0;
1091
#endif
1092
}
1093
1094
f64 diffNormL1(const Size2D &_size,
1095
const f32 * src0Base, ptrdiff_t src0Stride,
1096
const f32 * src1Base, ptrdiff_t src1Stride)
1097
{
1098
internal::assertSupportedConfiguration();
1099
#ifdef CAROTENE_NEON
1100
Size2D size(_size);
1101
if (src0Stride == src1Stride &&
1102
src0Stride == (ptrdiff_t)(size.width))
1103
{
1104
size.width *= size.height;
1105
size.height = 1;
1106
}
1107
f64 result = 0;
1108
for(size_t k = 0; k < size.height; ++k)
1109
{
1110
const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k);
1111
const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k);
1112
size_t i = 0;
1113
1114
if (size.width >= 4)
1115
{
1116
for(; i <= size.width - 4;)
1117
{
1118
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
1119
float32x4_t s = vmovq_n_f32(0.0f);
1120
1121
for (; i <= limit; i += 4 )
1122
{
1123
internal::prefetch(src1 + i);
1124
internal::prefetch(src2 + i);
1125
1126
float32x4_t vs1 = vld1q_f32(src1 + i);
1127
float32x4_t vs2 = vld1q_f32(src2 + i);
1128
1129
float32x4_t vd = vabdq_f32(vs2, vs1);
1130
s = vaddq_f32(s, vd);
1131
}
1132
1133
f32 s2[4];
1134
vst1q_f32(s2, s);
1135
1136
for (u32 j = 0; j < 4; j++)
1137
result += (f64)(s2[j]);
1138
}
1139
}
1140
1141
for (; i < size.width; i++)
1142
{
1143
f32 v = std::abs(src1[i] - src2[i]);
1144
result += (f64)(v);
1145
}
1146
}
1147
return result;
1148
#else
1149
(void)_size;
1150
(void)src0Base;
1151
(void)src0Stride;
1152
(void)src1Base;
1153
(void)src1Stride;
1154
1155
return 0.;
1156
#endif
1157
}
1158
1159
s32 diffNormL2(const Size2D &_size,
1160
const u8 * src0Base, ptrdiff_t src0Stride,
1161
const u8 * src1Base, ptrdiff_t src1Stride)
1162
{
1163
internal::assertSupportedConfiguration();
1164
#ifdef CAROTENE_NEON
1165
Size2D size(_size);
1166
if (src0Stride == src1Stride &&
1167
src0Stride == (ptrdiff_t)(size.width))
1168
{
1169
size.width *= size.height;
1170
size.height = 1;
1171
}
1172
s32 result = 0;
1173
for(size_t k = 0; k < size.height; ++k)
1174
{
1175
const u8* src1 = internal::getRowPtr( src0Base, src0Stride, k);
1176
const u8* src2 = internal::getRowPtr( src1Base, src1Stride, k);
1177
size_t i = 0;
1178
1179
#define NORML28U_BLOCK_SIZE (33024*2) //bigger block size can result in integer overflow
1180
if (size.width >= 16)
1181
{
1182
for(; i <= size.width - 16;)
1183
{
1184
size_t limit = std::min(size.width, i + NORML28U_BLOCK_SIZE) - 16;
1185
uint32x4_t si1 = vmovq_n_u32(0);
1186
uint32x4_t si2 = vmovq_n_u32(0);
1187
1188
for (; i <= limit; i += 16)
1189
{
1190
internal::prefetch(src1 + i);
1191
internal::prefetch(src2 + i);
1192
1193
uint8x16_t vs1 = vld1q_u8(src1 + i);
1194
uint8x16_t vs2 = vld1q_u8(src2 + i);
1195
1196
uint16x8_t vdlo = vabdl_u8(vget_low_u8(vs1), vget_low_u8(vs2));
1197
uint16x8_t vdhi = vabdl_u8(vget_high_u8(vs1), vget_high_u8(vs2));
1198
1199
si1 = vmlal_u16(si1, vget_low_u16(vdlo), vget_low_u16(vdlo));
1200
si2 = vmlal_u16(si2, vget_high_u16(vdlo), vget_high_u16(vdlo));
1201
1202
si1 = vmlal_u16(si1, vget_low_u16(vdhi), vget_low_u16(vdhi));
1203
si2 = vmlal_u16(si2, vget_high_u16(vdhi), vget_high_u16(vdhi));
1204
}
1205
1206
u32 s2[4];
1207
vst1q_u32(s2, vqaddq_u32(si1, si2));
1208
1209
for (u32 j = 0; j < 4; j++)
1210
{
1211
if ((s32)(0x7fFFffFFu - s2[j]) <= result)
1212
{
1213
return 0x7fFFffFF; //result already saturated
1214
}
1215
result += (s32)s2[j];
1216
}
1217
}
1218
1219
}
1220
1221
for (; i < size.width; i++)
1222
{
1223
s32 v = (s32)(src1[i]) - (s32)(src2[i]);
1224
v *= v;
1225
1226
if ((s32)(0x7fFFffFFu - (u32)(v)) <= result)
1227
{
1228
return 0x7fFFffFF; //result already saturated
1229
}
1230
result += v;
1231
}
1232
}
1233
return result;
1234
#else
1235
(void)_size;
1236
(void)src0Base;
1237
(void)src0Stride;
1238
(void)src1Base;
1239
(void)src1Stride;
1240
1241
return 0;
1242
#endif
1243
}
1244
1245
f64 diffNormL2(const Size2D &_size,
1246
const f32 * src0Base, ptrdiff_t src0Stride,
1247
const f32 * src1Base, ptrdiff_t src1Stride)
1248
{
1249
internal::assertSupportedConfiguration();
1250
#ifdef CAROTENE_NEON
1251
Size2D size(_size);
1252
if (src0Stride == src1Stride &&
1253
src0Stride == (ptrdiff_t)(size.width))
1254
{
1255
size.width *= size.height;
1256
size.height = 1;
1257
}
1258
f64 result = 0;
1259
for(size_t k = 0; k < size.height; ++k)
1260
{
1261
const f32* src1 = internal::getRowPtr( src0Base, src0Stride, k);
1262
const f32* src2 = internal::getRowPtr( src1Base, src1Stride, k);
1263
size_t i = 0;
1264
1265
if (size.width >= 4)
1266
{
1267
for(; i <= size.width - 4;)
1268
{
1269
size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
1270
float32x4_t s = vmovq_n_f32(0.0f);
1271
1272
for (; i <= limit; i += 4 )
1273
{
1274
internal::prefetch(src1 + i);
1275
internal::prefetch(src2 + i);
1276
1277
float32x4_t vs1 = vld1q_f32(src1 + i);
1278
float32x4_t vs2 = vld1q_f32(src2 + i);
1279
1280
float32x4_t vd = vsubq_f32(vs2,vs1);
1281
s = vmlaq_f32(s, vd, vd);
1282
}
1283
1284
f32 s2[4];
1285
vst1q_f32(s2, s);
1286
1287
for (u32 j = 0; j < 4; j++)
1288
result += (f64)(s2[j]);
1289
}
1290
}
1291
1292
for (; i < size.width; i++)
1293
{
1294
f32 v = src1[i] - src2[i];
1295
result += v * v;
1296
}
1297
}
1298
return result;
1299
#else
1300
(void)_size;
1301
(void)src0Base;
1302
(void)src0Stride;
1303
(void)src1Base;
1304
(void)src1Stride;
1305
1306
return 0.;
1307
#endif
1308
}
1309
1310
} // namespace CAROTENE_NS
1311
1312