Path: blob/master/3rdparty/carotene/src/dot_product.cpp
16337 views
/*1* By downloading, copying, installing or using the software you agree to this license.2* If you do not agree to this license, do not download, install,3* copy or use the software.4*5*6* License Agreement7* For Open Source Computer Vision Library8* (3-clause BSD License)9*10* Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.11* Third party copyrights are property of their respective owners.12*13* Redistribution and use in source and binary forms, with or without modification,14* are permitted provided that the following conditions are met:15*16* * Redistributions of source code must retain the above copyright notice,17* this list of conditions and the following disclaimer.18*19* * Redistributions in binary form must reproduce the above copyright notice,20* this list of conditions and the following disclaimer in the documentation21* and/or other materials provided with the distribution.22*23* * Neither the names of the copyright holders nor the names of the contributors24* may be used to endorse or promote products derived from this software25* without specific prior written permission.26*27* This software is provided by the copyright holders and contributors "as is" and28* any express or implied warranties, including, but not limited to, the implied29* warranties of merchantability and fitness for a particular purpose are disclaimed.30* In no event shall copyright holders or contributors be liable for any direct,31* indirect, incidental, special, exemplary, or consequential damages32* (including, but not limited to, procurement of substitute goods or services;33* loss of use, data, or profits; or business interruption) however caused34* and on any theory of liability, whether in contract, strict liability,35* or tort (including negligence or otherwise) arising in any way out of36* the use of this software, even if advised of the possibility of such damage.37*/3839#include "common.hpp"4041namespace CAROTENE_NS {4243f64 dotProduct(const Size2D &_size,44const u8 * src0Base, ptrdiff_t src0Stride,45const u8 * src1Base, ptrdiff_t src1Stride)46{47internal::assertSupportedConfiguration();48#ifdef CAROTENE_NEON49Size2D size(_size);50if (src0Stride == src1Stride &&51src0Stride == (ptrdiff_t)(size.width))52{53size.width *= size.height;54size.height = 1;55}5657// It is possible to accumulate up to 66051 uchar multiplication results in uint32 without overflow58// We process 16 elements and accumulate two new elements per step. So we could handle 66051/2*16 elements59#define DOT_UINT_BLOCKSIZE 66050*860f64 result = 0.0;61for (size_t row = 0; row < size.height; ++row)62{63const u8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);64const u8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);6566size_t i = 0;67uint64x2_t ws = vmovq_n_u64(0);6869while(i + 16 <= size.width)70{71size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;7273uint32x4_t s1 = vmovq_n_u32(0);74uint32x4_t s2 = vmovq_n_u32(0);7576for (; i <= lim; i += 16)77{78internal::prefetch(src0 + i);79internal::prefetch(src1 + i);8081uint8x16_t vs1 = vld1q_u8(src0 + i);82uint8x16_t vs2 = vld1q_u8(src1 + i);8384uint16x8_t vdot1 = vmull_u8(vget_low_u8(vs1), vget_low_u8(vs2));85uint16x8_t vdot2 = vmull_u8(vget_high_u8(vs1), vget_high_u8(vs2));8687s1 = vpadalq_u16(s1, vdot1);88s2 = vpadalq_u16(s2, vdot2);89}9091ws = vpadalq_u32(ws, s1);92ws = vpadalq_u32(ws, s2);93}9495if(i + 8 <= size.width)96{97uint8x8_t vs1 = vld1_u8(src0 + i);98uint8x8_t vs2 = vld1_u8(src1 + i);99100ws = vpadalq_u32(ws, vpaddlq_u16(vmull_u8(vs1, vs2)));101i += 8;102}103104result += (double)vget_lane_u64(vadd_u64(vget_low_u64(ws), vget_high_u64(ws)), 0);105106for (; i < size.width; ++i)107result += s32(src0[i]) * s32(src1[i]);108}109return result;110#else111(void)_size;112(void)src0Base;113(void)src0Stride;114(void)src1Base;115(void)src1Stride;116117return 0;118#endif119}120121f64 dotProduct(const Size2D &_size,122const s8 * src0Base, ptrdiff_t src0Stride,123const s8 * src1Base, ptrdiff_t src1Stride)124{125internal::assertSupportedConfiguration();126#ifdef CAROTENE_NEON127Size2D size(_size);128if (src0Stride == src1Stride &&129src0Stride == (ptrdiff_t)(size.width))130{131size.width *= size.height;132size.height = 1;133}134135// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow136// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements137#define DOT_INT_BLOCKSIZE 131070*8138f64 result = 0.0;139for (size_t row = 0; row < size.height; ++row)140{141const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);142const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);143144size_t i = 0;145int64x2_t ws = vmovq_n_s64(0);146147while(i + 16 <= size.width)148{149size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;150151int32x4_t s1 = vmovq_n_s32(0);152int32x4_t s2 = vmovq_n_s32(0);153154for (; i <= lim; i += 16)155{156internal::prefetch(src0 + i);157internal::prefetch(src1 + i);158159int8x16_t vs1 = vld1q_s8(src0 + i);160int8x16_t vs2 = vld1q_s8(src1 + i);161162int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));163int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));164165s1 = vpadalq_s16(s1, vdot1);166s2 = vpadalq_s16(s2, vdot2);167}168169ws = vpadalq_s32(ws, s1);170ws = vpadalq_s32(ws, s2);171}172173if(i + 8 <= size.width)174{175int8x8_t vs1 = vld1_s8(src0 + i);176int8x8_t vs2 = vld1_s8(src1 + i);177178ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));179i += 8;180}181182result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);183184for (; i < size.width; ++i)185result += s32(src0[i]) * s32(src1[i]);186}187return result;188#else189(void)_size;190(void)src0Base;191(void)src0Stride;192(void)src1Base;193(void)src1Stride;194195return 0;196#endif197}198199f64 dotProduct(const Size2D &_size,200const f32 * src0Base, ptrdiff_t src0Stride,201const f32 * src1Base, ptrdiff_t src1Stride)202{203internal::assertSupportedConfiguration();204#ifdef CAROTENE_NEON205Size2D size(_size);206if (src0Stride == src1Stride &&207src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))208{209size.width *= size.height;210size.height = 1;211}212213#define DOT_FLOAT_BLOCKSIZE (1 << 13)214f64 result = 0.0;215for (size_t row = 0; row < size.height; ++row)216{217const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);218const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);219220size_t i = 0;221while(i + 4 <= size.width)222{223size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;224float32x4_t v_sum = vdupq_n_f32(0.0f);225226for( ; i <= lim; i += 4 )227{228internal::prefetch(src0 + i);229internal::prefetch(src1 + i);230v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));231}232233float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));234result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);235}236237if(i + 2 <= size.width)238{239float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));240result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);241i += 2;242}243244for (; i < size.width; ++i)245result += src0[i] * src1[i];246}247return result;248#else249(void)_size;250(void)src0Base;251(void)src0Stride;252(void)src1Base;253(void)src1Stride;254255return 0;256#endif257}258259} // namespace CAROTENE_NS260261262