Path: blob/master/thirdparty/libwebp/src/dsp/enc_neon.c
9913 views
// Copyright 2012 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// ARM NEON version of speed-critical encoding functions.10//11// adapted from libvpx (https://www.webmproject.org/code/)1213#include "src/dsp/dsp.h"1415#if defined(WEBP_USE_NEON)1617#include <assert.h>1819#include "src/dsp/neon.h"20#include "src/enc/vp8i_enc.h"2122//------------------------------------------------------------------------------23// Transforms (Paragraph 14.4)2425// Inverse transform.26// This code is pretty much the same as TransformOne in the dec_neon.c, except27// for subtraction to *ref. See the comments there for algorithmic explanations.2829static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;30static const int16_t kC2 =31WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.3233// This code works but is *slower* than the inlined-asm version below34// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to35// WEBP_USE_INTRINSICS define.36// With gcc-4.8, it's a little faster speed than inlined-assembly.37#if defined(WEBP_USE_INTRINSICS)3839// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.40static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {41return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));42}4344// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result45// to the corresponding rows of 'dst'.46static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,47const int16x8_t dst01,48const int16x8_t dst23) {49// Unsigned saturate to 8b.50const uint8x8_t dst01_u8 = vqmovun_s16(dst01);51const uint8x8_t dst23_u8 = vqmovun_s16(dst23);5253// Store the results.54vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);55vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);56vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);57vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);58}5960static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,61const int16x8_t row23,62const uint8_t* WEBP_RESTRICT const ref,63uint8_t* WEBP_RESTRICT const dst) {64uint32x2_t dst01 = vdup_n_u32(0);65uint32x2_t dst23 = vdup_n_u32(0);6667// Load the source pixels.68dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);69dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);70dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);71dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);7273{74// Convert to 16b.75const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);76const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);7778// Descale with rounding.79const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);80const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);81// Add the inverse transform.82SaturateAndStore4x4_NEON(dst, out01, out23);83}84}8586static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,87const int16x8_t in1,88int16x8x2_t* const out) {89// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d190// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d391const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...92// b0 d0 b1 d1 b2 d2 ...93*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);94}9596static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {97// {rows} = in0 | in498// in8 | in1299// B1 = in4 | in12100const int16x8_t B1 =101vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));102// C0 = kC1 * in4 | kC1 * in12103// C1 = kC2 * in4 | kC2 * in12104const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);105const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);106const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),107vget_low_s16(rows->val[1])); // in0 + in8108const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),109vget_low_s16(rows->val[1])); // in0 - in8110// c = kC2 * in4 - kC1 * in12111// d = kC1 * in4 + kC2 * in12112const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));113const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));114const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b115const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c116const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c117const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c118const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));119Transpose8x2_NEON(E0, E1, rows);120}121122static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,123const int16_t* WEBP_RESTRICT in,124uint8_t* WEBP_RESTRICT dst) {125int16x8x2_t rows;126INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));127TransformPass_NEON(&rows);128TransformPass_NEON(&rows);129Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);130}131132#else133134static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,135const int16_t* WEBP_RESTRICT in,136uint8_t* WEBP_RESTRICT dst) {137const int kBPS = BPS;138const int16_t kC1C2[] = { kC1, kC2, 0, 0 };139140__asm__ volatile (141"vld1.16 {q1, q2}, [%[in]] \n"142"vld1.16 {d0}, [%[kC1C2]] \n"143144// d2: in[0]145// d3: in[8]146// d4: in[4]147// d5: in[12]148"vswp d3, d4 \n"149150// q8 = {in[4], in[12]} * kC1 * 2 >> 16151// q9 = {in[4], in[12]} * kC2 >> 16152"vqdmulh.s16 q8, q2, d0[0] \n"153"vqdmulh.s16 q9, q2, d0[1] \n"154155// d22 = a = in[0] + in[8]156// d23 = b = in[0] - in[8]157"vqadd.s16 d22, d2, d3 \n"158"vqsub.s16 d23, d2, d3 \n"159160// q8 = in[4]/[12] * kC1 >> 16161"vshr.s16 q8, q8, #1 \n"162163// Add {in[4], in[12]} back after the multiplication.164"vqadd.s16 q8, q2, q8 \n"165166// d20 = c = in[4]*kC2 - in[12]*kC1167// d21 = d = in[4]*kC1 + in[12]*kC2168"vqsub.s16 d20, d18, d17 \n"169"vqadd.s16 d21, d19, d16 \n"170171// d2 = tmp[0] = a + d172// d3 = tmp[1] = b + c173// d4 = tmp[2] = b - c174// d5 = tmp[3] = a - d175"vqadd.s16 d2, d22, d21 \n"176"vqadd.s16 d3, d23, d20 \n"177"vqsub.s16 d4, d23, d20 \n"178"vqsub.s16 d5, d22, d21 \n"179180"vzip.16 q1, q2 \n"181"vzip.16 q1, q2 \n"182183"vswp d3, d4 \n"184185// q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16186// q9 = {tmp[4], tmp[12]} * kC2 >> 16187"vqdmulh.s16 q8, q2, d0[0] \n"188"vqdmulh.s16 q9, q2, d0[1] \n"189190// d22 = a = tmp[0] + tmp[8]191// d23 = b = tmp[0] - tmp[8]192"vqadd.s16 d22, d2, d3 \n"193"vqsub.s16 d23, d2, d3 \n"194195"vshr.s16 q8, q8, #1 \n"196"vqadd.s16 q8, q2, q8 \n"197198// d20 = c = in[4]*kC2 - in[12]*kC1199// d21 = d = in[4]*kC1 + in[12]*kC2200"vqsub.s16 d20, d18, d17 \n"201"vqadd.s16 d21, d19, d16 \n"202203// d2 = tmp[0] = a + d204// d3 = tmp[1] = b + c205// d4 = tmp[2] = b - c206// d5 = tmp[3] = a - d207"vqadd.s16 d2, d22, d21 \n"208"vqadd.s16 d3, d23, d20 \n"209"vqsub.s16 d4, d23, d20 \n"210"vqsub.s16 d5, d22, d21 \n"211212"vld1.32 d6[0], [%[ref]], %[kBPS] \n"213"vld1.32 d6[1], [%[ref]], %[kBPS] \n"214"vld1.32 d7[0], [%[ref]], %[kBPS] \n"215"vld1.32 d7[1], [%[ref]], %[kBPS] \n"216217"sub %[ref], %[ref], %[kBPS], lsl #2 \n"218219// (val) + 4 >> 3220"vrshr.s16 d2, d2, #3 \n"221"vrshr.s16 d3, d3, #3 \n"222"vrshr.s16 d4, d4, #3 \n"223"vrshr.s16 d5, d5, #3 \n"224225"vzip.16 q1, q2 \n"226"vzip.16 q1, q2 \n"227228// Must accumulate before saturating229"vmovl.u8 q8, d6 \n"230"vmovl.u8 q9, d7 \n"231232"vqadd.s16 q1, q1, q8 \n"233"vqadd.s16 q2, q2, q9 \n"234235"vqmovun.s16 d0, q1 \n"236"vqmovun.s16 d1, q2 \n"237238"vst1.32 d0[0], [%[dst]], %[kBPS] \n"239"vst1.32 d0[1], [%[dst]], %[kBPS] \n"240"vst1.32 d1[0], [%[dst]], %[kBPS] \n"241"vst1.32 d1[1], [%[dst]] \n"242243: [in] "+r"(in), [dst] "+r"(dst) // modified registers244: [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants245: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered246);247}248249#endif // WEBP_USE_INTRINSICS250251static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,252const int16_t* WEBP_RESTRICT in,253uint8_t* WEBP_RESTRICT dst, int do_two) {254ITransformOne_NEON(ref, in, dst);255if (do_two) {256ITransformOne_NEON(ref + 4, in + 16, dst + 4);257}258}259260// Load all 4x4 pixels into a single uint8x16_t variable.261static uint8x16_t Load4x4_NEON(const uint8_t* src) {262uint32x4_t out = vdupq_n_u32(0);263out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);264out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);265out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);266out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);267return vreinterpretq_u8_u32(out);268}269270// Forward transform.271272#if defined(WEBP_USE_INTRINSICS)273274static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,275const int16x4_t B,276const int16x4_t C,277const int16x4_t D,278int16x8_t* const out01,279int16x8_t* const out32) {280const int16x4x2_t AB = vtrn_s16(A, B);281const int16x4x2_t CD = vtrn_s16(C, D);282const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),283vreinterpret_s32_s16(CD.val[0]));284const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),285vreinterpret_s32_s16(CD.val[1]));286*out01 = vreinterpretq_s16_s64(287vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),288vreinterpret_s64_s32(tmp13.val[0])));289*out32 = vreinterpretq_s16_s64(290vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),291vreinterpret_s64_s32(tmp02.val[1])));292}293294static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,295const uint8x8_t b) {296return vreinterpretq_s16_u16(vsubl_u8(a, b));297}298299static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,300const uint8_t* WEBP_RESTRICT ref,301int16_t* WEBP_RESTRICT out) {302int16x8_t d0d1, d3d2; // working 4x4 int16 variables303{304const uint8x16_t S0 = Load4x4_NEON(src);305const uint8x16_t R0 = Load4x4_NEON(ref);306const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));307const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));308const int16x4_t D0 = vget_low_s16(D0D1);309const int16x4_t D1 = vget_high_s16(D0D1);310const int16x4_t D2 = vget_low_s16(D2D3);311const int16x4_t D3 = vget_high_s16(D2D3);312Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);313}314{ // 1rst pass315const int32x4_t kCst937 = vdupq_n_s32(937);316const int32x4_t kCst1812 = vdupq_n_s32(1812);317const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)318const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)319const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);320const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),321vget_high_s16(a0a1_2));322const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),323vget_high_s16(a0a1_2));324const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);325const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);326const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);327const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);328const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);329const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);330Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);331}332{ // 2nd pass333// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)334const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));335const int32x4_t kCst51000 = vdupq_n_s32(51000);336const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)337const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)338const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));339const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);340const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);341const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);342const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);343const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);344const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);345const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);346const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);347const int16x4_t a3_eq_0 =348vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));349const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);350vst1_s16(out + 0, out0);351vst1_s16(out + 4, out1);352vst1_s16(out + 8, out2);353vst1_s16(out + 12, out3);354}355}356357#else358359// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm360static const int16_t kCoeff16[] = {3615352, 5352, 5352, 5352, 2217, 2217, 2217, 2217362};363static const int32_t kCoeff32[] = {3641812, 1812, 1812, 1812,365937, 937, 937, 937,36612000, 12000, 12000, 12000,36751000, 51000, 51000, 51000368};369370static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,371const uint8_t* WEBP_RESTRICT ref,372int16_t* WEBP_RESTRICT out) {373const int kBPS = BPS;374const uint8_t* src_ptr = src;375const uint8_t* ref_ptr = ref;376const int16_t* coeff16 = kCoeff16;377const int32_t* coeff32 = kCoeff32;378379__asm__ volatile (380// load src into q4, q5 in high half381"vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"382"vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"383"vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"384"vld1.8 {d11}, [%[src_ptr]] \n"385386// load ref into q6, q7 in high half387"vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"388"vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"389"vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"390"vld1.8 {d15}, [%[ref_ptr]] \n"391392// Pack the high values in to q4 and q6393"vtrn.32 q4, q5 \n"394"vtrn.32 q6, q7 \n"395396// d[0-3] = src - ref397"vsubl.u8 q0, d8, d12 \n"398"vsubl.u8 q1, d9, d13 \n"399400// load coeff16 into q8(d16=5352, d17=2217)401"vld1.16 {q8}, [%[coeff16]] \n"402403// load coeff32 high half into q9 = 1812, q10 = 937404"vld1.32 {q9, q10}, [%[coeff32]]! \n"405406// load coeff32 low half into q11=12000, q12=51000407"vld1.32 {q11,q12}, [%[coeff32]] \n"408409// part 1410// Transpose. Register dN is the same as dN in C411"vtrn.32 d0, d2 \n"412"vtrn.32 d1, d3 \n"413"vtrn.16 d0, d1 \n"414"vtrn.16 d2, d3 \n"415416"vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3417"vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2418"vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2419"vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3420421"vadd.s16 d0, d4, d5 \n" // a0 + a1422"vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3423"vsub.s16 d2, d4, d5 \n" // a0 - a1424"vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3425426"vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812427"vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937428"vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812429"vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352430431// temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9432// temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9433"vshrn.s32 d1, q9, #9 \n"434"vshrn.s32 d3, q10, #9 \n"435436// part 2437// transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]438"vtrn.32 d0, d2 \n"439"vtrn.32 d1, d3 \n"440"vtrn.16 d0, d1 \n"441"vtrn.16 d2, d3 \n"442443"vmov.s16 d26, #7 \n"444445"vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]446"vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]447"vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]448"vadd.s16 d4, d4, d26 \n" // a1 + 7449"vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]450451"vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7452"vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7453454"vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000455"vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000456457"vceq.s16 d4, d7, #0 \n"458459"vshr.s16 d0, d0, #4 \n"460"vshr.s16 d2, d2, #4 \n"461462"vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000463"vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000464465"vmvn d4, d4 \n" // !(d1 == 0)466// op[4] = (c1*2217 + d1*5352 + 12000)>>16467"vshrn.s32 d1, q11, #16 \n"468// op[4] += (d1!=0)469"vsub.s16 d1, d1, d4 \n"470// op[12]= (d1*2217 - c1*5352 + 51000)>>16471"vshrn.s32 d3, q12, #16 \n"472473// set result to out array474"vst1.16 {q0, q1}, [%[out]] \n"475: [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),476[coeff32] "+r"(coeff32) // modified registers477: [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),478[out] "r"(out) // constants479: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",480"q10", "q11", "q12", "q13" // clobbered481);482}483484#endif485486#define LOAD_LANE_16b(VALUE, LANE) do { \487(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \488src += stride; \489} while (0)490491static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,492int16_t* WEBP_RESTRICT out) {493const int stride = 16;494const int16x4_t zero = vdup_n_s16(0);495int32x4x4_t tmp0;496int16x4x4_t in;497INIT_VECTOR4(in, zero, zero, zero, zero);498LOAD_LANE_16b(in.val[0], 0);499LOAD_LANE_16b(in.val[1], 0);500LOAD_LANE_16b(in.val[2], 0);501LOAD_LANE_16b(in.val[3], 0);502LOAD_LANE_16b(in.val[0], 1);503LOAD_LANE_16b(in.val[1], 1);504LOAD_LANE_16b(in.val[2], 1);505LOAD_LANE_16b(in.val[3], 1);506LOAD_LANE_16b(in.val[0], 2);507LOAD_LANE_16b(in.val[1], 2);508LOAD_LANE_16b(in.val[2], 2);509LOAD_LANE_16b(in.val[3], 2);510LOAD_LANE_16b(in.val[0], 3);511LOAD_LANE_16b(in.val[1], 3);512LOAD_LANE_16b(in.val[2], 3);513LOAD_LANE_16b(in.val[3], 3);514515{516// a0 = in[0 * 16] + in[2 * 16]517// a1 = in[1 * 16] + in[3 * 16]518// a2 = in[1 * 16] - in[3 * 16]519// a3 = in[0 * 16] - in[2 * 16]520const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);521const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);522const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);523const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);524tmp0.val[0] = vaddq_s32(a0, a1);525tmp0.val[1] = vaddq_s32(a3, a2);526tmp0.val[2] = vsubq_s32(a3, a2);527tmp0.val[3] = vsubq_s32(a0, a1);528}529{530const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);531// a0 = tmp[0 + i] + tmp[ 8 + i]532// a1 = tmp[4 + i] + tmp[12 + i]533// a2 = tmp[4 + i] - tmp[12 + i]534// a3 = tmp[0 + i] - tmp[ 8 + i]535const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);536const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);537const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);538const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);539const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1540const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1541const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1542const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1543const int16x4_t out0 = vmovn_s32(b0);544const int16x4_t out1 = vmovn_s32(b1);545const int16x4_t out2 = vmovn_s32(b2);546const int16x4_t out3 = vmovn_s32(b3);547548vst1_s16(out + 0, out0);549vst1_s16(out + 4, out1);550vst1_s16(out + 8, out2);551vst1_s16(out + 12, out3);552}553}554#undef LOAD_LANE_16b555556//------------------------------------------------------------------------------557// Texture distortion558//559// We try to match the spectral content (weighted) between source and560// reconstructed samples.561562// a 0123, b 0123563// a 4567, b 4567564// a 89ab, b 89ab565// a cdef, b cdef566//567// transpose568//569// a 048c, b 048c570// a 159d, b 159d571// a 26ae, b 26ae572// a 37bf, b 37bf573//574static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {575const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);576const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);577const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),578vreinterpretq_s32_s16(q2_tmp1.val[0]));579const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),580vreinterpretq_s32_s16(q2_tmp1.val[1]));581q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);582q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);583q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);584q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);585return q4_in;586}587588static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(589const int16x8x4_t q4_in) {590// {a0, a1} = {in[0] + in[2], in[1] + in[3]}591// {a3, a2} = {in[0] - in[2], in[1] - in[3]}592const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);593const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);594const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);595const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);596int16x8x4_t q4_out;597// tmp[0] = a0 + a1598// tmp[1] = a3 + a2599// tmp[2] = a3 - a2600// tmp[3] = a0 - a1601INIT_VECTOR4(q4_out,602vabsq_s16(vaddq_s16(q_a0, q_a1)),603vabsq_s16(vaddq_s16(q_a3, q_a2)),604vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));605return q4_out;606}607608static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {609const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],610q4_in.val[2]));611const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],612q4_in.val[3]));613const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],614q4_in.val[3]));615const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],616q4_in.val[2]));617int16x8x4_t q4_out;618619INIT_VECTOR4(q4_out,620vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),621vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));622return q4_out;623}624625static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {626const uint16x8_t q_w07 = vld1q_u16(&w[0]);627const uint16x8_t q_w8f = vld1q_u16(&w[8]);628int16x4x4_t d4_w;629INIT_VECTOR4(d4_w,630vget_low_s16(vreinterpretq_s16_u16(q_w07)),631vget_high_s16(vreinterpretq_s16_u16(q_w07)),632vget_low_s16(vreinterpretq_s16_u16(q_w8f)),633vget_high_s16(vreinterpretq_s16_u16(q_w8f)));634return d4_w;635}636637static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,638const int16x4x4_t d4_w) {639int32x2_t d_sum;640// sum += w[ 0] * abs(b0);641// sum += w[ 4] * abs(b1);642// sum += w[ 8] * abs(b2);643// sum += w[12] * abs(b3);644int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));645int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));646int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));647int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));648q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));649q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));650q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));651q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));652653q_sum0 = vaddq_s32(q_sum0, q_sum1);654q_sum2 = vaddq_s32(q_sum2, q_sum3);655q_sum2 = vaddq_s32(q_sum0, q_sum2);656d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));657d_sum = vpadd_s32(d_sum, d_sum);658return d_sum;659}660661#define LOAD_LANE_32b(src, VALUE, LANE) \662(VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))663664// Hadamard transform665// Returns the weighted sum of the absolute value of transformed coefficients.666// w[] contains a row-major 4 by 4 symmetric matrix.667static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,668const uint8_t* WEBP_RESTRICT const b,669const uint16_t* WEBP_RESTRICT const w) {670uint32x2_t d_in_ab_0123 = vdup_n_u32(0);671uint32x2_t d_in_ab_4567 = vdup_n_u32(0);672uint32x2_t d_in_ab_89ab = vdup_n_u32(0);673uint32x2_t d_in_ab_cdef = vdup_n_u32(0);674uint8x8x4_t d4_in;675676// load data a, b677LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);678LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);679LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);680LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);681LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);682LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);683LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);684LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);685INIT_VECTOR4(d4_in,686vreinterpret_u8_u32(d_in_ab_0123),687vreinterpret_u8_u32(d_in_ab_4567),688vreinterpret_u8_u32(d_in_ab_89ab),689vreinterpret_u8_u32(d_in_ab_cdef));690691{692// Vertical pass first to avoid a transpose (vertical and horizontal passes693// are commutative because w/kWeightY is symmetric) and subsequent694// transpose.695const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);696const int16x4x4_t d4_w = DistoLoadW_NEON(w);697// horizontal pass698const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);699const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);700int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);701702// abs(sum2 - sum1) >> 5703d_sum = vabs_s32(d_sum);704d_sum = vshr_n_s32(d_sum, 5);705return vget_lane_s32(d_sum, 0);706}707}708#undef LOAD_LANE_32b709710static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,711const uint8_t* WEBP_RESTRICT const b,712const uint16_t* WEBP_RESTRICT const w) {713int D = 0;714int x, y;715for (y = 0; y < 16 * BPS; y += 4 * BPS) {716for (x = 0; x < 16; x += 4) {717D += Disto4x4_NEON(a + x + y, b + x + y, w);718}719}720return D;721}722723//------------------------------------------------------------------------------724725static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,726const uint8_t* WEBP_RESTRICT pred,727int start_block, int end_block,728VP8Histogram* WEBP_RESTRICT const histo) {729const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);730int j;731int distribution[MAX_COEFF_THRESH + 1] = { 0 };732for (j = start_block; j < end_block; ++j) {733int16_t out[16];734FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);735{736int k;737const int16x8_t a0 = vld1q_s16(out + 0);738const int16x8_t b0 = vld1q_s16(out + 8);739const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));740const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));741const uint16x8_t a2 = vshrq_n_u16(a1, 3);742const uint16x8_t b2 = vshrq_n_u16(b1, 3);743const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);744const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);745vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));746vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));747// Convert coefficients to bin.748for (k = 0; k < 16; ++k) {749++distribution[out[k]];750}751}752}753VP8SetHistogramData(distribution, histo);754}755756//------------------------------------------------------------------------------757758static WEBP_INLINE void AccumulateSSE16_NEON(759const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,760uint32x4_t* const sum) {761const uint8x16_t a0 = vld1q_u8(a);762const uint8x16_t b0 = vld1q_u8(b);763const uint8x16_t abs_diff = vabdq_u8(a0, b0);764const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),765vget_low_u8(abs_diff));766const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),767vget_high_u8(abs_diff));768/* pair-wise adds and widen */769const uint32x4_t sum1 = vpaddlq_u16(prod1);770const uint32x4_t sum2 = vpaddlq_u16(prod2);771*sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));772}773774// Horizontal sum of all four uint32_t values in 'sum'.775static int SumToInt_NEON(uint32x4_t sum) {776#if WEBP_AARCH64777return (int)vaddvq_u32(sum);778#else779const uint64x2_t sum2 = vpaddlq_u32(sum);780const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),781vreinterpret_u32_u64(vget_high_u64(sum2)));782return (int)vget_lane_u32(sum3, 0);783#endif784}785786static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,787const uint8_t* WEBP_RESTRICT b) {788uint32x4_t sum = vdupq_n_u32(0);789int y;790for (y = 0; y < 16; ++y) {791AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);792}793return SumToInt_NEON(sum);794}795796static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,797const uint8_t* WEBP_RESTRICT b) {798uint32x4_t sum = vdupq_n_u32(0);799int y;800for (y = 0; y < 8; ++y) {801AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);802}803return SumToInt_NEON(sum);804}805806static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,807const uint8_t* WEBP_RESTRICT b) {808uint32x4_t sum = vdupq_n_u32(0);809int y;810for (y = 0; y < 8; ++y) {811const uint8x8_t a0 = vld1_u8(a + y * BPS);812const uint8x8_t b0 = vld1_u8(b + y * BPS);813const uint8x8_t abs_diff = vabd_u8(a0, b0);814const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);815sum = vpadalq_u16(sum, prod);816}817return SumToInt_NEON(sum);818}819820static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,821const uint8_t* WEBP_RESTRICT b) {822const uint8x16_t a0 = Load4x4_NEON(a);823const uint8x16_t b0 = Load4x4_NEON(b);824const uint8x16_t abs_diff = vabdq_u8(a0, b0);825const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),826vget_low_u8(abs_diff));827const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),828vget_high_u8(abs_diff));829/* pair-wise adds and widen */830const uint32x4_t sum1 = vpaddlq_u16(prod1);831const uint32x4_t sum2 = vpaddlq_u16(prod2);832return SumToInt_NEON(vaddq_u32(sum1, sum2));833}834835//------------------------------------------------------------------------------836837// Compilation with gcc-4.6.x is problematic for now.838#if !defined(WORK_AROUND_GCC)839840static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,841const VP8Matrix* WEBP_RESTRICT const mtx,842int offset) {843const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);844const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);845const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);846const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);847const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);848849const int16x8_t a = vld1q_s16(in + offset); // in850const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)851const int16x8_t sign = vshrq_n_s16(a, 15); // sign852const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen853const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));854const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));855const uint32x4_t m2 = vhaddq_u32(m0, bias0);856const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1857const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),858vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1859const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));860const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);861const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign862const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));863vst1q_s16(in + offset, c4);864assert(QFIX == 17); // this function can't work as is if QFIX != 16+1865return c3;866}867868static const uint8_t kShuffles[4][8] = {869{ 0, 1, 2, 3, 8, 9, 16, 17 },870{ 10, 11, 4, 5, 6, 7, 12, 13 },871{ 18, 19, 24, 25, 26, 27, 20, 21 },872{ 14, 15, 22, 23, 28, 29, 30, 31 }873};874875static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],876const VP8Matrix* WEBP_RESTRICT const mtx) {877const int16x8_t out0 = Quantize_NEON(in, mtx, 0);878const int16x8_t out1 = Quantize_NEON(in, mtx, 8);879uint8x8x4_t shuffles;880// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use881// non-standard versions there.882#if defined(__APPLE__) && WEBP_AARCH64 && \883defined(__apple_build_version__) && (__apple_build_version__< 6020037)884uint8x16x2_t all_out;885INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));886INIT_VECTOR4(shuffles,887vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),888vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),889vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),890vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));891#else892uint8x8x4_t all_out;893INIT_VECTOR4(all_out,894vreinterpret_u8_s16(vget_low_s16(out0)),895vreinterpret_u8_s16(vget_high_s16(out0)),896vreinterpret_u8_s16(vget_low_s16(out1)),897vreinterpret_u8_s16(vget_high_s16(out1)));898INIT_VECTOR4(shuffles,899vtbl4_u8(all_out, vld1_u8(kShuffles[0])),900vtbl4_u8(all_out, vld1_u8(kShuffles[1])),901vtbl4_u8(all_out, vld1_u8(kShuffles[2])),902vtbl4_u8(all_out, vld1_u8(kShuffles[3])));903#endif904// Zigzag reordering905vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);906vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);907vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);908vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);909// test zeros910if (*(uint64_t*)(out + 0) != 0) return 1;911if (*(uint64_t*)(out + 4) != 0) return 1;912if (*(uint64_t*)(out + 8) != 0) return 1;913if (*(uint64_t*)(out + 12) != 0) return 1;914return 0;915}916917static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],918const VP8Matrix* WEBP_RESTRICT const mtx) {919int nz;920nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;921nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;922return nz;923}924925#endif // !WORK_AROUND_GCC926927#if WEBP_AARCH64928929#if BPS == 32930#define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \931do { \932uint8x16_t r; \933r = vqtbl2q_u8(qcombined, tbl); \934r = vreinterpretq_u8_u32( \935vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane), \936vreinterpretq_u32_u8(r), 1)); \937vst1q_u8(dst, r); \938} while (0)939940#define RD4_VR4_LD4_VL4_NEON(dst, tbl) \941do { \942uint8x16_t r; \943r = vqtbl2q_u8(qcombined, tbl); \944vst1q_u8(dst, r); \945} while (0)946947static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,948const uint8_t* WEBP_RESTRICT top) {949// 0 1 2 3 4 5 6 7 8 9 10 11 12 13950// L K J I X A B C D E F G H951// -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7952static const uint8_t kLookupTbl1[64] = {9530, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12,9543, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0,9554, 20, 21, 22, 3, 18, 2, 17, 3, 19, 4, 20, 2, 17, 1, 16,9562, 18, 3, 19, 1, 16, 31, 31, 1, 17, 2, 18, 31, 31, 31, 31957};958959static const uint8_t kLookupTbl2[64] = {96020, 21, 22, 23, 5, 6, 7, 8, 22, 23, 24, 25, 6, 7, 8, 9,96119, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,96218, 19, 20, 21, 19, 5, 6, 7, 24, 25, 26, 27, 7, 8, 9, 26,96317, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27964};965966static const uint8_t kLookupTbl3[64] = {96730, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 19, 19, 19, 19,96830, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 18, 18, 18, 18,96930, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 17, 17, 17, 17,97030, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 16, 16, 16, 16971};972973const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1);974const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2);975const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3);976977const uint8x16_t preload = vld1q_u8(top - 5);978uint8x16x2_t qcombined;979uint8x16_t result0, result1;980981uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);982uint8x16_t b = preload;983uint8x16_t c = vextq_u8(a, a, 2);984985uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);986uint8x16_t avg2_all = vrhaddq_u8(a, b);987988uint8x8_t preload_x8, sub_a, sub_c;989uint8_t result_u8;990uint8x8_t res_lo, res_hi;991uint8x16_t full_b;992uint16x8_t sub, sum_lo, sum_hi;993994preload_x8 = vget_low_u8(c);995preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);996997result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;998999avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);1000avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);10011002qcombined.val[0] = avg2_all;1003qcombined.val[1] = avg3_all;10041005sub_a = vdup_laneq_u8(preload, 4);10061007// preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}1008full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);1009// preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}1010sub_c = vreinterpret_u8_u32(vdup_n_u32(1011vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));10121013sub = vsubl_u8(sub_c, sub_a);1014sum_lo = vaddw_u8(sub, vget_low_u8(full_b));1015res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));10161017sum_hi = vaddw_u8(sub, vget_high_u8(full_b));1018res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));10191020// DC4, VE4, HE4, TM41021DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);1022DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);1023DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);1024DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);10251026// RD4, VR4, LD4, VL41027RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);1028RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);1029RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);1030RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);10311032// HD4, HU41033result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);1034result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);10351036vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));1037vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));1038vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));1039vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));1040}1041#endif // BPS == 3210421043static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {1044uint8x16_t a = vdupq_n_u8(value);1045int i;1046for (i = 0; i < 16; i++) {1047vst1q_u8(dst + BPS * i, a);1048}1049}10501051static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {1052uint8x16_t a = vld1q_u8(src);1053int i;1054for (i = 0; i < 16; i++) {1055vst1q_u8(dst + BPS * i, a);1056}1057}10581059static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,1060const uint8_t* left) {1061uint8x16_t a;10621063if (left == NULL) {1064Fill_NEON(dst, 129);1065return;1066}10671068a = vld1q_u8(left + 0);1069vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));1070vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));1071vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));1072vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));1073vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));1074vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));1075vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));1076vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));1077vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));1078vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));1079vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));1080vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));1081vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));1082vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));1083vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));1084vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));1085}10861087static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {1088if (top != NULL) {1089Fill16_NEON(dst, top);1090} else {1091Fill_NEON(dst, 127);1092}1093}10941095static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,1096const uint8_t* top) {1097uint8_t s;10981099if (top != NULL) {1100uint16_t dc;1101dc = vaddlvq_u8(vld1q_u8(top));1102if (left != NULL) {1103// top and left present.1104dc += vaddlvq_u8(vld1q_u8(left));1105s = vqrshrnh_n_u16(dc, 5);1106} else {1107// top but no left.1108s = vqrshrnh_n_u16(dc, 4);1109}1110} else {1111if (left != NULL) {1112uint16_t dc;1113// left but no top.1114dc = vaddlvq_u8(vld1q_u8(left));1115s = vqrshrnh_n_u16(dc, 4);1116} else {1117// No top, no left, nothing.1118s = 0x80;1119}1120}1121Fill_NEON(dst, s);1122}11231124static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,1125const uint8x8_t outer,1126const uint8x8x2_t inner,1127const uint16x8_t a, int i,1128const int n) {1129uint8x8_t d1, d2;1130uint16x8_t r1, r2;11311132r1 = vaddl_u8(outer, inner.val[0]);1133r1 = vqsubq_u16(r1, a);1134d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));1135r2 = vaddl_u8(outer, inner.val[1]);1136r2 = vqsubq_u16(r2, a);1137d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));1138vst1_u8(dst + BPS * (i * 4 + n), d1);1139vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);1140}11411142static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,1143const uint8_t* top) {1144int i;1145uint16x8_t a;1146uint8x8x2_t inner;11471148if (left == NULL) {1149// True motion without left samples (hence: with default 129 value) is1150// equivalent to VE prediction where you just copy the top samples.1151// Note that if top samples are not available, the default value is then1152// 129, and not 127 as in the VerticalPred case.1153if (top != NULL) {1154VerticalPred16_NEON(dst, top);1155} else {1156Fill_NEON(dst, 129);1157}1158return;1159}11601161// left is not NULL.1162if (top == NULL) {1163HorizontalPred16_NEON(dst, left);1164return;1165}11661167// Neither left nor top are NULL.1168a = vdupq_n_u16(left[-1]);1169inner = vld1_u8_x2(top);11701171for (i = 0; i < 4; i++) {1172const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);11731174TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);1175TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);1176TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);1177TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);1178}1179}11801181static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,1182const uint8_t* WEBP_RESTRICT left,1183const uint8_t* WEBP_RESTRICT top) {1184DCMode_NEON(I16DC16 + dst, left, top);1185VerticalPred16_NEON(I16VE16 + dst, top);1186HorizontalPred16_NEON(I16HE16 + dst, left);1187TrueMotion_NEON(I16TM16 + dst, left, top);1188}11891190#endif // WEBP_AARCH6411911192//------------------------------------------------------------------------------1193// Entry point11941195extern void VP8EncDspInitNEON(void);11961197WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {1198VP8ITransform = ITransform_NEON;1199VP8FTransform = FTransform_NEON;12001201VP8FTransformWHT = FTransformWHT_NEON;12021203VP8TDisto4x4 = Disto4x4_NEON;1204VP8TDisto16x16 = Disto16x16_NEON;1205VP8CollectHistogram = CollectHistogram_NEON;12061207VP8SSE16x16 = SSE16x16_NEON;1208VP8SSE16x8 = SSE16x8_NEON;1209VP8SSE8x8 = SSE8x8_NEON;1210VP8SSE4x4 = SSE4x4_NEON;12111212#if WEBP_AARCH641213#if BPS == 321214VP8EncPredLuma4 = Intra4Preds_NEON;1215#endif1216VP8EncPredLuma16 = Intra16Preds_NEON;1217#endif12181219#if !defined(WORK_AROUND_GCC)1220VP8EncQuantizeBlock = QuantizeBlock_NEON;1221VP8EncQuantize2Blocks = Quantize2Blocks_NEON;1222VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;1223#endif1224}12251226#else // !WEBP_USE_NEON12271228WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)12291230#endif // WEBP_USE_NEON123112321233