Path: blob/master/thirdparty/libwebp/src/dsp/dec_neon.c
9913 views
// Copyright 2012 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// ARM NEON version of dsp functions and loop filtering.10//11// Authors: Somnath Banerjee ([email protected])12// Johann Koenig ([email protected])1314#include "src/dsp/dsp.h"1516#if defined(WEBP_USE_NEON)1718#include "src/dsp/neon.h"19#include "src/dec/vp8i_dec.h"2021//------------------------------------------------------------------------------22// NxM Loading functions2324#if !defined(WORK_AROUND_GCC)2526// This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation27// (register alloc, probably). The variants somewhat mitigate the problem, but28// not quite. HFilter16i() remains problematic.29static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,30int stride) {31const uint8x8_t zero = vdup_n_u8(0);32uint8x8x4_t out;33INIT_VECTOR4(out, zero, zero, zero, zero);34out = vld4_lane_u8(src + 0 * stride, out, 0);35out = vld4_lane_u8(src + 1 * stride, out, 1);36out = vld4_lane_u8(src + 2 * stride, out, 2);37out = vld4_lane_u8(src + 3 * stride, out, 3);38out = vld4_lane_u8(src + 4 * stride, out, 4);39out = vld4_lane_u8(src + 5 * stride, out, 5);40out = vld4_lane_u8(src + 6 * stride, out, 6);41out = vld4_lane_u8(src + 7 * stride, out, 7);42return out;43}4445static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,46uint8x16_t* const p1,47uint8x16_t* const p0,48uint8x16_t* const q0,49uint8x16_t* const q1) {50// row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]51// row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]52const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);53const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);54*p1 = vcombine_u8(row0.val[0], row8.val[0]);55*p0 = vcombine_u8(row0.val[1], row8.val[1]);56*q0 = vcombine_u8(row0.val[2], row8.val[2]);57*q1 = vcombine_u8(row0.val[3], row8.val[3]);58}5960#else // WORK_AROUND_GCC6162#define LOADQ_LANE_32b(VALUE, LANE) do { \63(VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \64src += stride; \65} while (0)6667static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,68uint8x16_t* const p1,69uint8x16_t* const p0,70uint8x16_t* const q0,71uint8x16_t* const q1) {72const uint32x4_t zero = vdupq_n_u32(0);73uint32x4x4_t in;74INIT_VECTOR4(in, zero, zero, zero, zero);75src -= 2;76LOADQ_LANE_32b(in.val[0], 0);77LOADQ_LANE_32b(in.val[1], 0);78LOADQ_LANE_32b(in.val[2], 0);79LOADQ_LANE_32b(in.val[3], 0);80LOADQ_LANE_32b(in.val[0], 1);81LOADQ_LANE_32b(in.val[1], 1);82LOADQ_LANE_32b(in.val[2], 1);83LOADQ_LANE_32b(in.val[3], 1);84LOADQ_LANE_32b(in.val[0], 2);85LOADQ_LANE_32b(in.val[1], 2);86LOADQ_LANE_32b(in.val[2], 2);87LOADQ_LANE_32b(in.val[3], 2);88LOADQ_LANE_32b(in.val[0], 3);89LOADQ_LANE_32b(in.val[1], 3);90LOADQ_LANE_32b(in.val[2], 3);91LOADQ_LANE_32b(in.val[3], 3);92// Transpose four 4x4 parts:93{94const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),95vreinterpretq_u8_u32(in.val[1]));96const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),97vreinterpretq_u8_u32(in.val[3]));98const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),99vreinterpretq_u16_u8(row23.val[0]));100const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),101vreinterpretq_u16_u8(row23.val[1]));102*p1 = vreinterpretq_u8_u16(row02.val[0]);103*p0 = vreinterpretq_u8_u16(row13.val[0]);104*q0 = vreinterpretq_u8_u16(row02.val[1]);105*q1 = vreinterpretq_u8_u16(row13.val[1]);106}107}108#undef LOADQ_LANE_32b109110#endif // !WORK_AROUND_GCC111112static WEBP_INLINE void Load8x16_NEON(113const uint8_t* const src, int stride,114uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,115uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,116uint8x16_t* const q2, uint8x16_t* const q3) {117Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);118Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);119}120121static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,122uint8x16_t* const p1,123uint8x16_t* const p0,124uint8x16_t* const q0,125uint8x16_t* const q1) {126*p1 = vld1q_u8(src - 2 * stride);127*p0 = vld1q_u8(src - 1 * stride);128*q0 = vld1q_u8(src + 0 * stride);129*q1 = vld1q_u8(src + 1 * stride);130}131132static WEBP_INLINE void Load16x8_NEON(133const uint8_t* const src, int stride,134uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,135uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,136uint8x16_t* const q2, uint8x16_t* const q3) {137Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0);138Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3);139}140141static WEBP_INLINE void Load8x8x2_NEON(142const uint8_t* const u, const uint8_t* const v, int stride,143uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,144uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,145uint8x16_t* const q2, uint8x16_t* const q3) {146// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination147// and the v-samples on the higher half.148*p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));149*p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));150*p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));151*p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));152*q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));153*q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));154*q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));155*q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));156}157158#if !defined(WORK_AROUND_GCC)159160#define LOAD_UV_8(ROW) \161vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))162163static WEBP_INLINE void Load8x8x2T_NEON(164const uint8_t* const u, const uint8_t* const v, int stride,165uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,166uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,167uint8x16_t* const q2, uint8x16_t* const q3) {168// We pack the 8x8 u-samples in the lower half of the uint8x16_t destination169// and the v-samples on the higher half.170const uint8x16_t row0 = LOAD_UV_8(0);171const uint8x16_t row1 = LOAD_UV_8(1);172const uint8x16_t row2 = LOAD_UV_8(2);173const uint8x16_t row3 = LOAD_UV_8(3);174const uint8x16_t row4 = LOAD_UV_8(4);175const uint8x16_t row5 = LOAD_UV_8(5);176const uint8x16_t row6 = LOAD_UV_8(6);177const uint8x16_t row7 = LOAD_UV_8(7);178// Perform two side-by-side 8x8 transposes179// u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07180// u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...181// u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...182// u30 u31 u32 u33 u34 u35 u36 u37 | ...183// u40 u41 u42 u43 u44 u45 u46 u47 | ...184// u50 u51 u52 u53 u54 u55 u56 u57 | ...185// u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...186// u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...187const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...188// u01 u11 u03 u13 ...189const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...190// u21 u31 u23 u33 ...191const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...192const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...193const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),194vreinterpretq_u16_u8(row23.val[0]));195const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),196vreinterpretq_u16_u8(row23.val[1]));197const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),198vreinterpretq_u16_u8(row67.val[0]));199const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),200vreinterpretq_u16_u8(row67.val[1]));201const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),202vreinterpretq_u32_u16(row46.val[0]));203const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),204vreinterpretq_u32_u16(row46.val[1]));205const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),206vreinterpretq_u32_u16(row57.val[0]));207const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),208vreinterpretq_u32_u16(row57.val[1]));209*p3 = vreinterpretq_u8_u32(row04.val[0]);210*p2 = vreinterpretq_u8_u32(row15.val[0]);211*p1 = vreinterpretq_u8_u32(row26.val[0]);212*p0 = vreinterpretq_u8_u32(row37.val[0]);213*q0 = vreinterpretq_u8_u32(row04.val[1]);214*q1 = vreinterpretq_u8_u32(row15.val[1]);215*q2 = vreinterpretq_u8_u32(row26.val[1]);216*q3 = vreinterpretq_u8_u32(row37.val[1]);217}218#undef LOAD_UV_8219220#endif // !WORK_AROUND_GCC221222static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,223uint8_t* const dst, int stride) {224vst2_lane_u8(dst + 0 * stride, v, 0);225vst2_lane_u8(dst + 1 * stride, v, 1);226vst2_lane_u8(dst + 2 * stride, v, 2);227vst2_lane_u8(dst + 3 * stride, v, 3);228vst2_lane_u8(dst + 4 * stride, v, 4);229vst2_lane_u8(dst + 5 * stride, v, 5);230vst2_lane_u8(dst + 6 * stride, v, 6);231vst2_lane_u8(dst + 7 * stride, v, 7);232}233234static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,235uint8_t* const dst, int stride) {236uint8x8x2_t lo, hi;237lo.val[0] = vget_low_u8(p0);238lo.val[1] = vget_low_u8(q0);239hi.val[0] = vget_high_u8(p0);240hi.val[1] = vget_high_u8(q0);241Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);242Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);243}244245#if !defined(WORK_AROUND_GCC)246static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,247uint8_t* const dst, int stride) {248vst4_lane_u8(dst + 0 * stride, v, 0);249vst4_lane_u8(dst + 1 * stride, v, 1);250vst4_lane_u8(dst + 2 * stride, v, 2);251vst4_lane_u8(dst + 3 * stride, v, 3);252vst4_lane_u8(dst + 4 * stride, v, 4);253vst4_lane_u8(dst + 5 * stride, v, 5);254vst4_lane_u8(dst + 6 * stride, v, 6);255vst4_lane_u8(dst + 7 * stride, v, 7);256}257258static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,259const uint8x16_t q0, const uint8x16_t q1,260uint8_t* const dst, int stride) {261uint8x8x4_t lo, hi;262INIT_VECTOR4(lo,263vget_low_u8(p1), vget_low_u8(p0),264vget_low_u8(q0), vget_low_u8(q1));265INIT_VECTOR4(hi,266vget_high_u8(p1), vget_high_u8(p0),267vget_high_u8(q0), vget_high_u8(q1));268Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);269Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);270}271#endif // !WORK_AROUND_GCC272273static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,274uint8_t* const dst, int stride) {275vst1q_u8(dst - stride, p0);276vst1q_u8(dst, q0);277}278279static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,280const uint8x16_t q0, const uint8x16_t q1,281uint8_t* const dst, int stride) {282Store16x2_NEON(p1, p0, dst - stride, stride);283Store16x2_NEON(q0, q1, dst + stride, stride);284}285286static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,287const uint8x16_t q0,288uint8_t* const u, uint8_t* const v,289int stride) {290// p0 and q0 contain the u+v samples packed in low/high halves.291vst1_u8(u - stride, vget_low_u8(p0));292vst1_u8(u, vget_low_u8(q0));293vst1_u8(v - stride, vget_high_u8(p0));294vst1_u8(v, vget_high_u8(q0));295}296297static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,298const uint8x16_t p0,299const uint8x16_t q0,300const uint8x16_t q1,301uint8_t* const u, uint8_t* const v,302int stride) {303// The p1...q1 registers contain the u+v samples packed in low/high halves.304Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);305Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);306}307308#if !defined(WORK_AROUND_GCC)309310#define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \311vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \312vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \313(DST) += stride; \314} while (0)315316static WEBP_INLINE void Store6x8x2_NEON(317const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,318const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,319uint8_t* u, uint8_t* v, int stride) {320uint8x8x3_t u0, u1, v0, v1;321INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));322INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));323INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));324INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));325STORE6_LANE(u, u0, u1, 0);326STORE6_LANE(u, u0, u1, 1);327STORE6_LANE(u, u0, u1, 2);328STORE6_LANE(u, u0, u1, 3);329STORE6_LANE(u, u0, u1, 4);330STORE6_LANE(u, u0, u1, 5);331STORE6_LANE(u, u0, u1, 6);332STORE6_LANE(u, u0, u1, 7);333STORE6_LANE(v, v0, v1, 0);334STORE6_LANE(v, v0, v1, 1);335STORE6_LANE(v, v0, v1, 2);336STORE6_LANE(v, v0, v1, 3);337STORE6_LANE(v, v0, v1, 4);338STORE6_LANE(v, v0, v1, 5);339STORE6_LANE(v, v0, v1, 6);340STORE6_LANE(v, v0, v1, 7);341}342#undef STORE6_LANE343344static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,345const uint8x16_t p0,346const uint8x16_t q0,347const uint8x16_t q1,348uint8_t* const u, uint8_t* const v,349int stride) {350uint8x8x4_t u0, v0;351INIT_VECTOR4(u0,352vget_low_u8(p1), vget_low_u8(p0),353vget_low_u8(q0), vget_low_u8(q1));354INIT_VECTOR4(v0,355vget_high_u8(p1), vget_high_u8(p0),356vget_high_u8(q0), vget_high_u8(q1));357vst4_lane_u8(u - 2 + 0 * stride, u0, 0);358vst4_lane_u8(u - 2 + 1 * stride, u0, 1);359vst4_lane_u8(u - 2 + 2 * stride, u0, 2);360vst4_lane_u8(u - 2 + 3 * stride, u0, 3);361vst4_lane_u8(u - 2 + 4 * stride, u0, 4);362vst4_lane_u8(u - 2 + 5 * stride, u0, 5);363vst4_lane_u8(u - 2 + 6 * stride, u0, 6);364vst4_lane_u8(u - 2 + 7 * stride, u0, 7);365vst4_lane_u8(v - 2 + 0 * stride, v0, 0);366vst4_lane_u8(v - 2 + 1 * stride, v0, 1);367vst4_lane_u8(v - 2 + 2 * stride, v0, 2);368vst4_lane_u8(v - 2 + 3 * stride, v0, 3);369vst4_lane_u8(v - 2 + 4 * stride, v0, 4);370vst4_lane_u8(v - 2 + 5 * stride, v0, 5);371vst4_lane_u8(v - 2 + 6 * stride, v0, 6);372vst4_lane_u8(v - 2 + 7 * stride, v0, 7);373}374375#endif // !WORK_AROUND_GCC376377// Zero extend 'v' to an int16x8_t.378static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {379return vreinterpretq_s16_u16(vmovl_u8(v));380}381382// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result383// to the corresponding rows of 'dst'.384static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,385const int16x8_t dst01,386const int16x8_t dst23) {387// Unsigned saturate to 8b.388const uint8x8_t dst01_u8 = vqmovun_s16(dst01);389const uint8x8_t dst23_u8 = vqmovun_s16(dst23);390391// Store the results.392vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);393vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);394vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);395vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);396}397398static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,399const int16x8_t row23,400uint8_t* const dst) {401uint32x2_t dst01 = vdup_n_u32(0);402uint32x2_t dst23 = vdup_n_u32(0);403404// Load the source pixels.405dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);406dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);407dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);408dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);409410{411// Convert to 16b.412const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));413const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));414415// Descale with rounding.416const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);417const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);418// Add the inverse transform.419SaturateAndStore4x4_NEON(dst, out01, out23);420}421}422423//-----------------------------------------------------------------------------424// Simple In-loop filtering (Paragraph 15.2)425426static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,427const uint8x16_t q0, const uint8x16_t q1,428int thresh) {429const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);430const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)431const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)432const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)433const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2434const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);435const uint8x16_t mask = vcgeq_u8(thresh_v, sum);436return mask;437}438439static int8x16_t FlipSign_NEON(const uint8x16_t v) {440const uint8x16_t sign_bit = vdupq_n_u8(0x80);441return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));442}443444static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {445const int8x16_t sign_bit = vdupq_n_s8(0x80);446return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));447}448449static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,450const int8x16_t q0, const int8x16_t q1) {451const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)452const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)453const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)454const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)455const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)456return s3;457}458459static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {460const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)461const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)462const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)463return s2;464}465466//------------------------------------------------------------------------------467468static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,469const int8x16_t delta,470int8x16_t* const op0,471int8x16_t* const oq0) {472const int8x16_t kCst3 = vdupq_n_s8(0x03);473const int8x16_t kCst4 = vdupq_n_s8(0x04);474const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);475const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);476const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);477const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);478*op0 = vqaddq_s8(p0s, delta3);479*oq0 = vqsubq_s8(q0s, delta4);480}481482#if defined(WEBP_USE_INTRINSICS)483484static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,485const int8x16_t delta,486uint8x16_t* const op0, uint8x16_t* const oq0) {487const int8x16_t kCst3 = vdupq_n_s8(0x03);488const int8x16_t kCst4 = vdupq_n_s8(0x04);489const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);490const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);491const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);492const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);493const int8x16_t sp0 = vqaddq_s8(p0s, delta3);494const int8x16_t sq0 = vqsubq_s8(q0s, delta4);495*op0 = FlipSignBack_NEON(sp0);496*oq0 = FlipSignBack_NEON(sq0);497}498499static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,500const uint8x16_t q0, const uint8x16_t q1,501const uint8x16_t mask,502uint8x16_t* const op0, uint8x16_t* const oq0) {503const int8x16_t p1s = FlipSign_NEON(p1);504const int8x16_t p0s = FlipSign_NEON(p0);505const int8x16_t q0s = FlipSign_NEON(q0);506const int8x16_t q1s = FlipSign_NEON(q1);507const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);508const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));509ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);510}511512static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {513uint8x16_t p1, p0, q0, q1, op0, oq0;514Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);515{516const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);517DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);518}519Store16x2_NEON(op0, oq0, p, stride);520}521522static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {523uint8x16_t p1, p0, q0, q1, oq0, op0;524Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);525{526const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);527DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);528}529Store2x16_NEON(op0, oq0, p, stride);530}531532#else533534// Load/Store vertical edge535#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \536"vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \537"vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \538"vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \539"vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \540"vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \541"vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \542"vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \543"vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"544545#define STORE8x2(c1, c2, p, stride) \546"vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \547"vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \548"vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \549"vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \550"vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \551"vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \552"vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \553"vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"554555#define QRegs "q0", "q1", "q2", "q3", \556"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"557558#define FLIP_SIGN_BIT2(a, b, s) \559"veor " #a "," #a "," #s " \n" \560"veor " #b "," #b "," #s " \n" \561562#define FLIP_SIGN_BIT4(a, b, c, d, s) \563FLIP_SIGN_BIT2(a, b, s) \564FLIP_SIGN_BIT2(c, d, s) \565566#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \567"vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \568"vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \569"vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \570"vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \571"vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \572"vdup.8 q14, " #thresh " \n" \573"vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */574575#define GET_BASE_DELTA(p1, p0, q0, q1, o) \576"vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \577"vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \578"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \579"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \580"vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */581582#define DO_SIMPLE_FILTER(p0, q0, fl) \583"vmov.i8 q15, #0x03 \n" \584"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \585"vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \586"vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \587\588"vmov.i8 q15, #0x04 \n" \589"vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \590"vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \591"vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */592593// Applies filter on 2 pixels (p0 and q0)594#define DO_FILTER2(p1, p0, q0, q1, thresh) \595NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \596"vmov.i8 q10, #0x80 \n" /* sign bit */ \597FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \598GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \599"vand q9, q9, q11 \n" /* apply filter mask */ \600DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \601FLIP_SIGN_BIT2(p0, q0, q10)602603static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {604__asm__ volatile (605"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride606607"vld1.u8 {q1}, [%[p]], %[stride] \n" // p1608"vld1.u8 {q2}, [%[p]], %[stride] \n" // p0609"vld1.u8 {q3}, [%[p]], %[stride] \n" // q0610"vld1.u8 {q12}, [%[p]] \n" // q1611612DO_FILTER2(q1, q2, q3, q12, %[thresh])613614"sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride615616"vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0617"vst1.u8 {q3}, [%[p]] \n" // store oq0618: [p] "+r"(p)619: [stride] "r"(stride), [thresh] "r"(thresh)620: "memory", QRegs621);622}623624static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {625__asm__ volatile (626"sub r4, %[p], #2 \n" // base1 = p - 2627"lsl r6, %[stride], #1 \n" // r6 = 2 * stride628"add r5, r4, %[stride] \n" // base2 = base1 + stride629630LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)631LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)632"vswp d3, d24 \n" // p1:q1 p0:q3633"vswp d5, d26 \n" // q0:q2 q1:q4634"vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4635636DO_FILTER2(q1, q2, q12, q13, %[thresh])637638"sub %[p], %[p], #1 \n" // p - 1639640"vswp d5, d24 \n"641STORE8x2(d4, d5, [%[p]], %[stride])642STORE8x2(d24, d25, [%[p]], %[stride])643644: [p] "+r"(p)645: [stride] "r"(stride), [thresh] "r"(thresh)646: "memory", "r4", "r5", "r6", QRegs647);648}649650#undef LOAD8x4651#undef STORE8x2652653#endif // WEBP_USE_INTRINSICS654655static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {656uint32_t k;657for (k = 3; k != 0; --k) {658p += 4 * stride;659SimpleVFilter16_NEON(p, stride, thresh);660}661}662663static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {664uint32_t k;665for (k = 3; k != 0; --k) {666p += 4;667SimpleHFilter16_NEON(p, stride, thresh);668}669}670671//------------------------------------------------------------------------------672// Complex In-loop filtering (Paragraph 15.3)673674static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,675const uint8x16_t q0, const uint8x16_t q1,676int hev_thresh) {677const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);678const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)679const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)680const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);681const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);682return mask;683}684685static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,686const uint8x16_t p1, const uint8x16_t p0,687const uint8x16_t q0, const uint8x16_t q1,688const uint8x16_t q2, const uint8x16_t q3,689int ithresh, int thresh) {690const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);691const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)692const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)693const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)694const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)695const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)696const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)697const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);698const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);699const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);700const uint8x16_t max12 = vmaxq_u8(max1, max2);701const uint8x16_t max123 = vmaxq_u8(max12, max3);702const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);703const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);704const uint8x16_t mask = vandq_u8(mask1, mask2);705return mask;706}707708// 4-points filter709710static void ApplyFilter4_NEON(711const int8x16_t p1, const int8x16_t p0,712const int8x16_t q0, const int8x16_t q1,713const int8x16_t delta0,714uint8x16_t* const op1, uint8x16_t* const op0,715uint8x16_t* const oq0, uint8x16_t* const oq1) {716const int8x16_t kCst3 = vdupq_n_s8(0x03);717const int8x16_t kCst4 = vdupq_n_s8(0x04);718const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);719const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);720const int8x16_t a1 = vshrq_n_s8(delta1, 3);721const int8x16_t a2 = vshrq_n_s8(delta2, 3);722const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1723*op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2)724*oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1)725*op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3)726*oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3)727}728729static void DoFilter4_NEON(730const uint8x16_t p1, const uint8x16_t p0,731const uint8x16_t q0, const uint8x16_t q1,732const uint8x16_t mask, const uint8x16_t hev_mask,733uint8x16_t* const op1, uint8x16_t* const op0,734uint8x16_t* const oq0, uint8x16_t* const oq1) {735// This is a fused version of DoFilter2() calling ApplyFilter2 directly736const int8x16_t p1s = FlipSign_NEON(p1);737int8x16_t p0s = FlipSign_NEON(p0);738int8x16_t q0s = FlipSign_NEON(q0);739const int8x16_t q1s = FlipSign_NEON(q1);740const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);741742// do_filter2 part (simple loopfilter on pixels with hev)743{744const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);745const int8x16_t simple_lf_delta =746vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));747ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);748}749750// do_filter4 part (complex loopfilter on pixels without hev)751{752const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);753// we use: (mask & hev_mask) ^ mask = mask & !hev_mask754const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);755const int8x16_t complex_lf_delta =756vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));757ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);758}759}760761// 6-points filter762763static void ApplyFilter6_NEON(764const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,765const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,766const int8x16_t delta,767uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,768uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {769// We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7770// Turns out, there's a common sub-expression S=9 * a - 1 that can be used771// with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:772// X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7773const int8x8_t delta_lo = vget_low_s8(delta);774const int8x8_t delta_hi = vget_high_s8(delta);775const int8x8_t kCst9 = vdup_n_s8(9);776const int16x8_t kCstm1 = vdupq_n_s16(-1);777const int8x8_t kCst18 = vdup_n_s8(18);778const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1779const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);780const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a781const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);782const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7783const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);784const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6785const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);786const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7787const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);788const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);789const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);790const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);791792*op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1)793*oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1)794*oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2)795*op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2)796*oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3)797*op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3)798}799800static void DoFilter6_NEON(801const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,802const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,803const uint8x16_t mask, const uint8x16_t hev_mask,804uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,805uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {806// This is a fused version of DoFilter2() calling ApplyFilter2 directly807const int8x16_t p2s = FlipSign_NEON(p2);808const int8x16_t p1s = FlipSign_NEON(p1);809int8x16_t p0s = FlipSign_NEON(p0);810int8x16_t q0s = FlipSign_NEON(q0);811const int8x16_t q1s = FlipSign_NEON(q1);812const int8x16_t q2s = FlipSign_NEON(q2);813const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);814const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);815816// do_filter2 part (simple loopfilter on pixels with hev)817{818const int8x16_t simple_lf_delta =819vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));820ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);821}822823// do_filter6 part (complex loopfilter on pixels without hev)824{825// we use: (mask & hev_mask) ^ mask = mask & !hev_mask826const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);827const int8x16_t complex_lf_delta =828vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));829ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,830op2, op1, op0, oq0, oq1, oq2);831}832}833834// on macroblock edges835836static void VFilter16_NEON(uint8_t* p, int stride,837int thresh, int ithresh, int hev_thresh) {838uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;839Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);840{841const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,842ithresh, thresh);843const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);844uint8x16_t op2, op1, op0, oq0, oq1, oq2;845DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,846&op2, &op1, &op0, &oq0, &oq1, &oq2);847Store16x2_NEON(op2, op1, p - 2 * stride, stride);848Store16x2_NEON(op0, oq0, p + 0 * stride, stride);849Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);850}851}852853static void HFilter16_NEON(uint8_t* p, int stride,854int thresh, int ithresh, int hev_thresh) {855uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;856Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);857{858const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,859ithresh, thresh);860const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);861uint8x16_t op2, op1, op0, oq0, oq1, oq2;862DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,863&op2, &op1, &op0, &oq0, &oq1, &oq2);864Store2x16_NEON(op2, op1, p - 2, stride);865Store2x16_NEON(op0, oq0, p + 0, stride);866Store2x16_NEON(oq1, oq2, p + 2, stride);867}868}869870// on three inner edges871static void VFilter16i_NEON(uint8_t* p, int stride,872int thresh, int ithresh, int hev_thresh) {873uint32_t k;874uint8x16_t p3, p2, p1, p0;875Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0);876for (k = 3; k != 0; --k) {877uint8x16_t q0, q1, q2, q3;878p += 4 * stride;879Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3);880{881const uint8x16_t mask =882NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);883const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);884// p3 and p2 are not just temporary variables here: they will be885// re-used for next span. And q2/q3 will become p1/p0 accordingly.886DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);887Store16x4_NEON(p1, p0, p3, p2, p, stride);888p1 = q2;889p0 = q3;890}891}892}893894#if !defined(WORK_AROUND_GCC)895static void HFilter16i_NEON(uint8_t* p, int stride,896int thresh, int ithresh, int hev_thresh) {897uint32_t k;898uint8x16_t p3, p2, p1, p0;899Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);900for (k = 3; k != 0; --k) {901uint8x16_t q0, q1, q2, q3;902p += 4;903Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);904{905const uint8x16_t mask =906NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);907const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);908DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);909Store4x16_NEON(p1, p0, p3, p2, p, stride);910p1 = q2;911p0 = q3;912}913}914}915#endif // !WORK_AROUND_GCC916917// 8-pixels wide variant, for chroma filtering918static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,919int stride, int thresh, int ithresh, int hev_thresh) {920uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;921Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);922{923const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,924ithresh, thresh);925const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);926uint8x16_t op2, op1, op0, oq0, oq1, oq2;927DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,928&op2, &op1, &op0, &oq0, &oq1, &oq2);929Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);930Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);931Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);932}933}934static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,935int stride,936int thresh, int ithresh, int hev_thresh) {937uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;938u += 4 * stride;939v += 4 * stride;940Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);941{942const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,943ithresh, thresh);944const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);945uint8x16_t op1, op0, oq0, oq1;946DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);947Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);948}949}950951#if !defined(WORK_AROUND_GCC)952static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,953int stride, int thresh, int ithresh, int hev_thresh) {954uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;955Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);956{957const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,958ithresh, thresh);959const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);960uint8x16_t op2, op1, op0, oq0, oq1, oq2;961DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,962&op2, &op1, &op0, &oq0, &oq1, &oq2);963Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);964}965}966967static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,968int stride,969int thresh, int ithresh, int hev_thresh) {970uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;971u += 4;972v += 4;973Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);974{975const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,976ithresh, thresh);977const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);978uint8x16_t op1, op0, oq0, oq1;979DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);980Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);981}982}983#endif // !WORK_AROUND_GCC984985//-----------------------------------------------------------------------------986// Inverse transforms (Paragraph 14.4)987988// Technically these are unsigned but vqdmulh is only available in signed.989// vqdmulh returns high half (effectively >> 16) but also doubles the value,990// changing the >> 16 to >> 15 and requiring an additional >> 1.991// We use this to our advantage with kC2. The canonical value is 35468.992// However, the high bit is set so treating it as signed will give incorrect993// results. We avoid this by down shifting by 1 here to clear the highest bit.994// Combined with the doubling effect of vqdmulh we get >> 16.995// This can not be applied to kC1 because the lowest bit is set. Down shifting996// the constant would reduce precision.997998// libwebp uses a trick to avoid some extra addition that libvpx does.999// Instead of:1000// temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);1001// libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the1002// same issue with kC1 and vqdmulh that we work around by down shifting kC210031004static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;1005static const int16_t kC2 =1006WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.10071008#if defined(WEBP_USE_INTRINSICS)1009static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,1010const int16x8_t in1,1011int16x8x2_t* const out) {1012// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d11013// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d31014const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...1015// b0 d0 b1 d1 b2 d2 ...1016*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);1017}10181019static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {1020// {rows} = in0 | in41021// in8 | in121022// B1 = in4 | in121023const int16x8_t B1 =1024vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));1025// C0 = kC1 * in4 | kC1 * in121026// C1 = kC2 * in4 | kC2 * in121027const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);1028const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);1029const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),1030vget_low_s16(rows->val[1])); // in0 + in81031const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),1032vget_low_s16(rows->val[1])); // in0 - in81033// c = kC2 * in4 - kC1 * in121034// d = kC1 * in4 + kC2 * in121035const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));1036const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));1037const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b1038const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c1039const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c1040const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c1041const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));1042Transpose8x2_NEON(E0, E1, rows);1043}10441045static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,1046uint8_t* WEBP_RESTRICT dst) {1047int16x8x2_t rows;1048INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));1049TransformPass_NEON(&rows);1050TransformPass_NEON(&rows);1051Add4x4_NEON(rows.val[0], rows.val[1], dst);1052}10531054#else10551056static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,1057uint8_t* WEBP_RESTRICT dst) {1058const int kBPS = BPS;1059// kC1, kC2. Padded because vld1.16 loads 8 bytes1060const int16_t constants[4] = { kC1, kC2, 0, 0 };1061/* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */1062__asm__ volatile (1063"vld1.16 {q1, q2}, [%[in]] \n"1064"vld1.16 {d0}, [%[constants]] \n"10651066/* d2: in[0]1067* d3: in[8]1068* d4: in[4]1069* d5: in[12]1070*/1071"vswp d3, d4 \n"10721073/* q8 = {in[4], in[12]} * kC1 * 2 >> 161074* q9 = {in[4], in[12]} * kC2 >> 161075*/1076"vqdmulh.s16 q8, q2, d0[0] \n"1077"vqdmulh.s16 q9, q2, d0[1] \n"10781079/* d22 = a = in[0] + in[8]1080* d23 = b = in[0] - in[8]1081*/1082"vqadd.s16 d22, d2, d3 \n"1083"vqsub.s16 d23, d2, d3 \n"10841085/* The multiplication should be x * kC1 >> 161086* However, with vqdmulh we get x * kC1 * 2 >> 161087* (multiply, double, return high half)1088* We avoided this in kC2 by pre-shifting the constant.1089* q8 = in[4]/[12] * kC1 >> 161090*/1091"vshr.s16 q8, q8, #1 \n"10921093/* Add {in[4], in[12]} back after the multiplication. This is handled by1094* adding 1 << 16 to kC1 in the libwebp C code.1095*/1096"vqadd.s16 q8, q2, q8 \n"10971098/* d20 = c = in[4]*kC2 - in[12]*kC11099* d21 = d = in[4]*kC1 + in[12]*kC21100*/1101"vqsub.s16 d20, d18, d17 \n"1102"vqadd.s16 d21, d19, d16 \n"11031104/* d2 = tmp[0] = a + d1105* d3 = tmp[1] = b + c1106* d4 = tmp[2] = b - c1107* d5 = tmp[3] = a - d1108*/1109"vqadd.s16 d2, d22, d21 \n"1110"vqadd.s16 d3, d23, d20 \n"1111"vqsub.s16 d4, d23, d20 \n"1112"vqsub.s16 d5, d22, d21 \n"11131114"vzip.16 q1, q2 \n"1115"vzip.16 q1, q2 \n"11161117"vswp d3, d4 \n"11181119/* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 161120* q9 = {tmp[4], tmp[12]} * kC2 >> 161121*/1122"vqdmulh.s16 q8, q2, d0[0] \n"1123"vqdmulh.s16 q9, q2, d0[1] \n"11241125/* d22 = a = tmp[0] + tmp[8]1126* d23 = b = tmp[0] - tmp[8]1127*/1128"vqadd.s16 d22, d2, d3 \n"1129"vqsub.s16 d23, d2, d3 \n"11301131/* See long winded explanations prior */1132"vshr.s16 q8, q8, #1 \n"1133"vqadd.s16 q8, q2, q8 \n"11341135/* d20 = c = in[4]*kC2 - in[12]*kC11136* d21 = d = in[4]*kC1 + in[12]*kC21137*/1138"vqsub.s16 d20, d18, d17 \n"1139"vqadd.s16 d21, d19, d16 \n"11401141/* d2 = tmp[0] = a + d1142* d3 = tmp[1] = b + c1143* d4 = tmp[2] = b - c1144* d5 = tmp[3] = a - d1145*/1146"vqadd.s16 d2, d22, d21 \n"1147"vqadd.s16 d3, d23, d20 \n"1148"vqsub.s16 d4, d23, d20 \n"1149"vqsub.s16 d5, d22, d21 \n"11501151"vld1.32 d6[0], [%[dst]], %[kBPS] \n"1152"vld1.32 d6[1], [%[dst]], %[kBPS] \n"1153"vld1.32 d7[0], [%[dst]], %[kBPS] \n"1154"vld1.32 d7[1], [%[dst]], %[kBPS] \n"11551156"sub %[dst], %[dst], %[kBPS], lsl #2 \n"11571158/* (val) + 4 >> 3 */1159"vrshr.s16 d2, d2, #3 \n"1160"vrshr.s16 d3, d3, #3 \n"1161"vrshr.s16 d4, d4, #3 \n"1162"vrshr.s16 d5, d5, #3 \n"11631164"vzip.16 q1, q2 \n"1165"vzip.16 q1, q2 \n"11661167/* Must accumulate before saturating */1168"vmovl.u8 q8, d6 \n"1169"vmovl.u8 q9, d7 \n"11701171"vqadd.s16 q1, q1, q8 \n"1172"vqadd.s16 q2, q2, q9 \n"11731174"vqmovun.s16 d0, q1 \n"1175"vqmovun.s16 d1, q2 \n"11761177"vst1.32 d0[0], [%[dst]], %[kBPS] \n"1178"vst1.32 d0[1], [%[dst]], %[kBPS] \n"1179"vst1.32 d1[0], [%[dst]], %[kBPS] \n"1180"vst1.32 d1[1], [%[dst]] \n"11811182: [in] "+r"(in), [dst] "+r"(dst) /* modified registers */1183: [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */1184: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */1185);1186}11871188#endif // WEBP_USE_INTRINSICS11891190static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,1191uint8_t* WEBP_RESTRICT dst, int do_two) {1192TransformOne_NEON(in, dst);1193if (do_two) {1194TransformOne_NEON(in + 16, dst + 4);1195}1196}11971198static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,1199uint8_t* WEBP_RESTRICT dst) {1200const int16x8_t DC = vdupq_n_s16(in[0]);1201Add4x4_NEON(DC, DC, dst);1202}12031204//------------------------------------------------------------------------------12051206#define STORE_WHT(dst, col, rows) do { \1207*dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \1208*dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \1209*dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \1210*dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \1211} while (0)12121213static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,1214int16_t* WEBP_RESTRICT out) {1215int32x4x4_t tmp;12161217{1218// Load the source.1219const int16x4_t in00_03 = vld1_s16(in + 0);1220const int16x4_t in04_07 = vld1_s16(in + 4);1221const int16x4_t in08_11 = vld1_s16(in + 8);1222const int16x4_t in12_15 = vld1_s16(in + 12);1223const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]1224const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]1225const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]1226const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]1227tmp.val[0] = vaddq_s32(a0, a1);1228tmp.val[1] = vaddq_s32(a3, a2);1229tmp.val[2] = vsubq_s32(a0, a1);1230tmp.val[3] = vsubq_s32(a3, a2);1231// Arrange the temporary results column-wise.1232tmp = Transpose4x4_NEON(tmp);1233}12341235{1236const int32x4_t kCst3 = vdupq_n_s32(3);1237const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder1238const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);1239const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);1240const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);1241const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);12421243tmp.val[0] = vaddq_s32(a0, a1);1244tmp.val[1] = vaddq_s32(a3, a2);1245tmp.val[2] = vsubq_s32(a0, a1);1246tmp.val[3] = vsubq_s32(a3, a2);12471248// right shift the results by 3.1249tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);1250tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);1251tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);1252tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);12531254STORE_WHT(out, 0, tmp);1255STORE_WHT(out, 1, tmp);1256STORE_WHT(out, 2, tmp);1257STORE_WHT(out, 3, tmp);1258}1259}12601261#undef STORE_WHT12621263//------------------------------------------------------------------------------12641265static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,1266uint8_t* WEBP_RESTRICT dst) {1267const int16x4_t A = vld1_dup_s16(in);1268const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));1269const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));1270const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);1271const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);1272const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |1273(uint64_t)( c1 & 0xffff) << 16 |1274(uint64_t)(-c1 & 0xffff) << 32 |1275(uint64_t)(-d1 & 0xffff) << 48;1276const int16x4_t CD = vcreate_s16(cd);1277const int16x4_t B = vqadd_s16(A, CD);1278const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));1279const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));1280Add4x4_NEON(m0_m1, m2_m3, dst);1281}12821283//------------------------------------------------------------------------------1284// 4x412851286static void DC4_NEON(uint8_t* dst) { // DC1287const uint8x8_t A = vld1_u8(dst - BPS); // top row1288const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top1289const uint16x4_t p1 = vpadd_u16(p0, p0);1290const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);1291const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);1292const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);1293const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);1294const uint16x8_t s0 = vaddl_u8(L0, L1);1295const uint16x8_t s1 = vaddl_u8(L2, L3);1296const uint16x8_t s01 = vaddq_u16(s0, s1);1297const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));1298const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 31299const uint8x8_t dc = vdup_lane_u8(dc0, 0);1300int i;1301for (i = 0; i < 4; ++i) {1302vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);1303}1304}13051306// TrueMotion (4x4 + 8x8)1307static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {1308const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'1309const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'1310const uint16x8_t d = vsubl_u8(T, TL); // A[c] - A[-1]1311int y;1312for (y = 0; y < size; y += 4) {1313// left edge1314const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);1315const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);1316const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);1317const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);1318// L[r] + A[c] - A[-1]1319const int16x8_t r0 = vreinterpretq_s16_u16(vaddw_u8(d, L0));1320const int16x8_t r1 = vreinterpretq_s16_u16(vaddw_u8(d, L1));1321const int16x8_t r2 = vreinterpretq_s16_u16(vaddw_u8(d, L2));1322const int16x8_t r3 = vreinterpretq_s16_u16(vaddw_u8(d, L3));1323// Saturate and store the result.1324const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));1325const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));1326const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));1327const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));1328if (size == 4) {1329vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);1330vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);1331vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);1332vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);1333} else {1334vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);1335vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);1336vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);1337vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);1338}1339dst += 4 * BPS;1340}1341}13421343static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }13441345static void VE4_NEON(uint8_t* dst) { // vertical1346// NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.1347const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row1348const uint64x1_t A1 = vshr_n_u64(A0, 8);1349const uint64x1_t A2 = vshr_n_u64(A0, 16);1350const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);1351const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);1352const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);1353const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);1354const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);1355int i;1356for (i = 0; i < 4; ++i) {1357vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);1358}1359}13601361static void RD4_NEON(uint8_t* dst) { // Down-right1362const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);1363const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);1364const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);1365const uint32_t I = dst[-1 + 0 * BPS];1366const uint32_t J = dst[-1 + 1 * BPS];1367const uint32_t K = dst[-1 + 2 * BPS];1368const uint32_t L = dst[-1 + 3 * BPS];1369const uint64x1_t LKJI____ =1370vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));1371const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);1372const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));1373const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));1374const uint8_t D = vget_lane_u8(XABCD_u8, 4);1375const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);1376const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);1377const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);1378const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);1379const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);1380const uint32x2_t r3 = vreinterpret_u32_u8(avg2);1381const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));1382const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));1383const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));1384vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);1385vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);1386vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);1387vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);1388}13891390static void LD4_NEON(uint8_t* dst) { // Down-left1391// Note using the same shift trick as VE4() is slower here.1392const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);1393const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);1394const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);1395const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);1396const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);1397const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);1398const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);1399const uint32x2_t r0 = vreinterpret_u32_u8(avg2);1400const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));1401const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));1402const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));1403vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);1404vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);1405vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);1406vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);1407}14081409//------------------------------------------------------------------------------1410// Chroma14111412static void VE8uv_NEON(uint8_t* dst) { // vertical1413const uint8x8_t top = vld1_u8(dst - BPS);1414int j;1415for (j = 0; j < 8; ++j) {1416vst1_u8(dst + j * BPS, top);1417}1418}14191420static void HE8uv_NEON(uint8_t* dst) { // horizontal1421int j;1422for (j = 0; j < 8; ++j) {1423const uint8x8_t left = vld1_dup_u8(dst - 1);1424vst1_u8(dst, left);1425dst += BPS;1426}1427}14281429static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {1430uint16x8_t sum_top;1431uint16x8_t sum_left;1432uint8x8_t dc0;14331434if (do_top) {1435const uint8x8_t A = vld1_u8(dst - BPS); // top row1436#if WEBP_AARCH641437const uint16_t p2 = vaddlv_u8(A);1438sum_top = vdupq_n_u16(p2);1439#else1440const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top1441const uint16x4_t p1 = vpadd_u16(p0, p0);1442const uint16x4_t p2 = vpadd_u16(p1, p1);1443sum_top = vcombine_u16(p2, p2);1444#endif1445}14461447if (do_left) {1448const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);1449const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);1450const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);1451const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);1452const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);1453const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);1454const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);1455const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);1456const uint16x8_t s0 = vaddl_u8(L0, L1);1457const uint16x8_t s1 = vaddl_u8(L2, L3);1458const uint16x8_t s2 = vaddl_u8(L4, L5);1459const uint16x8_t s3 = vaddl_u8(L6, L7);1460const uint16x8_t s01 = vaddq_u16(s0, s1);1461const uint16x8_t s23 = vaddq_u16(s2, s3);1462sum_left = vaddq_u16(s01, s23);1463}14641465if (do_top && do_left) {1466const uint16x8_t sum = vaddq_u16(sum_left, sum_top);1467dc0 = vrshrn_n_u16(sum, 4);1468} else if (do_top) {1469dc0 = vrshrn_n_u16(sum_top, 3);1470} else if (do_left) {1471dc0 = vrshrn_n_u16(sum_left, 3);1472} else {1473dc0 = vdup_n_u8(0x80);1474}14751476{1477const uint8x8_t dc = vdup_lane_u8(dc0, 0);1478int i;1479for (i = 0; i < 8; ++i) {1480vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));1481}1482}1483}14841485static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }1486static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }1487static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }1488static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }14891490static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }14911492//------------------------------------------------------------------------------1493// 16x1614941495static void VE16_NEON(uint8_t* dst) { // vertical1496const uint8x16_t top = vld1q_u8(dst - BPS);1497int j;1498for (j = 0; j < 16; ++j) {1499vst1q_u8(dst + j * BPS, top);1500}1501}15021503static void HE16_NEON(uint8_t* dst) { // horizontal1504int j;1505for (j = 0; j < 16; ++j) {1506const uint8x16_t left = vld1q_dup_u8(dst - 1);1507vst1q_u8(dst, left);1508dst += BPS;1509}1510}15111512static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {1513uint16x8_t sum_top;1514uint16x8_t sum_left;1515uint8x8_t dc0;15161517if (do_top) {1518const uint8x16_t A = vld1q_u8(dst - BPS); // top row1519#if WEBP_AARCH641520const uint16_t p3 = vaddlvq_u8(A);1521sum_top = vdupq_n_u16(p3);1522#else1523const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top1524const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));1525const uint16x4_t p2 = vpadd_u16(p1, p1);1526const uint16x4_t p3 = vpadd_u16(p2, p2);1527sum_top = vcombine_u16(p3, p3);1528#endif1529}15301531if (do_left) {1532int i;1533sum_left = vdupq_n_u16(0);1534for (i = 0; i < 16; i += 8) {1535const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);1536const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);1537const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);1538const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);1539const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);1540const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);1541const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);1542const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);1543const uint16x8_t s0 = vaddl_u8(L0, L1);1544const uint16x8_t s1 = vaddl_u8(L2, L3);1545const uint16x8_t s2 = vaddl_u8(L4, L5);1546const uint16x8_t s3 = vaddl_u8(L6, L7);1547const uint16x8_t s01 = vaddq_u16(s0, s1);1548const uint16x8_t s23 = vaddq_u16(s2, s3);1549const uint16x8_t sum = vaddq_u16(s01, s23);1550sum_left = vaddq_u16(sum_left, sum);1551}1552}15531554if (do_top && do_left) {1555const uint16x8_t sum = vaddq_u16(sum_left, sum_top);1556dc0 = vrshrn_n_u16(sum, 5);1557} else if (do_top) {1558dc0 = vrshrn_n_u16(sum_top, 4);1559} else if (do_left) {1560dc0 = vrshrn_n_u16(sum_left, 4);1561} else {1562dc0 = vdup_n_u8(0x80);1563}15641565{1566const uint8x16_t dc = vdupq_lane_u8(dc0, 0);1567int i;1568for (i = 0; i < 16; ++i) {1569vst1q_u8(dst + i * BPS, dc);1570}1571}1572}15731574static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }1575static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }1576static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }1577static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }15781579static void TM16_NEON(uint8_t* dst) {1580const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'1581const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'1582// A[c] - A[-1]1583const uint16x8_t d_lo = vsubl_u8(vget_low_u8(T), TL);1584const uint16x8_t d_hi = vsubl_u8(vget_high_u8(T), TL);1585int y;1586for (y = 0; y < 16; y += 4) {1587// left edge1588const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);1589const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);1590const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);1591const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);1592// L[r] + A[c] - A[-1]1593const int16x8_t r0_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L0));1594const int16x8_t r1_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L1));1595const int16x8_t r2_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L2));1596const int16x8_t r3_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L3));1597const int16x8_t r0_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L0));1598const int16x8_t r1_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L1));1599const int16x8_t r2_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L2));1600const int16x8_t r3_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L3));1601// Saturate and store the result.1602const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));1603const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));1604const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));1605const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));1606vst1q_u8(dst + 0 * BPS, row0);1607vst1q_u8(dst + 1 * BPS, row1);1608vst1q_u8(dst + 2 * BPS, row2);1609vst1q_u8(dst + 3 * BPS, row3);1610dst += 4 * BPS;1611}1612}16131614//------------------------------------------------------------------------------1615// Entry point16161617extern void VP8DspInitNEON(void);16181619WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {1620VP8Transform = TransformTwo_NEON;1621VP8TransformAC3 = TransformAC3_NEON;1622VP8TransformDC = TransformDC_NEON;1623VP8TransformWHT = TransformWHT_NEON;16241625VP8VFilter16 = VFilter16_NEON;1626VP8VFilter16i = VFilter16i_NEON;1627VP8HFilter16 = HFilter16_NEON;1628#if !defined(WORK_AROUND_GCC)1629VP8HFilter16i = HFilter16i_NEON;1630#endif1631VP8VFilter8 = VFilter8_NEON;1632VP8VFilter8i = VFilter8i_NEON;1633#if !defined(WORK_AROUND_GCC)1634VP8HFilter8 = HFilter8_NEON;1635VP8HFilter8i = HFilter8i_NEON;1636#endif1637VP8SimpleVFilter16 = SimpleVFilter16_NEON;1638VP8SimpleHFilter16 = SimpleHFilter16_NEON;1639VP8SimpleVFilter16i = SimpleVFilter16i_NEON;1640VP8SimpleHFilter16i = SimpleHFilter16i_NEON;16411642VP8PredLuma4[0] = DC4_NEON;1643VP8PredLuma4[1] = TM4_NEON;1644VP8PredLuma4[2] = VE4_NEON;1645VP8PredLuma4[4] = RD4_NEON;1646VP8PredLuma4[6] = LD4_NEON;16471648VP8PredLuma16[0] = DC16TopLeft_NEON;1649VP8PredLuma16[1] = TM16_NEON;1650VP8PredLuma16[2] = VE16_NEON;1651VP8PredLuma16[3] = HE16_NEON;1652VP8PredLuma16[4] = DC16NoTop_NEON;1653VP8PredLuma16[5] = DC16NoLeft_NEON;1654VP8PredLuma16[6] = DC16NoTopLeft_NEON;16551656VP8PredChroma8[0] = DC8uv_NEON;1657VP8PredChroma8[1] = TM8uv_NEON;1658VP8PredChroma8[2] = VE8uv_NEON;1659VP8PredChroma8[3] = HE8uv_NEON;1660VP8PredChroma8[4] = DC8uvNoTop_NEON;1661VP8PredChroma8[5] = DC8uvNoLeft_NEON;1662VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;1663}16641665#else // !WEBP_USE_NEON16661667WEBP_DSP_INIT_STUB(VP8DspInitNEON)16681669#endif // WEBP_USE_NEON167016711672